spark git commit: [SPARK-16162] Remove dead code OrcTableScan.
Repository: spark Updated Branches: refs/heads/master f34b5c62b -> 4374a46bf [SPARK-16162] Remove dead code OrcTableScan. ## What changes were proposed in this pull request? SPARK-14535 removed all calls to class OrcTableScan. This removes the dead code. ## How was this patch tested? Existing unit tests. Author: Brian Cho Closes #13869 from dafrista/clean-up-orctablescan. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4374a46b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4374a46b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4374a46b Branch: refs/heads/master Commit: 4374a46bfc52ee4f3ae9f61ccedc77a62aa9d4ee Parents: f34b5c6 Author: Brian Cho Authored: Wed Jun 22 22:37:50 2016 -0700 Committer: Reynold Xin Committed: Wed Jun 22 22:37:50 2016 -0700 -- .../spark/sql/hive/orc/OrcFileFormat.scala | 67 +--- 1 file changed, 1 insertion(+), 66 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4374a46b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala index a2c8092..5de3507 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala @@ -27,12 +27,10 @@ import org.apache.hadoop.hive.ql.io.orc._ import org.apache.hadoop.hive.serde2.objectinspector.{SettableStructObjectInspector, StructObjectInspector} import org.apache.hadoop.hive.serde2.typeinfo.{StructTypeInfo, TypeInfoUtils} import org.apache.hadoop.io.{NullWritable, Writable} -import org.apache.hadoop.mapred.{InputFormat => MapRedInputFormat, JobConf, OutputFormat => MapRedOutputFormat, RecordWriter, Reporter} +import org.apache.hadoop.mapred.{JobConf, OutputFormat => MapRedOutputFormat, RecordWriter, Reporter} import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} -import org.apache.spark.internal.Logging -import org.apache.spark.rdd.{HadoopRDD, RDD} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ @@ -260,69 +258,6 @@ private[orc] class OrcOutputWriter( } } -private[orc] case class OrcTableScan( -@transient sparkSession: SparkSession, -attributes: Seq[Attribute], -filters: Array[Filter], -@transient inputPaths: Seq[FileStatus]) - extends Logging - with HiveInspectors { - - def execute(): RDD[InternalRow] = { -val job = Job.getInstance(sparkSession.sessionState.newHadoopConf()) -val conf = job.getConfiguration - -// Figure out the actual schema from the ORC source (without partition columns) so that we -// can pick the correct ordinals. Note that this assumes that all files have the same schema. -val orcFormat = new OrcFileFormat -val dataSchema = - orcFormat -.inferSchema(sparkSession, Map.empty, inputPaths) -.getOrElse(sys.error("Failed to read schema from target ORC files.")) - -// Tries to push down filters if ORC filter push-down is enabled -if (sparkSession.sessionState.conf.orcFilterPushDown) { - OrcFilters.createFilter(dataSchema, filters).foreach { f => -conf.set(OrcTableScan.SARG_PUSHDOWN, f.toKryo) -conf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true) - } -} - -// Sets requested columns -OrcRelation.setRequiredColumns(conf, dataSchema, StructType.fromAttributes(attributes)) - -if (inputPaths.isEmpty) { - // the input path probably be pruned, return an empty RDD. - return sparkSession.sparkContext.emptyRDD[InternalRow] -} -FileInputFormat.setInputPaths(job, inputPaths.map(_.getPath): _*) - -val inputFormatClass = - classOf[OrcInputFormat] -.asInstanceOf[Class[_ <: MapRedInputFormat[NullWritable, Writable]]] - -val rdd = sparkSession.sparkContext.hadoopRDD( - conf.asInstanceOf[JobConf], - inputFormatClass, - classOf[NullWritable], - classOf[Writable] -).asInstanceOf[HadoopRDD[NullWritable, Writable]] - -val wrappedConf = new SerializableConfiguration(conf) - -rdd.mapPartitionsWithInputSplit { case (split: OrcSplit, iterator) => - val writableIterator = iterator.map(_._2) - val maybeStructOI = OrcFileOperator.getObjectInspector(split.getPath.toString, Some(conf)) - OrcRelation.unwrapOrcStructs( -wrappedConf.value, -StructType.fromAttributes(attributes), -maybeStructOI, -writableIterator - ) -} -
spark git commit: [SQL][MINOR] Fix minor formatting issues in SHOW CREATE TABLE output
Repository: spark Updated Branches: refs/heads/branch-2.0 5b4a9a4c3 -> 4ad731ed6 [SQL][MINOR] Fix minor formatting issues in SHOW CREATE TABLE output ## What changes were proposed in this pull request? This PR fixes two minor formatting issues appearing in `SHOW CREATE TABLE` output. Before: ``` CREATE EXTERNAL TABLE ... ... WITH SERDEPROPERTIES ('serialization.format' = '1' ) ... TBLPROPERTIES ('avro.schema.url' = '/tmp/avro/test.avsc', 'transient_lastDdlTime' = '1466638180') ``` After: ``` CREATE EXTERNAL TABLE ... ... WITH SERDEPROPERTIES ( 'serialization.format' = '1' ) ... TBLPROPERTIES ( 'avro.schema.url' = '/tmp/avro/test.avsc', 'transient_lastDdlTime' = '1466638180' ) ``` ## How was this patch tested? Manually tested. Author: Cheng Lian Closes #13864 from liancheng/show-create-table-format-fix. (cherry picked from commit f34b5c62b2da3fe0ea989acea46fff949d349afc) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ad731ed Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ad731ed Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ad731ed Branch: refs/heads/branch-2.0 Commit: 4ad731ed6a963131f05c387c2f9536b56d228090 Parents: 5b4a9a4 Author: Cheng Lian Authored: Wed Jun 22 22:28:54 2016 -0700 Committer: Reynold Xin Committed: Wed Jun 22 22:29:00 2016 -0700 -- .../scala/org/apache/spark/sql/execution/command/tables.scala| 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4ad731ed/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 3eb93a2..30dc7e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -830,7 +830,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" } - builder ++= serdeProps.mkString("WITH SERDEPROPERTIES (", ",\n ", "\n)\n") + builder ++= serdeProps.mkString("WITH SERDEPROPERTIES (\n ", ",\n ", "\n)\n") } if (storage.inputFormat.isDefined || storage.outputFormat.isDefined) { @@ -864,7 +864,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman } if (props.nonEmpty) { -builder ++= props.mkString("TBLPROPERTIES (", ",\n ", ")\n") +builder ++= props.mkString("TBLPROPERTIES (\n ", ",\n ", "\n)\n") } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SQL][MINOR] Fix minor formatting issues in SHOW CREATE TABLE output
Repository: spark Updated Branches: refs/heads/master 925884a61 -> f34b5c62b [SQL][MINOR] Fix minor formatting issues in SHOW CREATE TABLE output ## What changes were proposed in this pull request? This PR fixes two minor formatting issues appearing in `SHOW CREATE TABLE` output. Before: ``` CREATE EXTERNAL TABLE ... ... WITH SERDEPROPERTIES ('serialization.format' = '1' ) ... TBLPROPERTIES ('avro.schema.url' = '/tmp/avro/test.avsc', 'transient_lastDdlTime' = '1466638180') ``` After: ``` CREATE EXTERNAL TABLE ... ... WITH SERDEPROPERTIES ( 'serialization.format' = '1' ) ... TBLPROPERTIES ( 'avro.schema.url' = '/tmp/avro/test.avsc', 'transient_lastDdlTime' = '1466638180' ) ``` ## How was this patch tested? Manually tested. Author: Cheng Lian Closes #13864 from liancheng/show-create-table-format-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f34b5c62 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f34b5c62 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f34b5c62 Branch: refs/heads/master Commit: f34b5c62b2da3fe0ea989acea46fff949d349afc Parents: 925884a Author: Cheng Lian Authored: Wed Jun 22 22:28:54 2016 -0700 Committer: Reynold Xin Committed: Wed Jun 22 22:28:54 2016 -0700 -- .../scala/org/apache/spark/sql/execution/command/tables.scala| 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f34b5c62/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 3eb93a2..30dc7e8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -830,7 +830,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" } - builder ++= serdeProps.mkString("WITH SERDEPROPERTIES (", ",\n ", "\n)\n") + builder ++= serdeProps.mkString("WITH SERDEPROPERTIES (\n ", ",\n ", "\n)\n") } if (storage.inputFormat.isDefined || storage.outputFormat.isDefined) { @@ -864,7 +864,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman } if (props.nonEmpty) { -builder ++= props.mkString("TBLPROPERTIES (", ",\n ", ")\n") +builder ++= props.mkString("TBLPROPERTIES (\n ", ",\n ", "\n)\n") } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15230][SQL] distinct() does not handle column name with dot properly
Repository: spark Updated Branches: refs/heads/master 37f3be5d2 -> 925884a61 [SPARK-15230][SQL] distinct() does not handle column name with dot properly ## What changes were proposed in this pull request? When table is created with column name containing dot, distinct() will fail to run. For example, ```scala val rowRDD = sparkContext.parallelize(Seq(Row(1), Row(1), Row(2))) val schema = StructType(Array(StructField("column.with.dot", IntegerType, nullable = false))) val df = spark.createDataFrame(rowRDD, schema) ``` running the following will have no problem: ```scala df.select(new Column("`column.with.dot`")) ``` but running the query with additional distinct() will cause exception: ```scala df.select(new Column("`column.with.dot`")).distinct() ``` The issue is that distinct() will try to resolve the column name, but the column name in the schema does not have backtick with it. So the solution is to add the backtick before passing the column name to resolve(). ## How was this patch tested? Added a new test case. Author: bomeng Closes #13140 from bomeng/SPARK-15230. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/925884a6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/925884a6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/925884a6 Branch: refs/heads/master Commit: 925884a612dd88beaddf555c74d90856ab040ec7 Parents: 37f3be5 Author: bomeng Authored: Thu Jun 23 11:06:19 2016 +0800 Committer: Wenchen Fan Committed: Thu Jun 23 11:06:19 2016 +0800 -- sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 8 +++- .../src/test/scala/org/apache/spark/sql/DataFrameSuite.scala | 5 + 2 files changed, 12 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/925884a6/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 02cc398..f1d33c3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1812,7 +1812,13 @@ class Dataset[T] private[sql]( * @since 2.0.0 */ def dropDuplicates(colNames: Seq[String]): Dataset[T] = withTypedPlan { -val groupCols = colNames.map(resolve) +val resolver = sparkSession.sessionState.analyzer.resolver +val allColumns = queryExecution.analyzed.output +val groupCols = colNames.map { colName => + allColumns.find(col => resolver(col.name, colName)).getOrElse( +throw new AnalysisException( + s"""Cannot resolve column name "$colName" among (${schema.fieldNames.mkString(", ")})""")) +} val groupColExprIds = groupCols.map(_.exprId) val aggCols = logicalPlan.output.map { attr => if (groupColExprIds.contains(attr.exprId)) { http://git-wip-us.apache.org/repos/asf/spark/blob/925884a6/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index c8a0f71..1afee9f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -1536,4 +1536,9 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { Utils.deleteRecursively(baseDir) } } + + test("SPARK-15230: distinct() does not handle column name with dot properly") { +val df = Seq(1, 1, 2).toDF("column.with.dot") +checkAnswer(df.distinct(), Row(1) :: Row(2) :: Nil) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15230][SQL] distinct() does not handle column name with dot properly
Repository: spark Updated Branches: refs/heads/branch-2.0 e0a43235d -> 5b4a9a4c3 [SPARK-15230][SQL] distinct() does not handle column name with dot properly ## What changes were proposed in this pull request? When table is created with column name containing dot, distinct() will fail to run. For example, ```scala val rowRDD = sparkContext.parallelize(Seq(Row(1), Row(1), Row(2))) val schema = StructType(Array(StructField("column.with.dot", IntegerType, nullable = false))) val df = spark.createDataFrame(rowRDD, schema) ``` running the following will have no problem: ```scala df.select(new Column("`column.with.dot`")) ``` but running the query with additional distinct() will cause exception: ```scala df.select(new Column("`column.with.dot`")).distinct() ``` The issue is that distinct() will try to resolve the column name, but the column name in the schema does not have backtick with it. So the solution is to add the backtick before passing the column name to resolve(). ## How was this patch tested? Added a new test case. Author: bomeng Closes #13140 from bomeng/SPARK-15230. (cherry picked from commit 925884a612dd88beaddf555c74d90856ab040ec7) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b4a9a4c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b4a9a4c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b4a9a4c Branch: refs/heads/branch-2.0 Commit: 5b4a9a4c37822cd7528c6bb933da3454fd3bcd37 Parents: e0a4323 Author: bomeng Authored: Thu Jun 23 11:06:19 2016 +0800 Committer: Wenchen Fan Committed: Thu Jun 23 11:06:38 2016 +0800 -- sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 8 +++- .../src/test/scala/org/apache/spark/sql/DataFrameSuite.scala | 5 + 2 files changed, 12 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5b4a9a4c/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 02cc398..f1d33c3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1812,7 +1812,13 @@ class Dataset[T] private[sql]( * @since 2.0.0 */ def dropDuplicates(colNames: Seq[String]): Dataset[T] = withTypedPlan { -val groupCols = colNames.map(resolve) +val resolver = sparkSession.sessionState.analyzer.resolver +val allColumns = queryExecution.analyzed.output +val groupCols = colNames.map { colName => + allColumns.find(col => resolver(col.name, colName)).getOrElse( +throw new AnalysisException( + s"""Cannot resolve column name "$colName" among (${schema.fieldNames.mkString(", ")})""")) +} val groupColExprIds = groupCols.map(_.exprId) val aggCols = logicalPlan.output.map { attr => if (groupColExprIds.contains(attr.exprId)) { http://git-wip-us.apache.org/repos/asf/spark/blob/5b4a9a4c/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index c8a0f71..1afee9f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -1536,4 +1536,9 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { Utils.deleteRecursively(baseDir) } } + + test("SPARK-15230: distinct() does not handle column name with dot properly") { +val df = Seq(1, 1, 2).toDF("column.with.dot") +checkAnswer(df.distinct(), Row(1) :: Row(2) :: Nil) + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16159][SQL] Move RDD creation logic from FileSourceStrategy.apply
Repository: spark Updated Branches: refs/heads/master 9f990fa3f -> 37f3be5d2 [SPARK-16159][SQL] Move RDD creation logic from FileSourceStrategy.apply ## What changes were proposed in this pull request? We embed partitioning logic in FileSourceStrategy.apply, making the function very long. This is a small refactoring to move it into its own functions. Eventually we would be able to move the partitioning functions into a physical operator, rather than doing it in physical planning. ## How was this patch tested? This is a simple code move. Author: Reynold Xin Closes #13862 from rxin/SPARK-16159. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37f3be5d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37f3be5d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37f3be5d Branch: refs/heads/master Commit: 37f3be5d29192db0a54f6c4699237b149bd0ecae Parents: 9f990fa Author: Reynold Xin Authored: Wed Jun 22 18:19:07 2016 -0700 Committer: Reynold Xin Committed: Wed Jun 22 18:19:07 2016 -0700 -- .../sql/execution/datasources/FileScanRDD.scala | 26 +- .../datasources/FileSourceStrategy.scala| 240 +++ 2 files changed, 154 insertions(+), 112 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/37f3be5d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala index f7f68b1..1443057 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala @@ -27,9 +27,14 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.vectorized.ColumnarBatch /** - * A single file that should be read, along with partition column values that - * need to be prepended to each row. The reading should start at the first - * valid record found after `start`. + * A part (i.e. "block") of a single file that should be read, along with partition column values + * that need to be prepended to each row. + * + * @param partitionValues value of partition columns to be prepended to each row. + * @param filePath path of the file to read + * @param start the beginning offset (in bytes) of the block. + * @param length number of bytes to read. + * @param locations locality information (list of nodes that have the data). */ case class PartitionedFile( partitionValues: InternalRow, @@ -43,13 +48,14 @@ case class PartitionedFile( } /** - * A collection of files that should be read as a single task possibly from multiple partitioned - * directories. - * - * TODO: This currently does not take locality information about the files into account. + * A collection of file blocks that should be read as a single task + * (possibly from multiple partitioned directories). */ case class FilePartition(index: Int, files: Seq[PartitionedFile]) extends RDDPartition +/** + * An RDD that scans a list of file partitions. + */ class FileScanRDD( @transient private val sparkSession: SparkSession, readFunction: (PartitionedFile) => Iterator[InternalRow], @@ -88,8 +94,8 @@ class FileScanRDD( private[this] var currentFile: PartitionedFile = null private[this] var currentIterator: Iterator[Object] = null - def hasNext = (currentIterator != null && currentIterator.hasNext) || nextIterator() - def next() = { + def hasNext: Boolean = (currentIterator != null && currentIterator.hasNext) || nextIterator() + def next(): Object = { val nextElement = currentIterator.next() // TODO: we should have a better separation of row based and batch based scan, so that we // don't need to run this `if` for every record. @@ -120,7 +126,7 @@ class FileScanRDD( } } - override def close() = { + override def close(): Unit = { updateBytesRead() updateBytesReadWithFileSize() InputFileNameHolder.unsetInputFileName() http://git-wip-us.apache.org/repos/asf/spark/blob/37f3be5d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala index 13a86bf..04f166f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceSt
spark git commit: [SPARK-16024][SQL][TEST] Verify Column Comment for Data Source Tables
Repository: spark Updated Branches: refs/heads/branch-2.0 e2eb8e002 -> e0a43235d [SPARK-16024][SQL][TEST] Verify Column Comment for Data Source Tables What changes were proposed in this pull request? This PR is to improve test coverage. It verifies whether `Comment` of `Column` can be appropriate handled. The test cases verify the related parts in Parser, both SQL and DataFrameWriter interface, and both Hive Metastore catalog and In-memory catalog. How was this patch tested? N/A Author: gatorsmile Closes #13764 from gatorsmile/dataSourceComment. (cherry picked from commit 9f990fa3f9e0b798d8018cf4132b93a3468f33bb) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e0a43235 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e0a43235 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e0a43235 Branch: refs/heads/branch-2.0 Commit: e0a43235d9d59736ceb0d703c653ef1350e143ec Parents: e2eb8e0 Author: gatorsmile Authored: Thu Jun 23 09:12:20 2016 +0800 Committer: Wenchen Fan Committed: Thu Jun 23 09:13:17 2016 +0800 -- .../spark/sql/execution/command/DDLCommandSuite.scala | 10 +++--- .../apache/spark/sql/execution/command/DDLSuite.scala | 13 + .../spark/sql/hive/execution/HiveDDLSuite.scala | 14 ++ 3 files changed, 34 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e0a43235/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala index 5bee28b..7b96f4c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.SparkSqlParser import org.apache.spark.sql.execution.datasources.{BucketSpec, CreateTableUsing} import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} -import org.apache.spark.sql.types.{IntegerType, StringType, StructType} +import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, StringType, StructType} // TODO: merge this with DDLSuite (SPARK-14441) @@ -349,10 +349,14 @@ class DDLCommandSuite extends PlanTest { } test("create table using - with partitioned by") { -val query = "CREATE TABLE my_tab(a INT, b STRING) USING parquet PARTITIONED BY (a)" +val query = "CREATE TABLE my_tab(a INT comment 'test', b STRING) " + + "USING parquet PARTITIONED BY (a)" val expected = CreateTableUsing( TableIdentifier("my_tab"), - Some(new StructType().add("a", IntegerType).add("b", StringType)), + Some(new StructType() +.add("a", IntegerType, nullable = true, + new MetadataBuilder().putString("comment", s"test").build()) +.add("b", StringType)), "parquet", false, Map.empty, http://git-wip-us.apache.org/repos/asf/spark/blob/e0a43235/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index f40ddcc..47d8a28 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -252,6 +252,19 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } } + test("desc table for parquet data source table using in-memory catalog") { +assume(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") +val tabName = "tab1" +withTable(tabName) { + sql(s"CREATE TABLE $tabName(a int comment 'test') USING parquet ") + + checkAnswer( +sql(s"DESC $tabName").select("col_name", "data_type", "comment"), +Row("a", "int", "test") + ) +} + } + test("Alter/Describe Database") { withTempDir { tmpDir => val path = tmpDir.toString http://git-wip-us.apache.org/repos/asf/spark/blob/e0a43235/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLS
spark git commit: [SPARK-16024][SQL][TEST] Verify Column Comment for Data Source Tables
Repository: spark Updated Branches: refs/heads/master 4f869f88e -> 9f990fa3f [SPARK-16024][SQL][TEST] Verify Column Comment for Data Source Tables What changes were proposed in this pull request? This PR is to improve test coverage. It verifies whether `Comment` of `Column` can be appropriate handled. The test cases verify the related parts in Parser, both SQL and DataFrameWriter interface, and both Hive Metastore catalog and In-memory catalog. How was this patch tested? N/A Author: gatorsmile Closes #13764 from gatorsmile/dataSourceComment. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f990fa3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f990fa3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f990fa3 Branch: refs/heads/master Commit: 9f990fa3f9e0b798d8018cf4132b93a3468f33bb Parents: 4f869f8 Author: gatorsmile Authored: Thu Jun 23 09:12:20 2016 +0800 Committer: Wenchen Fan Committed: Thu Jun 23 09:12:20 2016 +0800 -- .../spark/sql/execution/command/DDLCommandSuite.scala | 10 +++--- .../apache/spark/sql/execution/command/DDLSuite.scala | 13 + .../spark/sql/hive/execution/HiveDDLSuite.scala | 14 ++ 3 files changed, 34 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9f990fa3/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala index 5bee28b..7b96f4c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.SparkSqlParser import org.apache.spark.sql.execution.datasources.{BucketSpec, CreateTableUsing} import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} -import org.apache.spark.sql.types.{IntegerType, StringType, StructType} +import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, StringType, StructType} // TODO: merge this with DDLSuite (SPARK-14441) @@ -349,10 +349,14 @@ class DDLCommandSuite extends PlanTest { } test("create table using - with partitioned by") { -val query = "CREATE TABLE my_tab(a INT, b STRING) USING parquet PARTITIONED BY (a)" +val query = "CREATE TABLE my_tab(a INT comment 'test', b STRING) " + + "USING parquet PARTITIONED BY (a)" val expected = CreateTableUsing( TableIdentifier("my_tab"), - Some(new StructType().add("a", IntegerType).add("b", StringType)), + Some(new StructType() +.add("a", IntegerType, nullable = true, + new MetadataBuilder().putString("comment", s"test").build()) +.add("b", StringType)), "parquet", false, Map.empty, http://git-wip-us.apache.org/repos/asf/spark/blob/9f990fa3/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index f40ddcc..47d8a28 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -252,6 +252,19 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } } + test("desc table for parquet data source table using in-memory catalog") { +assume(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory") +val tabName = "tab1" +withTable(tabName) { + sql(s"CREATE TABLE $tabName(a int comment 'test') USING parquet ") + + checkAnswer( +sql(s"DESC $tabName").select("col_name", "data_type", "comment"), +Row("a", "int", "test") + ) +} + } + test("Alter/Describe Database") { withTempDir { tmpDir => val path = tmpDir.toString http://git-wip-us.apache.org/repos/asf/spark/blob/9f990fa3/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index b2f01fc..89f69c8 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/executio
spark git commit: [SPARK-15956][SQL] When unwrapping ORC avoid pattern matching at runtime
Repository: spark Updated Branches: refs/heads/master 044971eca -> 4f869f88e [SPARK-15956][SQL] When unwrapping ORC avoid pattern matching at runtime ## What changes were proposed in this pull request? Extend the returning of unwrapper functions from primitive types to all types. This PR is based on https://github.com/apache/spark/pull/13676. It only fixes a bug with scala-2.10 compilation. All credit should go to dafrista. ## How was this patch tested? The patch should pass all unit tests. Reading ORC files with non-primitive types with this change reduced the read time by ~15%. Author: Brian Cho Author: Herman van Hovell Closes #13854 from hvanhovell/SPARK-15956-scala210. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f869f88 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f869f88 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f869f88 Branch: refs/heads/master Commit: 4f869f88ee96fa57be79f972f218111b6feac67f Parents: 044971e Author: Brian Cho Authored: Wed Jun 22 16:56:55 2016 -0700 Committer: Herman van Hovell Committed: Wed Jun 22 16:56:55 2016 -0700 -- .../apache/spark/sql/hive/HiveInspectors.scala | 428 +-- .../org/apache/spark/sql/hive/TableReader.scala | 3 +- .../hive/execution/ScriptTransformation.scala | 6 +- .../org/apache/spark/sql/hive/hiveUDFs.scala| 21 +- .../spark/sql/hive/HiveInspectorSuite.scala | 6 + 5 files changed, 314 insertions(+), 150 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4f869f88/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 585befe..bf5cc17 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -239,145 +239,6 @@ private[hive] trait HiveInspectors { } /** - * Converts hive types to native catalyst types. - * @param data the data in Hive type - * @param oi the ObjectInspector associated with the Hive Type - * @return convert the data into catalyst type - * TODO return the function of (data => Any) instead for performance consideration - * - * Strictly follows the following order in unwrapping (constant OI has the higher priority): - * Constant Null object inspector => - *return null - * Constant object inspector => - *extract the value from constant object inspector - * Check whether the `data` is null => - *return null if true - * If object inspector prefers writable => - *extract writable from `data` and then get the catalyst type from the writable - * Extract the java object directly from the object inspector - * - * NOTICE: the complex data type requires recursive unwrapping. - */ - def unwrap(data: Any, oi: ObjectInspector): Any = oi match { -case coi: ConstantObjectInspector if coi.getWritableConstantValue == null => null -case poi: WritableConstantStringObjectInspector => - UTF8String.fromString(poi.getWritableConstantValue.toString) -case poi: WritableConstantHiveVarcharObjectInspector => - UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue) -case poi: WritableConstantHiveCharObjectInspector => - UTF8String.fromString(poi.getWritableConstantValue.getHiveChar.getValue) -case poi: WritableConstantHiveDecimalObjectInspector => - HiveShim.toCatalystDecimal( -PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector, -poi.getWritableConstantValue.getHiveDecimal) -case poi: WritableConstantTimestampObjectInspector => - val t = poi.getWritableConstantValue - t.getSeconds * 100L + t.getNanos / 1000L -case poi: WritableConstantIntObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantDoubleObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantBooleanObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantLongObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantFloatObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantShortObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantByteObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantBinaryObjectInspector => - val writable = poi.getWritableConstantValue - val temp = new Array[Byte](writable.getLength) -
spark git commit: [SPARK-16131] initialize internal logger lazily in Scala preferred way
Repository: spark Updated Branches: refs/heads/branch-2.0 1d3c56e77 -> e2eb8e002 [SPARK-16131] initialize internal logger lazily in Scala preferred way ## What changes were proposed in this pull request? Initialize logger instance lazily in Scala preferred way ## How was this patch tested? By running `./build/mvn clean test` locally Author: Prajwal Tuladhar Closes #13842 from infynyxx/spark_internal_logger. (cherry picked from commit 044971eca0ff3c2ce62afa665dbd3072d52cbbec) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2eb8e00 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2eb8e00 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2eb8e00 Branch: refs/heads/branch-2.0 Commit: e2eb8e002acb19fd266d2237baec31f74aa02ef8 Parents: 1d3c56e Author: Prajwal Tuladhar Authored: Wed Jun 22 16:30:10 2016 -0700 Committer: Shixiong Zhu Committed: Wed Jun 22 16:30:18 2016 -0700 -- .../scala/org/apache/spark/internal/Logging.scala | 14 -- .../cluster/CoarseGrainedSchedulerBackend.scala | 2 -- 2 files changed, 4 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e2eb8e00/core/src/main/scala/org/apache/spark/internal/Logging.scala -- diff --git a/core/src/main/scala/org/apache/spark/internal/Logging.scala b/core/src/main/scala/org/apache/spark/internal/Logging.scala index 66a0cfe..c51050c 100644 --- a/core/src/main/scala/org/apache/spark/internal/Logging.scala +++ b/core/src/main/scala/org/apache/spark/internal/Logging.scala @@ -32,7 +32,10 @@ private[spark] trait Logging { // Make the log field transient so that objects with Logging can // be serialized and used on another machine - @transient private var log_ : Logger = null + @transient lazy val log: Logger = { +initializeLogIfNecessary(false) +LoggerFactory.getLogger(logName) + } // Method to get the logger name for this object protected def logName = { @@ -40,15 +43,6 @@ private[spark] trait Logging { this.getClass.getName.stripSuffix("$") } - // Method to get or create the logger for this object - protected def log: Logger = { -if (log_ == null) { - initializeLogIfNecessary(false) - log_ = LoggerFactory.getLogger(logName) -} -log_ - } - // Log methods that take only a String protected def logInfo(msg: => String) { if (log.isInfoEnabled) log.info(msg) http://git-wip-us.apache.org/repos/asf/spark/blob/e2eb8e00/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 967c4d5..8259923 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -100,8 +100,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // instance across threads private val ser = SparkEnv.get.closureSerializer.newInstance() -override protected def log = CoarseGrainedSchedulerBackend.this.log - protected val addressToExecutorId = new HashMap[RpcAddress, String] private val reviveThread = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16131] initialize internal logger lazily in Scala preferred way
Repository: spark Updated Branches: refs/heads/master 857ecff1d -> 044971eca [SPARK-16131] initialize internal logger lazily in Scala preferred way ## What changes were proposed in this pull request? Initialize logger instance lazily in Scala preferred way ## How was this patch tested? By running `./build/mvn clean test` locally Author: Prajwal Tuladhar Closes #13842 from infynyxx/spark_internal_logger. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/044971ec Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/044971ec Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/044971ec Branch: refs/heads/master Commit: 044971eca0ff3c2ce62afa665dbd3072d52cbbec Parents: 857ecff Author: Prajwal Tuladhar Authored: Wed Jun 22 16:30:10 2016 -0700 Committer: Shixiong Zhu Committed: Wed Jun 22 16:30:10 2016 -0700 -- .../scala/org/apache/spark/internal/Logging.scala | 14 -- .../cluster/CoarseGrainedSchedulerBackend.scala | 2 -- 2 files changed, 4 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/044971ec/core/src/main/scala/org/apache/spark/internal/Logging.scala -- diff --git a/core/src/main/scala/org/apache/spark/internal/Logging.scala b/core/src/main/scala/org/apache/spark/internal/Logging.scala index 66a0cfe..c51050c 100644 --- a/core/src/main/scala/org/apache/spark/internal/Logging.scala +++ b/core/src/main/scala/org/apache/spark/internal/Logging.scala @@ -32,7 +32,10 @@ private[spark] trait Logging { // Make the log field transient so that objects with Logging can // be serialized and used on another machine - @transient private var log_ : Logger = null + @transient lazy val log: Logger = { +initializeLogIfNecessary(false) +LoggerFactory.getLogger(logName) + } // Method to get the logger name for this object protected def logName = { @@ -40,15 +43,6 @@ private[spark] trait Logging { this.getClass.getName.stripSuffix("$") } - // Method to get or create the logger for this object - protected def log: Logger = { -if (log_ == null) { - initializeLogIfNecessary(false) - log_ = LoggerFactory.getLogger(logName) -} -log_ - } - // Log methods that take only a String protected def logInfo(msg: => String) { if (log.isInfoEnabled) log.info(msg) http://git-wip-us.apache.org/repos/asf/spark/blob/044971ec/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala index 967c4d5..8259923 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala @@ -100,8 +100,6 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp // instance across threads private val ser = SparkEnv.get.closureSerializer.newInstance() -override protected def log = CoarseGrainedSchedulerBackend.this.log - protected val addressToExecutorId = new HashMap[RpcAddress, String] private val reviveThread = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16155][DOC] remove package grouping in Java docs
Repository: spark Updated Branches: refs/heads/branch-2.0 02435acf3 -> 1d3c56e77 [SPARK-16155][DOC] remove package grouping in Java docs ## What changes were proposed in this pull request? In 1.4 and earlier releases, we have package grouping in the generated Java API docs. See http://spark.apache.org/docs/1.4.0/api/java/index.html. However, this disappeared in 1.5.0: http://spark.apache.org/docs/1.5.0/api/java/index.html. Rather than fixing it, I'd suggest removing grouping. Because it might take some time to fix and it is a manual process to update the grouping in `SparkBuild.scala`. I didn't find anyone complaining about missing groups since 1.5.0 on Google. Manually checked the generated Java API docs and confirmed that they are the same as in master. Author: Xiangrui Meng Closes #13856 from mengxr/SPARK-16155. (cherry picked from commit 857ecff1d8268b28bb287e47cda370c87afe9d41) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d3c56e7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d3c56e7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d3c56e7 Branch: refs/heads/branch-2.0 Commit: 1d3c56e778b28ad4587d07765896814bfc1201f4 Parents: 02435ac Author: Xiangrui Meng Authored: Wed Jun 22 15:52:37 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 15:52:47 2016 -0700 -- project/SparkBuild.scala | 20 1 file changed, 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1d3c56e7/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index bce7f1d..4b44469 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -684,11 +684,6 @@ object Unidoc { import sbtunidoc.Plugin._ import UnidocKeys._ - // for easier specification of JavaDoc package groups - private def packageList(names: String*): String = { -names.map(s => "org.apache.spark." + s).mkString(":") - } - private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): Seq[Seq[File]] = { packages .map(_.filterNot(_.getName.contains("$"))) @@ -731,21 +726,6 @@ object Unidoc { javacOptions in doc := Seq( "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " JavaDoc", "-public", - "-group", "Core Java API", packageList("api.java", "api.java.function"), - "-group", "Spark Streaming", packageList( -"streaming.api.java", "streaming.flume", "streaming.kafka", "streaming.kinesis" - ), - "-group", "MLlib", packageList( -"mllib.classification", "mllib.clustering", "mllib.evaluation.binary", "mllib.linalg", -"mllib.linalg.distributed", "mllib.optimization", "mllib.rdd", "mllib.recommendation", -"mllib.regression", "mllib.stat", "mllib.tree", "mllib.tree.configuration", -"mllib.tree.impurity", "mllib.tree.model", "mllib.util", -"mllib.evaluation", "mllib.feature", "mllib.random", "mllib.stat.correlation", -"mllib.stat.test", "mllib.tree.impl", "mllib.tree.loss", -"ml", "ml.attribute", "ml.classification", "ml.clustering", "ml.evaluation", "ml.feature", -"ml.param", "ml.recommendation", "ml.regression", "ml.tuning" - ), - "-group", "Spark SQL", packageList("sql.api.java", "sql.api.java.types", "sql.hive.api.java"), "-noqualifier", "java.lang" ), - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16155][DOC] remove package grouping in Java docs
Repository: spark Updated Branches: refs/heads/master 00cc5cca4 -> 857ecff1d [SPARK-16155][DOC] remove package grouping in Java docs ## What changes were proposed in this pull request? In 1.4 and earlier releases, we have package grouping in the generated Java API docs. See http://spark.apache.org/docs/1.4.0/api/java/index.html. However, this disappeared in 1.5.0: http://spark.apache.org/docs/1.5.0/api/java/index.html. Rather than fixing it, I'd suggest removing grouping. Because it might take some time to fix and it is a manual process to update the grouping in `SparkBuild.scala`. I didn't find anyone complaining about missing groups since 1.5.0 on Google. Manually checked the generated Java API docs and confirmed that they are the same as in master. Author: Xiangrui Meng Closes #13856 from mengxr/SPARK-16155. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/857ecff1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/857ecff1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/857ecff1 Branch: refs/heads/master Commit: 857ecff1d8268b28bb287e47cda370c87afe9d41 Parents: 00cc5cc Author: Xiangrui Meng Authored: Wed Jun 22 15:52:37 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 15:52:37 2016 -0700 -- project/SparkBuild.scala | 20 1 file changed, 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/857ecff1/project/SparkBuild.scala -- diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index bce7f1d..4b44469 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -684,11 +684,6 @@ object Unidoc { import sbtunidoc.Plugin._ import UnidocKeys._ - // for easier specification of JavaDoc package groups - private def packageList(names: String*): String = { -names.map(s => "org.apache.spark." + s).mkString(":") - } - private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): Seq[Seq[File]] = { packages .map(_.filterNot(_.getName.contains("$"))) @@ -731,21 +726,6 @@ object Unidoc { javacOptions in doc := Seq( "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " JavaDoc", "-public", - "-group", "Core Java API", packageList("api.java", "api.java.function"), - "-group", "Spark Streaming", packageList( -"streaming.api.java", "streaming.flume", "streaming.kafka", "streaming.kinesis" - ), - "-group", "MLlib", packageList( -"mllib.classification", "mllib.clustering", "mllib.evaluation.binary", "mllib.linalg", -"mllib.linalg.distributed", "mllib.optimization", "mllib.rdd", "mllib.recommendation", -"mllib.regression", "mllib.stat", "mllib.tree", "mllib.tree.configuration", -"mllib.tree.impurity", "mllib.tree.model", "mllib.util", -"mllib.evaluation", "mllib.feature", "mllib.random", "mllib.stat.correlation", -"mllib.stat.test", "mllib.tree.impl", "mllib.tree.loss", -"ml", "ml.attribute", "ml.classification", "ml.clustering", "ml.evaluation", "ml.feature", -"ml.param", "ml.recommendation", "ml.regression", "ml.tuning" - ), - "-group", "Spark SQL", packageList("sql.api.java", "sql.api.java.types", "sql.hive.api.java"), "-noqualifier", "java.lang" ), - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16153][MLLIB] switch to multi-line doc to avoid a genjavadoc bug
Repository: spark Updated Branches: refs/heads/branch-2.0 282a3cd02 -> 02435acf3 [SPARK-16153][MLLIB] switch to multi-line doc to avoid a genjavadoc bug ## What changes were proposed in this pull request? We recently deprecated setLabelCol in ChiSqSelectorModel (#13823): ~~~scala /** group setParam */ Since("1.6.0") deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0") def setLabelCol(value: String): this.type = set(labelCol, value) ~~~ This unfortunately hit a genjavadoc bug and broken doc generation. This is the generated Java code: ~~~java /** group setParam */ public org.apache.spark.ml.feature.ChiSqSelectorModel setOutputCol (java.lang.String value) { throw new RuntimeException(); } * * deprecated labelCol is not used by ChiSqSelectorModel. Since 2.0.0. */ public org.apache.spark.ml.feature.ChiSqSelectorModel setLabelCol (java.lang.String value) { throw new RuntimeException(); } ~~~ Switching to multiline is a workaround. Author: Xiangrui Meng Closes #13855 from mengxr/SPARK-16153. (cherry picked from commit 00cc5cca4522297b63b1522a2b8643b1a098e2b3) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/02435acf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/02435acf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/02435acf Branch: refs/heads/branch-2.0 Commit: 02435acf3bf84f77bb3c70a2fd548af8bad4c28e Parents: 282a3cd Author: Xiangrui Meng Authored: Wed Jun 22 15:50:21 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 15:50:28 2016 -0700 -- .../main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/02435acf/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 38b4db9..712634d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -140,7 +140,9 @@ final class ChiSqSelectorModel private[ml] ( @Since("1.6.0") def setOutputCol(value: String): this.type = set(outputCol, value) - /** @group setParam */ + /** + * @group setParam + */ @Since("1.6.0") @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0") def setLabelCol(value: String): this.type = set(labelCol, value) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16153][MLLIB] switch to multi-line doc to avoid a genjavadoc bug
Repository: spark Updated Branches: refs/heads/master 20d411bc5 -> 00cc5cca4 [SPARK-16153][MLLIB] switch to multi-line doc to avoid a genjavadoc bug ## What changes were proposed in this pull request? We recently deprecated setLabelCol in ChiSqSelectorModel (#13823): ~~~scala /** group setParam */ Since("1.6.0") deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0") def setLabelCol(value: String): this.type = set(labelCol, value) ~~~ This unfortunately hit a genjavadoc bug and broken doc generation. This is the generated Java code: ~~~java /** group setParam */ public org.apache.spark.ml.feature.ChiSqSelectorModel setOutputCol (java.lang.String value) { throw new RuntimeException(); } * * deprecated labelCol is not used by ChiSqSelectorModel. Since 2.0.0. */ public org.apache.spark.ml.feature.ChiSqSelectorModel setLabelCol (java.lang.String value) { throw new RuntimeException(); } ~~~ Switching to multiline is a workaround. Author: Xiangrui Meng Closes #13855 from mengxr/SPARK-16153. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00cc5cca Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00cc5cca Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00cc5cca Branch: refs/heads/master Commit: 00cc5cca4522297b63b1522a2b8643b1a098e2b3 Parents: 20d411b Author: Xiangrui Meng Authored: Wed Jun 22 15:50:21 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 15:50:21 2016 -0700 -- .../main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/00cc5cca/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala index 38b4db9..712634d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala @@ -140,7 +140,9 @@ final class ChiSqSelectorModel private[ml] ( @Since("1.6.0") def setOutputCol(value: String): this.type = set(outputCol, value) - /** @group setParam */ + /** + * @group setParam + */ @Since("1.6.0") @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0") def setLabelCol(value: String): this.type = set(labelCol, value) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16078][SQL] from_utc_timestamp/to_utc_timestamp should not depends on local timezone
Repository: spark Updated Branches: refs/heads/branch-2.0 299f427b7 -> 282a3cd02 [SPARK-16078][SQL] from_utc_timestamp/to_utc_timestamp should not depends on local timezone ## What changes were proposed in this pull request? Currently, we use local timezone to parse or format a timestamp (TimestampType), then use Long as the microseconds since epoch UTC. In from_utc_timestamp() and to_utc_timestamp(), we did not consider the local timezone, they could return different results with different local timezone. This PR will do the conversion based on human time (in local timezone), it should return same result in whatever timezone. But because the mapping from absolute timestamp to human time is not exactly one-to-one mapping, it will still return wrong result in some timezone (also in the begging or ending of DST). This PR is kind of the best effort fix. In long term, we should make the TimestampType be timezone aware to fix this totally. ## How was this patch tested? Tested these function in all timezone. Author: Davies Liu Closes #13784 from davies/convert_tz. (cherry picked from commit 20d411bc5d05dd099f6d5234a24e10a519a39bdf) Signed-off-by: Herman van Hovell Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/282a3cd0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/282a3cd0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/282a3cd0 Branch: refs/heads/branch-2.0 Commit: 282a3cd02389464d6adbf02921281c963da29b00 Parents: 299f427 Author: Davies Liu Authored: Wed Jun 22 13:40:24 2016 -0700 Committer: Herman van Hovell Committed: Wed Jun 22 13:41:33 2016 -0700 -- .../expressions/datetimeExpressions.scala | 10 +-- .../spark/sql/catalyst/util/DateTimeUtils.scala | 34 -- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 65 3 files changed, 73 insertions(+), 36 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/282a3cd0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 773431d..04c17bd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -730,16 +730,17 @@ case class FromUTCTimestamp(left: Expression, right: Expression) """.stripMargin) } else { val tzTerm = ctx.freshName("tz") +val utcTerm = ctx.freshName("utc") val tzClass = classOf[TimeZone].getName ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = $tzClass.getTimeZone("$tz");""") +ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = $tzClass.getTimeZone("UTC");""") val eval = left.genCode(ctx) ev.copy(code = s""" |${eval.code} |boolean ${ev.isNull} = ${eval.isNull}; |long ${ev.value} = 0; |if (!${ev.isNull}) { - | ${ev.value} = ${eval.value} + - | ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L; + | ${ev.value} = $dtu.convertTz(${eval.value}, $utcTerm, $tzTerm); |} """.stripMargin) } @@ -869,16 +870,17 @@ case class ToUTCTimestamp(left: Expression, right: Expression) """.stripMargin) } else { val tzTerm = ctx.freshName("tz") +val utcTerm = ctx.freshName("utc") val tzClass = classOf[TimeZone].getName ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = $tzClass.getTimeZone("$tz");""") +ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = $tzClass.getTimeZone("UTC");""") val eval = left.genCode(ctx) ev.copy(code = s""" |${eval.code} |boolean ${ev.isNull} = ${eval.isNull}; |long ${ev.value} = 0; |if (!${ev.isNull}) { - | ${ev.value} = ${eval.value} - - | ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L; + | ${ev.value} = $dtu.convertTz(${eval.value}, $tzTerm, $utcTerm); |} """.stripMargin) } http://git-wip-us.apache.org/repos/asf/spark/blob/282a3cd0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 56bf9a7..df480a
spark git commit: [SPARK-16078][SQL] from_utc_timestamp/to_utc_timestamp should not depends on local timezone
Repository: spark Updated Branches: refs/heads/master 43b04b7ec -> 20d411bc5 [SPARK-16078][SQL] from_utc_timestamp/to_utc_timestamp should not depends on local timezone ## What changes were proposed in this pull request? Currently, we use local timezone to parse or format a timestamp (TimestampType), then use Long as the microseconds since epoch UTC. In from_utc_timestamp() and to_utc_timestamp(), we did not consider the local timezone, they could return different results with different local timezone. This PR will do the conversion based on human time (in local timezone), it should return same result in whatever timezone. But because the mapping from absolute timestamp to human time is not exactly one-to-one mapping, it will still return wrong result in some timezone (also in the begging or ending of DST). This PR is kind of the best effort fix. In long term, we should make the TimestampType be timezone aware to fix this totally. ## How was this patch tested? Tested these function in all timezone. Author: Davies Liu Closes #13784 from davies/convert_tz. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20d411bc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20d411bc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20d411bc Branch: refs/heads/master Commit: 20d411bc5d05dd099f6d5234a24e10a519a39bdf Parents: 43b04b7 Author: Davies Liu Authored: Wed Jun 22 13:40:24 2016 -0700 Committer: Herman van Hovell Committed: Wed Jun 22 13:40:24 2016 -0700 -- .../expressions/datetimeExpressions.scala | 10 +-- .../spark/sql/catalyst/util/DateTimeUtils.scala | 34 -- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 65 3 files changed, 73 insertions(+), 36 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/20d411bc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala index 773431d..04c17bd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala @@ -730,16 +730,17 @@ case class FromUTCTimestamp(left: Expression, right: Expression) """.stripMargin) } else { val tzTerm = ctx.freshName("tz") +val utcTerm = ctx.freshName("utc") val tzClass = classOf[TimeZone].getName ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = $tzClass.getTimeZone("$tz");""") +ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = $tzClass.getTimeZone("UTC");""") val eval = left.genCode(ctx) ev.copy(code = s""" |${eval.code} |boolean ${ev.isNull} = ${eval.isNull}; |long ${ev.value} = 0; |if (!${ev.isNull}) { - | ${ev.value} = ${eval.value} + - | ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L; + | ${ev.value} = $dtu.convertTz(${eval.value}, $utcTerm, $tzTerm); |} """.stripMargin) } @@ -869,16 +870,17 @@ case class ToUTCTimestamp(left: Expression, right: Expression) """.stripMargin) } else { val tzTerm = ctx.freshName("tz") +val utcTerm = ctx.freshName("utc") val tzClass = classOf[TimeZone].getName ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = $tzClass.getTimeZone("$tz");""") +ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = $tzClass.getTimeZone("UTC");""") val eval = left.genCode(ctx) ev.copy(code = s""" |${eval.code} |boolean ${ev.isNull} = ${eval.isNull}; |long ${ev.value} = 0; |if (!${ev.isNull}) { - | ${ev.value} = ${eval.value} - - | ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L; + | ${ev.value} = $dtu.convertTz(${eval.value}, $tzTerm, $utcTerm); |} """.stripMargin) } http://git-wip-us.apache.org/repos/asf/spark/blob/20d411bc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 56bf9a7..df480a1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/ca
spark git commit: [SPARK-15672][R][DOC] R programming guide update
Repository: spark Updated Branches: refs/heads/branch-2.0 e043c02d0 -> 299f427b7 [SPARK-15672][R][DOC] R programming guide update ## What changes were proposed in this pull request? Guide for - UDFs with dapply, dapplyCollect - spark.lapply for running parallel R functions ## How was this patch tested? build locally https://cloud.githubusercontent.com/assets/3419881/16039344/12a3b6a0-31de-11e6-8d77-fe23308075c0.png";> Author: Kai Jiang Closes #13660 from vectorijk/spark-15672-R-guide-update. (cherry picked from commit 43b04b7ecb313a2cee6121dd575de1f7dc785c11) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/299f427b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/299f427b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/299f427b Branch: refs/heads/branch-2.0 Commit: 299f427b70f8dedbc0b554f83c4fde408caf4d15 Parents: e043c02 Author: Kai Jiang Authored: Wed Jun 22 12:50:36 2016 -0700 Committer: Joseph K. Bradley Committed: Wed Jun 22 12:50:44 2016 -0700 -- R/pkg/R/context.R | 2 +- docs/sparkr.md| 77 ++ 2 files changed, 78 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/299f427b/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 96ef943..dd0ceae 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -246,7 +246,7 @@ setCheckpointDir <- function(sc, dirName) { #' \preformatted{ #' train <- function(hyperparam) { #' library(MASS) -#' lm.ridge(ây ~ x+zâ, data, lambda=hyperparam) +#' lm.ridge("y ~ x+z", data, lambda=hyperparam) #' model #' } #' } http://git-wip-us.apache.org/repos/asf/spark/blob/299f427b/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index f018901..9e74e4a 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -255,6 +255,83 @@ head(df) {% endhighlight %} +### Applying User-Defined Function +In SparkR, we support several kinds of User-Defined Functions: + + Run a given function on a large dataset using `dapply` or `dapplyCollect` + +# dapply +Apply a function to each partition of a `SparkDataFrame`. The function to be applied to each partition of the `SparkDataFrame` +and should have only one parameter, to which a `data.frame` corresponds to each partition will be passed. The output of function +should be a `data.frame`. Schema specifies the row format of the resulting a `SparkDataFrame`. It must match the R function's output. + +{% highlight r %} + +# Convert waiting time from hours to seconds. +# Note that we can apply UDF to DataFrame. +schema <- structType(structField("eruptions", "double"), structField("waiting", "double"), + structField("waiting_secs", "double")) +df1 <- dapply(df, function(x) {x <- cbind(x, x$waiting * 60)}, schema) +head(collect(df1)) +## eruptions waiting waiting_secs +##1 3.600 79 4740 +##2 1.800 54 3240 +##3 3.333 74 4440 +##4 2.283 62 3720 +##5 4.533 85 5100 +##6 2.883 55 3300 +{% endhighlight %} + + +# dapplyCollect +Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function +should be a `data.frame`. But, Schema is not required to be passed. Note that `dapplyCollect` only can be used if the +output of UDF run on all the partitions can fit in driver memory. + +{% highlight r %} + +# Convert waiting time from hours to seconds. +# Note that we can apply UDF to DataFrame and return a R's data.frame +ldf <- dapplyCollect( + df, + function(x) { + x <- cbind(x, "waiting_secs" = x$waiting * 60) + }) +head(ldf, 3) +## eruptions waiting waiting_secs +##1 3.600 79 4740 +##2 1.800 54 3240 +##3 3.333 74 4440 + +{% endhighlight %} + + + Run local R functions distributed using `spark.lapply` + +# spark.lapply +Similar to `lapply` in native R, `spark.lapply` runs a function over a list of elements and distributes the computations with Spark. +Applies a function in a manner that is similar to `doParallel` or `lapply` to elements of a list. The results of all the computations +should fit in a single machine. If that is not the case they can do something like `df <- createDataFrame(list)` and then use +`dapply` + +{% highlight r %} + +# Perform distributed training of multiple models with spark.lapply. Here, we pass +# a read-only list of arguments which specifies family the generalized linear model shou
spark git commit: [SPARK-15672][R][DOC] R programming guide update
Repository: spark Updated Branches: refs/heads/master 6f915c9ec -> 43b04b7ec [SPARK-15672][R][DOC] R programming guide update ## What changes were proposed in this pull request? Guide for - UDFs with dapply, dapplyCollect - spark.lapply for running parallel R functions ## How was this patch tested? build locally https://cloud.githubusercontent.com/assets/3419881/16039344/12a3b6a0-31de-11e6-8d77-fe23308075c0.png";> Author: Kai Jiang Closes #13660 from vectorijk/spark-15672-R-guide-update. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43b04b7e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43b04b7e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43b04b7e Branch: refs/heads/master Commit: 43b04b7ecb313a2cee6121dd575de1f7dc785c11 Parents: 6f915c9 Author: Kai Jiang Authored: Wed Jun 22 12:50:36 2016 -0700 Committer: Joseph K. Bradley Committed: Wed Jun 22 12:50:36 2016 -0700 -- R/pkg/R/context.R | 2 +- docs/sparkr.md| 77 ++ 2 files changed, 78 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/43b04b7e/R/pkg/R/context.R -- diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index 96ef943..dd0ceae 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -246,7 +246,7 @@ setCheckpointDir <- function(sc, dirName) { #' \preformatted{ #' train <- function(hyperparam) { #' library(MASS) -#' lm.ridge(ây ~ x+zâ, data, lambda=hyperparam) +#' lm.ridge("y ~ x+z", data, lambda=hyperparam) #' model #' } #' } http://git-wip-us.apache.org/repos/asf/spark/blob/43b04b7e/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index f018901..9e74e4a 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -255,6 +255,83 @@ head(df) {% endhighlight %} +### Applying User-Defined Function +In SparkR, we support several kinds of User-Defined Functions: + + Run a given function on a large dataset using `dapply` or `dapplyCollect` + +# dapply +Apply a function to each partition of a `SparkDataFrame`. The function to be applied to each partition of the `SparkDataFrame` +and should have only one parameter, to which a `data.frame` corresponds to each partition will be passed. The output of function +should be a `data.frame`. Schema specifies the row format of the resulting a `SparkDataFrame`. It must match the R function's output. + +{% highlight r %} + +# Convert waiting time from hours to seconds. +# Note that we can apply UDF to DataFrame. +schema <- structType(structField("eruptions", "double"), structField("waiting", "double"), + structField("waiting_secs", "double")) +df1 <- dapply(df, function(x) {x <- cbind(x, x$waiting * 60)}, schema) +head(collect(df1)) +## eruptions waiting waiting_secs +##1 3.600 79 4740 +##2 1.800 54 3240 +##3 3.333 74 4440 +##4 2.283 62 3720 +##5 4.533 85 5100 +##6 2.883 55 3300 +{% endhighlight %} + + +# dapplyCollect +Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function +should be a `data.frame`. But, Schema is not required to be passed. Note that `dapplyCollect` only can be used if the +output of UDF run on all the partitions can fit in driver memory. + +{% highlight r %} + +# Convert waiting time from hours to seconds. +# Note that we can apply UDF to DataFrame and return a R's data.frame +ldf <- dapplyCollect( + df, + function(x) { + x <- cbind(x, "waiting_secs" = x$waiting * 60) + }) +head(ldf, 3) +## eruptions waiting waiting_secs +##1 3.600 79 4740 +##2 1.800 54 3240 +##3 3.333 74 4440 + +{% endhighlight %} + + + Run local R functions distributed using `spark.lapply` + +# spark.lapply +Similar to `lapply` in native R, `spark.lapply` runs a function over a list of elements and distributes the computations with Spark. +Applies a function in a manner that is similar to `doParallel` or `lapply` to elements of a list. The results of all the computations +should fit in a single machine. If that is not the case they can do something like `df <- createDataFrame(list)` and then use +`dapply` + +{% highlight r %} + +# Perform distributed training of multiple models with spark.lapply. Here, we pass +# a read-only list of arguments which specifies family the generalized linear model should be. +families <- c("gaussian", "poisson") +train <- function(family) { + model <- glm(Sepal.Length ~ Sepal.W
spark git commit: [SPARK-16003] SerializationDebugger runs into infinite loop
Repository: spark Updated Branches: refs/heads/master 472d611a7 -> 6f915c9ec [SPARK-16003] SerializationDebugger runs into infinite loop ## What changes were proposed in this pull request? This fixes SerializationDebugger to not recurse forever when `writeReplace` returns an object of the same class, which is the case for at least the `SQLMetrics` class. See also the OpenJDK unit tests on the behavior of recursive `writeReplace()`: https://github.com/openjdk-mirror/jdk7u-jdk/blob/f4d80957e89a19a29bb9f9807d2a28351ed7f7df/test/java/io/Serializable/nestedReplace/NestedReplace.java cc davies cloud-fan ## How was this patch tested? Unit tests for SerializationDebugger. Author: Eric Liang Closes #13814 from ericl/spark-16003. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f915c9e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f915c9e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f915c9e Branch: refs/heads/master Commit: 6f915c9ec24003877d1ef675a59145699780a2ff Parents: 472d611 Author: Eric Liang Authored: Wed Jun 22 12:12:34 2016 -0700 Committer: Davies Liu Committed: Wed Jun 22 12:12:34 2016 -0700 -- .../spark/serializer/SerializationDebugger.scala | 9 - .../spark/serializer/SerializationDebuggerSuite.scala | 13 - 2 files changed, 16 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f915c9e/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala -- diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala index c04b483..5e7a98c 100644 --- a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala +++ b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala @@ -155,7 +155,7 @@ private[spark] object SerializationDebugger extends Logging { // If the object has been replaced using writeReplace(), // then call visit() on it again to test its type again. - if (!finalObj.eq(o)) { + if (finalObj.getClass != o.getClass) { return visit(finalObj, s"writeReplace data (class: ${finalObj.getClass.getName})" :: stack) } @@ -265,11 +265,10 @@ private[spark] object SerializationDebugger extends Logging { if (!desc.hasWriteReplaceMethod) { (o, desc) } else { - // write place val replaced = desc.invokeWriteReplace(o) - // `writeReplace` may return the same object. - if (replaced eq o) { -(o, desc) + // `writeReplace` recursion stops when the returned object has the same class. + if (replaced.getClass == o.getClass) { +(replaced, desc) } else { findObjectAndDescriptor(replaced) } http://git-wip-us.apache.org/repos/asf/spark/blob/6f915c9e/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala index f019b1e..912a516 100644 --- a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala @@ -126,7 +126,11 @@ class SerializationDebuggerSuite extends SparkFunSuite with BeforeAndAfterEach { assert(find(new SerializableClassWithWriteReplace(new SerializableClass1)).isEmpty) } -test("object containing writeObject() and not serializable field") { + test("no infinite loop with writeReplace() which returns class of its own type") { +assert(find(new SerializableClassWithRecursiveWriteReplace).isEmpty) + } + + test("object containing writeObject() and not serializable field") { val s = find(new SerializableClassWithWriteObject(new NotSerializable)) assert(s.size === 3) assert(s(0).contains("NotSerializable")) @@ -229,6 +233,13 @@ class SerializableClassWithWriteReplace(@(transient @param) replacementFieldObje } +class SerializableClassWithRecursiveWriteReplace extends Serializable { + private def writeReplace(): Object = { +new SerializableClassWithRecursiveWriteReplace + } +} + + class ExternalizableClass(objectField: Object) extends java.io.Externalizable { val serializableObjectField = new SerializableClass1 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16003] SerializationDebugger runs into infinite loop
Repository: spark Updated Branches: refs/heads/branch-2.0 520828c90 -> e043c02d0 [SPARK-16003] SerializationDebugger runs into infinite loop ## What changes were proposed in this pull request? This fixes SerializationDebugger to not recurse forever when `writeReplace` returns an object of the same class, which is the case for at least the `SQLMetrics` class. See also the OpenJDK unit tests on the behavior of recursive `writeReplace()`: https://github.com/openjdk-mirror/jdk7u-jdk/blob/f4d80957e89a19a29bb9f9807d2a28351ed7f7df/test/java/io/Serializable/nestedReplace/NestedReplace.java cc davies cloud-fan ## How was this patch tested? Unit tests for SerializationDebugger. Author: Eric Liang Closes #13814 from ericl/spark-16003. (cherry picked from commit 6f915c9ec24003877d1ef675a59145699780a2ff) Signed-off-by: Davies Liu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e043c02d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e043c02d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e043c02d Branch: refs/heads/branch-2.0 Commit: e043c02d039809be149622a4d7562f332cfa25aa Parents: 520828c Author: Eric Liang Authored: Wed Jun 22 12:12:34 2016 -0700 Committer: Davies Liu Committed: Wed Jun 22 12:12:44 2016 -0700 -- .../spark/serializer/SerializationDebugger.scala | 9 - .../spark/serializer/SerializationDebuggerSuite.scala | 13 - 2 files changed, 16 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e043c02d/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala -- diff --git a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala index c04b483..5e7a98c 100644 --- a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala +++ b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala @@ -155,7 +155,7 @@ private[spark] object SerializationDebugger extends Logging { // If the object has been replaced using writeReplace(), // then call visit() on it again to test its type again. - if (!finalObj.eq(o)) { + if (finalObj.getClass != o.getClass) { return visit(finalObj, s"writeReplace data (class: ${finalObj.getClass.getName})" :: stack) } @@ -265,11 +265,10 @@ private[spark] object SerializationDebugger extends Logging { if (!desc.hasWriteReplaceMethod) { (o, desc) } else { - // write place val replaced = desc.invokeWriteReplace(o) - // `writeReplace` may return the same object. - if (replaced eq o) { -(o, desc) + // `writeReplace` recursion stops when the returned object has the same class. + if (replaced.getClass == o.getClass) { +(replaced, desc) } else { findObjectAndDescriptor(replaced) } http://git-wip-us.apache.org/repos/asf/spark/blob/e043c02d/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala index f019b1e..912a516 100644 --- a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala @@ -126,7 +126,11 @@ class SerializationDebuggerSuite extends SparkFunSuite with BeforeAndAfterEach { assert(find(new SerializableClassWithWriteReplace(new SerializableClass1)).isEmpty) } -test("object containing writeObject() and not serializable field") { + test("no infinite loop with writeReplace() which returns class of its own type") { +assert(find(new SerializableClassWithRecursiveWriteReplace).isEmpty) + } + + test("object containing writeObject() and not serializable field") { val s = find(new SerializableClassWithWriteObject(new NotSerializable)) assert(s.size === 3) assert(s(0).contains("NotSerializable")) @@ -229,6 +233,13 @@ class SerializableClassWithWriteReplace(@(transient @param) replacementFieldObje } +class SerializableClassWithRecursiveWriteReplace extends Serializable { + private def writeReplace(): Object = { +new SerializableClassWithRecursiveWriteReplace + } +} + + class ExternalizableClass(objectField: Object) extends java.io.Externalizable { val serializableObjectField = new SerializableClass1 - To unsubscribe, e-mail: commits-unsubscr
spark git commit: [SPARK-15956][SQL] Revert "[] When unwrapping ORC avoid pattern matching…
Repository: spark Updated Branches: refs/heads/master c2cebdb7d -> 472d611a7 [SPARK-15956][SQL] Revert "[] When unwrapping ORC avoid pattern matching⦠This reverts commit 0a9c02759515c41de37db6381750bc3a316c860c. It breaks the 2.10 build, I'll fix this in a different PR. Author: Herman van Hovell Closes #13853 from hvanhovell/SPARK-15956-revert. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/472d611a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/472d611a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/472d611a Branch: refs/heads/master Commit: 472d611a70da02d95e36da754435a3ac562f8b24 Parents: c2cebdb Author: Herman van Hovell Authored: Wed Jun 22 11:36:32 2016 -0700 Committer: Herman van Hovell Committed: Wed Jun 22 11:36:32 2016 -0700 -- .../apache/spark/sql/hive/HiveInspectors.scala | 428 ++- .../org/apache/spark/sql/hive/TableReader.scala | 3 +- .../hive/execution/ScriptTransformation.scala | 6 +- .../org/apache/spark/sql/hive/hiveUDFs.scala| 21 +- .../spark/sql/hive/HiveInspectorSuite.scala | 6 - 5 files changed, 150 insertions(+), 314 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/472d611a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 1aadc8b..585befe 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -239,6 +239,145 @@ private[hive] trait HiveInspectors { } /** + * Converts hive types to native catalyst types. + * @param data the data in Hive type + * @param oi the ObjectInspector associated with the Hive Type + * @return convert the data into catalyst type + * TODO return the function of (data => Any) instead for performance consideration + * + * Strictly follows the following order in unwrapping (constant OI has the higher priority): + * Constant Null object inspector => + *return null + * Constant object inspector => + *extract the value from constant object inspector + * Check whether the `data` is null => + *return null if true + * If object inspector prefers writable => + *extract writable from `data` and then get the catalyst type from the writable + * Extract the java object directly from the object inspector + * + * NOTICE: the complex data type requires recursive unwrapping. + */ + def unwrap(data: Any, oi: ObjectInspector): Any = oi match { +case coi: ConstantObjectInspector if coi.getWritableConstantValue == null => null +case poi: WritableConstantStringObjectInspector => + UTF8String.fromString(poi.getWritableConstantValue.toString) +case poi: WritableConstantHiveVarcharObjectInspector => + UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue) +case poi: WritableConstantHiveCharObjectInspector => + UTF8String.fromString(poi.getWritableConstantValue.getHiveChar.getValue) +case poi: WritableConstantHiveDecimalObjectInspector => + HiveShim.toCatalystDecimal( +PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector, +poi.getWritableConstantValue.getHiveDecimal) +case poi: WritableConstantTimestampObjectInspector => + val t = poi.getWritableConstantValue + t.getSeconds * 100L + t.getNanos / 1000L +case poi: WritableConstantIntObjectInspector => + poi.getWritableConstantValue.get() +case poi: WritableConstantDoubleObjectInspector => + poi.getWritableConstantValue.get() +case poi: WritableConstantBooleanObjectInspector => + poi.getWritableConstantValue.get() +case poi: WritableConstantLongObjectInspector => + poi.getWritableConstantValue.get() +case poi: WritableConstantFloatObjectInspector => + poi.getWritableConstantValue.get() +case poi: WritableConstantShortObjectInspector => + poi.getWritableConstantValue.get() +case poi: WritableConstantByteObjectInspector => + poi.getWritableConstantValue.get() +case poi: WritableConstantBinaryObjectInspector => + val writable = poi.getWritableConstantValue + val temp = new Array[Byte](writable.getLength) + System.arraycopy(writable.getBytes, 0, temp, 0, temp.length) + temp +case poi: WritableConstantDateObjectInspector => + DateTimeUtils.fromJavaDate(poi.getWritableConstantValue.get()) +case mi: StandardConstantMapObjectInspector => + // take the value from the map inspector object, rather than the input
spark git commit: [SPARK-16120][STREAMING] getCurrentLogFiles in ReceiverSuite WAL generating and cleaning case uses external variable instead of the passed parameter
Repository: spark Updated Branches: refs/heads/branch-2.0 76d0ef34e -> 520828c90 [SPARK-16120][STREAMING] getCurrentLogFiles in ReceiverSuite WAL generating and cleaning case uses external variable instead of the passed parameter ## What changes were proposed in this pull request? In `ReceiverSuite.scala`, in the test case "write ahead log - generating and cleaning", the inner method `getCurrentLogFiles` uses external variable `logDirectory1` instead of the passed parameter `logDirectory`. This PR fixes this by using the passed method argument instead of variable from the outer scope. ## How was this patch tested? The unit test was re-run and the output logs were checked for the correct paths used. tdas Author: Ahmed Mahran Closes #13825 from ahmed-mahran/b-receiver-suite-wal-gen-cln. (cherry picked from commit c2cebdb7ddff3d041d548fe1cd8de4efb31b294f) Signed-off-by: Shixiong Zhu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/520828c9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/520828c9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/520828c9 Branch: refs/heads/branch-2.0 Commit: 520828c90d25acf733ffa70fe269dcfe93b56a31 Parents: 76d0ef3 Author: Ahmed Mahran Authored: Wed Jun 22 10:39:24 2016 -0700 Committer: Shixiong Zhu Committed: Wed Jun 22 10:39:38 2016 -0700 -- .../src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/520828c9/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala index 917232c..1b1e21f 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala @@ -215,7 +215,7 @@ class ReceiverSuite extends TestSuiteBase with Timeouts with Serializable { def getCurrentLogFiles(logDirectory: File): Seq[String] = { try { if (logDirectory.exists()) { - logDirectory1.listFiles().filter { _.getName.startsWith("log") }.map { _.toString } + logDirectory.listFiles().filter { _.getName.startsWith("log") }.map { _.toString } } else { Seq.empty } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16120][STREAMING] getCurrentLogFiles in ReceiverSuite WAL generating and cleaning case uses external variable instead of the passed parameter
Repository: spark Updated Branches: refs/heads/master 0a9c02759 -> c2cebdb7d [SPARK-16120][STREAMING] getCurrentLogFiles in ReceiverSuite WAL generating and cleaning case uses external variable instead of the passed parameter ## What changes were proposed in this pull request? In `ReceiverSuite.scala`, in the test case "write ahead log - generating and cleaning", the inner method `getCurrentLogFiles` uses external variable `logDirectory1` instead of the passed parameter `logDirectory`. This PR fixes this by using the passed method argument instead of variable from the outer scope. ## How was this patch tested? The unit test was re-run and the output logs were checked for the correct paths used. tdas Author: Ahmed Mahran Closes #13825 from ahmed-mahran/b-receiver-suite-wal-gen-cln. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c2cebdb7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c2cebdb7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c2cebdb7 Branch: refs/heads/master Commit: c2cebdb7ddff3d041d548fe1cd8de4efb31b294f Parents: 0a9c027 Author: Ahmed Mahran Authored: Wed Jun 22 10:39:24 2016 -0700 Committer: Shixiong Zhu Committed: Wed Jun 22 10:39:24 2016 -0700 -- .../src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c2cebdb7/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala -- diff --git a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala index 917232c..1b1e21f 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala @@ -215,7 +215,7 @@ class ReceiverSuite extends TestSuiteBase with Timeouts with Serializable { def getCurrentLogFiles(logDirectory: File): Seq[String] = { try { if (logDirectory.exists()) { - logDirectory1.listFiles().filter { _.getName.startsWith("log") }.map { _.toString } + logDirectory.listFiles().filter { _.getName.startsWith("log") }.map { _.toString } } else { Seq.empty } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15956][SQL] When unwrapping ORC avoid pattern matching at runtime
Repository: spark Updated Branches: refs/heads/master 6a6010f00 -> 0a9c02759 [SPARK-15956][SQL] When unwrapping ORC avoid pattern matching at runtime ## What changes were proposed in this pull request? Extend the returning of unwrapper functions from primitive types to all types. ## How was this patch tested? The patch should pass all unit tests. Reading ORC files with non-primitive types with this change reduced the read time by ~15%. === The github diff is very noisy. Attaching the screenshots below for improved readability: ![screen shot 2016-06-14 at 5 33 16 pm](https://cloud.githubusercontent.com/assets/1514239/16064580/4d6f7a98-3257-11e6-9172-65e4baff948b.png) ![screen shot 2016-06-14 at 5 33 28 pm](https://cloud.githubusercontent.com/assets/1514239/16064587/5ae6c244-3257-11e6-8460-69eee70de219.png) Author: Brian Cho Closes #13676 from dafrista/improve-orc-master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a9c0275 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a9c0275 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a9c0275 Branch: refs/heads/master Commit: 0a9c02759515c41de37db6381750bc3a316c860c Parents: 6a6010f Author: Brian Cho Authored: Wed Jun 22 10:38:42 2016 -0700 Committer: Herman van Hovell Committed: Wed Jun 22 10:38:42 2016 -0700 -- .../apache/spark/sql/hive/HiveInspectors.scala | 428 +-- .../org/apache/spark/sql/hive/TableReader.scala | 3 +- .../hive/execution/ScriptTransformation.scala | 6 +- .../org/apache/spark/sql/hive/hiveUDFs.scala| 21 +- .../spark/sql/hive/HiveInspectorSuite.scala | 6 + 5 files changed, 314 insertions(+), 150 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a9c0275/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 585befe..1aadc8b 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -239,145 +239,6 @@ private[hive] trait HiveInspectors { } /** - * Converts hive types to native catalyst types. - * @param data the data in Hive type - * @param oi the ObjectInspector associated with the Hive Type - * @return convert the data into catalyst type - * TODO return the function of (data => Any) instead for performance consideration - * - * Strictly follows the following order in unwrapping (constant OI has the higher priority): - * Constant Null object inspector => - *return null - * Constant object inspector => - *extract the value from constant object inspector - * Check whether the `data` is null => - *return null if true - * If object inspector prefers writable => - *extract writable from `data` and then get the catalyst type from the writable - * Extract the java object directly from the object inspector - * - * NOTICE: the complex data type requires recursive unwrapping. - */ - def unwrap(data: Any, oi: ObjectInspector): Any = oi match { -case coi: ConstantObjectInspector if coi.getWritableConstantValue == null => null -case poi: WritableConstantStringObjectInspector => - UTF8String.fromString(poi.getWritableConstantValue.toString) -case poi: WritableConstantHiveVarcharObjectInspector => - UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue) -case poi: WritableConstantHiveCharObjectInspector => - UTF8String.fromString(poi.getWritableConstantValue.getHiveChar.getValue) -case poi: WritableConstantHiveDecimalObjectInspector => - HiveShim.toCatalystDecimal( -PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector, -poi.getWritableConstantValue.getHiveDecimal) -case poi: WritableConstantTimestampObjectInspector => - val t = poi.getWritableConstantValue - t.getSeconds * 100L + t.getNanos / 1000L -case poi: WritableConstantIntObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantDoubleObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantBooleanObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantLongObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantFloatObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantShortObjectInspector => - poi.getWritableConstantValue.get() -case poi: WritableConstantByteObjectInspector => -
spark git commit: [MINOR][MLLIB] DefaultParamsReadable/Writable should be DeveloperApi
Repository: spark Updated Branches: refs/heads/master 18faa588c -> 6a6010f00 [MINOR][MLLIB] DefaultParamsReadable/Writable should be DeveloperApi ## What changes were proposed in this pull request? `DefaultParamsReadable/Writable` are not user-facing. Only developers who implement `Transformer/Estimator` would use it. So this PR changes the annotation to `DeveloperApi`. Author: Xiangrui Meng Closes #13828 from mengxr/default-readable-should-be-developer-api. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a6010f0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a6010f0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a6010f0 Branch: refs/heads/master Commit: 6a6010f0015542dc2753b2cb12fdd1204db63ea6 Parents: 18faa58 Author: Xiangrui Meng Authored: Wed Jun 22 10:06:43 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 10:06:43 2016 -0700 -- .../scala/org/apache/spark/ml/util/ReadWrite.scala | 13 + 1 file changed, 5 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6a6010f0/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index 1582a73..4413fef 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -26,7 +26,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml._ import org.apache.spark.ml.classification.{OneVsRest, OneVsRestModel} @@ -161,7 +161,7 @@ trait MLWritable { } /** - * :: Experimental :: + * :: DeveloperApi :: * * Helper trait for making simple [[Params]] types writable. If a [[Params]] class stores * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide @@ -171,8 +171,7 @@ trait MLWritable { * * @see [[DefaultParamsReadable]], the counterpart to this trait */ -@Experimental -@Since("2.0.0") +@DeveloperApi trait DefaultParamsWritable extends MLWritable { self: Params => override def write: MLWriter = new DefaultParamsWriter(this) @@ -230,7 +229,7 @@ trait MLReadable[T] { /** - * :: Experimental :: + * :: DeveloperApi :: * * Helper trait for making simple [[Params]] types readable. If a [[Params]] class stores * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide @@ -239,11 +238,9 @@ trait MLReadable[T] { * [[org.apache.spark.sql.Dataset]]. * * @tparam T ML instance type - * * @see [[DefaultParamsWritable]], the counterpart to this trait */ -@Experimental -@Since("2.0.0") +@DeveloperApi trait DefaultParamsReadable[T] extends MLReadable[T] { override def read: MLReader[T] = new DefaultParamsReader[T] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][MLLIB] DefaultParamsReadable/Writable should be DeveloperApi
Repository: spark Updated Branches: refs/heads/branch-2.0 0cde3ad6d -> 76d0ef34e [MINOR][MLLIB] DefaultParamsReadable/Writable should be DeveloperApi ## What changes were proposed in this pull request? `DefaultParamsReadable/Writable` are not user-facing. Only developers who implement `Transformer/Estimator` would use it. So this PR changes the annotation to `DeveloperApi`. Author: Xiangrui Meng Closes #13828 from mengxr/default-readable-should-be-developer-api. (cherry picked from commit 6a6010f0015542dc2753b2cb12fdd1204db63ea6) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/76d0ef34 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/76d0ef34 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/76d0ef34 Branch: refs/heads/branch-2.0 Commit: 76d0ef34e4a5b91b883141f839adc493205fa429 Parents: 0cde3ad Author: Xiangrui Meng Authored: Wed Jun 22 10:06:43 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 10:06:49 2016 -0700 -- .../scala/org/apache/spark/ml/util/ReadWrite.scala | 13 + 1 file changed, 5 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/76d0ef34/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index 1582a73..4413fef 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -26,7 +26,7 @@ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext -import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml._ import org.apache.spark.ml.classification.{OneVsRest, OneVsRestModel} @@ -161,7 +161,7 @@ trait MLWritable { } /** - * :: Experimental :: + * :: DeveloperApi :: * * Helper trait for making simple [[Params]] types writable. If a [[Params]] class stores * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide @@ -171,8 +171,7 @@ trait MLWritable { * * @see [[DefaultParamsReadable]], the counterpart to this trait */ -@Experimental -@Since("2.0.0") +@DeveloperApi trait DefaultParamsWritable extends MLWritable { self: Params => override def write: MLWriter = new DefaultParamsWriter(this) @@ -230,7 +229,7 @@ trait MLReadable[T] { /** - * :: Experimental :: + * :: DeveloperApi :: * * Helper trait for making simple [[Params]] types readable. If a [[Params]] class stores * all data as [[org.apache.spark.ml.param.Param]] values, then extending this trait will provide @@ -239,11 +238,9 @@ trait MLReadable[T] { * [[org.apache.spark.sql.Dataset]]. * * @tparam T ML instance type - * * @see [[DefaultParamsWritable]], the counterpart to this trait */ -@Experimental -@Since("2.0.0") +@DeveloperApi trait DefaultParamsReadable[T] extends MLReadable[T] { override def read: MLReader[T] = new DefaultParamsReader[T] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16127][ML][PYPSARK] Audit @Since annotations related to ml.linalg
Repository: spark Updated Branches: refs/heads/branch-2.0 1cfdd25fd -> 0cde3ad6d [SPARK-16127][ML][PYPSARK] Audit @Since annotations related to ml.linalg [SPARK-14615](https://issues.apache.org/jira/browse/SPARK-14615) and #12627 changed `spark.ml` pipelines to use the new `ml.linalg` classes for `Vector`/`Matrix`. Some `Since` annotations for public methods/vals have not been updated accordingly to be `2.0.0`. This PR updates them. ## How was this patch tested? Existing unit tests. Author: Nick Pentreath Closes #13840 from MLnick/SPARK-16127-ml-linalg-since. (cherry picked from commit 18faa588ca11190890d2eb569d7497fbb25eee5c) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0cde3ad6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0cde3ad6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0cde3ad6 Branch: refs/heads/branch-2.0 Commit: 0cde3ad6d8ac822b73f42b8158ba09f7be00a2c9 Parents: 1cfdd25 Author: Nick Pentreath Authored: Wed Jun 22 10:05:25 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 10:05:31 2016 -0700 -- .../spark/ml/classification/LogisticRegression.scala| 2 +- .../classification/MultilayerPerceptronClassifier.scala | 2 +- .../org/apache/spark/ml/classification/NaiveBayes.scala | 4 ++-- .../scala/org/apache/spark/ml/clustering/KMeans.scala | 2 +- .../main/scala/org/apache/spark/ml/clustering/LDA.scala | 4 ++-- .../apache/spark/ml/feature/ElementwiseProduct.scala| 6 +++--- .../scala/org/apache/spark/ml/feature/Normalizer.scala | 12 ++-- .../apache/spark/ml/feature/PolynomialExpansion.scala | 12 ++-- .../scala/org/apache/spark/ml/feature/Word2Vec.scala| 2 +- .../spark/ml/regression/AFTSurvivalRegression.scala | 6 +++--- .../apache/spark/ml/regression/IsotonicRegression.scala | 4 ++-- .../apache/spark/ml/regression/LinearRegression.scala | 6 +++--- python/pyspark/ml/classification.py | 8 python/pyspark/ml/regression.py | 8 ++-- 14 files changed, 41 insertions(+), 37 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0cde3ad6/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 2fa8fbc..be69d46 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -482,7 +482,7 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] { @Experimental class LogisticRegressionModel private[spark] ( @Since("1.4.0") override val uid: String, -@Since("1.6.0") val coefficients: Vector, +@Since("2.0.0") val coefficients: Vector, @Since("1.3.0") val intercept: Double) extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams with MLWritable { http://git-wip-us.apache.org/repos/asf/spark/blob/0cde3ad6/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 7005421..76ef32a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -296,7 +296,7 @@ object MultilayerPerceptronClassifier class MultilayerPerceptronClassificationModel private[ml] ( @Since("1.5.0") override val uid: String, @Since("1.5.0") val layers: Array[Int], -@Since("1.5.0") val weights: Vector) +@Since("2.0.0") val weights: Vector) extends PredictionModel[Vector, MultilayerPerceptronClassificationModel] with Serializable with MLWritable { http://git-wip-us.apache.org/repos/asf/spark/blob/0cde3ad6/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index a9d4930..7c34031 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classif
spark git commit: [SPARK-16127][ML][PYPSARK] Audit @Since annotations related to ml.linalg
Repository: spark Updated Branches: refs/heads/master ea3a12b01 -> 18faa588c [SPARK-16127][ML][PYPSARK] Audit @Since annotations related to ml.linalg [SPARK-14615](https://issues.apache.org/jira/browse/SPARK-14615) and #12627 changed `spark.ml` pipelines to use the new `ml.linalg` classes for `Vector`/`Matrix`. Some `Since` annotations for public methods/vals have not been updated accordingly to be `2.0.0`. This PR updates them. ## How was this patch tested? Existing unit tests. Author: Nick Pentreath Closes #13840 from MLnick/SPARK-16127-ml-linalg-since. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18faa588 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18faa588 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18faa588 Branch: refs/heads/master Commit: 18faa588ca11190890d2eb569d7497fbb25eee5c Parents: ea3a12b Author: Nick Pentreath Authored: Wed Jun 22 10:05:25 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 10:05:25 2016 -0700 -- .../spark/ml/classification/LogisticRegression.scala| 2 +- .../classification/MultilayerPerceptronClassifier.scala | 2 +- .../org/apache/spark/ml/classification/NaiveBayes.scala | 4 ++-- .../scala/org/apache/spark/ml/clustering/KMeans.scala | 2 +- .../main/scala/org/apache/spark/ml/clustering/LDA.scala | 4 ++-- .../apache/spark/ml/feature/ElementwiseProduct.scala| 6 +++--- .../scala/org/apache/spark/ml/feature/Normalizer.scala | 12 ++-- .../apache/spark/ml/feature/PolynomialExpansion.scala | 12 ++-- .../scala/org/apache/spark/ml/feature/Word2Vec.scala| 2 +- .../spark/ml/regression/AFTSurvivalRegression.scala | 6 +++--- .../apache/spark/ml/regression/IsotonicRegression.scala | 4 ++-- .../apache/spark/ml/regression/LinearRegression.scala | 6 +++--- python/pyspark/ml/classification.py | 8 python/pyspark/ml/regression.py | 8 ++-- 14 files changed, 41 insertions(+), 37 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/18faa588/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 2fa8fbc..be69d46 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -482,7 +482,7 @@ object LogisticRegression extends DefaultParamsReadable[LogisticRegression] { @Experimental class LogisticRegressionModel private[spark] ( @Since("1.4.0") override val uid: String, -@Since("1.6.0") val coefficients: Vector, +@Since("2.0.0") val coefficients: Vector, @Since("1.3.0") val intercept: Double) extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel] with LogisticRegressionParams with MLWritable { http://git-wip-us.apache.org/repos/asf/spark/blob/18faa588/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala index 7005421..76ef32a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala @@ -296,7 +296,7 @@ object MultilayerPerceptronClassifier class MultilayerPerceptronClassificationModel private[ml] ( @Since("1.5.0") override val uid: String, @Since("1.5.0") val layers: Array[Int], -@Since("1.5.0") val weights: Vector) +@Since("2.0.0") val weights: Vector) extends PredictionModel[Vector, MultilayerPerceptronClassificationModel] with Serializable with MLWritable { http://git-wip-us.apache.org/repos/asf/spark/blob/18faa588/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index a9d4930..7c34031 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -130,8 +130,8 @@ object NaiveBayes extends DefaultParamsReadable[NaiveBayes] {
spark git commit: [SPARK-16107][R] group glm methods in documentation
Repository: spark Updated Branches: refs/heads/branch-2.0 503eb882c -> 1cfdd25fd [SPARK-16107][R] group glm methods in documentation ## What changes were proposed in this pull request? This groups GLM methods (spark.glm, summary, print, predict and write.ml) in the documentation. The example code was updated. ## How was this patch tested? N/A (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) ![screen shot 2016-06-21 at 2 31 37 pm](https://cloud.githubusercontent.com/assets/15318264/16247077/f6eafc04-37bc-11e6-89a8-7898ff3e4078.png) ![screen shot 2016-06-21 at 2 31 45 pm](https://cloud.githubusercontent.com/assets/15318264/16247078/f6eb1c16-37bc-11e6-940a-2b595b10617c.png) Author: Junyang Qian Author: Junyang Qian Closes #13820 from junyangq/SPARK-16107. (cherry picked from commit ea3a12b0147821960f8dabdc58d726f07f1f0e52) Signed-off-by: Xiangrui Meng Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1cfdd25f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1cfdd25f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1cfdd25f Branch: refs/heads/branch-2.0 Commit: 1cfdd25fdb87012187b1e01f9c5ac4b6218dc840 Parents: 503eb88 Author: Junyang Qian Authored: Wed Jun 22 09:13:08 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 09:13:15 2016 -0700 -- R/pkg/R/mllib.R | 80 +++- 1 file changed, 36 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1cfdd25f/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index b83b3b3..dbff1b9 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -53,9 +53,10 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) #' @note KMeansModel since 2.0.0 setClass("KMeansModel", representation(jobj = "jobj")) -#' Fits a generalized linear model +#' Generalized Linear Models #' -#' Fits a generalized linear model against a Spark DataFrame. +#' Fits generalized linear model against a Spark DataFrame. Users can print, make predictions on the +#' produced model and save the model to the input path. #' #' @param data SparkDataFrame for training. #' @param formula A symbolic description of the model to be fitted. Currently only a few formula @@ -66,8 +67,9 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' @param tol Positive convergence tolerance of iterations. #' @param maxIter Integer giving the maximal number of IRLS iterations. -#' @return a fitted generalized linear model +#' @return \code{spark.glm} returns a fitted generalized linear model #' @rdname spark.glm +#' @name spark.glm #' @export #' @examples #' \dontrun{ @@ -76,8 +78,21 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' df <- createDataFrame(iris) #' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") #' summary(model) +#' +#' # fitted values on training data +#' fitted <- predict(model, df) +#' head(select(fitted, "Sepal_Length", "prediction")) +#' +#' # save fitted model to input path +#' path <- "path/to/model" +#' write.ml(model, path) +#' +#' # can also read back the saved model and print +#' savedModel <- read.ml(path) +#' summary(savedModel) #' } #' @note spark.glm since 2.0.0 +#' @seealso \link{glm}, \link{read.ml} setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25) { if (is.character(family)) { @@ -99,10 +114,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), return(new("GeneralizedLinearRegressionModel", jobj = jobj)) }) -#' Fits a generalized linear model (R-compliant). +#' Generalized Linear Models (R-compliant) #' #' Fits a generalized linear model, similarly to R's glm(). -#' #' @param formula A symbolic description of the model to be fitted. Currently only a few formula #'operators are supported, including '~', '.', ':', '+', and '-'. #' @param data SparkDataFrame for training. @@ -112,7 +126,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' @param epsilon Positive convergence tolerance of iterations. #' @param maxit Integer giving the maximal number of IRLS iterations. -#' @return a fitted generalized linear model +#' @return \code{glm} returns a fitted generalized linear model. #' @rdname glm #' @export #' @examples @@ -124,24 +138,21 @@ setMethod("spark.g
spark git commit: [SPARK-16107][R] group glm methods in documentation
Repository: spark Updated Branches: refs/heads/master cf1995a97 -> ea3a12b01 [SPARK-16107][R] group glm methods in documentation ## What changes were proposed in this pull request? This groups GLM methods (spark.glm, summary, print, predict and write.ml) in the documentation. The example code was updated. ## How was this patch tested? N/A (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) ![screen shot 2016-06-21 at 2 31 37 pm](https://cloud.githubusercontent.com/assets/15318264/16247077/f6eafc04-37bc-11e6-89a8-7898ff3e4078.png) ![screen shot 2016-06-21 at 2 31 45 pm](https://cloud.githubusercontent.com/assets/15318264/16247078/f6eb1c16-37bc-11e6-940a-2b595b10617c.png) Author: Junyang Qian Author: Junyang Qian Closes #13820 from junyangq/SPARK-16107. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea3a12b0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea3a12b0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea3a12b0 Branch: refs/heads/master Commit: ea3a12b0147821960f8dabdc58d726f07f1f0e52 Parents: cf1995a Author: Junyang Qian Authored: Wed Jun 22 09:13:08 2016 -0700 Committer: Xiangrui Meng Committed: Wed Jun 22 09:13:08 2016 -0700 -- R/pkg/R/mllib.R | 80 +++- 1 file changed, 36 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ea3a12b0/R/pkg/R/mllib.R -- diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R index b83b3b3..dbff1b9 100644 --- a/R/pkg/R/mllib.R +++ b/R/pkg/R/mllib.R @@ -53,9 +53,10 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = "jobj")) #' @note KMeansModel since 2.0.0 setClass("KMeansModel", representation(jobj = "jobj")) -#' Fits a generalized linear model +#' Generalized Linear Models #' -#' Fits a generalized linear model against a Spark DataFrame. +#' Fits generalized linear model against a Spark DataFrame. Users can print, make predictions on the +#' produced model and save the model to the input path. #' #' @param data SparkDataFrame for training. #' @param formula A symbolic description of the model to be fitted. Currently only a few formula @@ -66,8 +67,9 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' @param tol Positive convergence tolerance of iterations. #' @param maxIter Integer giving the maximal number of IRLS iterations. -#' @return a fitted generalized linear model +#' @return \code{spark.glm} returns a fitted generalized linear model #' @rdname spark.glm +#' @name spark.glm #' @export #' @examples #' \dontrun{ @@ -76,8 +78,21 @@ setClass("KMeansModel", representation(jobj = "jobj")) #' df <- createDataFrame(iris) #' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian") #' summary(model) +#' +#' # fitted values on training data +#' fitted <- predict(model, df) +#' head(select(fitted, "Sepal_Length", "prediction")) +#' +#' # save fitted model to input path +#' path <- "path/to/model" +#' write.ml(model, path) +#' +#' # can also read back the saved model and print +#' savedModel <- read.ml(path) +#' summary(savedModel) #' } #' @note spark.glm since 2.0.0 +#' @seealso \link{glm}, \link{read.ml} setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25) { if (is.character(family)) { @@ -99,10 +114,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), return(new("GeneralizedLinearRegressionModel", jobj = jobj)) }) -#' Fits a generalized linear model (R-compliant). +#' Generalized Linear Models (R-compliant) #' #' Fits a generalized linear model, similarly to R's glm(). -#' #' @param formula A symbolic description of the model to be fitted. Currently only a few formula #'operators are supported, including '~', '.', ':', '+', and '-'. #' @param data SparkDataFrame for training. @@ -112,7 +126,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' \url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}. #' @param epsilon Positive convergence tolerance of iterations. #' @param maxit Integer giving the maximal number of IRLS iterations. -#' @return a fitted generalized linear model +#' @return \code{glm} returns a fitted generalized linear model. #' @rdname glm #' @export #' @examples @@ -124,24 +138,21 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"), #' summary(model) #' } #' @note glm since
spark git commit: [SPARK-15783][CORE] Fix Flakiness in BlacklistIntegrationSuite
Repository: spark Updated Branches: refs/heads/master 01277d4b2 -> cf1995a97 [SPARK-15783][CORE] Fix Flakiness in BlacklistIntegrationSuite ## What changes were proposed in this pull request? Three changes here -- first two were causing failures w/ BlacklistIntegrationSuite 1. The testing framework didn't include the reviveOffers thread, so the test which involved delay scheduling might never submit offers late enough for the delay scheduling to kick in. So added in the periodic revive offers, just like the real scheduler. 2. `assertEmptyDataStructures` would occasionally fail, because it appeared there was still an active job. This is because in DAGScheduler, the jobWaiter is notified of the job completion before the data structures are cleaned up. Most of the time the test code that is waiting on the jobWaiter won't become active until after the data structures are cleared, but occasionally the race goes the other way, and the assertions fail. 3. `DAGSchedulerSuite` was not stopping all the inner parts it was setting up, so each test was leaking a number of threads. So we stop those parts too. 4. Turns out that `assertMapOutputAvailable` is not terribly useful in this framework -- most of the places I was trying to use it suffer from some race. 5. When there is an exception in the backend, try to improve the error msg a little bit. Before the exception was printed to the console, but the test would fail w/ a timeout, and the logs wouldn't show anything. ## How was this patch tested? I ran all the tests in `BlacklistIntegrationSuite` 5k times and everything in `DAGSchedulerSuite` 1k times on my laptop. Also I ran a full jenkins build with `BlacklistIntegrationSuite` 500 times and `DAGSchedulerSuite` 50 times, see https://github.com/apache/spark/pull/13548. (I tried more times but jenkins timed out.) To check for more leaked threads, I added some code to dump the list of all threads at the end of each test in DAGSchedulerSuite, which is how I discovered the mapOutputTracker and eventLoop were leaking threads. (I removed that code from the final pr, just part of the testing.) And I'll run Jenkins on this a couple of times to do one more check. Author: Imran Rashid Closes #13565 from squito/blacklist_extra_tests. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf1995a9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf1995a9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf1995a9 Branch: refs/heads/master Commit: cf1995a97645f0b44c997f4fdbba631fd6b91a16 Parents: 01277d4 Author: Imran Rashid Authored: Wed Jun 22 08:35:41 2016 -0500 Committer: Imran Rashid Committed: Wed Jun 22 08:35:41 2016 -0500 -- .../apache/spark/scheduler/DAGScheduler.scala | 4 +- .../scheduler/BlacklistIntegrationSuite.scala | 16 ++--- .../spark/scheduler/DAGSchedulerSuite.scala | 9 ++- .../scheduler/SchedulerIntegrationSuite.scala | 73 4 files changed, 76 insertions(+), 26 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cf1995a9/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala -- diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index d4e0d6d..4eb7c81 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -1465,8 +1465,10 @@ class DAGScheduler( } if (ableToCancelStages) { - job.listener.jobFailed(error) + // SPARK-15783 important to cleanup state first, just for tests where we have some asserts + // against the state. Otherwise we have a *little* bit of flakiness in the tests. cleanupStateForJobAndIndependentStages(job) + job.listener.jobFailed(error) listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), JobFailed(error))) } } http://git-wip-us.apache.org/repos/asf/spark/blob/cf1995a9/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala index d8a4b19..8ba2697 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark._ class BlacklistIntegrationSuite extends SchedulerIntegrationSuite[MultiExecutorMockBackend]{ val badHost =
spark git commit: [SPARK-6005][TESTS] Fix flaky test: o.a.s.streaming.kafka.DirectKafkaStreamSuite.offset recovery
Repository: spark Updated Branches: refs/heads/branch-1.6 d98fb19c1 -> 4fdac3c27 [SPARK-6005][TESTS] Fix flaky test: o.a.s.streaming.kafka.DirectKafkaStreamSuite.offset recovery ## What changes were proposed in this pull request? Because this test extracts data from `DStream.generatedRDDs` before stopping, it may get data before checkpointing. Then after recovering from the checkpoint, `recoveredOffsetRanges` may contain something not in `offsetRangesBeforeStop`, which will fail the test. Adding `Thread.sleep(1000)` before `ssc.stop()` will reproduce this failure. This PR just moves the logic of `offsetRangesBeforeStop` (also renamed to `offsetRangesAfterStop`) after `ssc.stop()` to fix the flaky test. ## How was this patch tested? Jenkins unit tests. Author: Shixiong Zhu Closes #12903 from zsxwing/SPARK-6005. (cherry picked from commit 9533f5390a3ad7ab96a7bea01cdb6aed89503a51) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4fdac3c2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4fdac3c2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4fdac3c2 Branch: refs/heads/branch-1.6 Commit: 4fdac3c271eccc5db69c45788af15e955752a163 Parents: d98fb19 Author: Shixiong Zhu Authored: Tue May 10 13:26:53 2016 -0700 Committer: Sean Owen Committed: Wed Jun 22 14:10:50 2016 +0100 -- .../kafka/DirectKafkaStreamSuite.scala | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4fdac3c2/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala -- diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala index 02225d5..feea0ae 100644 --- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala +++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala @@ -280,14 +280,20 @@ class DirectKafkaStreamSuite sendDataAndWaitForReceive(i) } +ssc.stop() + // Verify that offset ranges were generated -val offsetRangesBeforeStop = getOffsetRanges(kafkaStream) -assert(offsetRangesBeforeStop.size >= 1, "No offset ranges generated") +// Since "offsetRangesAfterStop" will be used to compare with "recoveredOffsetRanges", we should +// collect offset ranges after stopping. Otherwise, because new RDDs keep being generated before +// stopping, we may not be able to get the latest RDDs, then "recoveredOffsetRanges" will +// contain something not in "offsetRangesAfterStop". +val offsetRangesAfterStop = getOffsetRanges(kafkaStream) +assert(offsetRangesAfterStop.size >= 1, "No offset ranges generated") assert( - offsetRangesBeforeStop.head._2.forall { _.fromOffset === 0 }, + offsetRangesAfterStop.head._2.forall { _.fromOffset === 0 }, "starting offset not zero" ) -ssc.stop() + logInfo("== RESTARTING ") // Recover context from checkpoints @@ -297,12 +303,14 @@ class DirectKafkaStreamSuite // Verify offset ranges have been recovered val recoveredOffsetRanges = getOffsetRanges(recoveredStream) assert(recoveredOffsetRanges.size > 0, "No offset ranges recovered") -val earlierOffsetRangesAsSets = offsetRangesBeforeStop.map { x => (x._1, x._2.toSet) } +val earlierOffsetRangesAsSets = offsetRangesAfterStop.map { x => (x._1, x._2.toSet) } assert( recoveredOffsetRanges.forall { or => earlierOffsetRangesAsSets.contains((or._1, or._2.toSet)) }, - "Recovered ranges are not the same as the ones generated" + "Recovered ranges are not the same as the ones generated\n" + +s"recoveredOffsetRanges: $recoveredOffsetRanges\n" + +s"earlierOffsetRangesAsSets: $earlierOffsetRangesAsSets" ) // Restart context, give more data and verify the total at the end // If the total is write that means each records has been received only once - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16097][SQL] Encoders.tuple should handle null object correctly
Repository: spark Updated Branches: refs/heads/branch-2.0 60bd704b5 -> 503eb882c [SPARK-16097][SQL] Encoders.tuple should handle null object correctly ## What changes were proposed in this pull request? Although the top level input object can not be null, but when we use `Encoders.tuple` to combine 2 encoders, their input objects are not top level anymore and can be null. We should handle this case. ## How was this patch tested? new test in DatasetSuite Author: Wenchen Fan Closes #13807 from cloud-fan/bug. (cherry picked from commit 01277d4b259dcf9cad25eece1377162b7a8c946d) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/503eb882 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/503eb882 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/503eb882 Branch: refs/heads/branch-2.0 Commit: 503eb882c14eac9681981199ccf8f699cab23bf0 Parents: 60bd704 Author: Wenchen Fan Authored: Wed Jun 22 18:32:14 2016 +0800 Committer: Cheng Lian Committed: Wed Jun 22 18:37:36 2016 +0800 -- .../catalyst/encoders/ExpressionEncoder.scala | 48 ++-- .../org/apache/spark/sql/DatasetSuite.scala | 7 +++ 2 files changed, 42 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/503eb882/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 0023ce6..1fac26c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, Invoke, NewInstance} import org.apache.spark.sql.catalyst.optimizer.SimplifyCasts import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, DeserializeToObject, LocalRelation} -import org.apache.spark.sql.types.{ObjectType, StructField, StructType} +import org.apache.spark.sql.types.{BooleanType, ObjectType, StructField, StructType} import org.apache.spark.util.Utils /** @@ -110,16 +110,34 @@ object ExpressionEncoder { val cls = Utils.getContextOrSparkClassLoader.loadClass(s"scala.Tuple${encoders.size}") -val serializer = encoders.map { - case e if e.flat => e.serializer.head - case other => CreateStruct(other.serializer) -}.zipWithIndex.map { case (expr, index) => - expr.transformUp { -case BoundReference(0, t, _) => - Invoke( -BoundReference(0, ObjectType(cls), nullable = true), -s"_${index + 1}", -t) +val serializer = encoders.zipWithIndex.map { case (enc, index) => + val originalInputObject = enc.serializer.head.collect { case b: BoundReference => b }.head + val newInputObject = Invoke( +BoundReference(0, ObjectType(cls), nullable = true), +s"_${index + 1}", +originalInputObject.dataType) + + val newSerializer = enc.serializer.map(_.transformUp { +case b: BoundReference if b == originalInputObject => newInputObject + }) + + if (enc.flat) { +newSerializer.head + } else { +// For non-flat encoder, the input object is not top level anymore after being combined to +// a tuple encoder, thus it can be null and we should wrap the `CreateStruct` with `If` and +// null check to handle null case correctly. +// e.g. for Encoder[(Int, String)], the serializer expressions will create 2 columns, and is +// not able to handle the case when the input tuple is null. This is not a problem as there +// is a check to make sure the input object won't be null. However, if this encoder is used +// to create a bigger tuple encoder, the original input object becomes a filed of the new +// input tuple and can be null. So instead of creating a struct directly here, we should add +// a null/None check and return a null struct if the null/None check fails. +val struct = CreateStruct(newSerializer) +val nullCheck = Or( + IsNull(newInputObject), + Invoke(Literal.fromObject(None), "equals", BooleanType, newInputObject :: Nil)) +If(nullCheck, Literal.create(null, struct.dataType), struct) } } @@ -203,8 +221,12 @@ case class ExpressionEncoder[T]( // (intermediate value is not an attribute). We assume that all
spark git commit: [SPARK-16097][SQL] Encoders.tuple should handle null object correctly
Repository: spark Updated Branches: refs/heads/master 39ad53f7f -> 01277d4b2 [SPARK-16097][SQL] Encoders.tuple should handle null object correctly ## What changes were proposed in this pull request? Although the top level input object can not be null, but when we use `Encoders.tuple` to combine 2 encoders, their input objects are not top level anymore and can be null. We should handle this case. ## How was this patch tested? new test in DatasetSuite Author: Wenchen Fan Closes #13807 from cloud-fan/bug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/01277d4b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/01277d4b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/01277d4b Branch: refs/heads/master Commit: 01277d4b259dcf9cad25eece1377162b7a8c946d Parents: 39ad53f Author: Wenchen Fan Authored: Wed Jun 22 18:32:14 2016 +0800 Committer: Cheng Lian Committed: Wed Jun 22 18:32:14 2016 +0800 -- .../catalyst/encoders/ExpressionEncoder.scala | 48 ++-- .../org/apache/spark/sql/DatasetSuite.scala | 7 +++ 2 files changed, 42 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/01277d4b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala index 0023ce6..1fac26c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, Invoke, NewInstance} import org.apache.spark.sql.catalyst.optimizer.SimplifyCasts import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, DeserializeToObject, LocalRelation} -import org.apache.spark.sql.types.{ObjectType, StructField, StructType} +import org.apache.spark.sql.types.{BooleanType, ObjectType, StructField, StructType} import org.apache.spark.util.Utils /** @@ -110,16 +110,34 @@ object ExpressionEncoder { val cls = Utils.getContextOrSparkClassLoader.loadClass(s"scala.Tuple${encoders.size}") -val serializer = encoders.map { - case e if e.flat => e.serializer.head - case other => CreateStruct(other.serializer) -}.zipWithIndex.map { case (expr, index) => - expr.transformUp { -case BoundReference(0, t, _) => - Invoke( -BoundReference(0, ObjectType(cls), nullable = true), -s"_${index + 1}", -t) +val serializer = encoders.zipWithIndex.map { case (enc, index) => + val originalInputObject = enc.serializer.head.collect { case b: BoundReference => b }.head + val newInputObject = Invoke( +BoundReference(0, ObjectType(cls), nullable = true), +s"_${index + 1}", +originalInputObject.dataType) + + val newSerializer = enc.serializer.map(_.transformUp { +case b: BoundReference if b == originalInputObject => newInputObject + }) + + if (enc.flat) { +newSerializer.head + } else { +// For non-flat encoder, the input object is not top level anymore after being combined to +// a tuple encoder, thus it can be null and we should wrap the `CreateStruct` with `If` and +// null check to handle null case correctly. +// e.g. for Encoder[(Int, String)], the serializer expressions will create 2 columns, and is +// not able to handle the case when the input tuple is null. This is not a problem as there +// is a check to make sure the input object won't be null. However, if this encoder is used +// to create a bigger tuple encoder, the original input object becomes a filed of the new +// input tuple and can be null. So instead of creating a struct directly here, we should add +// a null/None check and return a null struct if the null/None check fails. +val struct = CreateStruct(newSerializer) +val nullCheck = Or( + IsNull(newInputObject), + Invoke(Literal.fromObject(None), "equals", BooleanType, newInputObject :: Nil)) +If(nullCheck, Literal.create(null, struct.dataType), struct) } } @@ -203,8 +221,12 @@ case class ExpressionEncoder[T]( // (intermediate value is not an attribute). We assume that all serializer expressions use a same // `BoundReference` to refer to the object, and throw exception if
spark git commit: [SPARK-16121] ListingFileCatalog does not list in parallel anymore
Repository: spark Updated Branches: refs/heads/branch-2.0 838143a2a -> 60bd704b5 [SPARK-16121] ListingFileCatalog does not list in parallel anymore ## What changes were proposed in this pull request? Seems the fix of SPARK-14959 breaks the parallel partitioning discovery. This PR fixes the problem ## How was this patch tested? Tested manually. (This PR also adds a proper test for SPARK-14959) Author: Yin Huai Closes #13830 from yhuai/SPARK-16121. (cherry picked from commit 39ad53f7ffddae5ba0ff0a76089ba671b14c44c8) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60bd704b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60bd704b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60bd704b Branch: refs/heads/branch-2.0 Commit: 60bd704b541c4d1991922ffd3dd5b47de9bd5821 Parents: 838143a Author: Yin Huai Authored: Wed Jun 22 18:07:07 2016 +0800 Committer: Cheng Lian Committed: Wed Jun 22 18:07:27 2016 +0800 -- .../datasources/ListingFileCatalog.scala| 58 ++-- .../datasources/fileSourceInterfaces.scala | 7 ++- .../datasources/FileSourceStrategySuite.scala | 45 ++- 3 files changed, 101 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/60bd704b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala index f713fde..675e755 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources import scala.collection.mutable import scala.util.Try -import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path} import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.spark.sql.SparkSession @@ -73,21 +73,67 @@ class ListingFileCatalog( cachedPartitionSpec = null } - protected def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { + /** + * List leaf files of given paths. This method will submit a Spark job to do parallel + * listing whenever there is a path having more files than the parallel partition discovery + * discovery threshold. + */ + protected[spark] def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) { HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sparkSession) } else { + // Right now, the number of paths is less than the value of + // parallelPartitionDiscoveryThreshold. So, we will list file statues at the driver. + // If there is any child that has more files than the threshold, we will use parallel + // listing. + // Dummy jobconf to get to the pathFilter defined in configuration val jobConf = new JobConf(hadoopConf, this.getClass) val pathFilter = FileInputFormat.getInputPathFilter(jobConf) + val statuses: Seq[FileStatus] = paths.flatMap { path => val fs = path.getFileSystem(hadoopConf) logTrace(s"Listing $path on driver") -Try { - HadoopFsRelation.listLeafFiles(fs, fs.getFileStatus(path), pathFilter) -}.getOrElse(Array.empty[FileStatus]) + +val childStatuses = { + // TODO: We need to avoid of using Try at here. + val stats = Try(fs.listStatus(path)).getOrElse(Array.empty[FileStatus]) + if (pathFilter != null) stats.filter(f => pathFilter.accept(f.getPath)) else stats +} + +childStatuses.map { + case f: LocatedFileStatus => f + + // NOTE: + // + // - Although S3/S3A/S3N file system can be quite slow for remote file metadata + // operations, calling `getFileBlockLocations` does no harm here since these file system + // implementations don't actually issue RPC for this method. + // + // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not + // be a big deal since we always use to `listLeafFilesInParallel` when the number of + // paths exceeds threshold. + case f => +if (f.isDirectory ) { + // If f is a directory, we do not need to call getFileBlockLocations (SPARK-14959). + f +
spark git commit: [SPARK-16121] ListingFileCatalog does not list in parallel anymore
Repository: spark Updated Branches: refs/heads/master d281b0baf -> 39ad53f7f [SPARK-16121] ListingFileCatalog does not list in parallel anymore ## What changes were proposed in this pull request? Seems the fix of SPARK-14959 breaks the parallel partitioning discovery. This PR fixes the problem ## How was this patch tested? Tested manually. (This PR also adds a proper test for SPARK-14959) Author: Yin Huai Closes #13830 from yhuai/SPARK-16121. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39ad53f7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39ad53f7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39ad53f7 Branch: refs/heads/master Commit: 39ad53f7ffddae5ba0ff0a76089ba671b14c44c8 Parents: d281b0b Author: Yin Huai Authored: Wed Jun 22 18:07:07 2016 +0800 Committer: Cheng Lian Committed: Wed Jun 22 18:07:07 2016 +0800 -- .../datasources/ListingFileCatalog.scala| 58 ++-- .../datasources/fileSourceInterfaces.scala | 7 ++- .../datasources/FileSourceStrategySuite.scala | 45 ++- 3 files changed, 101 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/39ad53f7/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala index f713fde..675e755 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala @@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources import scala.collection.mutable import scala.util.Try -import org.apache.hadoop.fs.{FileStatus, Path} +import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path} import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.spark.sql.SparkSession @@ -73,21 +73,67 @@ class ListingFileCatalog( cachedPartitionSpec = null } - protected def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { + /** + * List leaf files of given paths. This method will submit a Spark job to do parallel + * listing whenever there is a path having more files than the parallel partition discovery + * discovery threshold. + */ + protected[spark] def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = { if (paths.length >= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) { HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sparkSession) } else { + // Right now, the number of paths is less than the value of + // parallelPartitionDiscoveryThreshold. So, we will list file statues at the driver. + // If there is any child that has more files than the threshold, we will use parallel + // listing. + // Dummy jobconf to get to the pathFilter defined in configuration val jobConf = new JobConf(hadoopConf, this.getClass) val pathFilter = FileInputFormat.getInputPathFilter(jobConf) + val statuses: Seq[FileStatus] = paths.flatMap { path => val fs = path.getFileSystem(hadoopConf) logTrace(s"Listing $path on driver") -Try { - HadoopFsRelation.listLeafFiles(fs, fs.getFileStatus(path), pathFilter) -}.getOrElse(Array.empty[FileStatus]) + +val childStatuses = { + // TODO: We need to avoid of using Try at here. + val stats = Try(fs.listStatus(path)).getOrElse(Array.empty[FileStatus]) + if (pathFilter != null) stats.filter(f => pathFilter.accept(f.getPath)) else stats +} + +childStatuses.map { + case f: LocatedFileStatus => f + + // NOTE: + // + // - Although S3/S3A/S3N file system can be quite slow for remote file metadata + // operations, calling `getFileBlockLocations` does no harm here since these file system + // implementations don't actually issue RPC for this method. + // + // - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not + // be a big deal since we always use to `listLeafFilesInParallel` when the number of + // paths exceeds threshold. + case f => +if (f.isDirectory ) { + // If f is a directory, we do not need to call getFileBlockLocations (SPARK-14959). + f +} else { + HadoopFsRelation.createLocatedFileStatus(f, fs.getFileBlockLocations(f
spark git commit: [SPARK-15162][SPARK-15164][PYSPARK][DOCS][ML] update some pydocs
Repository: spark Updated Branches: refs/heads/branch-2.0 e7a489c7f -> 838143a2a [SPARK-15162][SPARK-15164][PYSPARK][DOCS][ML] update some pydocs ## What changes were proposed in this pull request? Mark ml.classification algorithms as experimental to match Scala algorithms, update PyDoc for for thresholds on `LogisticRegression` to have same level of info as Scala, and enable mathjax for PyDoc. ## How was this patch tested? Built docs locally & PySpark SQL tests Author: Holden Karau Closes #12938 from holdenk/SPARK-15162-SPARK-15164-update-some-pydocs. (cherry picked from commit d281b0bafe6aa23085d4d2b68f0ce321f1978b50) Signed-off-by: Nick Pentreath Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/838143a2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/838143a2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/838143a2 Branch: refs/heads/branch-2.0 Commit: 838143a2a02192a9ebc955b8060a6520b62d9644 Parents: e7a489c Author: Holden Karau Authored: Wed Jun 22 11:54:49 2016 +0200 Committer: Nick Pentreath Committed: Wed Jun 22 11:55:10 2016 +0200 -- .../ml/classification/LogisticRegression.scala | 5 ++- python/docs/conf.py | 1 + python/pyspark/ml/classification.py | 38 ++-- 3 files changed, 39 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/838143a2/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index a7ba39e..2fa8fbc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -72,10 +72,9 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * Get threshold for binary classification. * - * If [[threshold]] is set, returns that value. - * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary classification), + * If [[thresholds]] is set with length 2 (i.e., binary classification), * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. - * Otherwise, returns [[threshold]] default value. + * Otherwise, returns [[threshold]] if set, or its default value if unset. * * @group getParam * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2. http://git-wip-us.apache.org/repos/asf/spark/blob/838143a2/python/docs/conf.py -- diff --git a/python/docs/conf.py b/python/docs/conf.py index d35bf73..50fb317 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -32,6 +32,7 @@ extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'epytext', +'sphinx.ext.mathjax', ] # Add any paths that contain templates here, relative to this directory. http://git-wip-us.apache.org/repos/asf/spark/blob/838143a2/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index e86c27e..d6d713c 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -49,6 +49,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, HasWeightCol, JavaMLWritable, JavaMLReadable): """ +.. note:: Experimental + Logistic regression. Currently, this class only supports binary classification. @@ -96,7 +98,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti threshold = Param(Params._dummy(), "threshold", "Threshold in binary classification prediction, in range [0, 1]." + - " If threshold and thresholds are both set, they must match.", + " If threshold and thresholds are both set, they must match." + + "e.g. if threshold is p, then thresholds must be equal to [1-p, p].", typeConverter=TypeConverters.toFloat) @keyword_only @@ -154,7 +157,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti @since("1.4.0") def getThreshold(self): """ -Gets the value of threshold or its default value. +Get threshold for binar
spark git commit: [SPARK-15162][SPARK-15164][PYSPARK][DOCS][ML] update some pydocs
Repository: spark Updated Branches: refs/heads/master 0e3ce7533 -> d281b0baf [SPARK-15162][SPARK-15164][PYSPARK][DOCS][ML] update some pydocs ## What changes were proposed in this pull request? Mark ml.classification algorithms as experimental to match Scala algorithms, update PyDoc for for thresholds on `LogisticRegression` to have same level of info as Scala, and enable mathjax for PyDoc. ## How was this patch tested? Built docs locally & PySpark SQL tests Author: Holden Karau Closes #12938 from holdenk/SPARK-15162-SPARK-15164-update-some-pydocs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d281b0ba Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d281b0ba Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d281b0ba Branch: refs/heads/master Commit: d281b0bafe6aa23085d4d2b68f0ce321f1978b50 Parents: 0e3ce75 Author: Holden Karau Authored: Wed Jun 22 11:54:49 2016 +0200 Committer: Nick Pentreath Committed: Wed Jun 22 11:54:49 2016 +0200 -- .../ml/classification/LogisticRegression.scala | 5 ++- python/docs/conf.py | 1 + python/pyspark/ml/classification.py | 38 ++-- 3 files changed, 39 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d281b0ba/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index a7ba39e..2fa8fbc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -72,10 +72,9 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas /** * Get threshold for binary classification. * - * If [[threshold]] is set, returns that value. - * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary classification), + * If [[thresholds]] is set with length 2 (i.e., binary classification), * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. - * Otherwise, returns [[threshold]] default value. + * Otherwise, returns [[threshold]] if set, or its default value if unset. * * @group getParam * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2. http://git-wip-us.apache.org/repos/asf/spark/blob/d281b0ba/python/docs/conf.py -- diff --git a/python/docs/conf.py b/python/docs/conf.py index d35bf73..50fb317 100644 --- a/python/docs/conf.py +++ b/python/docs/conf.py @@ -32,6 +32,7 @@ extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'epytext', +'sphinx.ext.mathjax', ] # Add any paths that contain templates here, relative to this directory. http://git-wip-us.apache.org/repos/asf/spark/blob/d281b0ba/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index e86c27e..d6d713c 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -49,6 +49,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, HasWeightCol, JavaMLWritable, JavaMLReadable): """ +.. note:: Experimental + Logistic regression. Currently, this class only supports binary classification. @@ -96,7 +98,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti threshold = Param(Params._dummy(), "threshold", "Threshold in binary classification prediction, in range [0, 1]." + - " If threshold and thresholds are both set, they must match.", + " If threshold and thresholds are both set, they must match." + + "e.g. if threshold is p, then thresholds must be equal to [1-p, p].", typeConverter=TypeConverters.toFloat) @keyword_only @@ -154,7 +157,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti @since("1.4.0") def getThreshold(self): """ -Gets the value of threshold or its default value. +Get threshold for binary classification. + +If :py:attr:`thresholds` is set with length 2 (i.e., binary classification), +