spark git commit: Revert "[SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour"
Repository: spark Updated Branches: refs/heads/branch-2.0 9e7e2f916 -> 7d10e4bdd Revert "[SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour" This reverts commit 9e7e2f9164e0b3bd555e795b871626057b4fed31. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7d10e4bd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7d10e4bd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7d10e4bd Branch: refs/heads/branch-2.0 Commit: 7d10e4bdd2adbeb10904665536e4949381f19cf5 Parents: 9e7e2f9 Author: Reynold Xin Authored: Sun Jun 5 23:40:35 2016 -0700 Committer: Reynold Xin Committed: Sun Jun 5 23:40:35 2016 -0700 -- python/pyspark/sql/readwriter.py| 81 ++-- .../execution/datasources/csv/CSVOptions.scala | 11 +-- .../execution/datasources/csv/CSVSuite.scala| 11 --- 3 files changed, 48 insertions(+), 55 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7d10e4bd/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 19aa8dd..9208a52 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -303,11 +303,10 @@ class DataFrameReader(object): return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(path))) @since(2.0) -def csv(self, path, schema=None, sep=u',', encoding=u'UTF-8', quote=u'\"', escape=u'\\', -comment=None, header='false', ignoreLeadingWhiteSpace='false', -ignoreTrailingWhiteSpace='false', nullValue='', nanValue='NaN', positiveInf='Inf', -negativeInf='Inf', dateFormat=None, maxColumns='20480', maxCharsPerColumn='100', -mode='PERMISSIVE'): +def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, +comment=None, header=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, +nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, +maxColumns=None, maxCharsPerColumn=None, mode=None): """Loads a CSV file and returns the result as a [[DataFrame]]. This function goes through the input once to determine the input schema. To avoid going @@ -316,41 +315,44 @@ class DataFrameReader(object): :param path: string, or list of strings, for input path(s). :param schema: an optional :class:`StructType` for the input schema. :param sep: sets the single character as a separator for each field and value. -The default value is ``,``. -:param encoding: decodes the CSV files by the given encoding type. -The default value is ``UTF-8``. +If None is set, it uses the default value, ``,``. +:param encoding: decodes the CSV files by the given encoding type. If None is set, + it uses the default value, ``UTF-8``. :param quote: sets the single character used for escaping quoted values where the - separator can be part of the value. The default value is ``"``. + separator can be part of the value. If None is set, it uses the default + value, ``"``. :param escape: sets the single character used for escaping quotes inside an already - quoted value. The default value is ``\``. + quoted value. If None is set, it uses the default value, ``\``. :param comment: sets the single character used for skipping lines beginning with this character. By default (None), it is disabled. -:param header: uses the first line as names of columns. The default value is ``false``. +:param header: uses the first line as names of columns. If None is set, it uses the + default value, ``false``. :param ignoreLeadingWhiteSpace: defines whether or not leading whitespaces from values -being read should be skipped. The default value is -``false``. +being read should be skipped. If None is set, it uses +the default value, ``false``. :param ignoreTrailingWhiteSpace: defines whether or not trailing whitespaces from values - being read should be skipped. The default value is - ``false``. -:param nullValue: sets the string representation of a null value. The default value is a - empty strin
spark git commit: Revert "[SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour"
Repository: spark Updated Branches: refs/heads/master b7e8d1cb3 -> 32f2f95db Revert "[SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour" This reverts commit b7e8d1cb3ce932ba4a784be59744af8a8ef027ce. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/32f2f95d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/32f2f95d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/32f2f95d Branch: refs/heads/master Commit: 32f2f95dbdfb21491e46d4b608fd4e8ac7ab8973 Parents: b7e8d1c Author: Reynold Xin Authored: Sun Jun 5 23:40:13 2016 -0700 Committer: Reynold Xin Committed: Sun Jun 5 23:40:13 2016 -0700 -- python/pyspark/sql/readwriter.py| 81 ++-- .../execution/datasources/csv/CSVOptions.scala | 11 +-- .../execution/datasources/csv/CSVSuite.scala| 11 --- 3 files changed, 48 insertions(+), 55 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/32f2f95d/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 19aa8dd..9208a52 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -303,11 +303,10 @@ class DataFrameReader(object): return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(path))) @since(2.0) -def csv(self, path, schema=None, sep=u',', encoding=u'UTF-8', quote=u'\"', escape=u'\\', -comment=None, header='false', ignoreLeadingWhiteSpace='false', -ignoreTrailingWhiteSpace='false', nullValue='', nanValue='NaN', positiveInf='Inf', -negativeInf='Inf', dateFormat=None, maxColumns='20480', maxCharsPerColumn='100', -mode='PERMISSIVE'): +def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, +comment=None, header=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, +nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, +maxColumns=None, maxCharsPerColumn=None, mode=None): """Loads a CSV file and returns the result as a [[DataFrame]]. This function goes through the input once to determine the input schema. To avoid going @@ -316,41 +315,44 @@ class DataFrameReader(object): :param path: string, or list of strings, for input path(s). :param schema: an optional :class:`StructType` for the input schema. :param sep: sets the single character as a separator for each field and value. -The default value is ``,``. -:param encoding: decodes the CSV files by the given encoding type. -The default value is ``UTF-8``. +If None is set, it uses the default value, ``,``. +:param encoding: decodes the CSV files by the given encoding type. If None is set, + it uses the default value, ``UTF-8``. :param quote: sets the single character used for escaping quoted values where the - separator can be part of the value. The default value is ``"``. + separator can be part of the value. If None is set, it uses the default + value, ``"``. :param escape: sets the single character used for escaping quotes inside an already - quoted value. The default value is ``\``. + quoted value. If None is set, it uses the default value, ``\``. :param comment: sets the single character used for skipping lines beginning with this character. By default (None), it is disabled. -:param header: uses the first line as names of columns. The default value is ``false``. +:param header: uses the first line as names of columns. If None is set, it uses the + default value, ``false``. :param ignoreLeadingWhiteSpace: defines whether or not leading whitespaces from values -being read should be skipped. The default value is -``false``. +being read should be skipped. If None is set, it uses +the default value, ``false``. :param ignoreTrailingWhiteSpace: defines whether or not trailing whitespaces from values - being read should be skipped. The default value is - ``false``. -:param nullValue: sets the string representation of a null value. The default value is a - empty string. -
spark git commit: [SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour
Repository: spark Updated Branches: refs/heads/master 79268aa46 -> b7e8d1cb3 [SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour ## What changes were proposed in this pull request? This pr fixes the behaviour of `format("csv").option("quote", null)` along with one of spark-csv. Also, it explicitly sets default values for CSV options in python. ## How was this patch tested? Added tests in CSVSuite. Author: Takeshi YAMAMURO Closes #13372 from maropu/SPARK-15585. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7e8d1cb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7e8d1cb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7e8d1cb Branch: refs/heads/master Commit: b7e8d1cb3ce932ba4a784be59744af8a8ef027ce Parents: 79268aa Author: Takeshi YAMAMURO Authored: Sun Jun 5 23:35:04 2016 -0700 Committer: Reynold Xin Committed: Sun Jun 5 23:35:04 2016 -0700 -- python/pyspark/sql/readwriter.py| 81 ++-- .../execution/datasources/csv/CSVOptions.scala | 11 ++- .../execution/datasources/csv/CSVSuite.scala| 11 +++ 3 files changed, 55 insertions(+), 48 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b7e8d1cb/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 9208a52..19aa8dd 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -303,10 +303,11 @@ class DataFrameReader(object): return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(path))) @since(2.0) -def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, -comment=None, header=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, -nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, -maxColumns=None, maxCharsPerColumn=None, mode=None): +def csv(self, path, schema=None, sep=u',', encoding=u'UTF-8', quote=u'\"', escape=u'\\', +comment=None, header='false', ignoreLeadingWhiteSpace='false', +ignoreTrailingWhiteSpace='false', nullValue='', nanValue='NaN', positiveInf='Inf', +negativeInf='Inf', dateFormat=None, maxColumns='20480', maxCharsPerColumn='100', +mode='PERMISSIVE'): """Loads a CSV file and returns the result as a [[DataFrame]]. This function goes through the input once to determine the input schema. To avoid going @@ -315,44 +316,41 @@ class DataFrameReader(object): :param path: string, or list of strings, for input path(s). :param schema: an optional :class:`StructType` for the input schema. :param sep: sets the single character as a separator for each field and value. -If None is set, it uses the default value, ``,``. -:param encoding: decodes the CSV files by the given encoding type. If None is set, - it uses the default value, ``UTF-8``. +The default value is ``,``. +:param encoding: decodes the CSV files by the given encoding type. +The default value is ``UTF-8``. :param quote: sets the single character used for escaping quoted values where the - separator can be part of the value. If None is set, it uses the default - value, ``"``. + separator can be part of the value. The default value is ``"``. :param escape: sets the single character used for escaping quotes inside an already - quoted value. If None is set, it uses the default value, ``\``. + quoted value. The default value is ``\``. :param comment: sets the single character used for skipping lines beginning with this character. By default (None), it is disabled. -:param header: uses the first line as names of columns. If None is set, it uses the - default value, ``false``. +:param header: uses the first line as names of columns. The default value is ``false``. :param ignoreLeadingWhiteSpace: defines whether or not leading whitespaces from values -being read should be skipped. If None is set, it uses -the default value, ``false``. +being read should be skipped. The default value is +``false``. :param ignoreTrailingWhiteSpace: defines whether or not trailing whitespaces from values -
spark git commit: [SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour
Repository: spark Updated Branches: refs/heads/branch-2.0 790de600b -> 9e7e2f916 [SPARK-15585][SQL] Fix NULL handling along with a spark-csv behaivour ## What changes were proposed in this pull request? This pr fixes the behaviour of `format("csv").option("quote", null)` along with one of spark-csv. Also, it explicitly sets default values for CSV options in python. ## How was this patch tested? Added tests in CSVSuite. Author: Takeshi YAMAMURO Closes #13372 from maropu/SPARK-15585. (cherry picked from commit b7e8d1cb3ce932ba4a784be59744af8a8ef027ce) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9e7e2f91 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9e7e2f91 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9e7e2f91 Branch: refs/heads/branch-2.0 Commit: 9e7e2f9164e0b3bd555e795b871626057b4fed31 Parents: 790de60 Author: Takeshi YAMAMURO Authored: Sun Jun 5 23:35:04 2016 -0700 Committer: Reynold Xin Committed: Sun Jun 5 23:35:10 2016 -0700 -- python/pyspark/sql/readwriter.py| 81 ++-- .../execution/datasources/csv/CSVOptions.scala | 11 ++- .../execution/datasources/csv/CSVSuite.scala| 11 +++ 3 files changed, 55 insertions(+), 48 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9e7e2f91/python/pyspark/sql/readwriter.py -- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 9208a52..19aa8dd 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -303,10 +303,11 @@ class DataFrameReader(object): return self._df(self._jreader.text(self._spark._sc._jvm.PythonUtils.toSeq(path))) @since(2.0) -def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=None, -comment=None, header=None, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None, -nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, -maxColumns=None, maxCharsPerColumn=None, mode=None): +def csv(self, path, schema=None, sep=u',', encoding=u'UTF-8', quote=u'\"', escape=u'\\', +comment=None, header='false', ignoreLeadingWhiteSpace='false', +ignoreTrailingWhiteSpace='false', nullValue='', nanValue='NaN', positiveInf='Inf', +negativeInf='Inf', dateFormat=None, maxColumns='20480', maxCharsPerColumn='100', +mode='PERMISSIVE'): """Loads a CSV file and returns the result as a [[DataFrame]]. This function goes through the input once to determine the input schema. To avoid going @@ -315,44 +316,41 @@ class DataFrameReader(object): :param path: string, or list of strings, for input path(s). :param schema: an optional :class:`StructType` for the input schema. :param sep: sets the single character as a separator for each field and value. -If None is set, it uses the default value, ``,``. -:param encoding: decodes the CSV files by the given encoding type. If None is set, - it uses the default value, ``UTF-8``. +The default value is ``,``. +:param encoding: decodes the CSV files by the given encoding type. +The default value is ``UTF-8``. :param quote: sets the single character used for escaping quoted values where the - separator can be part of the value. If None is set, it uses the default - value, ``"``. + separator can be part of the value. The default value is ``"``. :param escape: sets the single character used for escaping quotes inside an already - quoted value. If None is set, it uses the default value, ``\``. + quoted value. The default value is ``\``. :param comment: sets the single character used for skipping lines beginning with this character. By default (None), it is disabled. -:param header: uses the first line as names of columns. If None is set, it uses the - default value, ``false``. +:param header: uses the first line as names of columns. The default value is ``false``. :param ignoreLeadingWhiteSpace: defines whether or not leading whitespaces from values -being read should be skipped. If None is set, it uses -the default value, ``false``. +being read should be skipped. The default value is +``false``. :
spark git commit: [SPARK-15704][SQL] add a test case in DatasetAggregatorSuite for regression testing
Repository: spark Updated Branches: refs/heads/master 26c1089c3 -> 79268aa46 [SPARK-15704][SQL] add a test case in DatasetAggregatorSuite for regression testing ## What changes were proposed in this pull request? This change fixes a crash in TungstenAggregate while executing "Dataset complex Aggregator" test case due to IndexOutOfBoundsException. jira entry for detail: https://issues.apache.org/jira/browse/SPARK-15704 ## How was this patch tested? Using existing unit tests (including DatasetBenchmark) Author: Hiroshi Inoue Closes #13446 from inouehrs/fix_aggregate. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/79268aa4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/79268aa4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/79268aa4 Branch: refs/heads/master Commit: 79268aa461abd237bc4f96a7d31457c98e11798c Parents: 26c1089 Author: Hiroshi Inoue Authored: Sun Jun 5 20:10:33 2016 -0700 Committer: Wenchen Fan Committed: Sun Jun 5 20:10:33 2016 -0700 -- .../spark/sql/DatasetAggregatorSuite.scala | 19 +++ 1 file changed, 19 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/79268aa4/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala index ead7bd9..f9b4cd8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.StringType object ComplexResultAgg extends Aggregator[(String, Int), (Long, Long), (Long, Long)] { @@ -52,6 +53,16 @@ object ClassInputAgg extends Aggregator[AggData, Int, Int] { } +object ClassBufferAggregator extends Aggregator[AggData, AggData, Int] { + override def zero: AggData = AggData(0, "") + override def reduce(b: AggData, a: AggData): AggData = AggData(b.a + a.a, "") + override def finish(reduction: AggData): Int = reduction.a + override def merge(b1: AggData, b2: AggData): AggData = AggData(b1.a + b2.a, "") + override def bufferEncoder: Encoder[AggData] = Encoders.product[AggData] + override def outputEncoder: Encoder[Int] = Encoders.scalaInt +} + + object ComplexBufferAgg extends Aggregator[AggData, (Int, AggData), Int] { override def zero: (Int, AggData) = 0 -> AggData(0, "0") override def reduce(b: (Int, AggData), a: AggData): (Int, AggData) = (b._1 + 1, a) @@ -173,6 +184,14 @@ class DatasetAggregatorSuite extends QueryTest with SharedSQLContext { ("one", 1)) } + test("Typed aggregation using aggregator") { +// based on Dataset complex Aggregator test of DatasetBenchmark +val ds = Seq(AggData(1, "x"), AggData(2, "y"), AggData(3, "z")).toDS() +checkDataset( + ds.select(ClassBufferAggregator.toColumn), + 6) + } + test("typed aggregation: complex input") { val ds = Seq(AggData(1, "one"), AggData(2, "two")).toDS() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15704][SQL] add a test case in DatasetAggregatorSuite for regression testing
Repository: spark Updated Branches: refs/heads/branch-2.0 d8370ef11 -> 790de600b [SPARK-15704][SQL] add a test case in DatasetAggregatorSuite for regression testing ## What changes were proposed in this pull request? This change fixes a crash in TungstenAggregate while executing "Dataset complex Aggregator" test case due to IndexOutOfBoundsException. jira entry for detail: https://issues.apache.org/jira/browse/SPARK-15704 ## How was this patch tested? Using existing unit tests (including DatasetBenchmark) Author: Hiroshi Inoue Closes #13446 from inouehrs/fix_aggregate. (cherry picked from commit 79268aa461abd237bc4f96a7d31457c98e11798c) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/790de600 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/790de600 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/790de600 Branch: refs/heads/branch-2.0 Commit: 790de600beb3f6cae1914f59a61a43c02440884f Parents: d8370ef Author: Hiroshi Inoue Authored: Sun Jun 5 20:10:33 2016 -0700 Committer: Wenchen Fan Committed: Sun Jun 5 20:10:39 2016 -0700 -- .../spark/sql/DatasetAggregatorSuite.scala | 19 +++ 1 file changed, 19 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/790de600/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala index ead7bd9..f9b4cd8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetAggregatorSuite.scala @@ -24,6 +24,7 @@ import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.scalalang.typed import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext +import org.apache.spark.sql.types.StringType object ComplexResultAgg extends Aggregator[(String, Int), (Long, Long), (Long, Long)] { @@ -52,6 +53,16 @@ object ClassInputAgg extends Aggregator[AggData, Int, Int] { } +object ClassBufferAggregator extends Aggregator[AggData, AggData, Int] { + override def zero: AggData = AggData(0, "") + override def reduce(b: AggData, a: AggData): AggData = AggData(b.a + a.a, "") + override def finish(reduction: AggData): Int = reduction.a + override def merge(b1: AggData, b2: AggData): AggData = AggData(b1.a + b2.a, "") + override def bufferEncoder: Encoder[AggData] = Encoders.product[AggData] + override def outputEncoder: Encoder[Int] = Encoders.scalaInt +} + + object ComplexBufferAgg extends Aggregator[AggData, (Int, AggData), Int] { override def zero: (Int, AggData) = 0 -> AggData(0, "0") override def reduce(b: (Int, AggData), a: AggData): (Int, AggData) = (b._1 + 1, a) @@ -173,6 +184,14 @@ class DatasetAggregatorSuite extends QueryTest with SharedSQLContext { ("one", 1)) } + test("Typed aggregation using aggregator") { +// based on Dataset complex Aggregator test of DatasetBenchmark +val ds = Seq(AggData(1, "x"), AggData(2, "y"), AggData(3, "z")).toDS() +checkDataset( + ds.select(ClassBufferAggregator.toColumn), + 6) + } + test("typed aggregation: complex input") { val ds = Seq(AggData(1, "one"), AggData(2, "two")).toDS() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15748][SQL] Replace inefficient foldLeft() call with flatMap() in PartitionStatistics
Repository: spark Updated Branches: refs/heads/branch-2.0 38a626a54 -> d8370ef11 [SPARK-15748][SQL] Replace inefficient foldLeft() call with flatMap() in PartitionStatistics `PartitionStatistics` uses `foldLeft` and list concatenation (`++`) to flatten an iterator of lists, but this is extremely inefficient compared to simply doing `flatMap`/`flatten` because it performs many unnecessary object allocations. Simply replacing this `foldLeft` by a `flatMap` results in decent performance gains when constructing PartitionStatistics instances for tables with many columns. This patch fixes this and also makes two similar changes in MLlib and streaming to try to fix all known occurrences of this pattern. Author: Josh Rosen Closes #13491 from JoshRosen/foldleft-to-flatmap. (cherry picked from commit 26c1089c37149061f838129bb53330ded68ff4c9) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d8370ef1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d8370ef1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d8370ef1 Branch: refs/heads/branch-2.0 Commit: d8370ef117c96ebb30e9213d8d89fd3edbd796d7 Parents: 38a626a Author: Josh Rosen Authored: Sun Jun 5 16:51:00 2016 -0700 Committer: Reynold Xin Committed: Sun Jun 5 16:51:06 2016 -0700 -- mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala| 2 +- .../org/apache/spark/sql/execution/columnar/ColumnStats.scala| 4 ++-- .../main/scala/org/apache/spark/streaming/ui/StreamingPage.scala | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d8370ef1/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index 94d1b83..8ed40c3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -422,7 +422,7 @@ private[ml] object MetaAlgorithmReadWrite { case rformModel: RFormulaModel => Array(rformModel.pipelineModel) case _: Params => Array() } -val subStageMaps = subStages.map(getUidMapImpl).foldLeft(List.empty[(String, Params)])(_ ++ _) +val subStageMaps = subStages.flatMap(getUidMapImpl) List((instance.uid, instance)) ++ subStageMaps } } http://git-wip-us.apache.org/repos/asf/spark/blob/d8370ef1/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala index 5d44769..470307b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala @@ -33,9 +33,9 @@ private[columnar] class ColumnStatisticsSchema(a: Attribute) extends Serializabl } private[columnar] class PartitionStatistics(tableSchema: Seq[Attribute]) extends Serializable { - val (forAttribute, schema) = { + val (forAttribute: AttributeMap[ColumnStatisticsSchema], schema: Seq[AttributeReference]) = { val allStats = tableSchema.map(a => a -> new ColumnStatisticsSchema(a)) -(AttributeMap(allStats), allStats.map(_._2.schema).foldLeft(Seq.empty[Attribute])(_ ++ _)) +(AttributeMap(allStats), allStats.flatMap(_._2.schema)) } } http://git-wip-us.apache.org/repos/asf/spark/blob/d8370ef1/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala index b97e24f..46cd309 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala @@ -396,11 +396,11 @@ private[ui] class StreamingPage(parent: StreamingTab) .map(_.ceil.toLong) .getOrElse(0L) -val content = listener.receivedRecordRateWithBatchTime.toList.sortBy(_._1).map { +val content: Seq[Node] = listener.receivedRecordRateWithBatchTime.toList.sortBy(_._1).flatMap { case (streamId, recordRates) => generateInputDStreamRow( jsCollector, streamId, recordRates, minX, maxX, minY, maxYCalculated) -}.foldLeft[Seq[Node]](Nil)(_ ++ _) +} // scalastyle:off ---
spark git commit: [SPARK-15748][SQL] Replace inefficient foldLeft() call with flatMap() in PartitionStatistics
Repository: spark Updated Branches: refs/heads/master 30c4774f3 -> 26c1089c3 [SPARK-15748][SQL] Replace inefficient foldLeft() call with flatMap() in PartitionStatistics `PartitionStatistics` uses `foldLeft` and list concatenation (`++`) to flatten an iterator of lists, but this is extremely inefficient compared to simply doing `flatMap`/`flatten` because it performs many unnecessary object allocations. Simply replacing this `foldLeft` by a `flatMap` results in decent performance gains when constructing PartitionStatistics instances for tables with many columns. This patch fixes this and also makes two similar changes in MLlib and streaming to try to fix all known occurrences of this pattern. Author: Josh Rosen Closes #13491 from JoshRosen/foldleft-to-flatmap. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26c1089c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26c1089c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26c1089c Branch: refs/heads/master Commit: 26c1089c37149061f838129bb53330ded68ff4c9 Parents: 30c4774 Author: Josh Rosen Authored: Sun Jun 5 16:51:00 2016 -0700 Committer: Reynold Xin Committed: Sun Jun 5 16:51:00 2016 -0700 -- mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala| 2 +- .../org/apache/spark/sql/execution/columnar/ColumnStats.scala| 4 ++-- .../main/scala/org/apache/spark/streaming/ui/StreamingPage.scala | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26c1089c/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala index 94d1b83..8ed40c3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala @@ -422,7 +422,7 @@ private[ml] object MetaAlgorithmReadWrite { case rformModel: RFormulaModel => Array(rformModel.pipelineModel) case _: Params => Array() } -val subStageMaps = subStages.map(getUidMapImpl).foldLeft(List.empty[(String, Params)])(_ ++ _) +val subStageMaps = subStages.flatMap(getUidMapImpl) List((instance.uid, instance)) ++ subStageMaps } } http://git-wip-us.apache.org/repos/asf/spark/blob/26c1089c/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala index 5d44769..470307b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/ColumnStats.scala @@ -33,9 +33,9 @@ private[columnar] class ColumnStatisticsSchema(a: Attribute) extends Serializabl } private[columnar] class PartitionStatistics(tableSchema: Seq[Attribute]) extends Serializable { - val (forAttribute, schema) = { + val (forAttribute: AttributeMap[ColumnStatisticsSchema], schema: Seq[AttributeReference]) = { val allStats = tableSchema.map(a => a -> new ColumnStatisticsSchema(a)) -(AttributeMap(allStats), allStats.map(_._2.schema).foldLeft(Seq.empty[Attribute])(_ ++ _)) +(AttributeMap(allStats), allStats.flatMap(_._2.schema)) } } http://git-wip-us.apache.org/repos/asf/spark/blob/26c1089c/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala index b97e24f..46cd309 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala @@ -396,11 +396,11 @@ private[ui] class StreamingPage(parent: StreamingTab) .map(_.ceil.toLong) .getOrElse(0L) -val content = listener.receivedRecordRateWithBatchTime.toList.sortBy(_._1).map { +val content: Seq[Node] = listener.receivedRecordRateWithBatchTime.toList.sortBy(_._1).flatMap { case (streamId, recordRates) => generateInputDStreamRow( jsCollector, streamId, recordRates, minX, maxX, minY, maxYCalculated) -}.foldLeft[Seq[Node]](Nil)(_ ++ _) +} // scalastyle:off - To unsubscribe, e-mail: commits-unsubscr...@spark.apach
spark git commit: [SPARK-15657][SQL] RowEncoder should validate the data type of input object
Repository: spark Updated Branches: refs/heads/branch-2.0 e6e1d8232 -> 38a626a54 [SPARK-15657][SQL] RowEncoder should validate the data type of input object ## What changes were proposed in this pull request? This PR improves the error handling of `RowEncoder`. When we create a `RowEncoder` with a given schema, we should validate the data type of input object. e.g. we should throw an exception when a field is boolean but is declared as a string column. This PR also removes the support to use `Product` as a valid external type of struct type. This support is added at https://github.com/apache/spark/pull/9712, but is incomplete, e.g. nested product, product in array are both not working. However, we never officially support this feature and I think it's ok to ban it. ## How was this patch tested? new tests in `RowEncoderSuite`. Author: Wenchen Fan Closes #13401 from cloud-fan/bug. (cherry picked from commit 30c4774f33fed63b7d400d220d710fb432f599a8) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/38a626a5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/38a626a5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/38a626a5 Branch: refs/heads/branch-2.0 Commit: 38a626a54dd0fac0ca460e1ba534048de513bc29 Parents: e6e1d82 Author: Wenchen Fan Authored: Sun Jun 5 15:59:52 2016 -0700 Committer: Cheng Lian Committed: Sun Jun 5 16:00:00 2016 -0700 -- .../main/scala/org/apache/spark/sql/Row.scala | 10 +--- .../sql/catalyst/encoders/RowEncoder.scala | 17 -- .../catalyst/expressions/objects/objects.scala | 61 +--- .../sql/catalyst/encoders/RowEncoderSuite.scala | 47 ++- 4 files changed, 95 insertions(+), 40 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/38a626a5/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index a257b83..391001d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -304,15 +304,7 @@ trait Row extends Serializable { * * @throws ClassCastException when data type does not match. */ - def getStruct(i: Int): Row = { -// Product and Row both are recognized as StructType in a Row -val t = get(i) -if (t.isInstanceOf[Product]) { - Row.fromTuple(t.asInstanceOf[Product]) -} else { - t.asInstanceOf[Row] -} - } + def getStruct(i: Int): Row = getAs[Row](i) /** * Returns the value at position i. http://git-wip-us.apache.org/repos/asf/spark/blob/38a626a5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala index 6cd7b34..67fca15 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala @@ -51,7 +51,7 @@ import org.apache.spark.unsafe.types.UTF8String * BinaryType -> byte array * ArrayType -> scala.collection.Seq or Array * MapType -> scala.collection.Map - * StructType -> org.apache.spark.sql.Row or Product + * StructType -> org.apache.spark.sql.Row * }}} */ object RowEncoder { @@ -121,11 +121,15 @@ object RowEncoder { case t @ ArrayType(et, _) => et match { case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType => +// TODO: validate input type for primitive array. NewInstance( classOf[GenericArrayData], inputObject :: Nil, dataType = t) - case _ => MapObjects(serializerFor(_, et), inputObject, externalDataTypeForInput(et)) + case _ => MapObjects( +element => serializerFor(ValidateExternalType(element, et), et), +inputObject, +ObjectType(classOf[Object])) } case t @ MapType(kt, vt, valueNullable) => @@ -151,8 +155,9 @@ object RowEncoder { case StructType(fields) => val nonNullOutput = CreateNamedStruct(fields.zipWithIndex.flatMap { case (field, index) => val fieldValue = serializerFor( - GetExternalRowField( -inputObject, index, field.name, externalDataTypeForInput(field.dataType)), + ValidateExternalType( +GetExternalRowField(inputObject, index, field.name), +field.dataType),
spark git commit: [SPARK-15657][SQL] RowEncoder should validate the data type of input object
Repository: spark Updated Branches: refs/heads/master 8a9110510 -> 30c4774f3 [SPARK-15657][SQL] RowEncoder should validate the data type of input object ## What changes were proposed in this pull request? This PR improves the error handling of `RowEncoder`. When we create a `RowEncoder` with a given schema, we should validate the data type of input object. e.g. we should throw an exception when a field is boolean but is declared as a string column. This PR also removes the support to use `Product` as a valid external type of struct type. This support is added at https://github.com/apache/spark/pull/9712, but is incomplete, e.g. nested product, product in array are both not working. However, we never officially support this feature and I think it's ok to ban it. ## How was this patch tested? new tests in `RowEncoderSuite`. Author: Wenchen Fan Closes #13401 from cloud-fan/bug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/30c4774f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/30c4774f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/30c4774f Branch: refs/heads/master Commit: 30c4774f33fed63b7d400d220d710fb432f599a8 Parents: 8a91105 Author: Wenchen Fan Authored: Sun Jun 5 15:59:52 2016 -0700 Committer: Cheng Lian Committed: Sun Jun 5 15:59:52 2016 -0700 -- .../main/scala/org/apache/spark/sql/Row.scala | 10 +--- .../sql/catalyst/encoders/RowEncoder.scala | 17 -- .../catalyst/expressions/objects/objects.scala | 61 +--- .../sql/catalyst/encoders/RowEncoderSuite.scala | 47 ++- 4 files changed, 95 insertions(+), 40 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/30c4774f/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala index a257b83..391001d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala @@ -304,15 +304,7 @@ trait Row extends Serializable { * * @throws ClassCastException when data type does not match. */ - def getStruct(i: Int): Row = { -// Product and Row both are recognized as StructType in a Row -val t = get(i) -if (t.isInstanceOf[Product]) { - Row.fromTuple(t.asInstanceOf[Product]) -} else { - t.asInstanceOf[Row] -} - } + def getStruct(i: Int): Row = getAs[Row](i) /** * Returns the value at position i. http://git-wip-us.apache.org/repos/asf/spark/blob/30c4774f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala index 6cd7b34..67fca15 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/RowEncoder.scala @@ -51,7 +51,7 @@ import org.apache.spark.unsafe.types.UTF8String * BinaryType -> byte array * ArrayType -> scala.collection.Seq or Array * MapType -> scala.collection.Map - * StructType -> org.apache.spark.sql.Row or Product + * StructType -> org.apache.spark.sql.Row * }}} */ object RowEncoder { @@ -121,11 +121,15 @@ object RowEncoder { case t @ ArrayType(et, _) => et match { case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType => +// TODO: validate input type for primitive array. NewInstance( classOf[GenericArrayData], inputObject :: Nil, dataType = t) - case _ => MapObjects(serializerFor(_, et), inputObject, externalDataTypeForInput(et)) + case _ => MapObjects( +element => serializerFor(ValidateExternalType(element, et), et), +inputObject, +ObjectType(classOf[Object])) } case t @ MapType(kt, vt, valueNullable) => @@ -151,8 +155,9 @@ object RowEncoder { case StructType(fields) => val nonNullOutput = CreateNamedStruct(fields.zipWithIndex.flatMap { case (field, index) => val fieldValue = serializerFor( - GetExternalRowField( -inputObject, index, field.name, externalDataTypeForInput(field.dataType)), + ValidateExternalType( +GetExternalRowField(inputObject, index, field.name), +field.dataType), field.dataType) val convertedField = if (field.nullable) { If( @@ -183,7 +188,7 @@
spark git commit: [MINOR][R][DOC] Fix R documentation generation instruction.
Repository: spark Updated Branches: refs/heads/branch-2.0 1ece135b9 -> e6e1d8232 [MINOR][R][DOC] Fix R documentation generation instruction. ## What changes were proposed in this pull request? changes in R/README.md - Make step of generating SparkR document more clear. - link R/DOCUMENTATION.md from R/README.md - turn on some code syntax highlight in R/README.md ## How was this patch tested? local test Author: Kai Jiang Closes #13488 from vectorijk/R-Readme. (cherry picked from commit 8a9110510c9e4cbbcb0dede62cb4b9dd1c6bc8cc) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e6e1d823 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e6e1d823 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e6e1d823 Branch: refs/heads/branch-2.0 Commit: e6e1d823289a3ba18bb9b34413d7ed5a31416a23 Parents: 1ece135 Author: Kai Jiang Authored: Sun Jun 5 13:03:02 2016 -0700 Committer: Shivaram Venkataraman Committed: Sun Jun 5 13:03:17 2016 -0700 -- R/DOCUMENTATION.md | 12 ++-- R/README.md| 30 ++ 2 files changed, 20 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e6e1d823/R/DOCUMENTATION.md -- diff --git a/R/DOCUMENTATION.md b/R/DOCUMENTATION.md index 931d015..7314a1f 100644 --- a/R/DOCUMENTATION.md +++ b/R/DOCUMENTATION.md @@ -1,12 +1,12 @@ # SparkR Documentation -SparkR documentation is generated using in-source comments annotated using using -`roxygen2`. After making changes to the documentation, to generate man pages, +SparkR documentation is generated by using in-source comments and annotated by using +[`roxygen2`](https://cran.r-project.org/web/packages/roxygen2/index.html). After making changes to the documentation and generating man pages, you can run the following from an R console in the SparkR home directory - -library(devtools) -devtools::document(pkg="./pkg", roclets=c("rd")) - +```R +library(devtools) +devtools::document(pkg="./pkg", roclets=c("rd")) +``` You can verify if your changes are good by running R CMD check pkg/ http://git-wip-us.apache.org/repos/asf/spark/blob/e6e1d823/R/README.md -- diff --git a/R/README.md b/R/README.md index 044f953..932d527 100644 --- a/R/README.md +++ b/R/README.md @@ -7,8 +7,7 @@ SparkR is an R package that provides a light-weight frontend to use Spark from R Libraries of sparkR need to be created in `$SPARK_HOME/R/lib`. This can be done by running the script `$SPARK_HOME/R/install-dev.sh`. By default the above script uses the system wide installation of R. However, this can be changed to any user installed location of R by setting the environment variable `R_HOME` the full path of the base directory where R is installed, before running install-dev.sh script. Example: - -``` +```bash # where /home/username/R is where R is installed and /home/username/R/bin contains the files R and RScript export R_HOME=/home/username/R ./install-dev.sh @@ -20,8 +19,8 @@ export R_HOME=/home/username/R Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run -``` - build/mvn -DskipTests -Psparkr package +```bash +build/mvn -DskipTests -Psparkr package ``` Running sparkR @@ -40,9 +39,8 @@ To set other options like driver memory, executor memory etc. you can pass in th Using SparkR from RStudio -If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example - -``` +If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example +```R # Set this to where Spark is installed Sys.setenv(SPARK_HOME="/Users/username/spark") # This line loads SparkR from the installed directory @@ -59,25 +57,25 @@ Once you have made your changes, please include unit tests for them and run exis Generating documentation -The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script. +The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` a
spark git commit: [MINOR][R][DOC] Fix R documentation generation instruction.
Repository: spark Updated Branches: refs/heads/master 372fa61f5 -> 8a9110510 [MINOR][R][DOC] Fix R documentation generation instruction. ## What changes were proposed in this pull request? changes in R/README.md - Make step of generating SparkR document more clear. - link R/DOCUMENTATION.md from R/README.md - turn on some code syntax highlight in R/README.md ## How was this patch tested? local test Author: Kai Jiang Closes #13488 from vectorijk/R-Readme. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a911051 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a911051 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a911051 Branch: refs/heads/master Commit: 8a9110510c9e4cbbcb0dede62cb4b9dd1c6bc8cc Parents: 372fa61 Author: Kai Jiang Authored: Sun Jun 5 13:03:02 2016 -0700 Committer: Shivaram Venkataraman Committed: Sun Jun 5 13:03:02 2016 -0700 -- R/DOCUMENTATION.md | 12 ++-- R/README.md| 30 ++ 2 files changed, 20 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8a911051/R/DOCUMENTATION.md -- diff --git a/R/DOCUMENTATION.md b/R/DOCUMENTATION.md index 931d015..7314a1f 100644 --- a/R/DOCUMENTATION.md +++ b/R/DOCUMENTATION.md @@ -1,12 +1,12 @@ # SparkR Documentation -SparkR documentation is generated using in-source comments annotated using using -`roxygen2`. After making changes to the documentation, to generate man pages, +SparkR documentation is generated by using in-source comments and annotated by using +[`roxygen2`](https://cran.r-project.org/web/packages/roxygen2/index.html). After making changes to the documentation and generating man pages, you can run the following from an R console in the SparkR home directory - -library(devtools) -devtools::document(pkg="./pkg", roclets=c("rd")) - +```R +library(devtools) +devtools::document(pkg="./pkg", roclets=c("rd")) +``` You can verify if your changes are good by running R CMD check pkg/ http://git-wip-us.apache.org/repos/asf/spark/blob/8a911051/R/README.md -- diff --git a/R/README.md b/R/README.md index 044f953..932d527 100644 --- a/R/README.md +++ b/R/README.md @@ -7,8 +7,7 @@ SparkR is an R package that provides a light-weight frontend to use Spark from R Libraries of sparkR need to be created in `$SPARK_HOME/R/lib`. This can be done by running the script `$SPARK_HOME/R/install-dev.sh`. By default the above script uses the system wide installation of R. However, this can be changed to any user installed location of R by setting the environment variable `R_HOME` the full path of the base directory where R is installed, before running install-dev.sh script. Example: - -``` +```bash # where /home/username/R is where R is installed and /home/username/R/bin contains the files R and RScript export R_HOME=/home/username/R ./install-dev.sh @@ -20,8 +19,8 @@ export R_HOME=/home/username/R Build Spark with [Maven](http://spark.apache.org/docs/latest/building-spark.html#building-with-buildmvn) and include the `-Psparkr` profile to build the R package. For example to use the default Hadoop versions you can run -``` - build/mvn -DskipTests -Psparkr package +```bash +build/mvn -DskipTests -Psparkr package ``` Running sparkR @@ -40,9 +39,8 @@ To set other options like driver memory, executor memory etc. you can pass in th Using SparkR from RStudio -If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example - -``` +If you wish to use SparkR from RStudio or other R frontends you will need to set some environment variables which point SparkR to your Spark installation. For example +```R # Set this to where Spark is installed Sys.setenv(SPARK_HOME="/Users/username/spark") # This line loads SparkR from the installed directory @@ -59,25 +57,25 @@ Once you have made your changes, please include unit tests for them and run exis Generating documentation -The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script. +The SparkR documentation (Rd files and HTML files) are not a part of the source repository. To generate them you can run the script `R/create-docs.sh`. This script uses `devtools` and `knitr` to generate the docs and these packages need to be installed on the machine before using the script. Als
spark git commit: [SPARK-15770][ML] Annotation audit for Experimental and DeveloperApi
Repository: spark Updated Branches: refs/heads/branch-2.0 8c0ec85e6 -> 1ece135b9 [SPARK-15770][ML] Annotation audit for Experimental and DeveloperApi ## What changes were proposed in this pull request? 1, remove comments `:: Experimental ::` for non-experimental API 2, add comments `:: Experimental ::` for experimental API 3, add comments `:: DeveloperApi ::` for developerApi API ## How was this patch tested? manual tests Author: Zheng RuiFeng Closes #13514 from zhengruifeng/del_experimental. (cherry picked from commit 372fa61f511843f53498b9e843a84cfdd76fa2b2) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1ece135b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1ece135b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1ece135b Branch: refs/heads/branch-2.0 Commit: 1ece135b959d2a7f332e6ad06814a83fabc75d0f Parents: 8c0ec85 Author: Zheng RuiFeng Authored: Sun Jun 5 11:55:25 2016 -0700 Committer: Reynold Xin Committed: Sun Jun 5 11:55:34 2016 -0700 -- .../apache/spark/ml/attribute/AttributeType.scala | 3 +++ .../scala/org/apache/spark/ml/clustering/LDA.scala | 4 .../spark/mllib/clustering/BisectingKMeans.scala | 4 .../mllib/clustering/BisectingKMeansModel.scala| 2 ++ .../apache/spark/mllib/clustering/LDAModel.scala | 17 - .../spark/mllib/evaluation/MulticlassMetrics.scala | 1 - .../spark/mllib/evaluation/RankingMetrics.scala| 1 - .../spark/mllib/feature/StandardScaler.scala | 6 ++ .../org/apache/spark/mllib/random/RandomRDDs.scala | 5 + .../spark/mllib/rdd/MLPairRDDFunctions.scala | 4 .../org/apache/spark/mllib/rdd/RDDFunctions.scala | 4 .../apache/spark/mllib/recommendation/ALS.scala| 1 + .../apache/spark/mllib/tree/model/Predict.scala| 1 + 13 files changed, 50 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1ece135b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala index 5c7089b..078fecf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala @@ -27,6 +27,9 @@ import org.apache.spark.annotation.DeveloperApi @DeveloperApi sealed abstract class AttributeType(val name: String) +/** + * :: DeveloperApi :: + */ @DeveloperApi object AttributeType { http://git-wip-us.apache.org/repos/asf/spark/blob/1ece135b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 5aec692..609e50e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -675,6 +675,8 @@ class DistributedLDAModel private[ml] ( private var _checkpointFiles: Array[String] = oldDistributedModel.checkpointFiles /** + * :: DeveloperApi :: + * * If using checkpointing and [[LDA.keepLastCheckpoint]] is set to true, then there may be * saved checkpoint files. This method is provided so that users can manage those files. * @@ -689,6 +691,8 @@ class DistributedLDAModel private[ml] ( def getCheckpointFiles: Array[String] = _checkpointFiles /** + * :: DeveloperApi :: + * * Remove any remaining checkpoint files from training. * * @see [[getCheckpointFiles]] http://git-wip-us.apache.org/repos/asf/spark/blob/1ece135b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index e4bd0dc..91edcf2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -31,6 +31,8 @@ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel /** + * :: Experimental :: + * * A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques" * by Steinbach, Karypis, and Kumar, with modification to fit Spark. * The algorithm starts from a single cluster that contains all points. @@ -396,6 +398,8 @@ private object BisectingKMeans extend
spark git commit: [SPARK-15770][ML] Annotation audit for Experimental and DeveloperApi
Repository: spark Updated Branches: refs/heads/master 4e767d0f9 -> 372fa61f5 [SPARK-15770][ML] Annotation audit for Experimental and DeveloperApi ## What changes were proposed in this pull request? 1, remove comments `:: Experimental ::` for non-experimental API 2, add comments `:: Experimental ::` for experimental API 3, add comments `:: DeveloperApi ::` for developerApi API ## How was this patch tested? manual tests Author: Zheng RuiFeng Closes #13514 from zhengruifeng/del_experimental. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/372fa61f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/372fa61f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/372fa61f Branch: refs/heads/master Commit: 372fa61f511843f53498b9e843a84cfdd76fa2b2 Parents: 4e767d0 Author: Zheng RuiFeng Authored: Sun Jun 5 11:55:25 2016 -0700 Committer: Reynold Xin Committed: Sun Jun 5 11:55:25 2016 -0700 -- .../apache/spark/ml/attribute/AttributeType.scala | 3 +++ .../scala/org/apache/spark/ml/clustering/LDA.scala | 4 .../spark/mllib/clustering/BisectingKMeans.scala | 4 .../mllib/clustering/BisectingKMeansModel.scala| 2 ++ .../apache/spark/mllib/clustering/LDAModel.scala | 17 - .../spark/mllib/evaluation/MulticlassMetrics.scala | 1 - .../spark/mllib/evaluation/RankingMetrics.scala| 1 - .../spark/mllib/feature/StandardScaler.scala | 6 ++ .../org/apache/spark/mllib/random/RandomRDDs.scala | 5 + .../spark/mllib/rdd/MLPairRDDFunctions.scala | 4 .../org/apache/spark/mllib/rdd/RDDFunctions.scala | 4 .../apache/spark/mllib/recommendation/ALS.scala| 1 + .../apache/spark/mllib/tree/model/Predict.scala| 1 + 13 files changed, 50 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/372fa61f/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala index 5c7089b..078fecf 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/attribute/AttributeType.scala @@ -27,6 +27,9 @@ import org.apache.spark.annotation.DeveloperApi @DeveloperApi sealed abstract class AttributeType(val name: String) +/** + * :: DeveloperApi :: + */ @DeveloperApi object AttributeType { http://git-wip-us.apache.org/repos/asf/spark/blob/372fa61f/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala index 5aec692..609e50e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/LDA.scala @@ -675,6 +675,8 @@ class DistributedLDAModel private[ml] ( private var _checkpointFiles: Array[String] = oldDistributedModel.checkpointFiles /** + * :: DeveloperApi :: + * * If using checkpointing and [[LDA.keepLastCheckpoint]] is set to true, then there may be * saved checkpoint files. This method is provided so that users can manage those files. * @@ -689,6 +691,8 @@ class DistributedLDAModel private[ml] ( def getCheckpointFiles: Array[String] = _checkpointFiles /** + * :: DeveloperApi :: + * * Remove any remaining checkpoint files from training. * * @see [[getCheckpointFiles]] http://git-wip-us.apache.org/repos/asf/spark/blob/372fa61f/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala index e4bd0dc..91edcf2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/BisectingKMeans.scala @@ -31,6 +31,8 @@ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel /** + * :: Experimental :: + * * A bisecting k-means algorithm based on the paper "A comparison of document clustering techniques" * by Steinbach, Karypis, and Kumar, with modification to fit Spark. * The algorithm starts from a single cluster that contains all points. @@ -396,6 +398,8 @@ private object BisectingKMeans extends Serializable { } /** + * :: Experimental :: + * * Represents a node in a clustering tree. * *
spark git commit: [SPARK-15723] Fixed local-timezone-brittle test where short-timezone form "EST" is …
Repository: spark Updated Branches: refs/heads/branch-1.6 a0cf7d0b2 -> 6a9f19dd5 [SPARK-15723] Fixed local-timezone-brittle test where short-timezone form "EST" is ⦠## What changes were proposed in this pull request? Stop using the abbreviated and ambiguous timezone "EST" in a test, since it is machine-local default timezone dependent, and fails in different timezones. Fixed [SPARK-15723](https://issues.apache.org/jira/browse/SPARK-15723). ## How was this patch tested? Note that to reproduce this problem in any locale/timezone, you can modify the scalatest-maven-plugin argLine to add a timezone: -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} -Duser.timezone="Australia/Sydney" and run $ mvn test -DwildcardSuites=org.apache.spark.status.api.v1.SimpleDateParamSuite -Dtest=none. Equally this will fix it in an effected timezone: -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} -Duser.timezone="America/New_York" To test the fix, apply the above change to `pom.xml` to set test TZ to `Australia/Sydney`, and confirm the test now passes. Author: Brett Randall Closes #13462 from javabrett/SPARK-15723-SimpleDateParamSuite. (cherry picked from commit 4e767d0f9042bfea6074c2637438859699ec4dc3) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a9f19dd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a9f19dd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a9f19dd Branch: refs/heads/branch-1.6 Commit: 6a9f19dd57dadb80bccc328cf1d099bed04f7f18 Parents: a0cf7d0 Author: Brett Randall Authored: Sun Jun 5 15:31:56 2016 +0100 Committer: Sean Owen Committed: Sun Jun 5 16:12:49 2016 +0100 -- .../org/apache/spark/status/api/v1/SimpleDateParamSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6a9f19dd/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala index 63b0e77..18baeb1 100644 --- a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala @@ -26,7 +26,8 @@ class SimpleDateParamSuite extends SparkFunSuite with Matchers { test("date parsing") { new SimpleDateParam("2015-02-20T23:21:17.190GMT").timestamp should be (1424474477190L) -new SimpleDateParam("2015-02-20T17:21:17.190EST").timestamp should be (1424470877190L) +// don't use EST, it is ambiguous, use -0500 instead, see SPARK-15723 +new SimpleDateParam("2015-02-20T17:21:17.190-0500").timestamp should be (1424470877190L) new SimpleDateParam("2015-02-20").timestamp should be (142439040L) // GMT intercept[WebApplicationException] { new SimpleDateParam("invalid date") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15723] Fixed local-timezone-brittle test where short-timezone form "EST" is …
Repository: spark Updated Branches: refs/heads/branch-2.0 32a64d8fc -> 8c0ec85e6 [SPARK-15723] Fixed local-timezone-brittle test where short-timezone form "EST" is ⦠## What changes were proposed in this pull request? Stop using the abbreviated and ambiguous timezone "EST" in a test, since it is machine-local default timezone dependent, and fails in different timezones. Fixed [SPARK-15723](https://issues.apache.org/jira/browse/SPARK-15723). ## How was this patch tested? Note that to reproduce this problem in any locale/timezone, you can modify the scalatest-maven-plugin argLine to add a timezone: -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} -Duser.timezone="Australia/Sydney" and run $ mvn test -DwildcardSuites=org.apache.spark.status.api.v1.SimpleDateParamSuite -Dtest=none. Equally this will fix it in an effected timezone: -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} -Duser.timezone="America/New_York" To test the fix, apply the above change to `pom.xml` to set test TZ to `Australia/Sydney`, and confirm the test now passes. Author: Brett Randall Closes #13462 from javabrett/SPARK-15723-SimpleDateParamSuite. (cherry picked from commit 4e767d0f9042bfea6074c2637438859699ec4dc3) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8c0ec85e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8c0ec85e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8c0ec85e Branch: refs/heads/branch-2.0 Commit: 8c0ec85e62f762c11e0686d1c35d1dfec05df9de Parents: 32a64d8 Author: Brett Randall Authored: Sun Jun 5 15:31:56 2016 +0100 Committer: Sean Owen Committed: Sun Jun 5 16:12:24 2016 +0100 -- .../org/apache/spark/status/api/v1/SimpleDateParamSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8c0ec85e/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala index 63b0e77..18baeb1 100644 --- a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala @@ -26,7 +26,8 @@ class SimpleDateParamSuite extends SparkFunSuite with Matchers { test("date parsing") { new SimpleDateParam("2015-02-20T23:21:17.190GMT").timestamp should be (1424474477190L) -new SimpleDateParam("2015-02-20T17:21:17.190EST").timestamp should be (1424470877190L) +// don't use EST, it is ambiguous, use -0500 instead, see SPARK-15723 +new SimpleDateParam("2015-02-20T17:21:17.190-0500").timestamp should be (1424470877190L) new SimpleDateParam("2015-02-20").timestamp should be (142439040L) // GMT intercept[WebApplicationException] { new SimpleDateParam("invalid date") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15723] Fixed local-timezone-brittle test where short-timezone form "EST" is …
Repository: spark Updated Branches: refs/heads/master 0f307db5e -> 4e767d0f9 [SPARK-15723] Fixed local-timezone-brittle test where short-timezone form "EST" is ⦠## What changes were proposed in this pull request? Stop using the abbreviated and ambiguous timezone "EST" in a test, since it is machine-local default timezone dependent, and fails in different timezones. Fixed [SPARK-15723](https://issues.apache.org/jira/browse/SPARK-15723). ## How was this patch tested? Note that to reproduce this problem in any locale/timezone, you can modify the scalatest-maven-plugin argLine to add a timezone: -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} -Duser.timezone="Australia/Sydney" and run $ mvn test -DwildcardSuites=org.apache.spark.status.api.v1.SimpleDateParamSuite -Dtest=none. Equally this will fix it in an effected timezone: -ea -Xmx3g -XX:MaxPermSize=${MaxPermGen} -XX:ReservedCodeCacheSize=${CodeCacheSize} -Duser.timezone="America/New_York" To test the fix, apply the above change to `pom.xml` to set test TZ to `Australia/Sydney`, and confirm the test now passes. Author: Brett Randall Closes #13462 from javabrett/SPARK-15723-SimpleDateParamSuite. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e767d0f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e767d0f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e767d0f Branch: refs/heads/master Commit: 4e767d0f9042bfea6074c2637438859699ec4dc3 Parents: 0f307db Author: Brett Randall Authored: Sun Jun 5 15:31:56 2016 +0100 Committer: Sean Owen Committed: Sun Jun 5 15:31:56 2016 +0100 -- .../org/apache/spark/status/api/v1/SimpleDateParamSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e767d0f/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala index 63b0e77..18baeb1 100644 --- a/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/api/v1/SimpleDateParamSuite.scala @@ -26,7 +26,8 @@ class SimpleDateParamSuite extends SparkFunSuite with Matchers { test("date parsing") { new SimpleDateParam("2015-02-20T23:21:17.190GMT").timestamp should be (1424474477190L) -new SimpleDateParam("2015-02-20T17:21:17.190EST").timestamp should be (1424470877190L) +// don't use EST, it is ambiguous, use -0500 instead, see SPARK-15723 +new SimpleDateParam("2015-02-20T17:21:17.190-0500").timestamp should be (1424470877190L) new SimpleDateParam("2015-02-20").timestamp should be (142439040L) // GMT intercept[WebApplicationException] { new SimpleDateParam("invalid date") - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org