svn commit: r25861 - in /dev/spark/2.4.0-SNAPSHOT-2018_03_20_16_01-477d6bd-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Mar 20 23:15:32 2018 New Revision: 25861 Log: Apache Spark 2.4.0-SNAPSHOT-2018_03_20_16_01-477d6bd docs [This commit notification would consist of 1449 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r25856 - in /dev/spark/2.3.1-SNAPSHOT-2018_03_20_14_01-0b880db-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Mar 20 21:15:40 2018 New Revision: 25856 Log: Apache Spark 2.3.1-SNAPSHOT-2018_03_20_14_01-0b880db docs [This commit notification would consist of 1443 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23500][SQL] Fix complex type simplification rules to apply to entire plan
Repository: spark Updated Branches: refs/heads/master 2c4b9962f -> 477d6bd72 [SPARK-23500][SQL] Fix complex type simplification rules to apply to entire plan ## What changes were proposed in this pull request? Complex type simplification optimizer rules were not applied to the entire plan, just the expressions reachable from the root node. This patch fixes the rules to transform the entire plan. ## How was this patch tested? New unit test + ran sql / core tests. Author: Henry RobinsonAuthor: Henry Robinson Closes #20687 from henryr/spark-25000. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/477d6bd7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/477d6bd7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/477d6bd7 Branch: refs/heads/master Commit: 477d6bd7265e255fd43e53edda02019b32f29bb2 Parents: 2c4b996 Author: Henry Robinson Authored: Tue Mar 20 13:27:50 2018 -0700 Committer: gatorsmile Committed: Tue Mar 20 13:27:50 2018 -0700 -- .../sql/catalyst/optimizer/ComplexTypes.scala | 61 .../sql/catalyst/optimizer/Optimizer.scala | 4 +- .../catalyst/optimizer/complexTypesSuite.scala | 55 -- 3 files changed, 76 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/477d6bd7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala index be0009e..db7d6d3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala @@ -18,39 +18,39 @@ package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule /** -* push down operations into [[CreateNamedStructLike]]. -*/ -object SimplifyCreateStructOps extends Rule[LogicalPlan] { - override def apply(plan: LogicalPlan): LogicalPlan = { -plan.transformExpressionsUp { - // push down field extraction + * Simplify redundant [[CreateNamedStructLike]], [[CreateArray]] and [[CreateMap]] expressions. + */ +object SimplifyExtractValueOps extends Rule[LogicalPlan] { + override def apply(plan: LogicalPlan): LogicalPlan = plan transform { +// One place where this optimization is invalid is an aggregation where the select +// list expression is a function of a grouping expression: +// +// SELECT struct(a,b).a FROM tbl GROUP BY struct(a,b) +// +// cannot be simplified to SELECT a FROM tbl GROUP BY struct(a,b). So just skip this +// optimization for Aggregates (although this misses some cases where the optimization +// can be made). +case a: Aggregate => a +case p => p.transformExpressionsUp { + // Remove redundant field extraction. case GetStructField(createNamedStructLike: CreateNamedStructLike, ordinal, _) => createNamedStructLike.valExprs(ordinal) -} - } -} -/** -* push down operations into [[CreateArray]]. -*/ -object SimplifyCreateArrayOps extends Rule[LogicalPlan] { - override def apply(plan: LogicalPlan): LogicalPlan = { -plan.transformExpressionsUp { - // push down field selection (array of structs) - case GetArrayStructFields(CreateArray(elems), field, ordinal, numFields, containsNull) => -// instead f selecting the field on the entire array, -// select it from each member of the array. -// pushing down the operation this way open other optimizations opportunities -// (i.e. struct(...,x,...).x) + // Remove redundant array indexing. + case GetArrayStructFields(CreateArray(elems), field, ordinal, _, _) => +// Instead of selecting the field on the entire array, select it from each member +// of the array. Pushing down the operation this way may open other optimizations +// opportunities (i.e. struct(...,x,...).x) CreateArray(elems.map(GetStructField(_, ordinal, Some(field.name - // push down item selection. + + // Remove redundant map lookup. case ga @ GetArrayItem(CreateArray(elems), IntegerLiteral(idx)) => -// instead of creating the array and then selecting one row, -// remove array creation altgether. +// Instead of
svn commit: r25852 - in /dev/spark/2.4.0-SNAPSHOT-2018_03_20_12_03-2c4b996-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Mar 20 19:17:34 2018 New Revision: 25852 Log: Apache Spark 2.4.0-SNAPSHOT-2018_03_20_12_03-2c4b996 docs [This commit notification would consist of 1449 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23574][SQL] Report SinglePartition in DataSourceV2ScanExec when there's exactly 1 data reader factory.
Repository: spark Updated Branches: refs/heads/master 7f5e8aa26 -> 2c4b9962f [SPARK-23574][SQL] Report SinglePartition in DataSourceV2ScanExec when there's exactly 1 data reader factory. ## What changes were proposed in this pull request? Report SinglePartition in DataSourceV2ScanExec when there's exactly 1 data reader factory. Note that this means reader factories end up being constructed as partitioning is checked; let me know if you think that could be a problem. ## How was this patch tested? existing unit tests Author: Jose TorresAuthor: Jose Torres Closes #20726 from jose-torres/SPARK-23574. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2c4b9962 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2c4b9962 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2c4b9962 Branch: refs/heads/master Commit: 2c4b9962fdf8c1beb66126ca41628c72eb6c2383 Parents: 7f5e8aa Author: Jose Torres Authored: Tue Mar 20 11:46:51 2018 -0700 Committer: Wenchen Fan Committed: Tue Mar 20 11:46:51 2018 -0700 -- .../v2/reader/SupportsReportPartitioning.java | 3 ++ .../datasources/v2/DataSourceRDD.scala | 4 +-- .../datasources/v2/DataSourceV2ScanExec.scala | 29 +++- .../ContinuousDataSourceRDDIter.scala | 4 +-- .../sql/sources/v2/DataSourceV2Suite.scala | 20 +- .../sql/streaming/StreamingQuerySuite.scala | 4 +-- 6 files changed, 50 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2c4b9962/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java -- diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java index 5405a91..6076287 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java +++ b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java @@ -23,6 +23,9 @@ import org.apache.spark.sql.sources.v2.reader.partitioning.Partitioning; /** * A mix in interface for {@link DataSourceReader}. Data source readers can implement this * interface to report data partitioning and try to avoid shuffle at Spark side. + * + * Note that, when the reader creates exactly one {@link DataReaderFactory}, Spark may avoid + * adding a shuffle even if the reader does not implement this interface. */ @InterfaceStability.Evolving public interface SupportsReportPartitioning extends DataSourceReader { http://git-wip-us.apache.org/repos/asf/spark/blob/2c4b9962/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala index 5ed0ba7..f85971b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala @@ -29,11 +29,11 @@ class DataSourceRDDPartition[T : ClassTag](val index: Int, val readerFactory: Da class DataSourceRDD[T: ClassTag]( sc: SparkContext, -@transient private val readerFactories: java.util.List[DataReaderFactory[T]]) +@transient private val readerFactories: Seq[DataReaderFactory[T]]) extends RDD[T](sc, Nil) { override protected def getPartitions: Array[Partition] = { -readerFactories.asScala.zipWithIndex.map { +readerFactories.zipWithIndex.map { case (readerFactory, index) => new DataSourceRDDPartition(index, readerFactory) }.toArray } http://git-wip-us.apache.org/repos/asf/spark/blob/2c4b9962/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala index cb691ba..3a5e7bf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala @@ -25,12 +25,14 @@ import org.apache.spark.sql.catalyst.InternalRow import
spark git commit: [SPARK-21898][ML] Feature parity for KolmogorovSmirnovTest in MLlib
Repository: spark Updated Branches: refs/heads/master 5e7bc2ace -> 7f5e8aa26 [SPARK-21898][ML] Feature parity for KolmogorovSmirnovTest in MLlib ## What changes were proposed in this pull request? Feature parity for KolmogorovSmirnovTest in MLlib. Implement `DataFrame` interface for `KolmogorovSmirnovTest` in `mllib.stat`. ## How was this patch tested? Test suite added. Author: WeichenXuAuthor: jkbradley Closes #19108 from WeichenXu123/ml-ks-test. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f5e8aa2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f5e8aa2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f5e8aa2 Branch: refs/heads/master Commit: 7f5e8aa2606b0ee0297ceb6f4603bd368e3b0291 Parents: 5e7bc2a Author: WeichenXu Authored: Tue Mar 20 11:14:34 2018 -0700 Committer: Joseph K. Bradley Committed: Tue Mar 20 11:14:34 2018 -0700 -- .../spark/ml/stat/KolmogorovSmirnovTest.scala | 113 +++ .../ml/stat/JavaKolmogorovSmirnovTestSuite.java | 84 +++ .../ml/stat/KolmogorovSmirnovTestSuite.scala| 140 +++ 3 files changed, 337 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7f5e8aa2/mllib/src/main/scala/org/apache/spark/ml/stat/KolmogorovSmirnovTest.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/KolmogorovSmirnovTest.scala new file mode 100644 index 000..8d80e77 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/KolmogorovSmirnovTest.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.stat + +import scala.annotation.varargs + +import org.apache.spark.annotation.{Experimental, Since} +import org.apache.spark.api.java.function.Function +import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.mllib.stat.{Statistics => OldStatistics} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.col + +/** + * :: Experimental :: + * + * Conduct the two-sided Kolmogorov Smirnov (KS) test for data sampled from a + * continuous distribution. By comparing the largest difference between the empirical cumulative + * distribution of the sample data and the theoretical distribution we can provide a test for the + * the null hypothesis that the sample data comes from that theoretical distribution. + * For more information on KS Test: + * @see https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test;> + * Kolmogorov-Smirnov test (Wikipedia) + */ +@Experimental +@Since("2.4.0") +object KolmogorovSmirnovTest { + + /** Used to construct output schema of test */ + private case class KolmogorovSmirnovTestResult( + pValue: Double, + statistic: Double) + + private def getSampleRDD(dataset: DataFrame, sampleCol: String): RDD[Double] = { +SchemaUtils.checkNumericType(dataset.schema, sampleCol) +import dataset.sparkSession.implicits._ +dataset.select(col(sampleCol).cast("double")).as[Double].rdd + } + + /** + * Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a + * continuous distribution. By comparing the largest difference between the empirical cumulative + * distribution of the sample data and the theoretical distribution we can provide a test for the + * the null hypothesis that the sample data comes from that theoretical distribution. + * + * @param dataset a `DataFrame` containing the sample of data to test + * @param sampleCol Name of sample column in dataset, of any numerical type + * @param cdf a `Double => Double` function to calculate the theoretical CDF at a given value + * @return DataFrame containing the test result for the input sampled data. + * This DataFrame will
spark git commit: [SPARK-23649][SQL] Skipping chars disallowed in UTF-8
Repository: spark Updated Branches: refs/heads/branch-2.2 175b221bc -> 367a16118 [SPARK-23649][SQL] Skipping chars disallowed in UTF-8 The mapping of UTF-8 char's first byte to char's size doesn't cover whole range 0-255. It is defined only for 0-253: https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L60-L65 https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L190 If the first byte of a char is 253-255, IndexOutOfBoundsException is thrown. Besides of that values for 244-252 are not correct according to recent unicode standard for UTF-8: http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf As a consequence of the exception above, the length of input string in UTF-8 encoding cannot be calculated if the string contains chars started from 253 code. It is visible on user's side as for example crashing of schema inferring of csv file which contains such chars but the file can be read if the schema is specified explicitly or if the mode set to multiline. The proposed changes build correct mapping of first byte of UTF-8 char to its size (now it covers all cases) and skip disallowed chars (counts it as one octet). Added a test and a file with a char which is disallowed in UTF-8 - 0xFF. Author: Maxim GekkCloses #20796 from MaxGekk/skip-wrong-utf8-chars. (cherry picked from commit 5e7bc2acef4a1e11d0d8056ef5c12cd5c8f220da) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/367a1611 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/367a1611 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/367a1611 Branch: refs/heads/branch-2.2 Commit: 367a16118289e1c03507c14f966e8b1ebd688489 Parents: 175b221 Author: Maxim Gekk Authored: Tue Mar 20 10:34:56 2018 -0700 Committer: Wenchen Fan Committed: Tue Mar 20 10:37:29 2018 -0700 -- .../apache/spark/unsafe/types/UTF8String.java | 48 .../spark/unsafe/types/UTF8StringSuite.java | 23 +- 2 files changed, 62 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/367a1611/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 23636ca..a11e63c 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -57,12 +57,43 @@ public final class UTF8String implements Comparable, Externalizable, public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } - private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -4, 4, 4, 4, 4, 4, 4, 4, -5, 5, 5, 5, -6, 6}; + /** + * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which + * indicates the size of the char. See Unicode standard in page 126, Table 3-6: + * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf + * + * BinaryHex Comments + * 0xxx 0x00..0x7F Only byte of a 1-byte character encoding + * 10xx 0x80..0xBF Continuation bytes (1-3 continuation bytes) + * 110x 0xC0..0xDF First byte of a 2-byte character encoding + * 1110 0xE0..0xEF First byte of a 3-byte character encoding + * 0xxx 0xF0..0xF7 First byte of a 4-byte character encoding + * + * As a consequence of the well-formedness conditions specified in + * Table 3-7 (page 126), the following byte values are disallowed in UTF-8: + * C0âC1, F5âFF. + */ + private static byte[] bytesOfCodePointInUTF8 = { +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F +// Continuation bytes cannot appear as the first byte +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
spark git commit: [SPARK-23649][SQL] Skipping chars disallowed in UTF-8
Repository: spark Updated Branches: refs/heads/branch-2.3 c854b6ca7 -> 0b880db65 [SPARK-23649][SQL] Skipping chars disallowed in UTF-8 ## What changes were proposed in this pull request? The mapping of UTF-8 char's first byte to char's size doesn't cover whole range 0-255. It is defined only for 0-253: https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L60-L65 https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L190 If the first byte of a char is 253-255, IndexOutOfBoundsException is thrown. Besides of that values for 244-252 are not correct according to recent unicode standard for UTF-8: http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf As a consequence of the exception above, the length of input string in UTF-8 encoding cannot be calculated if the string contains chars started from 253 code. It is visible on user's side as for example crashing of schema inferring of csv file which contains such chars but the file can be read if the schema is specified explicitly or if the mode set to multiline. The proposed changes build correct mapping of first byte of UTF-8 char to its size (now it covers all cases) and skip disallowed chars (counts it as one octet). ## How was this patch tested? Added a test and a file with a char which is disallowed in UTF-8 - 0xFF. Author: Maxim GekkCloses #20796 from MaxGekk/skip-wrong-utf8-chars. (cherry picked from commit 5e7bc2acef4a1e11d0d8056ef5c12cd5c8f220da) Signed-off-by: Wenchen Fan Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b880db6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b880db6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b880db6 Branch: refs/heads/branch-2.3 Commit: 0b880db65b647e549b78721859b1712dff733ec9 Parents: c854b6c Author: Maxim Gekk Authored: Tue Mar 20 10:34:56 2018 -0700 Committer: Wenchen Fan Committed: Tue Mar 20 10:35:14 2018 -0700 -- .../apache/spark/unsafe/types/UTF8String.java | 48 .../spark/unsafe/types/UTF8StringSuite.java | 23 +- 2 files changed, 62 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0b880db6/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index b0d0c44..5d468ae 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -57,12 +57,43 @@ public final class UTF8String implements Comparable, Externalizable, public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } - private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -4, 4, 4, 4, 4, 4, 4, 4, -5, 5, 5, 5, -6, 6}; + /** + * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which + * indicates the size of the char. See Unicode standard in page 126, Table 3-6: + * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf + * + * BinaryHex Comments + * 0xxx 0x00..0x7F Only byte of a 1-byte character encoding + * 10xx 0x80..0xBF Continuation bytes (1-3 continuation bytes) + * 110x 0xC0..0xDF First byte of a 2-byte character encoding + * 1110 0xE0..0xEF First byte of a 3-byte character encoding + * 0xxx 0xF0..0xF7 First byte of a 4-byte character encoding + * + * As a consequence of the well-formedness conditions specified in + * Table 3-7 (page 126), the following byte values are disallowed in UTF-8: + * C0âC1, F5âFF. + */ + private static byte[] bytesOfCodePointInUTF8 = { +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F +//
spark git commit: [SPARK-23649][SQL] Skipping chars disallowed in UTF-8
Repository: spark Updated Branches: refs/heads/master 566321852 -> 5e7bc2ace [SPARK-23649][SQL] Skipping chars disallowed in UTF-8 ## What changes were proposed in this pull request? The mapping of UTF-8 char's first byte to char's size doesn't cover whole range 0-255. It is defined only for 0-253: https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L60-L65 https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L190 If the first byte of a char is 253-255, IndexOutOfBoundsException is thrown. Besides of that values for 244-252 are not correct according to recent unicode standard for UTF-8: http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf As a consequence of the exception above, the length of input string in UTF-8 encoding cannot be calculated if the string contains chars started from 253 code. It is visible on user's side as for example crashing of schema inferring of csv file which contains such chars but the file can be read if the schema is specified explicitly or if the mode set to multiline. The proposed changes build correct mapping of first byte of UTF-8 char to its size (now it covers all cases) and skip disallowed chars (counts it as one octet). ## How was this patch tested? Added a test and a file with a char which is disallowed in UTF-8 - 0xFF. Author: Maxim GekkCloses #20796 from MaxGekk/skip-wrong-utf8-chars. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e7bc2ac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e7bc2ac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e7bc2ac Branch: refs/heads/master Commit: 5e7bc2acef4a1e11d0d8056ef5c12cd5c8f220da Parents: 5663218 Author: Maxim Gekk Authored: Tue Mar 20 10:34:56 2018 -0700 Committer: Wenchen Fan Committed: Tue Mar 20 10:34:56 2018 -0700 -- .../apache/spark/unsafe/types/UTF8String.java | 48 .../spark/unsafe/types/UTF8StringSuite.java | 23 +- 2 files changed, 62 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5e7bc2ac/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index b0d0c44..5d468ae 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -57,12 +57,43 @@ public final class UTF8String implements Comparable, Externalizable, public Object getBaseObject() { return base; } public long getBaseOffset() { return offset; } - private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -4, 4, 4, 4, 4, 4, 4, 4, -5, 5, 5, 5, -6, 6}; + /** + * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte which + * indicates the size of the char. See Unicode standard in page 126, Table 3-6: + * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf + * + * BinaryHex Comments + * 0xxx 0x00..0x7F Only byte of a 1-byte character encoding + * 10xx 0x80..0xBF Continuation bytes (1-3 continuation bytes) + * 110x 0xC0..0xDF First byte of a 2-byte character encoding + * 1110 0xE0..0xEF First byte of a 3-byte character encoding + * 0xxx 0xF0..0xF7 First byte of a 4-byte character encoding + * + * As a consequence of the well-formedness conditions specified in + * Table 3-7 (page 126), the following byte values are disallowed in UTF-8: + * C0âC1, F5âFF. + */ + private static byte[] bytesOfCodePointInUTF8 = { +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F +1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F +// Continuation bytes cannot appear as the first byte +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80..0x8F +0, 0, 0, 0,
svn commit: r25843 - in /dev/spark/2.3.1-SNAPSHOT-2018_03_20_02_01-c854b6c-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Mar 20 09:16:01 2018 New Revision: 25843 Log: Apache Spark 2.3.1-SNAPSHOT-2018_03_20_02_01-c854b6c docs [This commit notification would consist of 1443 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23691][PYTHON][BRANCH-2.3] Use sql_conf util in PySpark tests where possible
Repository: spark Updated Branches: refs/heads/branch-2.3 2f82c037d -> c854b6ca7 [SPARK-23691][PYTHON][BRANCH-2.3] Use sql_conf util in PySpark tests where possible ## What changes were proposed in this pull request? This PR backports https://github.com/apache/spark/pull/20830 to reduce the diff against master and restore the default value back in PySpark tests. https://github.com/apache/spark/commit/d6632d185e147fcbe6724545488ad80dce20277e added an useful util. This backport extracts and brings this util: ```python contextmanager def sql_conf(self, pairs): ... ``` to allow configuration set/unset within a block: ```python with self.sql_conf({"spark.blah.blah.blah", "blah"}) # test codes ``` This PR proposes to use this util where possible in PySpark tests. Note that there look already few places affecting tests without restoring the original value back in unittest classes. ## How was this patch tested? Likewise, manually tested via: ``` ./run-tests --modules=pyspark-sql --python-executables=python2 ./run-tests --modules=pyspark-sql --python-executables=python3 ``` Author: hyukjinkwonCloses #20863 from HyukjinKwon/backport-20830. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c854b6ca Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c854b6ca Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c854b6ca Branch: refs/heads/branch-2.3 Commit: c854b6ca7ba4dc33138c12ba4606ff8fbe82aef2 Parents: 2f82c03 Author: hyukjinkwon Authored: Tue Mar 20 17:53:09 2018 +0900 Committer: hyukjinkwon Committed: Tue Mar 20 17:53:09 2018 +0900 -- python/pyspark/sql/tests.py | 143 --- 1 file changed, 72 insertions(+), 71 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c854b6ca/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 82c5500..d806e5d 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -33,6 +33,7 @@ import datetime import array import ctypes import py4j +from contextlib import contextmanager try: import xmlrunner @@ -201,6 +202,28 @@ class ReusedSQLTestCase(ReusedPySparkTestCase): "\n\nResult:\n%s\n%s" % (result, result.dtypes)) self.assertTrue(expected.equals(result), msg=msg) +@contextmanager +def sql_conf(self, pairs): +""" +A convenient context manager to test some configuration specific logic. This sets +`value` to the configuration `key` and then restores it back when it exits. +""" +assert isinstance(pairs, dict), "pairs should be a dictionary." + +keys = pairs.keys() +new_values = pairs.values() +old_values = [self.spark.conf.get(key, None) for key in keys] +for key, new_value in zip(keys, new_values): +self.spark.conf.set(key, new_value) +try: +yield +finally: +for key, old_value in zip(keys, old_values): +if old_value is None: +self.spark.conf.unset(key) +else: +self.spark.conf.set(key, old_value) + class DataTypeTests(unittest.TestCase): # regression test for SPARK-6055 @@ -2409,17 +2432,13 @@ class SQLTests(ReusedSQLTestCase): df1 = self.spark.range(1).toDF("a") df2 = self.spark.range(1).toDF("b") -try: -self.spark.conf.set("spark.sql.crossJoin.enabled", "false") +with self.sql_conf({"spark.sql.crossJoin.enabled": False}): self.assertRaises(AnalysisException, lambda: df1.join(df2, how="inner").collect()) -self.spark.conf.set("spark.sql.crossJoin.enabled", "true") +with self.sql_conf({"spark.sql.crossJoin.enabled": True}): actual = df1.join(df2, how="inner").collect() expected = [Row(a=0, b=0)] self.assertEqual(actual, expected) -finally: -# We should unset this. Otherwise, other tests are affected. -self.spark.conf.unset("spark.sql.crossJoin.enabled") # Regression test for invalid join methods when on is None, Spark-14761 def test_invalid_join_method(self): @@ -2891,21 +2910,18 @@ class SQLTests(ReusedSQLTestCase): self.assertPandasEqual(pdf, df.toPandas()) orig_env_tz = os.environ.get('TZ', None) -orig_session_tz = self.spark.conf.get('spark.sql.session.timeZone') try: tz = 'America/Los_Angeles' os.environ['TZ'] = tz time.tzset() -self.spark.conf.set('spark.sql.session.timeZone', tz) - -
svn commit: r25842 - in /dev/spark/2.4.0-SNAPSHOT-2018_03_20_00_01-5663218-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Mar 20 07:17:24 2018 New Revision: 25842 Log: Apache Spark 2.4.0-SNAPSHOT-2018_03_20_00_01-5663218 docs [This commit notification would consist of 1449 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org