svn commit: r25861 - in /dev/spark/2.4.0-SNAPSHOT-2018_03_20_16_01-477d6bd-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-03-20 Thread pwendell
Author: pwendell
Date: Tue Mar 20 23:15:32 2018
New Revision: 25861

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_03_20_16_01-477d6bd docs


[This commit notification would consist of 1449 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



svn commit: r25856 - in /dev/spark/2.3.1-SNAPSHOT-2018_03_20_14_01-0b880db-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-03-20 Thread pwendell
Author: pwendell
Date: Tue Mar 20 21:15:40 2018
New Revision: 25856

Log:
Apache Spark 2.3.1-SNAPSHOT-2018_03_20_14_01-0b880db docs


[This commit notification would consist of 1443 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-23500][SQL] Fix complex type simplification rules to apply to entire plan

2018-03-20 Thread lixiao
Repository: spark
Updated Branches:
  refs/heads/master 2c4b9962f -> 477d6bd72


[SPARK-23500][SQL] Fix complex type simplification rules to apply to entire plan

## What changes were proposed in this pull request?

Complex type simplification optimizer rules were not applied to the
entire plan, just the expressions reachable from the root node. This
patch fixes the rules to transform the entire plan.

## How was this patch tested?

New unit test + ran sql / core tests.

Author: Henry Robinson 
Author: Henry Robinson 

Closes #20687 from henryr/spark-25000.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/477d6bd7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/477d6bd7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/477d6bd7

Branch: refs/heads/master
Commit: 477d6bd7265e255fd43e53edda02019b32f29bb2
Parents: 2c4b996
Author: Henry Robinson 
Authored: Tue Mar 20 13:27:50 2018 -0700
Committer: gatorsmile 
Committed: Tue Mar 20 13:27:50 2018 -0700

--
 .../sql/catalyst/optimizer/ComplexTypes.scala   | 61 
 .../sql/catalyst/optimizer/Optimizer.scala  |  4 +-
 .../catalyst/optimizer/complexTypesSuite.scala  | 55 --
 3 files changed, 76 insertions(+), 44 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/477d6bd7/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
index be0009e..db7d6d3 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/ComplexTypes.scala
@@ -18,39 +18,39 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.Rule
 
 /**
-* push down operations into [[CreateNamedStructLike]].
-*/
-object SimplifyCreateStructOps extends Rule[LogicalPlan] {
-  override def apply(plan: LogicalPlan): LogicalPlan = {
-plan.transformExpressionsUp {
-  // push down field extraction
+ * Simplify redundant [[CreateNamedStructLike]], [[CreateArray]] and 
[[CreateMap]] expressions.
+ */
+object SimplifyExtractValueOps extends Rule[LogicalPlan] {
+  override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+// One place where this optimization is invalid is an aggregation where 
the select
+// list expression is a function of a grouping expression:
+//
+// SELECT struct(a,b).a FROM tbl GROUP BY struct(a,b)
+//
+// cannot be simplified to SELECT a FROM tbl GROUP BY struct(a,b). So just 
skip this
+// optimization for Aggregates (although this misses some cases where the 
optimization
+// can be made).
+case a: Aggregate => a
+case p => p.transformExpressionsUp {
+  // Remove redundant field extraction.
   case GetStructField(createNamedStructLike: CreateNamedStructLike, 
ordinal, _) =>
 createNamedStructLike.valExprs(ordinal)
-}
-  }
-}
 
-/**
-* push down operations into [[CreateArray]].
-*/
-object SimplifyCreateArrayOps extends Rule[LogicalPlan] {
-  override def apply(plan: LogicalPlan): LogicalPlan = {
-plan.transformExpressionsUp {
-  // push down field selection (array of structs)
-  case GetArrayStructFields(CreateArray(elems), field, ordinal, numFields, 
containsNull) =>
-// instead f selecting the field on the entire array,
-// select it from each member of the array.
-// pushing down the operation this way open other optimizations 
opportunities
-// (i.e. struct(...,x,...).x)
+  // Remove redundant array indexing.
+  case GetArrayStructFields(CreateArray(elems), field, ordinal, _, _) =>
+// Instead of selecting the field on the entire array, select it from 
each member
+// of the array. Pushing down the operation this way may open other 
optimizations
+// opportunities (i.e. struct(...,x,...).x)
 CreateArray(elems.map(GetStructField(_, ordinal, Some(field.name
-  // push down item selection.
+
+  // Remove redundant map lookup.
   case ga @ GetArrayItem(CreateArray(elems), IntegerLiteral(idx)) =>
-// instead of creating the array and then selecting one row,
-// remove array creation altgether.
+// Instead of 

svn commit: r25852 - in /dev/spark/2.4.0-SNAPSHOT-2018_03_20_12_03-2c4b996-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-03-20 Thread pwendell
Author: pwendell
Date: Tue Mar 20 19:17:34 2018
New Revision: 25852

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_03_20_12_03-2c4b996 docs


[This commit notification would consist of 1449 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-23574][SQL] Report SinglePartition in DataSourceV2ScanExec when there's exactly 1 data reader factory.

2018-03-20 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/master 7f5e8aa26 -> 2c4b9962f


[SPARK-23574][SQL] Report SinglePartition in DataSourceV2ScanExec when there's 
exactly 1 data reader factory.

## What changes were proposed in this pull request?

Report SinglePartition in DataSourceV2ScanExec when there's exactly 1 data 
reader factory.

Note that this means reader factories end up being constructed as partitioning 
is checked; let me know if you think that could be a problem.

## How was this patch tested?

existing unit tests

Author: Jose Torres 
Author: Jose Torres 

Closes #20726 from jose-torres/SPARK-23574.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2c4b9962
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2c4b9962
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2c4b9962

Branch: refs/heads/master
Commit: 2c4b9962fdf8c1beb66126ca41628c72eb6c2383
Parents: 7f5e8aa
Author: Jose Torres 
Authored: Tue Mar 20 11:46:51 2018 -0700
Committer: Wenchen Fan 
Committed: Tue Mar 20 11:46:51 2018 -0700

--
 .../v2/reader/SupportsReportPartitioning.java   |  3 ++
 .../datasources/v2/DataSourceRDD.scala  |  4 +--
 .../datasources/v2/DataSourceV2ScanExec.scala   | 29 +++-
 .../ContinuousDataSourceRDDIter.scala   |  4 +--
 .../sql/sources/v2/DataSourceV2Suite.scala  | 20 +-
 .../sql/streaming/StreamingQuerySuite.scala |  4 +--
 6 files changed, 50 insertions(+), 14 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2c4b9962/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
--
diff --git 
a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
 
b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
index 5405a91..6076287 100644
--- 
a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
+++ 
b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/SupportsReportPartitioning.java
@@ -23,6 +23,9 @@ import 
org.apache.spark.sql.sources.v2.reader.partitioning.Partitioning;
 /**
  * A mix in interface for {@link DataSourceReader}. Data source readers can 
implement this
  * interface to report data partitioning and try to avoid shuffle at Spark 
side.
+ *
+ * Note that, when the reader creates exactly one {@link DataReaderFactory}, 
Spark may avoid
+ * adding a shuffle even if the reader does not implement this interface.
  */
 @InterfaceStability.Evolving
 public interface SupportsReportPartitioning extends DataSourceReader {

http://git-wip-us.apache.org/repos/asf/spark/blob/2c4b9962/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
index 5ed0ba7..f85971b 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceRDD.scala
@@ -29,11 +29,11 @@ class DataSourceRDDPartition[T : ClassTag](val index: Int, 
val readerFactory: Da
 
 class DataSourceRDD[T: ClassTag](
 sc: SparkContext,
-@transient private val readerFactories: 
java.util.List[DataReaderFactory[T]])
+@transient private val readerFactories: Seq[DataReaderFactory[T]])
   extends RDD[T](sc, Nil) {
 
   override protected def getPartitions: Array[Partition] = {
-readerFactories.asScala.zipWithIndex.map {
+readerFactories.zipWithIndex.map {
   case (readerFactory, index) => new DataSourceRDDPartition(index, 
readerFactory)
 }.toArray
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/2c4b9962/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
index cb691ba..3a5e7bf 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2ScanExec.scala
@@ -25,12 +25,14 @@ import org.apache.spark.sql.catalyst.InternalRow
 import 

spark git commit: [SPARK-21898][ML] Feature parity for KolmogorovSmirnovTest in MLlib

2018-03-20 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/master 5e7bc2ace -> 7f5e8aa26


[SPARK-21898][ML] Feature parity for KolmogorovSmirnovTest in MLlib

## What changes were proposed in this pull request?

Feature parity for KolmogorovSmirnovTest in MLlib.
Implement `DataFrame` interface for `KolmogorovSmirnovTest` in `mllib.stat`.

## How was this patch tested?

Test suite added.

Author: WeichenXu 
Author: jkbradley 

Closes #19108 from WeichenXu123/ml-ks-test.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7f5e8aa2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7f5e8aa2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7f5e8aa2

Branch: refs/heads/master
Commit: 7f5e8aa2606b0ee0297ceb6f4603bd368e3b0291
Parents: 5e7bc2a
Author: WeichenXu 
Authored: Tue Mar 20 11:14:34 2018 -0700
Committer: Joseph K. Bradley 
Committed: Tue Mar 20 11:14:34 2018 -0700

--
 .../spark/ml/stat/KolmogorovSmirnovTest.scala   | 113 +++
 .../ml/stat/JavaKolmogorovSmirnovTestSuite.java |  84 +++
 .../ml/stat/KolmogorovSmirnovTestSuite.scala| 140 +++
 3 files changed, 337 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7f5e8aa2/mllib/src/main/scala/org/apache/spark/ml/stat/KolmogorovSmirnovTest.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/stat/KolmogorovSmirnovTest.scala 
b/mllib/src/main/scala/org/apache/spark/ml/stat/KolmogorovSmirnovTest.scala
new file mode 100644
index 000..8d80e77
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/KolmogorovSmirnovTest.scala
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.stat
+
+import scala.annotation.varargs
+
+import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.api.java.function.Function
+import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.mllib.stat.{Statistics => OldStatistics}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.col
+
+/**
+ * :: Experimental ::
+ *
+ * Conduct the two-sided Kolmogorov Smirnov (KS) test for data sampled from a
+ * continuous distribution. By comparing the largest difference between the 
empirical cumulative
+ * distribution of the sample data and the theoretical distribution we can 
provide a test for the
+ * the null hypothesis that the sample data comes from that theoretical 
distribution.
+ * For more information on KS Test:
+ * @see https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test;>
+ * Kolmogorov-Smirnov test (Wikipedia)
+ */
+@Experimental
+@Since("2.4.0")
+object KolmogorovSmirnovTest {
+
+  /** Used to construct output schema of test */
+  private case class KolmogorovSmirnovTestResult(
+  pValue: Double,
+  statistic: Double)
+
+  private def getSampleRDD(dataset: DataFrame, sampleCol: String): RDD[Double] 
= {
+SchemaUtils.checkNumericType(dataset.schema, sampleCol)
+import dataset.sparkSession.implicits._
+dataset.select(col(sampleCol).cast("double")).as[Double].rdd
+  }
+
+  /**
+   * Conduct the two-sided Kolmogorov-Smirnov (KS) test for data sampled from a
+   * continuous distribution. By comparing the largest difference between the 
empirical cumulative
+   * distribution of the sample data and the theoretical distribution we can 
provide a test for the
+   * the null hypothesis that the sample data comes from that theoretical 
distribution.
+   *
+   * @param dataset a `DataFrame` containing the sample of data to test
+   * @param sampleCol Name of sample column in dataset, of any numerical type
+   * @param cdf a `Double => Double` function to calculate the theoretical CDF 
at a given value
+   * @return DataFrame containing the test result for the input sampled data.
+   * This DataFrame will 

spark git commit: [SPARK-23649][SQL] Skipping chars disallowed in UTF-8

2018-03-20 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 175b221bc -> 367a16118


[SPARK-23649][SQL] Skipping chars disallowed in UTF-8

The mapping of UTF-8 char's first byte to char's size doesn't cover whole range 
0-255. It is defined only for 0-253:
https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L60-L65
https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L190

If the first byte of a char is 253-255, IndexOutOfBoundsException is thrown. 
Besides of that values for 244-252 are not correct according to recent unicode 
standard for UTF-8: 
http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf

As a consequence of the exception above, the length of input string in UTF-8 
encoding cannot be calculated if the string contains chars started from 253 
code. It is visible on user's side as for example crashing of schema inferring 
of csv file which contains such chars but the file can be read if the schema is 
specified explicitly or if the mode set to multiline.

The proposed changes build correct mapping of first byte of UTF-8 char to its 
size (now it covers all cases) and skip disallowed chars (counts it as one 
octet).

Added a test and a file with a char which is disallowed in UTF-8 - 0xFF.

Author: Maxim Gekk 

Closes #20796 from MaxGekk/skip-wrong-utf8-chars.

(cherry picked from commit 5e7bc2acef4a1e11d0d8056ef5c12cd5c8f220da)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/367a1611
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/367a1611
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/367a1611

Branch: refs/heads/branch-2.2
Commit: 367a16118289e1c03507c14f966e8b1ebd688489
Parents: 175b221
Author: Maxim Gekk 
Authored: Tue Mar 20 10:34:56 2018 -0700
Committer: Wenchen Fan 
Committed: Tue Mar 20 10:37:29 2018 -0700

--
 .../apache/spark/unsafe/types/UTF8String.java   | 48 
 .../spark/unsafe/types/UTF8StringSuite.java | 23 +-
 2 files changed, 62 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/367a1611/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
--
diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 23636ca..a11e63c 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -57,12 +57,43 @@ public final class UTF8String implements 
Comparable, Externalizable,
   public Object getBaseObject() { return base; }
   public long getBaseOffset() { return offset; }
 
-  private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2,
-2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-4, 4, 4, 4, 4, 4, 4, 4,
-5, 5, 5, 5,
-6, 6};
+  /**
+   * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte 
which
+   * indicates the size of the char. See Unicode standard in page 126, Table 
3-6:
+   * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf
+   *
+   * BinaryHex  Comments
+   * 0xxx  0x00..0x7F   Only byte of a 1-byte character encoding
+   * 10xx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
+   * 110x  0xC0..0xDF   First byte of a 2-byte character encoding
+   * 1110  0xE0..0xEF   First byte of a 3-byte character encoding
+   * 0xxx  0xF0..0xF7   First byte of a 4-byte character encoding
+   *
+   * As a consequence of the well-formedness conditions specified in
+   * Table 3-7 (page 126), the following byte values are disallowed in UTF-8:
+   *   C0–C1, F5–FF.
+   */
+  private static byte[] bytesOfCodePointInUTF8 = {
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F
+// Continuation bytes cannot appear as the first byte
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

spark git commit: [SPARK-23649][SQL] Skipping chars disallowed in UTF-8

2018-03-20 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 c854b6ca7 -> 0b880db65


[SPARK-23649][SQL] Skipping chars disallowed in UTF-8

## What changes were proposed in this pull request?

The mapping of UTF-8 char's first byte to char's size doesn't cover whole range 
0-255. It is defined only for 0-253:
https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L60-L65
https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L190

If the first byte of a char is 253-255, IndexOutOfBoundsException is thrown. 
Besides of that values for 244-252 are not correct according to recent unicode 
standard for UTF-8: 
http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf

As a consequence of the exception above, the length of input string in UTF-8 
encoding cannot be calculated if the string contains chars started from 253 
code. It is visible on user's side as for example crashing of schema inferring 
of csv file which contains such chars but the file can be read if the schema is 
specified explicitly or if the mode set to multiline.

The proposed changes build correct mapping of first byte of UTF-8 char to its 
size (now it covers all cases) and skip disallowed chars (counts it as one 
octet).

## How was this patch tested?

Added a test and a file with a char which is disallowed in UTF-8 - 0xFF.

Author: Maxim Gekk 

Closes #20796 from MaxGekk/skip-wrong-utf8-chars.

(cherry picked from commit 5e7bc2acef4a1e11d0d8056ef5c12cd5c8f220da)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b880db6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b880db6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b880db6

Branch: refs/heads/branch-2.3
Commit: 0b880db65b647e549b78721859b1712dff733ec9
Parents: c854b6c
Author: Maxim Gekk 
Authored: Tue Mar 20 10:34:56 2018 -0700
Committer: Wenchen Fan 
Committed: Tue Mar 20 10:35:14 2018 -0700

--
 .../apache/spark/unsafe/types/UTF8String.java   | 48 
 .../spark/unsafe/types/UTF8StringSuite.java | 23 +-
 2 files changed, 62 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0b880db6/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
--
diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index b0d0c44..5d468ae 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -57,12 +57,43 @@ public final class UTF8String implements 
Comparable, Externalizable,
   public Object getBaseObject() { return base; }
   public long getBaseOffset() { return offset; }
 
-  private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2,
-2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-4, 4, 4, 4, 4, 4, 4, 4,
-5, 5, 5, 5,
-6, 6};
+  /**
+   * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte 
which
+   * indicates the size of the char. See Unicode standard in page 126, Table 
3-6:
+   * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf
+   *
+   * BinaryHex  Comments
+   * 0xxx  0x00..0x7F   Only byte of a 1-byte character encoding
+   * 10xx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
+   * 110x  0xC0..0xDF   First byte of a 2-byte character encoding
+   * 1110  0xE0..0xEF   First byte of a 3-byte character encoding
+   * 0xxx  0xF0..0xF7   First byte of a 4-byte character encoding
+   *
+   * As a consequence of the well-formedness conditions specified in
+   * Table 3-7 (page 126), the following byte values are disallowed in UTF-8:
+   *   C0–C1, F5–FF.
+   */
+  private static byte[] bytesOfCodePointInUTF8 = {
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F
+// 

spark git commit: [SPARK-23649][SQL] Skipping chars disallowed in UTF-8

2018-03-20 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/master 566321852 -> 5e7bc2ace


[SPARK-23649][SQL] Skipping chars disallowed in UTF-8

## What changes were proposed in this pull request?

The mapping of UTF-8 char's first byte to char's size doesn't cover whole range 
0-255. It is defined only for 0-253:
https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L60-L65
https://github.com/apache/spark/blob/master/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java#L190

If the first byte of a char is 253-255, IndexOutOfBoundsException is thrown. 
Besides of that values for 244-252 are not correct according to recent unicode 
standard for UTF-8: 
http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf

As a consequence of the exception above, the length of input string in UTF-8 
encoding cannot be calculated if the string contains chars started from 253 
code. It is visible on user's side as for example crashing of schema inferring 
of csv file which contains such chars but the file can be read if the schema is 
specified explicitly or if the mode set to multiline.

The proposed changes build correct mapping of first byte of UTF-8 char to its 
size (now it covers all cases) and skip disallowed chars (counts it as one 
octet).

## How was this patch tested?

Added a test and a file with a char which is disallowed in UTF-8 - 0xFF.

Author: Maxim Gekk 

Closes #20796 from MaxGekk/skip-wrong-utf8-chars.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5e7bc2ac
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5e7bc2ac
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5e7bc2ac

Branch: refs/heads/master
Commit: 5e7bc2acef4a1e11d0d8056ef5c12cd5c8f220da
Parents: 5663218
Author: Maxim Gekk 
Authored: Tue Mar 20 10:34:56 2018 -0700
Committer: Wenchen Fan 
Committed: Tue Mar 20 10:34:56 2018 -0700

--
 .../apache/spark/unsafe/types/UTF8String.java   | 48 
 .../spark/unsafe/types/UTF8StringSuite.java | 23 +-
 2 files changed, 62 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5e7bc2ac/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
--
diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index b0d0c44..5d468ae 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -57,12 +57,43 @@ public final class UTF8String implements 
Comparable, Externalizable,
   public Object getBaseObject() { return base; }
   public long getBaseOffset() { return offset; }
 
-  private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
2,
-2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-4, 4, 4, 4, 4, 4, 4, 4,
-5, 5, 5, 5,
-6, 6};
+  /**
+   * A char in UTF-8 encoding can take 1-4 bytes depending on the first byte 
which
+   * indicates the size of the char. See Unicode standard in page 126, Table 
3-6:
+   * http://www.unicode.org/versions/Unicode10.0.0/UnicodeStandard-10.0.pdf
+   *
+   * BinaryHex  Comments
+   * 0xxx  0x00..0x7F   Only byte of a 1-byte character encoding
+   * 10xx  0x80..0xBF   Continuation bytes (1-3 continuation bytes)
+   * 110x  0xC0..0xDF   First byte of a 2-byte character encoding
+   * 1110  0xE0..0xEF   First byte of a 3-byte character encoding
+   * 0xxx  0xF0..0xF7   First byte of a 4-byte character encoding
+   *
+   * As a consequence of the well-formedness conditions specified in
+   * Table 3-7 (page 126), the following byte values are disallowed in UTF-8:
+   *   C0–C1, F5–FF.
+   */
+  private static byte[] bytesOfCodePointInUTF8 = {
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x00..0x0F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x10..0x1F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x20..0x2F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x30..0x3F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x40..0x4F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x50..0x5F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x60..0x6F
+1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x70..0x7F
+// Continuation bytes cannot appear as the first byte
+0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x80..0x8F
+0, 0, 0, 0, 

svn commit: r25843 - in /dev/spark/2.3.1-SNAPSHOT-2018_03_20_02_01-c854b6c-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-03-20 Thread pwendell
Author: pwendell
Date: Tue Mar 20 09:16:01 2018
New Revision: 25843

Log:
Apache Spark 2.3.1-SNAPSHOT-2018_03_20_02_01-c854b6c docs


[This commit notification would consist of 1443 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-23691][PYTHON][BRANCH-2.3] Use sql_conf util in PySpark tests where possible

2018-03-20 Thread gurwls223
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 2f82c037d -> c854b6ca7


[SPARK-23691][PYTHON][BRANCH-2.3] Use sql_conf util in PySpark tests where 
possible

## What changes were proposed in this pull request?

This PR backports https://github.com/apache/spark/pull/20830 to reduce the diff 
against master and restore the default value back in PySpark tests.

https://github.com/apache/spark/commit/d6632d185e147fcbe6724545488ad80dce20277e 
added an useful util. This backport extracts and brings this util:

```python
contextmanager
def sql_conf(self, pairs):
...
```

to allow configuration set/unset within a block:

```python
with self.sql_conf({"spark.blah.blah.blah", "blah"})
# test codes
```

This PR proposes to use this util where possible in PySpark tests.

Note that there look already few places affecting tests without restoring the 
original value back in unittest classes.

## How was this patch tested?

Likewise, manually tested via:

```
./run-tests --modules=pyspark-sql --python-executables=python2
./run-tests --modules=pyspark-sql --python-executables=python3
```

Author: hyukjinkwon 

Closes #20863 from HyukjinKwon/backport-20830.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c854b6ca
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c854b6ca
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c854b6ca

Branch: refs/heads/branch-2.3
Commit: c854b6ca7ba4dc33138c12ba4606ff8fbe82aef2
Parents: 2f82c03
Author: hyukjinkwon 
Authored: Tue Mar 20 17:53:09 2018 +0900
Committer: hyukjinkwon 
Committed: Tue Mar 20 17:53:09 2018 +0900

--
 python/pyspark/sql/tests.py | 143 ---
 1 file changed, 72 insertions(+), 71 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c854b6ca/python/pyspark/sql/tests.py
--
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 82c5500..d806e5d 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -33,6 +33,7 @@ import datetime
 import array
 import ctypes
 import py4j
+from contextlib import contextmanager
 
 try:
 import xmlrunner
@@ -201,6 +202,28 @@ class ReusedSQLTestCase(ReusedPySparkTestCase):
"\n\nResult:\n%s\n%s" % (result, result.dtypes))
 self.assertTrue(expected.equals(result), msg=msg)
 
+@contextmanager
+def sql_conf(self, pairs):
+"""
+A convenient context manager to test some configuration specific 
logic. This sets
+`value` to the configuration `key` and then restores it back when it 
exits.
+"""
+assert isinstance(pairs, dict), "pairs should be a dictionary."
+
+keys = pairs.keys()
+new_values = pairs.values()
+old_values = [self.spark.conf.get(key, None) for key in keys]
+for key, new_value in zip(keys, new_values):
+self.spark.conf.set(key, new_value)
+try:
+yield
+finally:
+for key, old_value in zip(keys, old_values):
+if old_value is None:
+self.spark.conf.unset(key)
+else:
+self.spark.conf.set(key, old_value)
+
 
 class DataTypeTests(unittest.TestCase):
 # regression test for SPARK-6055
@@ -2409,17 +2432,13 @@ class SQLTests(ReusedSQLTestCase):
 df1 = self.spark.range(1).toDF("a")
 df2 = self.spark.range(1).toDF("b")
 
-try:
-self.spark.conf.set("spark.sql.crossJoin.enabled", "false")
+with self.sql_conf({"spark.sql.crossJoin.enabled": False}):
 self.assertRaises(AnalysisException, lambda: df1.join(df2, 
how="inner").collect())
 
-self.spark.conf.set("spark.sql.crossJoin.enabled", "true")
+with self.sql_conf({"spark.sql.crossJoin.enabled": True}):
 actual = df1.join(df2, how="inner").collect()
 expected = [Row(a=0, b=0)]
 self.assertEqual(actual, expected)
-finally:
-# We should unset this. Otherwise, other tests are affected.
-self.spark.conf.unset("spark.sql.crossJoin.enabled")
 
 # Regression test for invalid join methods when on is None, Spark-14761
 def test_invalid_join_method(self):
@@ -2891,21 +2910,18 @@ class SQLTests(ReusedSQLTestCase):
 self.assertPandasEqual(pdf, df.toPandas())
 
 orig_env_tz = os.environ.get('TZ', None)
-orig_session_tz = self.spark.conf.get('spark.sql.session.timeZone')
 try:
 tz = 'America/Los_Angeles'
 os.environ['TZ'] = tz
 time.tzset()
-self.spark.conf.set('spark.sql.session.timeZone', tz)
-
-   

svn commit: r25842 - in /dev/spark/2.4.0-SNAPSHOT-2018_03_20_00_01-5663218-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s

2018-03-20 Thread pwendell
Author: pwendell
Date: Tue Mar 20 07:17:24 2018
New Revision: 25842

Log:
Apache Spark 2.4.0-SNAPSHOT-2018_03_20_00_01-5663218 docs


[This commit notification would consist of 1449 parts, 
which exceeds the limit of 50 ones, so it was shortened to the summary.]

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org