spark git commit: [SPARK-15553][SQL] Dataset.createTempView should use CreateViewCommand
Repository: spark Updated Branches: refs/heads/branch-2.0 ada319844 -> 36045106d [SPARK-15553][SQL] Dataset.createTempView should use CreateViewCommand ## What changes were proposed in this pull request? Let `Dataset.createTempView` and `Dataset.createOrReplaceTempView` use `CreateViewCommand`, rather than calling `SparkSession.createTempView`. Besides, this patch also removes `SparkSession.createTempView`. ## How was this patch tested? Existing tests. Author: Liang-Chi HsiehCloses #13327 from viirya/dataset-createtempview. (cherry picked from commit f1b220d1d4d12121fe0b3b175da44488da68) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/36045106 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/36045106 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/36045106 Branch: refs/heads/branch-2.0 Commit: 36045106d43b3952c55bae4439dbc86892399b3c Parents: ada3198 Author: Liang-Chi Hsieh Authored: Fri May 27 21:24:08 2016 -0700 Committer: Reynold Xin Committed: Fri May 27 21:24:14 2016 -0700 -- .../spark/sql/catalyst/catalog/interface.scala | 5 + .../scala/org/apache/spark/sql/Dataset.scala| 23 +++- .../scala/org/apache/spark/sql/SQLContext.scala | 2 +- .../org/apache/spark/sql/SparkSession.scala | 11 -- .../spark/sql/execution/SparkSqlParser.scala| 18 +++ .../spark/sql/execution/command/cache.scala | 3 +-- .../spark/sql/execution/command/views.scala | 8 +-- 7 files changed, 39 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/36045106/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 4a073d1..77731b1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -50,6 +50,11 @@ case class CatalogStorageFormat( compressed: Boolean, serdeProperties: Map[String, String]) +object CatalogStorageFormat { + /** Empty storage format for default values and copies. */ + val EmptyStorageFormat = CatalogStorageFormat(locationUri = None, inputFormat = None, +outputFormat = None, serde = None, compressed = false, serdeProperties = Map.empty) +} /** * A column in a table. http://git-wip-us.apache.org/repos/asf/spark/blob/36045106/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index abd16f2..7aeec20 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -35,6 +35,7 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.analysis._ +import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.encoders._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ @@ -44,7 +45,7 @@ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util.usePrettyExpression import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution} -import org.apache.spark.sql.execution.command.ExplainCommand +import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand} import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation} import org.apache.spark.sql.execution.datasources.json.JacksonGenerator import org.apache.spark.sql.execution.python.EvaluatePython @@ -2329,8 +2330,14 @@ class Dataset[T] private[sql]( * @since 2.0.0 */ @throws[AnalysisException] - def createTempView(viewName: String): Unit = { -sparkSession.createTempView(viewName, toDF(), replaceIfExists = false) + def createTempView(viewName: String): Unit = withPlan { +val tableDesc = CatalogTable( + identifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName), + tableType = CatalogTableType.VIEW, + schema = Seq.empty[CatalogColumn], + storage = CatalogStorageFormat.EmptyStorageFormat) +
spark git commit: [SPARK-15553][SQL] Dataset.createTempView should use CreateViewCommand
Repository: spark Updated Branches: refs/heads/master 73178c755 -> f1b220eee [SPARK-15553][SQL] Dataset.createTempView should use CreateViewCommand ## What changes were proposed in this pull request? Let `Dataset.createTempView` and `Dataset.createOrReplaceTempView` use `CreateViewCommand`, rather than calling `SparkSession.createTempView`. Besides, this patch also removes `SparkSession.createTempView`. ## How was this patch tested? Existing tests. Author: Liang-Chi HsiehCloses #13327 from viirya/dataset-createtempview. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f1b220ee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f1b220ee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f1b220ee Branch: refs/heads/master Commit: f1b220d1d4d12121fe0b3b175da44488da68 Parents: 73178c7 Author: Liang-Chi Hsieh Authored: Fri May 27 21:24:08 2016 -0700 Committer: Reynold Xin Committed: Fri May 27 21:24:08 2016 -0700 -- .../spark/sql/catalyst/catalog/interface.scala | 5 + .../scala/org/apache/spark/sql/Dataset.scala| 23 +++- .../scala/org/apache/spark/sql/SQLContext.scala | 2 +- .../org/apache/spark/sql/SparkSession.scala | 11 -- .../spark/sql/execution/SparkSqlParser.scala| 18 +++ .../spark/sql/execution/command/cache.scala | 3 +-- .../spark/sql/execution/command/views.scala | 8 +-- 7 files changed, 39 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f1b220ee/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index 4a073d1..77731b1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -50,6 +50,11 @@ case class CatalogStorageFormat( compressed: Boolean, serdeProperties: Map[String, String]) +object CatalogStorageFormat { + /** Empty storage format for default values and copies. */ + val EmptyStorageFormat = CatalogStorageFormat(locationUri = None, inputFormat = None, +outputFormat = None, serde = None, compressed = false, serdeProperties = Map.empty) +} /** * A column in a table. http://git-wip-us.apache.org/repos/asf/spark/blob/f1b220ee/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index abd16f2..7aeec20 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -35,6 +35,7 @@ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst._ import org.apache.spark.sql.catalyst.analysis._ +import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.encoders._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate._ @@ -44,7 +45,7 @@ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.util.usePrettyExpression import org.apache.spark.sql.execution.{FileRelation, LogicalRDD, QueryExecution, SQLExecution} -import org.apache.spark.sql.execution.command.ExplainCommand +import org.apache.spark.sql.execution.command.{CreateViewCommand, ExplainCommand} import org.apache.spark.sql.execution.datasources.{CreateTableUsingAsSelect, LogicalRelation} import org.apache.spark.sql.execution.datasources.json.JacksonGenerator import org.apache.spark.sql.execution.python.EvaluatePython @@ -2329,8 +2330,14 @@ class Dataset[T] private[sql]( * @since 2.0.0 */ @throws[AnalysisException] - def createTempView(viewName: String): Unit = { -sparkSession.createTempView(viewName, toDF(), replaceIfExists = false) + def createTempView(viewName: String): Unit = withPlan { +val tableDesc = CatalogTable( + identifier = sparkSession.sessionState.sqlParser.parseTableIdentifier(viewName), + tableType = CatalogTableType.VIEW, + schema = Seq.empty[CatalogColumn], + storage = CatalogStorageFormat.EmptyStorageFormat) +CreateViewCommand(tableDesc, logicalPlan, allowExisting = false, replace = false, + isTemporary = true, sql = "") } /** @@
[1/2] spark git commit: [SPARK-15633][MINOR] Make package name for Java tests consistent
Repository: spark Updated Branches: refs/heads/branch-2.0 3801fb4f3 -> ada319844 http://git-wip-us.apache.org/repos/asf/spark/blob/ada31984/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java -- diff --git a/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java b/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java new file mode 100644 index 000..cf5607f --- /dev/null +++ b/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java @@ -0,0 +1,910 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.java8.dstream; + +import java.io.Serializable; +import java.util.*; + +import scala.Tuple2; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import org.junit.Assert; +import org.junit.Test; + +import org.apache.spark.Accumulator; +import org.apache.spark.HashPartitioner; +import org.apache.spark.api.java.Optional; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.streaming.*; +import org.apache.spark.streaming.api.java.JavaDStream; +import org.apache.spark.streaming.api.java.JavaPairDStream; +import org.apache.spark.streaming.api.java.JavaMapWithStateDStream; + +/** + * Most of these tests replicate org.apache.spark.streaming.JavaAPISuite using java 8 + * lambda syntax. + */ +@SuppressWarnings("unchecked") +public class Java8APISuite extends LocalJavaStreamingContext implements Serializable { + + @Test + public void testMap() { +ListinputData = Arrays.asList( + Arrays.asList("hello", "world"), + Arrays.asList("goodnight", "moon")); + +List
expected = Arrays.asList( + Arrays.asList(5, 5), + Arrays.asList(9, 4)); + +JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); +JavaDStream letterCount = stream.map(String::length); +JavaTestUtils.attachTestOutputStream(letterCount); +List
result = JavaTestUtils.runStreams(ssc, 2, 2); + +assertOrderInvariantEquals(expected, result); + } + + @Test + public void testFilter() { +List
inputData = Arrays.asList( + Arrays.asList("giants", "dodgers"), + Arrays.asList("yankees", "red sox")); + +List
expected = Arrays.asList( + Arrays.asList("giants"), + Arrays.asList("yankees")); + +JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); +JavaDStream filtered = stream.filter(s -> s.contains("a")); +JavaTestUtils.attachTestOutputStream(filtered); +List
result = JavaTestUtils.runStreams(ssc, 2, 2); + +assertOrderInvariantEquals(expected, result); + } + + @Test + public void testMapPartitions() { +List
inputData = Arrays.asList( + Arrays.asList("giants", "dodgers"), + Arrays.asList("yankees", "red sox")); + +List
expected = Arrays.asList( + Arrays.asList("GIANTSDODGERS"), + Arrays.asList("YANKEESRED SOX")); + +JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); +JavaDStream mapped = stream.mapPartitions(in -> { + String out = ""; + while (in.hasNext()) { +out = out + in.next().toUpperCase(); + } + return Lists.newArrayList(out).iterator(); +}); +JavaTestUtils.attachTestOutputStream(mapped); +List
result = JavaTestUtils.runStreams(ssc, 2, 2); + +Assert.assertEquals(expected, result); + } + + @Test + public void testReduce() { +List
inputData = Arrays.asList( + Arrays.asList(1, 2, 3), + Arrays.asList(4, 5, 6), + Arrays.asList(7, 8, 9)); + +List
expected = Arrays.asList( + Arrays.asList(6), + Arrays.asList(15), + Arrays.asList(24)); + +JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); +JavaDStream reduced = stream.reduce((x, y) -> x + y); +JavaTestUtils.attachTestOutputStream(reduced); +List
result =
[2/2] spark git commit: [SPARK-15633][MINOR] Make package name for Java tests consistent
[SPARK-15633][MINOR] Make package name for Java tests consistent ## What changes were proposed in this pull request? This is a simple patch that makes package names for Java 8 test suites consistent. I moved everything to test.org.apache.spark to we can test package private APIs properly. Also added "java8" as the package name so we can easily run all the tests related to Java 8. ## How was this patch tested? This is a test only change. Author: Reynold XinCloses #13364 from rxin/SPARK-15633. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/73178c75 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/73178c75 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/73178c75 Branch: refs/heads/master Commit: 73178c75565e20f53e6ee1478f3d976732c64438 Parents: 9893dc9 Author: Reynold Xin Authored: Fri May 27 21:20:02 2016 -0700 Committer: Reynold Xin Committed: Fri May 27 21:20:02 2016 -0700 -- .../scala/org/apache/spark/SparkFunSuite.scala | 2 +- .../java/org/apache/spark/Java8APISuite.java| 393 .../spark/sql/Java8DatasetAggregatorSuite.java | 61 -- .../apache/spark/streaming/Java8APISuite.java | 909 -- .../apache/spark/java8/Java8RDDAPISuite.java| 395 .../spark/java8/dstream/Java8APISuite.java | 910 +++ .../java8/sql/Java8DatasetAggregatorSuite.java | 62 ++ .../scala/org/apache/spark/JDK8ScalaSuite.scala | 27 - .../org/apache/spark/java8/JDK8ScalaSuite.scala | 30 + .../spark/sql/JavaDatasetAggregatorSuite.java | 134 +++ .../sql/JavaDatasetAggregatorSuiteBase.java | 75 ++ .../org/apache/spark/sql/JavaSaveLoadSuite.java | 106 +++ .../sql/sources/JavaDatasetAggregatorSuite.java | 134 --- .../sources/JavaDatasetAggregatorSuiteBase.java | 75 -- .../spark/sql/sources/JavaSaveLoadSuite.java| 106 --- 15 files changed, 1713 insertions(+), 1706 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/73178c75/core/src/test/scala/org/apache/spark/SparkFunSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala index 0081bca..cd87680 100644 --- a/core/src/test/scala/org/apache/spark/SparkFunSuite.scala +++ b/core/src/test/scala/org/apache/spark/SparkFunSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.util.AccumulatorContext /** * Base abstract class for all unit tests in Spark for handling common functionality. */ -private[spark] abstract class SparkFunSuite +abstract class SparkFunSuite extends FunSuite with BeforeAndAfterAll with Logging { http://git-wip-us.apache.org/repos/asf/spark/blob/73178c75/external/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java -- diff --git a/external/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java b/external/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java deleted file mode 100644 index 6ac5ca9..000 --- a/external/java8-tests/src/test/java/org/apache/spark/Java8APISuite.java +++ /dev/null @@ -1,393 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - *http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark; - -import java.io.File; -import java.io.Serializable; -import java.util.*; - -import scala.Tuple2; - -import com.google.common.collect.Iterables; -import com.google.common.io.Files; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.SequenceFileOutputFormat; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import org.apache.spark.api.java.JavaDoubleRDD; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.Optional; -import
[1/2] spark git commit: [SPARK-15633][MINOR] Make package name for Java tests consistent
Repository: spark Updated Branches: refs/heads/master 9893dc975 -> 73178c755 http://git-wip-us.apache.org/repos/asf/spark/blob/73178c75/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java -- diff --git a/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java b/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java new file mode 100644 index 000..cf5607f --- /dev/null +++ b/external/java8-tests/src/test/java/test/org/apache/spark/java8/dstream/Java8APISuite.java @@ -0,0 +1,910 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package test.org.apache.spark.java8.dstream; + +import java.io.Serializable; +import java.util.*; + +import scala.Tuple2; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import org.junit.Assert; +import org.junit.Test; + +import org.apache.spark.Accumulator; +import org.apache.spark.HashPartitioner; +import org.apache.spark.api.java.Optional; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.streaming.*; +import org.apache.spark.streaming.api.java.JavaDStream; +import org.apache.spark.streaming.api.java.JavaPairDStream; +import org.apache.spark.streaming.api.java.JavaMapWithStateDStream; + +/** + * Most of these tests replicate org.apache.spark.streaming.JavaAPISuite using java 8 + * lambda syntax. + */ +@SuppressWarnings("unchecked") +public class Java8APISuite extends LocalJavaStreamingContext implements Serializable { + + @Test + public void testMap() { +ListinputData = Arrays.asList( + Arrays.asList("hello", "world"), + Arrays.asList("goodnight", "moon")); + +List
expected = Arrays.asList( + Arrays.asList(5, 5), + Arrays.asList(9, 4)); + +JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); +JavaDStream letterCount = stream.map(String::length); +JavaTestUtils.attachTestOutputStream(letterCount); +List
result = JavaTestUtils.runStreams(ssc, 2, 2); + +assertOrderInvariantEquals(expected, result); + } + + @Test + public void testFilter() { +List
inputData = Arrays.asList( + Arrays.asList("giants", "dodgers"), + Arrays.asList("yankees", "red sox")); + +List
expected = Arrays.asList( + Arrays.asList("giants"), + Arrays.asList("yankees")); + +JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); +JavaDStream filtered = stream.filter(s -> s.contains("a")); +JavaTestUtils.attachTestOutputStream(filtered); +List
result = JavaTestUtils.runStreams(ssc, 2, 2); + +assertOrderInvariantEquals(expected, result); + } + + @Test + public void testMapPartitions() { +List
inputData = Arrays.asList( + Arrays.asList("giants", "dodgers"), + Arrays.asList("yankees", "red sox")); + +List
expected = Arrays.asList( + Arrays.asList("GIANTSDODGERS"), + Arrays.asList("YANKEESRED SOX")); + +JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); +JavaDStream mapped = stream.mapPartitions(in -> { + String out = ""; + while (in.hasNext()) { +out = out + in.next().toUpperCase(); + } + return Lists.newArrayList(out).iterator(); +}); +JavaTestUtils.attachTestOutputStream(mapped); +List
result = JavaTestUtils.runStreams(ssc, 2, 2); + +Assert.assertEquals(expected, result); + } + + @Test + public void testReduce() { +List
inputData = Arrays.asList( + Arrays.asList(1, 2, 3), + Arrays.asList(4, 5, 6), + Arrays.asList(7, 8, 9)); + +List
expected = Arrays.asList( + Arrays.asList(6), + Arrays.asList(15), + Arrays.asList(24)); + +JavaDStream stream = JavaTestUtils.attachTestInputStream(ssc, inputData, 1); +JavaDStream reduced = stream.reduce((x, y) -> x + y); +JavaTestUtils.attachTestOutputStream(reduced); +List
result =
spark git commit: [SPARK-15610][ML] update error message for k in pca
Repository: spark Updated Branches: refs/heads/master 88c9c467a -> 9893dc975 [SPARK-15610][ML] update error message for k in pca ## What changes were proposed in this pull request? Fix the wrong bound of `k` in `PCA` `require(k <= sources.first().size, ...` -> `require(k < sources.first().size` BTW, remove unused import in `ml.ElementwiseProduct` ## How was this patch tested? manual tests Author: Zheng RuiFengCloses #13356 from zhengruifeng/fix_pca. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9893dc97 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9893dc97 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9893dc97 Branch: refs/heads/master Commit: 9893dc975784551a62f65bbd709f8972e0204b2a Parents: 88c9c46 Author: Zheng RuiFeng Authored: Fri May 27 21:57:41 2016 -0500 Committer: Sean Owen Committed: Fri May 27 21:57:41 2016 -0500 -- .../scala/org/apache/spark/ml/feature/ElementwiseProduct.scala | 1 - mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9893dc97/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala index 91989c3..9d2e60f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala @@ -23,7 +23,6 @@ import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature -import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType http://git-wip-us.apache.org/repos/asf/spark/blob/9893dc97/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala index 30c403e..15b7220 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala @@ -40,8 +40,9 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) { */ @Since("1.4.0") def fit(sources: RDD[Vector]): PCAModel = { -require(k <= sources.first().size, - s"source vector size is ${sources.first().size} must be greater than k=$k") +val numFeatures = sources.first().size +require(k <= numFeatures, + s"source vector size $numFeatures must be no less than k=$k") val mat = new RowMatrix(sources) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) @@ -58,7 +59,6 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) { case m => throw new IllegalArgumentException("Unsupported matrix format. Expected " + s"SparseMatrix or DenseMatrix. Instead got: ${m.getClass}") - } val denseExplainedVariance = explainedVariance match { case dv: DenseVector => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15610][ML] update error message for k in pca
Repository: spark Updated Branches: refs/heads/branch-2.0 6d82e0c1b -> 3801fb4f3 [SPARK-15610][ML] update error message for k in pca ## What changes were proposed in this pull request? Fix the wrong bound of `k` in `PCA` `require(k <= sources.first().size, ...` -> `require(k < sources.first().size` BTW, remove unused import in `ml.ElementwiseProduct` ## How was this patch tested? manual tests Author: Zheng RuiFengCloses #13356 from zhengruifeng/fix_pca. (cherry picked from commit 9893dc975784551a62f65bbd709f8972e0204b2a) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3801fb4f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3801fb4f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3801fb4f Branch: refs/heads/branch-2.0 Commit: 3801fb4f35ba1ffb8dbaf8326eff927b738551f2 Parents: 6d82e0c Author: Zheng RuiFeng Authored: Fri May 27 21:57:41 2016 -0500 Committer: Sean Owen Committed: Fri May 27 21:57:48 2016 -0500 -- .../scala/org/apache/spark/ml/feature/ElementwiseProduct.scala | 1 - mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3801fb4f/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala index 91989c3..9d2e60f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala @@ -23,7 +23,6 @@ import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature -import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType http://git-wip-us.apache.org/repos/asf/spark/blob/3801fb4f/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala index 30c403e..15b7220 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/PCA.scala @@ -40,8 +40,9 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) { */ @Since("1.4.0") def fit(sources: RDD[Vector]): PCAModel = { -require(k <= sources.first().size, - s"source vector size is ${sources.first().size} must be greater than k=$k") +val numFeatures = sources.first().size +require(k <= numFeatures, + s"source vector size $numFeatures must be no less than k=$k") val mat = new RowMatrix(sources) val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k) @@ -58,7 +59,6 @@ class PCA @Since("1.4.0") (@Since("1.4.0") val k: Int) { case m => throw new IllegalArgumentException("Unsupported matrix format. Expected " + s"SparseMatrix or DenseMatrix. Instead got: ${m.getClass}") - } val denseExplainedVariance = explainedVariance match { case dv: DenseVector => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15562][ML] Delete temp directory after program exit in DataFrameExample
Repository: spark Updated Branches: refs/heads/master 5d4dafe8f -> 88c9c467a [SPARK-15562][ML] Delete temp directory after program exit in DataFrameExample ## What changes were proposed in this pull request? Temp directory used to save records is not deleted after program exit in DataFrameExample. Although it called deleteOnExit, it doesn't work as the directory is not empty. Similar things happend in ContextCleanerSuite. Update the code to make sure temp directory is deleted after program exit. ## How was this patch tested? unit tests and local build. Author: dding3Closes #13328 from dding3/master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/88c9c467 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/88c9c467 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/88c9c467 Branch: refs/heads/master Commit: 88c9c467a31630c558719679ca0894873a268b27 Parents: 5d4dafe Author: dding3 Authored: Fri May 27 21:01:50 2016 -0500 Committer: Sean Owen Committed: Fri May 27 21:01:50 2016 -0500 -- core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala | 4 ++-- .../scala/org/apache/spark/examples/ml/DataFrameExample.scala| 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/88c9c467/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala index 69ff6c7..6724af9 100644 --- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala @@ -32,6 +32,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.rdd.{RDD, ReliableRDDCheckpointData} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.storage._ +import org.apache.spark.util.Utils /** * An abstract base class for context cleaner tests, which sets up a context with a config @@ -206,8 +207,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase { } test("automatically cleanup normal checkpoint") { -val checkpointDir = java.io.File.createTempFile("temp", "") -checkpointDir.deleteOnExit() +val checkpointDir = Utils.createTempDir() checkpointDir.delete() var rdd = newPairRDD() sc.setCheckpointDir(checkpointDir.toString) http://git-wip-us.apache.org/repos/asf/spark/blob/88c9c467/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala index c69027b..11faa61 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala @@ -28,6 +28,7 @@ import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.util.Utils /** * An example of how to use [[org.apache.spark.sql.DataFrame]] for ML. Run with @@ -86,8 +87,7 @@ object DataFrameExample { println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. -val tmpDir = Files.createTempDir() -tmpDir.deleteOnExit() +val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15562][ML] Delete temp directory after program exit in DataFrameExample
Repository: spark Updated Branches: refs/heads/branch-2.0 8467e2102 -> 6d82e0c1b [SPARK-15562][ML] Delete temp directory after program exit in DataFrameExample ## What changes were proposed in this pull request? Temp directory used to save records is not deleted after program exit in DataFrameExample. Although it called deleteOnExit, it doesn't work as the directory is not empty. Similar things happend in ContextCleanerSuite. Update the code to make sure temp directory is deleted after program exit. ## How was this patch tested? unit tests and local build. Author: dding3Closes #13328 from dding3/master. (cherry picked from commit 88c9c467a31630c558719679ca0894873a268b27) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d82e0c1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d82e0c1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d82e0c1 Branch: refs/heads/branch-2.0 Commit: 6d82e0c1b8b4368e91aeebfc80430a61762c7e88 Parents: 8467e21 Author: dding3 Authored: Fri May 27 21:01:50 2016 -0500 Committer: Sean Owen Committed: Fri May 27 21:01:56 2016 -0500 -- core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala | 4 ++-- .../scala/org/apache/spark/examples/ml/DataFrameExample.scala| 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6d82e0c1/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala index 69ff6c7..6724af9 100644 --- a/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala +++ b/core/src/test/scala/org/apache/spark/ContextCleanerSuite.scala @@ -32,6 +32,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.rdd.{RDD, ReliableRDDCheckpointData} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.storage._ +import org.apache.spark.util.Utils /** * An abstract base class for context cleaner tests, which sets up a context with a config @@ -206,8 +207,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase { } test("automatically cleanup normal checkpoint") { -val checkpointDir = java.io.File.createTempFile("temp", "") -checkpointDir.deleteOnExit() +val checkpointDir = Utils.createTempDir() checkpointDir.delete() var rdd = newPairRDD() sc.setCheckpointDir(checkpointDir.toString) http://git-wip-us.apache.org/repos/asf/spark/blob/6d82e0c1/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala index c69027b..11faa61 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DataFrameExample.scala @@ -28,6 +28,7 @@ import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.util.Utils /** * An example of how to use [[org.apache.spark.sql.DataFrame]] for ML. Run with @@ -86,8 +87,7 @@ object DataFrameExample { println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. -val tmpDir = Files.createTempDir() -tmpDir.deleteOnExit() +val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue
Repository: spark Updated Branches: refs/heads/branch-2.0 80a40e8e2 -> 8467e2102 [SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) In the MLLib naivebayes example, scala and python example doesn't use libsvm data, but Java does. I make changes in scala and python example to use the libsvm data as the same as Java example. ## How was this patch tested? Manual tests Author: wm...@hotmail.comCloses #13301 from wangmiao1981/example. (cherry picked from commit 5d4dafe8fdea49dcbd6b0e4c23e3791fa30c8911) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8467e210 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8467e210 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8467e210 Branch: refs/heads/branch-2.0 Commit: 8467e2102886da1cefb43f2aaa69864375fe91bc Parents: 80a40e8 Author: wm...@hotmail.com Authored: Fri May 27 20:59:24 2016 -0500 Committer: Sean Owen Committed: Fri May 27 20:59:34 2016 -0500 -- data/mllib/sample_naive_bayes_data.txt| 12 .../spark/examples/mllib/JavaNaiveBayesExample.java | 4 ++-- examples/src/main/python/mllib/naive_bayes_example.py | 13 - .../spark/examples/mllib/NaiveBayesExample.scala | 14 -- 4 files changed, 10 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/data/mllib/sample_naive_bayes_data.txt -- diff --git a/data/mllib/sample_naive_bayes_data.txt b/data/mllib/sample_naive_bayes_data.txt deleted file mode 100644 index bd22bea..000 --- a/data/mllib/sample_naive_bayes_data.txt +++ /dev/null @@ -1,12 +0,0 @@ -0,1 0 0 -0,2 0 0 -0,3 0 0 -0,4 0 0 -1,0 1 0 -1,0 2 0 -1,0 3 0 -1,0 4 0 -2,0 0 1 -2,0 0 2 -2,0 0 3 -2,0 0 4 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java index 2b17dbb..f4ec04b 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java @@ -36,9 +36,9 @@ public class JavaNaiveBayesExample { SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // $example on$ -String path = "data/mllib/sample_naive_bayes_data.txt"; +String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); -JavaRDD[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}, 12345); +JavaRDD[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}); JavaRDD training = tmp[0]; // training set JavaRDD test = tmp[1]; // test set final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/examples/src/main/python/mllib/naive_bayes_example.py -- diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py index 35724f7..749353b 100644 --- a/examples/src/main/python/mllib/naive_bayes_example.py +++ b/examples/src/main/python/mllib/naive_bayes_example.py @@ -29,15 +29,9 @@ import shutil from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.regression import LabeledPoint +from pyspark.mllib.util import MLUtils -def parseLine(line): -parts = line.split(',') -label = float(parts[0]) -features = Vectors.dense([float(x) for x in parts[1].split(' ')]) -return LabeledPoint(label, features) # $example off$ if __name__ == "__main__": @@ -45,10 +39,11 @@ if __name__ == "__main__": sc = SparkContext(appName="PythonNaiveBayesExample") # $example on$ -data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine) +# Load and parse the data file. +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split data approximately into training (60%) and test (40%) -training, test = data.randomSplit([0.6, 0.4], seed=0) +
spark git commit: [SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue
Repository: spark Updated Branches: refs/heads/master 4a2fb8b87 -> 5d4dafe8f [SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue ## What changes were proposed in this pull request? (Please fill in changes proposed in this fix) In the MLLib naivebayes example, scala and python example doesn't use libsvm data, but Java does. I make changes in scala and python example to use the libsvm data as the same as Java example. ## How was this patch tested? Manual tests Author: wm...@hotmail.comCloses #13301 from wangmiao1981/example. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5d4dafe8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5d4dafe8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5d4dafe8 Branch: refs/heads/master Commit: 5d4dafe8fdea49dcbd6b0e4c23e3791fa30c8911 Parents: 4a2fb8b Author: wm...@hotmail.com Authored: Fri May 27 20:59:24 2016 -0500 Committer: Sean Owen Committed: Fri May 27 20:59:24 2016 -0500 -- data/mllib/sample_naive_bayes_data.txt| 12 .../spark/examples/mllib/JavaNaiveBayesExample.java | 4 ++-- examples/src/main/python/mllib/naive_bayes_example.py | 13 - .../spark/examples/mllib/NaiveBayesExample.scala | 14 -- 4 files changed, 10 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5d4dafe8/data/mllib/sample_naive_bayes_data.txt -- diff --git a/data/mllib/sample_naive_bayes_data.txt b/data/mllib/sample_naive_bayes_data.txt deleted file mode 100644 index bd22bea..000 --- a/data/mllib/sample_naive_bayes_data.txt +++ /dev/null @@ -1,12 +0,0 @@ -0,1 0 0 -0,2 0 0 -0,3 0 0 -0,4 0 0 -1,0 1 0 -1,0 2 0 -1,0 3 0 -1,0 4 0 -2,0 0 1 -2,0 0 2 -2,0 0 3 -2,0 0 4 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/5d4dafe8/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java index 2b17dbb..f4ec04b 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java @@ -36,9 +36,9 @@ public class JavaNaiveBayesExample { SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // $example on$ -String path = "data/mllib/sample_naive_bayes_data.txt"; +String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD inputData = MLUtils.loadLibSVMFile(jsc.sc(), path).toJavaRDD(); -JavaRDD[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}, 12345); +JavaRDD[] tmp = inputData.randomSplit(new double[]{0.6, 0.4}); JavaRDD training = tmp[0]; // training set JavaRDD test = tmp[1]; // test set final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0); http://git-wip-us.apache.org/repos/asf/spark/blob/5d4dafe8/examples/src/main/python/mllib/naive_bayes_example.py -- diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py index 35724f7..749353b 100644 --- a/examples/src/main/python/mllib/naive_bayes_example.py +++ b/examples/src/main/python/mllib/naive_bayes_example.py @@ -29,15 +29,9 @@ import shutil from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel -from pyspark.mllib.linalg import Vectors -from pyspark.mllib.regression import LabeledPoint +from pyspark.mllib.util import MLUtils -def parseLine(line): -parts = line.split(',') -label = float(parts[0]) -features = Vectors.dense([float(x) for x in parts[1].split(' ')]) -return LabeledPoint(label, features) # $example off$ if __name__ == "__main__": @@ -45,10 +39,11 @@ if __name__ == "__main__": sc = SparkContext(appName="PythonNaiveBayesExample") # $example on$ -data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine) +# Load and parse the data file. +data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") # Split data approximately into training (60%) and test (40%) -training, test = data.randomSplit([0.6, 0.4], seed=0) +training, test = data.randomSplit([0.6, 0.4]) # Train a naive Bayes model. model = NaiveBayes.train(training,
spark git commit: [SPARK-15594][SQL] ALTER TABLE SERDEPROPERTIES does not respect partition spec
Repository: spark Updated Branches: refs/heads/master 776d183c8 -> 4a2fb8b87 [SPARK-15594][SQL] ALTER TABLE SERDEPROPERTIES does not respect partition spec ## What changes were proposed in this pull request? These commands ignore the partition spec and change the storage properties of the table itself: ``` ALTER TABLE table_name PARTITION (a=1, b=2) SET SERDE 'my_serde' ALTER TABLE table_name PARTITION (a=1, b=2) SET SERDEPROPERTIES ('key1'='val1') ``` Now they change the storage properties of the specified partition. ## How was this patch tested? DDLSuite Author: Andrew OrCloses #13343 from andrewor14/alter-table-serdeproperties. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4a2fb8b8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4a2fb8b8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4a2fb8b8 Branch: refs/heads/master Commit: 4a2fb8b87ca4517e0f4a1d7a1a1b3c08c1c1294d Parents: 776d183 Author: Andrew Or Authored: Fri May 27 17:27:24 2016 -0700 Committer: Yin Huai Committed: Fri May 27 17:27:24 2016 -0700 -- .../spark/sql/execution/command/ddl.scala | 26 ++-- .../spark/sql/execution/command/DDLSuite.scala | 64 2 files changed, 84 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4a2fb8b8/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index 95bac94..5fd0b83 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -293,7 +293,7 @@ case class AlterTableSerDePropertiesCommand( tableName: TableIdentifier, serdeClassName: Option[String], serdeProperties: Option[Map[String, String]], -partition: Option[Map[String, String]]) +partSpec: Option[TablePartitionSpec]) extends RunnableCommand { // should never happen if we parsed things correctly @@ -306,15 +306,29 @@ case class AlterTableSerDePropertiesCommand( "ALTER TABLE SERDEPROPERTIES") val catalog = sparkSession.sessionState.catalog val table = catalog.getTableMetadata(tableName) -// Do not support setting serde for datasource tables +// For datasource tables, disallow setting serde or specifying partition +if (partSpec.isDefined && DDLUtils.isDatasourceTable(table)) { + throw new AnalysisException("Operation not allowed: ALTER TABLE SET " + +"[SERDE | SERDEPROPERTIES] for a specific partition is not supported " + +"for tables created with the datasource API") +} if (serdeClassName.isDefined && DDLUtils.isDatasourceTable(table)) { throw new AnalysisException("Operation not allowed: ALTER TABLE SET SERDE is " + "not supported for tables created with the datasource API") } -val newTable = table.withNewStorage( - serde = serdeClassName.orElse(table.storage.serde), - serdeProperties = table.storage.serdeProperties ++ serdeProperties.getOrElse(Map())) -catalog.alterTable(newTable) +if (partSpec.isEmpty) { + val newTable = table.withNewStorage( +serde = serdeClassName.orElse(table.storage.serde), +serdeProperties = table.storage.serdeProperties ++ serdeProperties.getOrElse(Map())) + catalog.alterTable(newTable) +} else { + val spec = partSpec.get + val part = catalog.getPartition(tableName, spec) + val newPart = part.copy(storage = part.storage.copy( +serde = serdeClassName.orElse(part.storage.serde), +serdeProperties = part.storage.serdeProperties ++ serdeProperties.getOrElse(Map( + catalog.alterPartitions(tableName, Seq(newPart)) +} Seq.empty[Row] } http://git-wip-us.apache.org/repos/asf/spark/blob/4a2fb8b8/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index ccb4006..5d45cfb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -538,6 +538,14 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { testSetSerde(isDatasourceTable = true) } + test("alter table: set serde
spark git commit: [SPARK-15594][SQL] ALTER TABLE SERDEPROPERTIES does not respect partition spec
Repository: spark Updated Branches: refs/heads/branch-2.0 dc6e94157 -> 80a40e8e2 [SPARK-15594][SQL] ALTER TABLE SERDEPROPERTIES does not respect partition spec ## What changes were proposed in this pull request? These commands ignore the partition spec and change the storage properties of the table itself: ``` ALTER TABLE table_name PARTITION (a=1, b=2) SET SERDE 'my_serde' ALTER TABLE table_name PARTITION (a=1, b=2) SET SERDEPROPERTIES ('key1'='val1') ``` Now they change the storage properties of the specified partition. ## How was this patch tested? DDLSuite Author: Andrew OrCloses #13343 from andrewor14/alter-table-serdeproperties. (cherry picked from commit 4a2fb8b87ca4517e0f4a1d7a1a1b3c08c1c1294d) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/80a40e8e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/80a40e8e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/80a40e8e Branch: refs/heads/branch-2.0 Commit: 80a40e8e2cc198c34dabbc431d4ca302319fbbad Parents: dc6e941 Author: Andrew Or Authored: Fri May 27 17:27:24 2016 -0700 Committer: Yin Huai Committed: Fri May 27 17:27:38 2016 -0700 -- .../spark/sql/execution/command/ddl.scala | 26 ++-- .../spark/sql/execution/command/DDLSuite.scala | 64 2 files changed, 84 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/80a40e8e/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala index 95bac94..5fd0b83 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala @@ -293,7 +293,7 @@ case class AlterTableSerDePropertiesCommand( tableName: TableIdentifier, serdeClassName: Option[String], serdeProperties: Option[Map[String, String]], -partition: Option[Map[String, String]]) +partSpec: Option[TablePartitionSpec]) extends RunnableCommand { // should never happen if we parsed things correctly @@ -306,15 +306,29 @@ case class AlterTableSerDePropertiesCommand( "ALTER TABLE SERDEPROPERTIES") val catalog = sparkSession.sessionState.catalog val table = catalog.getTableMetadata(tableName) -// Do not support setting serde for datasource tables +// For datasource tables, disallow setting serde or specifying partition +if (partSpec.isDefined && DDLUtils.isDatasourceTable(table)) { + throw new AnalysisException("Operation not allowed: ALTER TABLE SET " + +"[SERDE | SERDEPROPERTIES] for a specific partition is not supported " + +"for tables created with the datasource API") +} if (serdeClassName.isDefined && DDLUtils.isDatasourceTable(table)) { throw new AnalysisException("Operation not allowed: ALTER TABLE SET SERDE is " + "not supported for tables created with the datasource API") } -val newTable = table.withNewStorage( - serde = serdeClassName.orElse(table.storage.serde), - serdeProperties = table.storage.serdeProperties ++ serdeProperties.getOrElse(Map())) -catalog.alterTable(newTable) +if (partSpec.isEmpty) { + val newTable = table.withNewStorage( +serde = serdeClassName.orElse(table.storage.serde), +serdeProperties = table.storage.serdeProperties ++ serdeProperties.getOrElse(Map())) + catalog.alterTable(newTable) +} else { + val spec = partSpec.get + val part = catalog.getPartition(tableName, spec) + val newPart = part.copy(storage = part.storage.copy( +serde = serdeClassName.orElse(part.storage.serde), +serdeProperties = part.storage.serdeProperties ++ serdeProperties.getOrElse(Map( + catalog.alterPartitions(tableName, Seq(newPart)) +} Seq.empty[Row] } http://git-wip-us.apache.org/repos/asf/spark/blob/80a40e8e/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index ccb4006..5d45cfb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -538,6 +538,14 @@ class DDLSuite extends QueryTest with
spark git commit: [SPARK-9876][SQL] Update Parquet to 1.8.1.
Repository: spark Updated Branches: refs/heads/branch-2.0 9c137b2e3 -> dc6e94157 [SPARK-9876][SQL] Update Parquet to 1.8.1. ## What changes were proposed in this pull request? This includes minimal changes to get Spark using the current release of Parquet, 1.8.1. ## How was this patch tested? This uses the existing Parquet tests. Author: Ryan BlueCloses #13280 from rdblue/SPARK-9876-update-parquet. (cherry picked from commit 776d183c82b424ef7c3cae30537d8afe9b9eee83) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dc6e9415 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dc6e9415 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dc6e9415 Branch: refs/heads/branch-2.0 Commit: dc6e94157ce08df91aa1a31db8e5ec733a1ab0c5 Parents: 9c137b2 Author: Ryan Blue Authored: Fri May 27 16:59:38 2016 -0700 Committer: Cheng Lian Committed: Fri May 27 16:59:50 2016 -0700 -- dev/deps/spark-deps-hadoop-2.2 | 11 ++- dev/deps/spark-deps-hadoop-2.3 | 11 ++- dev/deps/spark-deps-hadoop-2.4 | 11 ++- dev/deps/spark-deps-hadoop-2.6 | 11 ++- dev/deps/spark-deps-hadoop-2.7 | 11 ++- pom.xml | 2 +- .../SpecificParquetRecordReaderBase.java| 20 +++-- .../parquet/CatalystReadSupport.scala | 12 ++- .../parquet/CatalystSchemaConverter.scala | 16 .../datasources/parquet/ParquetFilters.scala| 83 .../parquet/ParquetSchemaSuite.scala| 20 +++-- 11 files changed, 91 insertions(+), 117 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dc6e9415/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index 578691c..deec033 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -129,14 +129,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.7.0.jar -parquet-common-1.7.0.jar -parquet-encoding-1.7.0.jar +parquet-column-1.8.1.jar +parquet-common-1.8.1.jar +parquet-encoding-1.8.1.jar parquet-format-2.3.0-incubating.jar -parquet-generator-1.7.0.jar -parquet-hadoop-1.7.0.jar +parquet-hadoop-1.8.1.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.7.0.jar +parquet-jackson-1.8.1.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/dc6e9415/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index fc6306f..43c7dd3 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -136,14 +136,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.7.0.jar -parquet-common-1.7.0.jar -parquet-encoding-1.7.0.jar +parquet-column-1.8.1.jar +parquet-common-1.8.1.jar +parquet-encoding-1.8.1.jar parquet-format-2.3.0-incubating.jar -parquet-generator-1.7.0.jar -parquet-hadoop-1.7.0.jar +parquet-hadoop-1.8.1.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.7.0.jar +parquet-jackson-1.8.1.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/dc6e9415/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index dee1417..7186b30 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -136,14 +136,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.7.0.jar -parquet-common-1.7.0.jar -parquet-encoding-1.7.0.jar +parquet-column-1.8.1.jar +parquet-common-1.8.1.jar +parquet-encoding-1.8.1.jar parquet-format-2.3.0-incubating.jar -parquet-generator-1.7.0.jar -parquet-hadoop-1.7.0.jar +parquet-hadoop-1.8.1.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.7.0.jar +parquet-jackson-1.8.1.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/dc6e9415/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 9695661..3e4ed74 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -144,14 +144,13 @@ opencsv-2.3.jar oro-2.0.8.jar
spark git commit: [SPARK-9876][SQL] Update Parquet to 1.8.1.
Repository: spark Updated Branches: refs/heads/master 019afd9c7 -> 776d183c8 [SPARK-9876][SQL] Update Parquet to 1.8.1. ## What changes were proposed in this pull request? This includes minimal changes to get Spark using the current release of Parquet, 1.8.1. ## How was this patch tested? This uses the existing Parquet tests. Author: Ryan BlueCloses #13280 from rdblue/SPARK-9876-update-parquet. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/776d183c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/776d183c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/776d183c Branch: refs/heads/master Commit: 776d183c82b424ef7c3cae30537d8afe9b9eee83 Parents: 019afd9 Author: Ryan Blue Authored: Fri May 27 16:59:38 2016 -0700 Committer: Cheng Lian Committed: Fri May 27 16:59:38 2016 -0700 -- dev/deps/spark-deps-hadoop-2.2 | 11 ++- dev/deps/spark-deps-hadoop-2.3 | 11 ++- dev/deps/spark-deps-hadoop-2.4 | 11 ++- dev/deps/spark-deps-hadoop-2.6 | 11 ++- dev/deps/spark-deps-hadoop-2.7 | 11 ++- pom.xml | 2 +- .../SpecificParquetRecordReaderBase.java| 20 +++-- .../parquet/CatalystReadSupport.scala | 12 ++- .../parquet/CatalystSchemaConverter.scala | 16 .../datasources/parquet/ParquetFilters.scala| 83 .../parquet/ParquetSchemaSuite.scala| 20 +++-- 11 files changed, 91 insertions(+), 117 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/776d183c/dev/deps/spark-deps-hadoop-2.2 -- diff --git a/dev/deps/spark-deps-hadoop-2.2 b/dev/deps/spark-deps-hadoop-2.2 index 578691c..deec033 100644 --- a/dev/deps/spark-deps-hadoop-2.2 +++ b/dev/deps/spark-deps-hadoop-2.2 @@ -129,14 +129,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.7.0.jar -parquet-common-1.7.0.jar -parquet-encoding-1.7.0.jar +parquet-column-1.8.1.jar +parquet-common-1.8.1.jar +parquet-encoding-1.8.1.jar parquet-format-2.3.0-incubating.jar -parquet-generator-1.7.0.jar -parquet-hadoop-1.7.0.jar +parquet-hadoop-1.8.1.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.7.0.jar +parquet-jackson-1.8.1.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/776d183c/dev/deps/spark-deps-hadoop-2.3 -- diff --git a/dev/deps/spark-deps-hadoop-2.3 b/dev/deps/spark-deps-hadoop-2.3 index fc6306f..43c7dd3 100644 --- a/dev/deps/spark-deps-hadoop-2.3 +++ b/dev/deps/spark-deps-hadoop-2.3 @@ -136,14 +136,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.7.0.jar -parquet-common-1.7.0.jar -parquet-encoding-1.7.0.jar +parquet-column-1.8.1.jar +parquet-common-1.8.1.jar +parquet-encoding-1.8.1.jar parquet-format-2.3.0-incubating.jar -parquet-generator-1.7.0.jar -parquet-hadoop-1.7.0.jar +parquet-hadoop-1.8.1.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.7.0.jar +parquet-jackson-1.8.1.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/776d183c/dev/deps/spark-deps-hadoop-2.4 -- diff --git a/dev/deps/spark-deps-hadoop-2.4 b/dev/deps/spark-deps-hadoop-2.4 index dee1417..7186b30 100644 --- a/dev/deps/spark-deps-hadoop-2.4 +++ b/dev/deps/spark-deps-hadoop-2.4 @@ -136,14 +136,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.7.0.jar -parquet-common-1.7.0.jar -parquet-encoding-1.7.0.jar +parquet-column-1.8.1.jar +parquet-common-1.8.1.jar +parquet-encoding-1.8.1.jar parquet-format-2.3.0-incubating.jar -parquet-generator-1.7.0.jar -parquet-hadoop-1.7.0.jar +parquet-hadoop-1.8.1.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.7.0.jar +parquet-jackson-1.8.1.jar pmml-model-1.2.15.jar pmml-schema-1.2.15.jar protobuf-java-2.5.0.jar http://git-wip-us.apache.org/repos/asf/spark/blob/776d183c/dev/deps/spark-deps-hadoop-2.6 -- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 9695661..3e4ed74 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -144,14 +144,13 @@ opencsv-2.3.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.7.0.jar -parquet-common-1.7.0.jar -parquet-encoding-1.7.0.jar
spark git commit: [SPARK-15431][SQL][BRANCH-2.0-TEST] rework the clisuite test cases
Repository: spark Updated Branches: refs/heads/master 21b2605dc -> 019afd9c7 [SPARK-15431][SQL][BRANCH-2.0-TEST] rework the clisuite test cases ## What changes were proposed in this pull request? This PR reworks on the CliSuite test cases for `LIST FILES/JARS` commands. CC yhuai Thanks! Author: Xin WuCloses #13361 from xwu0226/SPARK-15431-clisuite-new. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/019afd9c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/019afd9c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/019afd9c Branch: refs/heads/master Commit: 019afd9c78a9f40e1d07f0a74868010206e90ed5 Parents: 21b2605 Author: Xin Wu Authored: Fri May 27 14:07:12 2016 -0700 Committer: Yin Huai Committed: Fri May 27 14:07:12 2016 -0700 -- .../spark/sql/hive/thriftserver/CliSuite.scala | 37 ++-- 1 file changed, 26 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/019afd9c/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala -- diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index 656fe97..75535ca 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -62,13 +62,13 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { /** * Run a CLI operation and expect all the queries and expected answers to be returned. + * * @param timeout maximum time for the commands to complete * @param extraArgs any extra arguments * @param errorResponses a sequence of strings whose presence in the stdout of the forked process * is taken as an immediate error condition. That is: if a line containing * with one of these strings is found, fail the test immediately. * The default value is `Seq("Error:")` - * * @param queriesAndExpectedAnswers one or more tuples of query + answer */ def runCliWithin( @@ -239,22 +239,37 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { "" -> "This is a test for Spark-11624") } - ignore("list jars") { + test("list jars") { val jarFile = Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar") runCliWithin(2.minute)( - s"ADD JAR $jarFile" -> "", - s"LIST JARS" -> "TestUDTF.jar", - s"List JAR $jarFile" -> "TestUDTF.jar" + s"ADD JAR $jarFile;" -> "", + s"LIST JARS;" -> "TestUDTF.jar" +) + } + + test("list jar ") { +val jarFile = Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar") +runCliWithin(2.minute)( + s"ADD JAR $jarFile;" -> "", + s"List JAR $jarFile;" -> "TestUDTF.jar" +) + } + + test("list files") { +val dataFilePath = Thread.currentThread(). + getContextClassLoader.getResource("data/files/small_kv.txt") +runCliWithin(2.minute)( + s"ADD FILE $dataFilePath;" -> "", + s"LIST FILES;" -> "small_kv.txt" ) } - ignore("list files") { -val dataFilePath = Thread.currentThread().getContextClassLoader - .getResource("data/files/small_kv.txt") + test("list file ") { +val dataFilePath = Thread.currentThread(). + getContextClassLoader.getResource("data/files/small_kv.txt") runCliWithin(2.minute)( - s"ADD FILE $dataFilePath" -> "", - s"LIST FILES" -> "small_kv.txt", - s"LIST FILE $dataFilePath" -> "small_kv.txt" + s"ADD FILE $dataFilePath;" -> "", + s"LIST FILE $dataFilePath;" -> "small_kv.txt" ) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15431][SQL][BRANCH-2.0-TEST] rework the clisuite test cases
Repository: spark Updated Branches: refs/heads/branch-2.0 dcf498e8a -> 9c137b2e3 [SPARK-15431][SQL][BRANCH-2.0-TEST] rework the clisuite test cases ## What changes were proposed in this pull request? This PR reworks on the CliSuite test cases for `LIST FILES/JARS` commands. CC yhuai Thanks! Author: Xin WuCloses #13361 from xwu0226/SPARK-15431-clisuite-new. (cherry picked from commit 019afd9c78a9f40e1d07f0a74868010206e90ed5) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9c137b2e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9c137b2e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9c137b2e Branch: refs/heads/branch-2.0 Commit: 9c137b2e361ad80845dbf086c173bc430c53d2a2 Parents: dcf498e Author: Xin Wu Authored: Fri May 27 14:07:12 2016 -0700 Committer: Yin Huai Committed: Fri May 27 14:07:27 2016 -0700 -- .../spark/sql/hive/thriftserver/CliSuite.scala | 37 ++-- 1 file changed, 26 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9c137b2e/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala -- diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index 656fe97..75535ca 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -62,13 +62,13 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { /** * Run a CLI operation and expect all the queries and expected answers to be returned. + * * @param timeout maximum time for the commands to complete * @param extraArgs any extra arguments * @param errorResponses a sequence of strings whose presence in the stdout of the forked process * is taken as an immediate error condition. That is: if a line containing * with one of these strings is found, fail the test immediately. * The default value is `Seq("Error:")` - * * @param queriesAndExpectedAnswers one or more tuples of query + answer */ def runCliWithin( @@ -239,22 +239,37 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { "" -> "This is a test for Spark-11624") } - ignore("list jars") { + test("list jars") { val jarFile = Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar") runCliWithin(2.minute)( - s"ADD JAR $jarFile" -> "", - s"LIST JARS" -> "TestUDTF.jar", - s"List JAR $jarFile" -> "TestUDTF.jar" + s"ADD JAR $jarFile;" -> "", + s"LIST JARS;" -> "TestUDTF.jar" +) + } + + test("list jar ") { +val jarFile = Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar") +runCliWithin(2.minute)( + s"ADD JAR $jarFile;" -> "", + s"List JAR $jarFile;" -> "TestUDTF.jar" +) + } + + test("list files") { +val dataFilePath = Thread.currentThread(). + getContextClassLoader.getResource("data/files/small_kv.txt") +runCliWithin(2.minute)( + s"ADD FILE $dataFilePath;" -> "", + s"LIST FILES;" -> "small_kv.txt" ) } - ignore("list files") { -val dataFilePath = Thread.currentThread().getContextClassLoader - .getResource("data/files/small_kv.txt") + test("list file ") { +val dataFilePath = Thread.currentThread(). + getContextClassLoader.getResource("data/files/small_kv.txt") runCliWithin(2.minute)( - s"ADD FILE $dataFilePath" -> "", - s"LIST FILES" -> "small_kv.txt", - s"LIST FILE $dataFilePath" -> "small_kv.txt" + s"ADD FILE $dataFilePath;" -> "", + s"LIST FILE $dataFilePath;" -> "small_kv.txt" ) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix
Repository: spark Updated Branches: refs/heads/branch-2.0 a778d3c90 -> dcf498e8a [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix ## What changes were proposed in this pull request? We're using `asML` to convert the mllib vector/matrix to ml vector/matrix now. Using `as` is more correct given that this conversion actually shares the same underline data structure. As a result, in this PR, `toBreeze` will be changed to `asBreeze`. This is a private API, as a result, it will not affect any user's application. ## How was this patch tested? unit tests Author: DB TsaiCloses #13198 from dbtsai/minor. (cherry picked from commit 21b2605dc4900894ea7a911e039781ecc2a18c14) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dcf498e8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dcf498e8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dcf498e8 Branch: refs/heads/branch-2.0 Commit: dcf498e8aafd2b53c5680cf7f3ada31829686b62 Parents: a778d3c Author: DB Tsai Authored: Fri May 27 14:02:39 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 14:02:51 2016 -0700 -- .../org/apache/spark/ml/linalg/Matrices.scala | 16 ++-- .../org/apache/spark/ml/linalg/Vectors.scala| 8 +++--- .../distribution/MultivariateGaussian.scala | 8 +++--- .../ml/linalg/BreezeMatrixConversionSuite.scala | 4 +-- .../ml/linalg/BreezeVectorConversionSuite.scala | 4 +-- .../apache/spark/ml/linalg/MatricesSuite.scala | 14 +-- .../apache/spark/ml/linalg/VectorsSuite.scala | 2 +- .../scala/org/apache/spark/ml/ann/Layer.scala | 8 +++--- .../ml/classification/LogisticRegression.scala | 2 +- .../spark/ml/clustering/GaussianMixture.scala | 2 +- .../apache/spark/ml/feature/MaxAbsScaler.scala | 2 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 2 +- .../ml/regression/AFTSurvivalRegression.scala | 2 +- .../spark/ml/regression/LinearRegression.scala | 2 +- .../apache/spark/mllib/classification/SVM.scala | 2 +- .../mllib/clustering/GaussianMixture.scala | 2 +- .../mllib/clustering/GaussianMixtureModel.scala | 4 +-- .../spark/mllib/clustering/LDAModel.scala | 26 ++-- .../spark/mllib/clustering/LDAOptimizer.scala | 6 ++--- .../mllib/clustering/StreamingKMeans.scala | 4 +-- .../apache/spark/mllib/linalg/Matrices.scala| 16 ++-- .../org/apache/spark/mllib/linalg/Vectors.scala | 8 +++--- .../mllib/linalg/distributed/BlockMatrix.scala | 8 +++--- .../mllib/linalg/distributed/RowMatrix.scala| 16 ++-- .../mllib/optimization/GradientDescent.scala| 4 +-- .../apache/spark/mllib/optimization/LBFGS.scala | 4 +-- .../spark/mllib/optimization/Updater.scala | 14 +-- .../apache/spark/mllib/regression/Lasso.scala | 2 +- .../mllib/regression/LinearRegression.scala | 2 +- .../mllib/regression/RidgeRegression.scala | 2 +- .../stat/correlation/PearsonCorrelation.scala | 2 +- .../distribution/MultivariateGaussian.scala | 8 +++--- .../spark/mllib/stat/test/ChiSqTest.scala | 2 +- .../ml/classification/NaiveBayesSuite.scala | 6 ++--- .../LogisticRegressionSuite.scala | 4 +-- .../mllib/classification/NaiveBayesSuite.scala | 4 +-- .../spark/mllib/clustering/LDASuite.scala | 4 +-- .../mllib/clustering/StreamingKMeansSuite.scala | 2 +- .../spark/mllib/feature/NormalizerSuite.scala | 16 ++-- .../linalg/BreezeMatrixConversionSuite.scala| 4 +-- .../linalg/BreezeVectorConversionSuite.scala| 4 +-- .../spark/mllib/linalg/MatricesSuite.scala | 14 +-- .../spark/mllib/linalg/VectorsSuite.scala | 2 +- .../linalg/distributed/BlockMatrixSuite.scala | 2 +- .../distributed/IndexedRowMatrixSuite.scala | 10 .../linalg/distributed/RowMatrixSuite.scala | 14 +-- .../spark/mllib/stat/CorrelationSuite.scala | 6 ++--- .../apache/spark/mllib/util/MLUtilsSuite.scala | 6 ++--- project/MimaExcludes.scala | 3 +++ 49 files changed, 156 insertions(+), 153 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dcf498e8/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala -- diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index a47526d..0ea687b 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@
spark git commit: [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix
Repository: spark Updated Branches: refs/heads/master 130b8d07b -> 21b2605dc [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix ## What changes were proposed in this pull request? We're using `asML` to convert the mllib vector/matrix to ml vector/matrix now. Using `as` is more correct given that this conversion actually shares the same underline data structure. As a result, in this PR, `toBreeze` will be changed to `asBreeze`. This is a private API, as a result, it will not affect any user's application. ## How was this patch tested? unit tests Author: DB TsaiCloses #13198 from dbtsai/minor. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/21b2605d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/21b2605d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/21b2605d Branch: refs/heads/master Commit: 21b2605dc4900894ea7a911e039781ecc2a18c14 Parents: 130b8d0 Author: DB Tsai Authored: Fri May 27 14:02:39 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 14:02:39 2016 -0700 -- .../org/apache/spark/ml/linalg/Matrices.scala | 16 ++-- .../org/apache/spark/ml/linalg/Vectors.scala| 8 +++--- .../distribution/MultivariateGaussian.scala | 8 +++--- .../ml/linalg/BreezeMatrixConversionSuite.scala | 4 +-- .../ml/linalg/BreezeVectorConversionSuite.scala | 4 +-- .../apache/spark/ml/linalg/MatricesSuite.scala | 14 +-- .../apache/spark/ml/linalg/VectorsSuite.scala | 2 +- .../scala/org/apache/spark/ml/ann/Layer.scala | 8 +++--- .../ml/classification/LogisticRegression.scala | 2 +- .../spark/ml/clustering/GaussianMixture.scala | 2 +- .../apache/spark/ml/feature/MaxAbsScaler.scala | 2 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 2 +- .../ml/regression/AFTSurvivalRegression.scala | 2 +- .../spark/ml/regression/LinearRegression.scala | 2 +- .../apache/spark/mllib/classification/SVM.scala | 2 +- .../mllib/clustering/GaussianMixture.scala | 2 +- .../mllib/clustering/GaussianMixtureModel.scala | 4 +-- .../spark/mllib/clustering/LDAModel.scala | 26 ++-- .../spark/mllib/clustering/LDAOptimizer.scala | 6 ++--- .../mllib/clustering/StreamingKMeans.scala | 4 +-- .../apache/spark/mllib/linalg/Matrices.scala| 16 ++-- .../org/apache/spark/mllib/linalg/Vectors.scala | 8 +++--- .../mllib/linalg/distributed/BlockMatrix.scala | 8 +++--- .../mllib/linalg/distributed/RowMatrix.scala| 16 ++-- .../mllib/optimization/GradientDescent.scala| 4 +-- .../apache/spark/mllib/optimization/LBFGS.scala | 4 +-- .../spark/mllib/optimization/Updater.scala | 14 +-- .../apache/spark/mllib/regression/Lasso.scala | 2 +- .../mllib/regression/LinearRegression.scala | 2 +- .../mllib/regression/RidgeRegression.scala | 2 +- .../stat/correlation/PearsonCorrelation.scala | 2 +- .../distribution/MultivariateGaussian.scala | 8 +++--- .../spark/mllib/stat/test/ChiSqTest.scala | 2 +- .../ml/classification/NaiveBayesSuite.scala | 6 ++--- .../LogisticRegressionSuite.scala | 4 +-- .../mllib/classification/NaiveBayesSuite.scala | 4 +-- .../spark/mllib/clustering/LDASuite.scala | 4 +-- .../mllib/clustering/StreamingKMeansSuite.scala | 2 +- .../spark/mllib/feature/NormalizerSuite.scala | 16 ++-- .../linalg/BreezeMatrixConversionSuite.scala| 4 +-- .../linalg/BreezeVectorConversionSuite.scala| 4 +-- .../spark/mllib/linalg/MatricesSuite.scala | 14 +-- .../spark/mllib/linalg/VectorsSuite.scala | 2 +- .../linalg/distributed/BlockMatrixSuite.scala | 2 +- .../distributed/IndexedRowMatrixSuite.scala | 10 .../linalg/distributed/RowMatrixSuite.scala | 14 +-- .../spark/mllib/stat/CorrelationSuite.scala | 6 ++--- .../apache/spark/mllib/util/MLUtilsSuite.scala | 6 ++--- project/MimaExcludes.scala | 3 +++ 49 files changed, 156 insertions(+), 153 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/21b2605d/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala -- diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index a47526d..0ea687b 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -69,7 +69,7 @@ sealed trait Matrix extends Serializable { def rowIter: Iterator[Vector] = this.transpose.colIter /** Converts to
spark git commit: [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest
Repository: spark Updated Branches: refs/heads/branch-2.0 e6e2f293d -> a778d3c90 [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest ## What changes were proposed in this pull request? 1. Add `_transfer_param_map_to/from_java` for OneVsRest; 2. Add `_compare_params` in ml/tests.py to help compare params. 3. Add `test_onevsrest` as the integration test for OneVsRest. ## How was this patch tested? Python unit test. Author: yinxusenCloses #12875 from yinxusen/SPARK-15008. (cherry picked from commit 130b8d07b8eb08f2ad522081a95032b90247094d) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a778d3c9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a778d3c9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a778d3c9 Branch: refs/heads/branch-2.0 Commit: a778d3c90599eb76e6bca87b7aa3c0f9910f24c5 Parents: e6e2f29 Author: yinxusen Authored: Fri May 27 13:18:29 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 13:18:36 2016 -0700 -- python/pyspark/ml/tests.py | 69 +++-- 1 file changed, 46 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a778d3c9/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index a7c93ac..4358175 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -747,12 +747,32 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def _compare_params(self, m1, m2, param): +""" +Compare 2 ML Params instances for the given param, and assert both have the same param value +and parent. The param must be a parameter of m1. +""" +# Prevent key not found error in case of some param in neither paramMap nor defaultParamMap. +if m1.isDefined(param): +paramValue1 = m1.getOrDefault(param) +paramValue2 = m2.getOrDefault(m2.getParam(param.name)) +if isinstance(paramValue1, Params): +self._compare_pipelines(paramValue1, paramValue2) +else: +self.assertEqual(paramValue1, paramValue2) # for general types param +# Assert parents are equal +self.assertEqual(param.parent, m2.getParam(param.name).parent) +else: +# If m1 is not defined param, then m2 should not, too. See SPARK-14931. +self.assertFalse(m2.isDefined(m2.getParam(param.name))) + def _compare_pipelines(self, m1, m2): """ Compare 2 ML types, asserting that they are equivalent. This currently supports: - basic types - Pipeline, PipelineModel + - OneVsRest, OneVsRestModel This checks: - uid - type @@ -763,8 +783,7 @@ class PersistenceTest(SparkSessionTestCase): if isinstance(m1, JavaParams): self.assertEqual(len(m1.params), len(m2.params)) for p in m1.params: -self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p)) -self.assertEqual(p.parent, m2.getParam(p.name).parent) +self._compare_params(m1, m2, p) elif isinstance(m1, Pipeline): self.assertEqual(len(m1.getStages()), len(m2.getStages())) for s1, s2 in zip(m1.getStages(), m2.getStages()): @@ -773,6 +792,13 @@ class PersistenceTest(SparkSessionTestCase): self.assertEqual(len(m1.stages), len(m2.stages)) for s1, s2 in zip(m1.stages, m2.stages): self._compare_pipelines(s1, s2) +elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel): +for p in m1.params: +self._compare_params(m1, m2, p) +if isinstance(m1, OneVsRestModel): +self.assertEqual(len(m1.models), len(m2.models)) +for x, y in zip(m1.models, m2.models): +self._compare_pipelines(x, y) else: raise RuntimeError("_compare_pipelines does not yet support type: %s" % type(m1)) @@ -833,6 +859,24 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def test_onevsrest(self): +temp_path = tempfile.mkdtemp() +df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), + (1.0, Vectors.sparse(2, [], [])), + (2.0, Vectors.dense(0.5, 0.5))] * 10, +["label", "features"]) +lr =
spark git commit: [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest
Repository: spark Updated Branches: refs/heads/master a3550e374 -> 130b8d07b [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest ## What changes were proposed in this pull request? 1. Add `_transfer_param_map_to/from_java` for OneVsRest; 2. Add `_compare_params` in ml/tests.py to help compare params. 3. Add `test_onevsrest` as the integration test for OneVsRest. ## How was this patch tested? Python unit test. Author: yinxusenCloses #12875 from yinxusen/SPARK-15008. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/130b8d07 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/130b8d07 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/130b8d07 Branch: refs/heads/master Commit: 130b8d07b8eb08f2ad522081a95032b90247094d Parents: a3550e3 Author: yinxusen Authored: Fri May 27 13:18:29 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 13:18:29 2016 -0700 -- python/pyspark/ml/tests.py | 69 +++-- 1 file changed, 46 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/130b8d07/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index a7c93ac..4358175 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -747,12 +747,32 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def _compare_params(self, m1, m2, param): +""" +Compare 2 ML Params instances for the given param, and assert both have the same param value +and parent. The param must be a parameter of m1. +""" +# Prevent key not found error in case of some param in neither paramMap nor defaultParamMap. +if m1.isDefined(param): +paramValue1 = m1.getOrDefault(param) +paramValue2 = m2.getOrDefault(m2.getParam(param.name)) +if isinstance(paramValue1, Params): +self._compare_pipelines(paramValue1, paramValue2) +else: +self.assertEqual(paramValue1, paramValue2) # for general types param +# Assert parents are equal +self.assertEqual(param.parent, m2.getParam(param.name).parent) +else: +# If m1 is not defined param, then m2 should not, too. See SPARK-14931. +self.assertFalse(m2.isDefined(m2.getParam(param.name))) + def _compare_pipelines(self, m1, m2): """ Compare 2 ML types, asserting that they are equivalent. This currently supports: - basic types - Pipeline, PipelineModel + - OneVsRest, OneVsRestModel This checks: - uid - type @@ -763,8 +783,7 @@ class PersistenceTest(SparkSessionTestCase): if isinstance(m1, JavaParams): self.assertEqual(len(m1.params), len(m2.params)) for p in m1.params: -self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p)) -self.assertEqual(p.parent, m2.getParam(p.name).parent) +self._compare_params(m1, m2, p) elif isinstance(m1, Pipeline): self.assertEqual(len(m1.getStages()), len(m2.getStages())) for s1, s2 in zip(m1.getStages(), m2.getStages()): @@ -773,6 +792,13 @@ class PersistenceTest(SparkSessionTestCase): self.assertEqual(len(m1.stages), len(m2.stages)) for s1, s2 in zip(m1.stages, m2.stages): self._compare_pipelines(s1, s2) +elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel): +for p in m1.params: +self._compare_params(m1, m2, p) +if isinstance(m1, OneVsRestModel): +self.assertEqual(len(m1.models), len(m2.models)) +for x, y in zip(m1.models, m2.models): +self._compare_pipelines(x, y) else: raise RuntimeError("_compare_pipelines does not yet support type: %s" % type(m1)) @@ -833,6 +859,24 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def test_onevsrest(self): +temp_path = tempfile.mkdtemp() +df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), + (1.0, Vectors.sparse(2, [], [])), + (2.0, Vectors.dense(0.5, 0.5))] * 10, +["label", "features"]) +lr = LogisticRegression(maxIter=5, regParam=0.01) +ovr = OneVsRest(classifier=lr) +model = ovr.fit(df) +ovrPath = temp_path + "/ovr" +
spark git commit: [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS
Repository: spark Updated Branches: refs/heads/branch-2.0 5dd1423f4 -> e6e2f293d [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS ## What changes were proposed in this pull request? * Document ```WeightedLeastSquares```(normal equation) and ```IterativelyReweightedLeastSquares```. * Copy ```L-BFGS``` documents from ```spark.mllib``` to ```spark.ml```. Due to the session ```Optimization of linear methods``` is used for developers, I think we should provide the brief introduction of the optimization method, necessary references and how it implements in Spark. It's not necessary to paste all mathematical formula and derivation here. If developers/users want to learn more, they can track reference. ## How was this patch tested? Document update, no tests. Author: Yanbo LiangCloses #13262 from yanboliang/spark-15484. (cherry picked from commit a3550e3747e21c79a5110132dc127ee83879062a) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e6e2f293 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e6e2f293 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e6e2f293 Branch: refs/heads/branch-2.0 Commit: e6e2f293d6830ce118050e789773a09b3888fd30 Parents: 5dd1423 Author: Yanbo Liang Authored: Fri May 27 13:16:22 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 13:16:37 2016 -0700 -- docs/ml-advanced.md | 85 ++-- .../IterativelyReweightedLeastSquares.scala | 2 +- 2 files changed, 81 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e6e2f293/docs/ml-advanced.md -- diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md index 91731d7..1c5f844 100644 --- a/docs/ml-advanced.md +++ b/docs/ml-advanced.md @@ -4,10 +4,85 @@ title: Advanced topics - spark.ml displayTitle: Advanced topics - spark.ml --- -# Optimization of linear methods +* Table of contents +{:toc} + +`\[ +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\x}{\mathbf{x}} +\newcommand{\y}{\mathbf{y}} +\newcommand{\wv}{\mathbf{w}} +\newcommand{\av}{\mathbf{\alpha}} +\newcommand{\bv}{\mathbf{b}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} +\newcommand{\zero}{\mathbf{0}} +\]` + +# Optimization of linear methods (developer) + +## Limited-memory BFGS (L-BFGS) +[L-BFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is an optimization +algorithm in the family of quasi-Newton methods to solve the optimization problems of the form +`$\min_{\wv \in\R^d} \; f(\wv)$`. The L-BFGS method approximates the objective function locally as a +quadratic without evaluating the second partial derivatives of the objective function to construct the +Hessian matrix. The Hessian matrix is approximated by previous gradient evaluations, so there is no +vertical scalability issue (the number of training features) unlike computing the Hessian matrix +explicitly in Newton's method. As a result, L-BFGS often achieves faster convergence compared with +other first-order optimizations. -The optimization algorithm underlying the implementation is called [Orthant-Wise Limited-memory -QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) -(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 -regularization and elastic net. +Quasi-Newton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) +(OWL-QN) is an extension of L-BFGS that can effectively handle L1 and elastic net regularization. + +L-BFGS is used as a solver for [LinearRegression](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression), +[LogisticRegression](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression), +[AFTSurvivalRegression](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression) +and [MultilayerPerceptronClassifier](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier). + +MLlib L-BFGS solver calls the corresponding implementation in [breeze](https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/optimize/LBFGS.scala). + +## Normal equation solver for weighted least squares + +MLlib implements normal equation solver for [weighted least squares](https://en.wikipedia.org/wiki/Least_squares#Weighted_least_squares) by
spark git commit: [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS
Repository: spark Updated Branches: refs/heads/master c96244f5a -> a3550e374 [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS ## What changes were proposed in this pull request? * Document ```WeightedLeastSquares```(normal equation) and ```IterativelyReweightedLeastSquares```. * Copy ```L-BFGS``` documents from ```spark.mllib``` to ```spark.ml```. Due to the session ```Optimization of linear methods``` is used for developers, I think we should provide the brief introduction of the optimization method, necessary references and how it implements in Spark. It's not necessary to paste all mathematical formula and derivation here. If developers/users want to learn more, they can track reference. ## How was this patch tested? Document update, no tests. Author: Yanbo LiangCloses #13262 from yanboliang/spark-15484. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a3550e37 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a3550e37 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a3550e37 Branch: refs/heads/master Commit: a3550e3747e21c79a5110132dc127ee83879062a Parents: c96244f Author: Yanbo Liang Authored: Fri May 27 13:16:22 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 13:16:22 2016 -0700 -- docs/ml-advanced.md | 85 ++-- .../IterativelyReweightedLeastSquares.scala | 2 +- 2 files changed, 81 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a3550e37/docs/ml-advanced.md -- diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md index 91731d7..1c5f844 100644 --- a/docs/ml-advanced.md +++ b/docs/ml-advanced.md @@ -4,10 +4,85 @@ title: Advanced topics - spark.ml displayTitle: Advanced topics - spark.ml --- -# Optimization of linear methods +* Table of contents +{:toc} + +`\[ +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\x}{\mathbf{x}} +\newcommand{\y}{\mathbf{y}} +\newcommand{\wv}{\mathbf{w}} +\newcommand{\av}{\mathbf{\alpha}} +\newcommand{\bv}{\mathbf{b}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} +\newcommand{\zero}{\mathbf{0}} +\]` + +# Optimization of linear methods (developer) + +## Limited-memory BFGS (L-BFGS) +[L-BFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is an optimization +algorithm in the family of quasi-Newton methods to solve the optimization problems of the form +`$\min_{\wv \in\R^d} \; f(\wv)$`. The L-BFGS method approximates the objective function locally as a +quadratic without evaluating the second partial derivatives of the objective function to construct the +Hessian matrix. The Hessian matrix is approximated by previous gradient evaluations, so there is no +vertical scalability issue (the number of training features) unlike computing the Hessian matrix +explicitly in Newton's method. As a result, L-BFGS often achieves faster convergence compared with +other first-order optimizations. -The optimization algorithm underlying the implementation is called [Orthant-Wise Limited-memory -QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) -(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 -regularization and elastic net. +Quasi-Newton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) +(OWL-QN) is an extension of L-BFGS that can effectively handle L1 and elastic net regularization. + +L-BFGS is used as a solver for [LinearRegression](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression), +[LogisticRegression](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression), +[AFTSurvivalRegression](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression) +and [MultilayerPerceptronClassifier](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier). + +MLlib L-BFGS solver calls the corresponding implementation in [breeze](https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/optimize/LBFGS.scala). + +## Normal equation solver for weighted least squares + +MLlib implements normal equation solver for [weighted least squares](https://en.wikipedia.org/wiki/Least_squares#Weighted_least_squares) by [WeightedLeastSquares](https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala). + +Given $n$ weighted observations $(w_i, a_i, b_i)$: + +* $w_i$ the weight of i-th observation +* $a_i$
spark git commit: [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression
Repository: spark Updated Branches: refs/heads/branch-2.0 d76e066d3 -> 5dd1423f4 [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression ## What changes were proposed in this pull request? This patch adds a user guide section for generalized linear regression and includes the examples from [#12754](https://github.com/apache/spark/pull/12754). ## How was this patch tested? Documentation only, no tests required. ## Approach In general, it is a bit unclear what level of detail ought to be included in the user guide since there is a lot of variability within the current user guide. I tried to give a fairly brief mathematical introduction to GLMs, and cover what types of problems they could be used for. Additionally, I included a brief blurb on the IRLS solver. The input/output columns are given in a table as is found elsewhere in the docs (though, again, these appear rather intermittently in the current docs), as well as a table providing the supported families and their link functions. Author: sethahCloses #13139 from sethah/SPARK-15186. (cherry picked from commit c96244f5acd8b335e34694c171bab32d92e6e0fb) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5dd1423f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5dd1423f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5dd1423f Branch: refs/heads/branch-2.0 Commit: 5dd1423f462f03b7ae625a93cdaf9d882969afb6 Parents: d76e066 Author: sethah Authored: Fri May 27 12:55:48 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 12:56:00 2016 -0700 -- docs/ml-classification-regression.md | 132 ++ 1 file changed, 132 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5dd1423f/docs/ml-classification-regression.md -- diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index f1a21f4..ff8dec6 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -374,6 +374,138 @@ regression model and extracting model summary statistics. +## Generalized linear regression + +Contrasted with linear regression where the output is assumed to follow a Gaussian +distribution, [generalized linear models](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLMs) are specifications of linear models where the response variable $Y_i$ follows some +distribution from the [exponential family of distributions](https://en.wikipedia.org/wiki/Exponential_family). +Spark's `GeneralizedLinearRegression` interface +allows for flexible specification of GLMs which can be used for various types of +prediction problems including linear regression, Poisson regression, logistic regression, and others. +Currently in `spark.ml`, only a subset of the exponential family distributions are supported and they are listed +[below](#available-families). + +**NOTE**: Spark currently only supports up to 4096 features through its `GeneralizedLinearRegression` +interface, and will throw an exception if this constraint is exceeded. See the [advanced section](ml-advanced) for more details. + Still, for linear and logistic regression, models with an increased number of features can be trained + using the `LinearRegression` and `LogisticRegression` estimators. + +GLMs require exponential family distributions that can be written in their "canonical" or "natural" form, aka +[natural exponential family distributions](https://en.wikipedia.org/wiki/Natural_exponential_family). The form of a natural exponential family distribution is given as: + +$$ +f_Y(y|\theta, \tau) = h(y, \tau)\exp{\left( \frac{\theta \cdot y - A(\theta)}{d(\tau)} \right)} +$$ + +where $\theta$ is the parameter of interest and $\tau$ is a dispersion parameter. In a GLM the response variable $Y_i$ is assumed to be drawn from a natural exponential family distribution: + +$$ +Y_i \sim f\left(\cdot|\theta_i, \tau \right) +$$ + +where the parameter of interest $\theta_i$ is related to the expected value of the response variable $\mu_i$ by + +$$ +\mu_i = A'(\theta_i) +$$ + +Here, $A'(\theta_i)$ is defined by the form of the distribution selected. GLMs also allow specification +of a link function, which defines the relationship between the expected value of the response variable $\mu_i$ +and the so called _linear predictor_ $\eta_i$: + +$$ +g(\mu_i) = \eta_i = \vec{x_i}^T \cdot \vec{\beta} +$$ + +Often, the link function is chosen such that $A' = g^{-1}$, which yields a simplified relationship +between the parameter of interest $\theta$ and the
spark git commit: [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression
Repository: spark Updated Branches: refs/heads/master a96e4151a -> c96244f5a [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression ## What changes were proposed in this pull request? This patch adds a user guide section for generalized linear regression and includes the examples from [#12754](https://github.com/apache/spark/pull/12754). ## How was this patch tested? Documentation only, no tests required. ## Approach In general, it is a bit unclear what level of detail ought to be included in the user guide since there is a lot of variability within the current user guide. I tried to give a fairly brief mathematical introduction to GLMs, and cover what types of problems they could be used for. Additionally, I included a brief blurb on the IRLS solver. The input/output columns are given in a table as is found elsewhere in the docs (though, again, these appear rather intermittently in the current docs), as well as a table providing the supported families and their link functions. Author: sethahCloses #13139 from sethah/SPARK-15186. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c96244f5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c96244f5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c96244f5 Branch: refs/heads/master Commit: c96244f5acd8b335e34694c171bab32d92e6e0fb Parents: a96e415 Author: sethah Authored: Fri May 27 12:55:48 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 12:55:48 2016 -0700 -- docs/ml-classification-regression.md | 132 ++ 1 file changed, 132 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c96244f5/docs/ml-classification-regression.md -- diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index f1a21f4..ff8dec6 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -374,6 +374,138 @@ regression model and extracting model summary statistics. +## Generalized linear regression + +Contrasted with linear regression where the output is assumed to follow a Gaussian +distribution, [generalized linear models](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLMs) are specifications of linear models where the response variable $Y_i$ follows some +distribution from the [exponential family of distributions](https://en.wikipedia.org/wiki/Exponential_family). +Spark's `GeneralizedLinearRegression` interface +allows for flexible specification of GLMs which can be used for various types of +prediction problems including linear regression, Poisson regression, logistic regression, and others. +Currently in `spark.ml`, only a subset of the exponential family distributions are supported and they are listed +[below](#available-families). + +**NOTE**: Spark currently only supports up to 4096 features through its `GeneralizedLinearRegression` +interface, and will throw an exception if this constraint is exceeded. See the [advanced section](ml-advanced) for more details. + Still, for linear and logistic regression, models with an increased number of features can be trained + using the `LinearRegression` and `LogisticRegression` estimators. + +GLMs require exponential family distributions that can be written in their "canonical" or "natural" form, aka +[natural exponential family distributions](https://en.wikipedia.org/wiki/Natural_exponential_family). The form of a natural exponential family distribution is given as: + +$$ +f_Y(y|\theta, \tau) = h(y, \tau)\exp{\left( \frac{\theta \cdot y - A(\theta)}{d(\tau)} \right)} +$$ + +where $\theta$ is the parameter of interest and $\tau$ is a dispersion parameter. In a GLM the response variable $Y_i$ is assumed to be drawn from a natural exponential family distribution: + +$$ +Y_i \sim f\left(\cdot|\theta_i, \tau \right) +$$ + +where the parameter of interest $\theta_i$ is related to the expected value of the response variable $\mu_i$ by + +$$ +\mu_i = A'(\theta_i) +$$ + +Here, $A'(\theta_i)$ is defined by the form of the distribution selected. GLMs also allow specification +of a link function, which defines the relationship between the expected value of the response variable $\mu_i$ +and the so called _linear predictor_ $\eta_i$: + +$$ +g(\mu_i) = \eta_i = \vec{x_i}^T \cdot \vec{\beta} +$$ + +Often, the link function is chosen such that $A' = g^{-1}$, which yields a simplified relationship +between the parameter of interest $\theta$ and the linear predictor $\eta$. In this case, the link +function $g(\mu)$ is said to be the "canonical" link function. + +$$ +\theta_i =
spark git commit: [SPARK-14400][SQL] ScriptTransformation does not fail the job for bad user command
Repository: spark Updated Branches: refs/heads/master b376a4eab -> a96e4151a [SPARK-14400][SQL] ScriptTransformation does not fail the job for bad user command ## What changes were proposed in this pull request? - Refer to the Jira for the problem: jira : https://issues.apache.org/jira/browse/SPARK-14400 - The fix is to check if the process has exited with a non-zero exit code in `hasNext()`. I have moved this and checking of writer thread exception to a separate method. ## How was this patch tested? - Ran a job which had incorrect transform script command and saw that the job fails - Existing unit tests for `ScriptTransformationSuite`. Added a new unit test Author: Tejas PatilCloses #12194 from tejasapatil/script_transform. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a96e4151 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a96e4151 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a96e4151 Branch: refs/heads/master Commit: a96e4151a9d429cfaf457c07b4ce174890a3b39b Parents: b376a4e Author: Tejas Patil Authored: Fri May 27 12:05:11 2016 -0700 Committer: Reynold Xin Committed: Fri May 27 12:05:11 2016 -0700 -- .../spark/sql/execution/SparkPlanTest.scala | 7 +- .../hive/execution/ScriptTransformation.scala | 90 +--- .../execution/ScriptTransformationSuite.scala | 18 +++- 3 files changed, 81 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a96e4151/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala index 9fe0e96..b29e822 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala @@ -231,7 +231,12 @@ object SparkPlanTest { } } - private def executePlan(outputPlan: SparkPlan, spark: SQLContext): Seq[Row] = { + /** + * Runs the plan + * @param outputPlan SparkPlan to be executed + * @param spark SqlContext used for execution of the plan + */ + def executePlan(outputPlan: SparkPlan, spark: SQLContext): Seq[Row] = { val execution = new QueryExecution(spark.sparkSession, null) { override lazy val sparkPlan: SparkPlan = outputPlan transform { case plan: SparkPlan => http://git-wip-us.apache.org/repos/asf/spark/blob/a96e4151/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala index f6e6a75..9e25e1d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala @@ -32,7 +32,7 @@ import org.apache.hadoop.hive.serde2.AbstractSerDe import org.apache.hadoop.hive.serde2.objectinspector._ import org.apache.hadoop.io.Writable -import org.apache.spark.TaskContext +import org.apache.spark.{SparkException, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} @@ -127,45 +127,71 @@ case class ScriptTransformation( } val mutableRow = new SpecificMutableRow(output.map(_.dataType)) +private def checkFailureAndPropagate(cause: Throwable = null): Unit = { + if (writerThread.exception.isDefined) { +throw writerThread.exception.get + } + + // Checks if the proc is still alive (incase the command ran was bad) + // The ideal way to do this is to use Java 8's Process#isAlive() + // but it cannot be used because Spark still supports Java 7. + // Following is a workaround used to check if a process is alive in Java 7 + // TODO: Once builds are switched to Java 8, this can be changed + try { +val exitCode = proc.exitValue() +if (exitCode != 0) { + logError(stderrBuffer.toString) // log the stderr circular buffer + throw new SparkException(s"Subprocess exited with status $exitCode. " + +s"Error: ${stderrBuffer.toString}", cause) +} + } catch { +case _: IllegalThreadStateException => +// This means that the process is still
spark git commit: [SPARK-14400][SQL] ScriptTransformation does not fail the job for bad user command
Repository: spark Updated Branches: refs/heads/branch-2.0 5ea58898c -> d76e066d3 [SPARK-14400][SQL] ScriptTransformation does not fail the job for bad user command ## What changes were proposed in this pull request? - Refer to the Jira for the problem: jira : https://issues.apache.org/jira/browse/SPARK-14400 - The fix is to check if the process has exited with a non-zero exit code in `hasNext()`. I have moved this and checking of writer thread exception to a separate method. ## How was this patch tested? - Ran a job which had incorrect transform script command and saw that the job fails - Existing unit tests for `ScriptTransformationSuite`. Added a new unit test Author: Tejas PatilCloses #12194 from tejasapatil/script_transform. (cherry picked from commit a96e4151a9d429cfaf457c07b4ce174890a3b39b) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d76e066d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d76e066d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d76e066d Branch: refs/heads/branch-2.0 Commit: d76e066d3355a9942af3ae4f23d81a948c236e5e Parents: 5ea5889 Author: Tejas Patil Authored: Fri May 27 12:05:11 2016 -0700 Committer: Reynold Xin Committed: Fri May 27 12:05:17 2016 -0700 -- .../spark/sql/execution/SparkPlanTest.scala | 7 +- .../hive/execution/ScriptTransformation.scala | 90 +--- .../execution/ScriptTransformationSuite.scala | 18 +++- 3 files changed, 81 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d76e066d/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala index 9fe0e96..b29e822 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkPlanTest.scala @@ -231,7 +231,12 @@ object SparkPlanTest { } } - private def executePlan(outputPlan: SparkPlan, spark: SQLContext): Seq[Row] = { + /** + * Runs the plan + * @param outputPlan SparkPlan to be executed + * @param spark SqlContext used for execution of the plan + */ + def executePlan(outputPlan: SparkPlan, spark: SQLContext): Seq[Row] = { val execution = new QueryExecution(spark.sparkSession, null) { override lazy val sparkPlan: SparkPlan = outputPlan transform { case plan: SparkPlan => http://git-wip-us.apache.org/repos/asf/spark/blob/d76e066d/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala index f6e6a75..9e25e1d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/ScriptTransformation.scala @@ -32,7 +32,7 @@ import org.apache.hadoop.hive.serde2.AbstractSerDe import org.apache.hadoop.hive.serde2.objectinspector._ import org.apache.hadoop.io.Writable -import org.apache.spark.TaskContext +import org.apache.spark.{SparkException, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} @@ -127,45 +127,71 @@ case class ScriptTransformation( } val mutableRow = new SpecificMutableRow(output.map(_.dataType)) +private def checkFailureAndPropagate(cause: Throwable = null): Unit = { + if (writerThread.exception.isDefined) { +throw writerThread.exception.get + } + + // Checks if the proc is still alive (incase the command ran was bad) + // The ideal way to do this is to use Java 8's Process#isAlive() + // but it cannot be used because Spark still supports Java 7. + // Following is a workaround used to check if a process is alive in Java 7 + // TODO: Once builds are switched to Java 8, this can be changed + try { +val exitCode = proc.exitValue() +if (exitCode != 0) { + logError(stderrBuffer.toString) // log the stderr circular buffer + throw new SparkException(s"Subprocess exited with status $exitCode. " + +s"Error: ${stderrBuffer.toString}", cause) +}
spark git commit: [HOTFIX] Scala 2.10 compile GaussianMixtureModel
Repository: spark Updated Branches: refs/heads/branch-2.0 17f43cc87 -> 5ea58898c [HOTFIX] Scala 2.10 compile GaussianMixtureModel Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5ea58898 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5ea58898 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5ea58898 Branch: refs/heads/branch-2.0 Commit: 5ea58898cc9413fd0b04b60db230c8894d8bb9ef Parents: 17f43cc Author: Andrew OrAuthored: Fri May 27 11:43:01 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:43:50 2016 -0700 -- .../org/apache/spark/mllib/clustering/GaussianMixtureModel.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5ea58898/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 4b06816..f470b0f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -170,7 +170,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { (weight, new MultivariateGaussian(mu, sigma)) }.unzip - new GaussianMixtureModel(weights, gaussians) + new GaussianMixtureModel(weights.toArray, gaussians.toArray) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [HOTFIX] Scala 2.10 compile GaussianMixtureModel
Repository: spark Updated Branches: refs/heads/master 1b98fa2e4 -> b376a4eab [HOTFIX] Scala 2.10 compile GaussianMixtureModel Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b376a4ea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b376a4ea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b376a4ea Branch: refs/heads/master Commit: b376a4eabc82d622ea26290345c01465af7a628d Parents: 1b98fa2 Author: Andrew OrAuthored: Fri May 27 11:43:01 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:43:01 2016 -0700 -- .../org/apache/spark/mllib/clustering/GaussianMixtureModel.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b376a4ea/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 4b06816..f470b0f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -170,7 +170,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { (weight, new MultivariateGaussian(mu, sigma)) }.unzip - new GaussianMixtureModel(weights, gaussians) + new GaussianMixtureModel(weights.toArray, gaussians.toArray) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [YARN][DOC][MINOR] Remove several obsolete env variables and update the doc
Repository: spark Updated Branches: refs/heads/branch-2.0 074989af9 -> 17f43cc87 [YARN][DOC][MINOR] Remove several obsolete env variables and update the doc ## What changes were proposed in this pull request? Remove several obsolete env variables not supported for Spark on YARN now, also updates the docs to include several changes with 2.0. ## How was this patch tested? N/A CC vanzin tgravescs Author: jerryshaoCloses #13296 from jerryshao/yarn-doc. (cherry picked from commit 1b98fa2e4382d3d8385cf1ac25d7fd3ae5650475) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/17f43cc8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/17f43cc8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/17f43cc8 Branch: refs/heads/branch-2.0 Commit: 17f43cc87ed8b3c77b7c34163340da8e2da48eb1 Parents: 074989a Author: jerryshao Authored: Fri May 27 11:31:25 2016 -0700 Committer: Marcelo Vanzin Committed: Fri May 27 11:31:37 2016 -0700 -- conf/spark-env.sh.template | 4 docs/running-on-yarn.md| 4 2 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/17f43cc8/conf/spark-env.sh.template -- diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index a031cd6..9cffdc3 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -40,10 +40,6 @@ # - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). # - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) # - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) -# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark) -# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: 'default') -# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job. -# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job. # Options for the daemons used in the standalone deploy mode # - SPARK_MASTER_IP, to bind the master to a different IP address or hostname http://git-wip-us.apache.org/repos/asf/spark/blob/17f43cc8/docs/running-on-yarn.md -- diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index f2fbe3c..9833806 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -60,6 +60,8 @@ Running Spark on YARN requires a binary distribution of Spark which is built wit Binary distributions can be downloaded from the [downloads page](http://spark.apache.org/downloads.html) of the project website. To build Spark yourself, refer to [Building Spark](building-spark.html). +To make Spark runtime jars accessible from YARN side, you can specify `spark.yarn.archive` or `spark.yarn.jars`. For details please refer to [Spark Properties](running-on-yarn.html#spark-properties). If neither `spark.yarn.archive` nor `spark.yarn.jars` is specified, Spark will create a zip file with all jars under `$SPARK_HOME/jars` and upload it to the distributed cache. + # Configuration Most of the configs are the same for Spark on YARN as for other deployment modes. See the [configuration page](configuration.html) for more information on those. These are configs that are specific to Spark on YARN. @@ -99,6 +101,8 @@ to the same log file). If you need a reference to the proper location to put log files in the YARN so that YARN can properly display and aggregate them, use `spark.yarn.app.container.log.dir` in your `log4j.properties`. For example, `log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log`. For streaming applications, configuring `RollingFileAppender` and setting file location to YARN's log directory will avoid disk overflow caused by large log files, and logs can be accessed using YARN's log utility. +To use a custom metrics.properties for the application master and executors, update the `$SPARK_CONF_DIR/metrics.properties` file. It will automatically be uploaded with other configurations, so you don't need to specify it manually with `--files`. + Spark Properties - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [YARN][DOC][MINOR] Remove several obsolete env variables and update the doc
Repository: spark Updated Branches: refs/heads/master 623aae590 -> 1b98fa2e4 [YARN][DOC][MINOR] Remove several obsolete env variables and update the doc ## What changes were proposed in this pull request? Remove several obsolete env variables not supported for Spark on YARN now, also updates the docs to include several changes with 2.0. ## How was this patch tested? N/A CC vanzin tgravescs Author: jerryshaoCloses #13296 from jerryshao/yarn-doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b98fa2e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b98fa2e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b98fa2e Branch: refs/heads/master Commit: 1b98fa2e4382d3d8385cf1ac25d7fd3ae5650475 Parents: 623aae5 Author: jerryshao Authored: Fri May 27 11:31:25 2016 -0700 Committer: Marcelo Vanzin Committed: Fri May 27 11:31:25 2016 -0700 -- conf/spark-env.sh.template | 4 docs/running-on-yarn.md| 4 2 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1b98fa2e/conf/spark-env.sh.template -- diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index a031cd6..9cffdc3 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -40,10 +40,6 @@ # - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). # - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) # - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) -# - SPARK_YARN_APP_NAME, The name of your application (Default: Spark) -# - SPARK_YARN_QUEUE, The hadoop queue to use for allocation requests (Default: 'default') -# - SPARK_YARN_DIST_FILES, Comma separated list of files to be distributed with the job. -# - SPARK_YARN_DIST_ARCHIVES, Comma separated list of archives to be distributed with the job. # Options for the daemons used in the standalone deploy mode # - SPARK_MASTER_IP, to bind the master to a different IP address or hostname http://git-wip-us.apache.org/repos/asf/spark/blob/1b98fa2e/docs/running-on-yarn.md -- diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index f2fbe3c..9833806 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -60,6 +60,8 @@ Running Spark on YARN requires a binary distribution of Spark which is built wit Binary distributions can be downloaded from the [downloads page](http://spark.apache.org/downloads.html) of the project website. To build Spark yourself, refer to [Building Spark](building-spark.html). +To make Spark runtime jars accessible from YARN side, you can specify `spark.yarn.archive` or `spark.yarn.jars`. For details please refer to [Spark Properties](running-on-yarn.html#spark-properties). If neither `spark.yarn.archive` nor `spark.yarn.jars` is specified, Spark will create a zip file with all jars under `$SPARK_HOME/jars` and upload it to the distributed cache. + # Configuration Most of the configs are the same for Spark on YARN as for other deployment modes. See the [configuration page](configuration.html) for more information on those. These are configs that are specific to Spark on YARN. @@ -99,6 +101,8 @@ to the same log file). If you need a reference to the proper location to put log files in the YARN so that YARN can properly display and aggregate them, use `spark.yarn.app.container.log.dir` in your `log4j.properties`. For example, `log4j.appender.file_appender.File=${spark.yarn.app.container.log.dir}/spark.log`. For streaming applications, configuring `RollingFileAppender` and setting file location to YARN's log directory will avoid disk overflow caused by large log files, and logs can be accessed using YARN's log utility. +To use a custom metrics.properties for the application master and executors, update the `$SPARK_CONF_DIR/metrics.properties` file. It will automatically be uploaded with other configurations, so you don't need to specify it manually with `--files`. + Spark Properties - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15531][DEPLOY] spark-class tries to use too much memory when running Launcher
Repository: spark Updated Branches: refs/heads/branch-2.0 30e87b55b -> 074989af9 [SPARK-15531][DEPLOY] spark-class tries to use too much memory when running Launcher ## What changes were proposed in this pull request? Explicitly limit launcher JVM memory to modest 128m ## How was this patch tested? Jenkins tests. Author: Sean OwenCloses #13360 from srowen/SPARK-15531. (cherry picked from commit 623aae5907f4ba8b7079c21328e0c0b5fef7bb00) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/074989af Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/074989af Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/074989af Branch: refs/heads/branch-2.0 Commit: 074989af945b8ebb2779f94b8714752b67f3e82f Parents: 30e87b5 Author: Sean Owen Authored: Fri May 27 11:28:28 2016 -0700 Committer: Marcelo Vanzin Committed: Fri May 27 11:28:39 2016 -0700 -- bin/spark-class | 2 +- bin/spark-class2.cmd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/074989af/bin/spark-class -- diff --git a/bin/spark-class b/bin/spark-class index 23a60c6..658e076 100755 --- a/bin/spark-class +++ b/bin/spark-class @@ -68,7 +68,7 @@ fi # The exit code of the launcher is appended to the output, so the parent shell removes it from the # command array and checks the value to see if the launcher succeeded. build_command() { - "$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@" + "$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@" printf "%d\0" $? } http://git-wip-us.apache.org/repos/asf/spark/blob/074989af/bin/spark-class2.cmd -- diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd index db68021..869c0b2 100644 --- a/bin/spark-class2.cmd +++ b/bin/spark-class2.cmd @@ -55,7 +55,7 @@ if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java rem The launcher library prints the command to be executed in a single line suitable for being rem executed by the batch interpreter. So read all the output of the launcher into a variable. set LAUNCHER_OUTPUT=%temp%\spark-class-launcher-output-%RANDOM%.txt -"%RUNNER%" -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* > %LAUNCHER_OUTPUT% +"%RUNNER%" -Xmx128m -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* > %LAUNCHER_OUTPUT% for /f "tokens=*" %%i in (%LAUNCHER_OUTPUT%) do ( set SPARK_CMD=%%i ) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15531][DEPLOY] spark-class tries to use too much memory when running Launcher
Repository: spark Updated Branches: refs/heads/master ce756daa4 -> 623aae590 [SPARK-15531][DEPLOY] spark-class tries to use too much memory when running Launcher ## What changes were proposed in this pull request? Explicitly limit launcher JVM memory to modest 128m ## How was this patch tested? Jenkins tests. Author: Sean OwenCloses #13360 from srowen/SPARK-15531. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/623aae59 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/623aae59 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/623aae59 Branch: refs/heads/master Commit: 623aae5907f4ba8b7079c21328e0c0b5fef7bb00 Parents: ce756da Author: Sean Owen Authored: Fri May 27 11:28:28 2016 -0700 Committer: Marcelo Vanzin Committed: Fri May 27 11:28:28 2016 -0700 -- bin/spark-class | 2 +- bin/spark-class2.cmd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/623aae59/bin/spark-class -- diff --git a/bin/spark-class b/bin/spark-class index 23a60c6..658e076 100755 --- a/bin/spark-class +++ b/bin/spark-class @@ -68,7 +68,7 @@ fi # The exit code of the launcher is appended to the output, so the parent shell removes it from the # command array and checks the value to see if the launcher succeeded. build_command() { - "$RUNNER" -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@" + "$RUNNER" -Xmx128m -cp "$LAUNCH_CLASSPATH" org.apache.spark.launcher.Main "$@" printf "%d\0" $? } http://git-wip-us.apache.org/repos/asf/spark/blob/623aae59/bin/spark-class2.cmd -- diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd index db68021..869c0b2 100644 --- a/bin/spark-class2.cmd +++ b/bin/spark-class2.cmd @@ -55,7 +55,7 @@ if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java rem The launcher library prints the command to be executed in a single line suitable for being rem executed by the batch interpreter. So read all the output of the launcher into a variable. set LAUNCHER_OUTPUT=%temp%\spark-class-launcher-output-%RANDOM%.txt -"%RUNNER%" -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* > %LAUNCHER_OUTPUT% +"%RUNNER%" -Xmx128m -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* > %LAUNCHER_OUTPUT% for /f "tokens=*" %%i in (%LAUNCHER_OUTPUT%) do ( set SPARK_CMD=%%i ) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk…
Repository: spark Updated Branches: refs/heads/master 5bdbedf22 -> ce756daa4 [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk… ## What changes were proposed in this pull request? Profiling a Spark job spilling large amount of intermediate data we found that significant portion of time is being spent in DiskObjectWriter.updateBytesWritten function. Looking at the code, we see that the function is being called too frequently to update the number of bytes written to disk. We should reduce the frequency to avoid this. ## How was this patch tested? Tested by running the job on cluster and saw 20% CPU gain by this change. Author: Sital KediaCloses #13332 from sitalkedia/DiskObjectWriter. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ce756daa Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ce756daa Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ce756daa Branch: refs/heads/master Commit: ce756daa4f012ebdc5a41bf5a89ff11b6dfdab8c Parents: 5bdbedf Author: Sital Kedia Authored: Fri May 27 11:22:39 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:22:39 2016 -0700 -- .../apache/spark/storage/DiskBlockObjectWriter.scala| 3 +-- .../spark/storage/DiskBlockObjectWriterSuite.scala | 12 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ce756daa/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala index ab97d2e..5b493f4 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala @@ -203,8 +203,7 @@ private[spark] class DiskBlockObjectWriter( numRecordsWritten += 1 writeMetrics.incRecordsWritten(1) -// TODO: call updateBytesWritten() less frequently. -if (numRecordsWritten % 32 == 0) { +if (numRecordsWritten % 16384 == 0) { updateBytesWritten() } } http://git-wip-us.apache.org/repos/asf/spark/blob/ce756daa/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala index 8eff3c2..ec4ef4b 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala @@ -53,13 +53,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.recordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.bytesWritten == 0) -// After 32 writes, metrics should update -for (i <- 0 until 32) { +// After 16384 writes, metrics should update +for (i <- 0 until 16384) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.bytesWritten > 0) -assert(writeMetrics.recordsWritten === 33) +assert(writeMetrics.recordsWritten === 16385) writer.commitAndClose() assert(file.length() == writeMetrics.bytesWritten) } @@ -75,13 +75,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.recordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.bytesWritten == 0) -// After 32 writes, metrics should update -for (i <- 0 until 32) { +// After 16384 writes, metrics should update +for (i <- 0 until 16384) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.bytesWritten > 0) -assert(writeMetrics.recordsWritten === 33) +assert(writeMetrics.recordsWritten === 16385) writer.revertPartialWritesAndClose() assert(writeMetrics.bytesWritten == 0) assert(writeMetrics.recordsWritten == 0) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk…
Repository: spark Updated Branches: refs/heads/branch-2.0 89fdb6972 -> 30e87b55b [SPARK-15569] Reduce frequency of updateBytesWritten function in Disk… ## What changes were proposed in this pull request? Profiling a Spark job spilling large amount of intermediate data we found that significant portion of time is being spent in DiskObjectWriter.updateBytesWritten function. Looking at the code, we see that the function is being called too frequently to update the number of bytes written to disk. We should reduce the frequency to avoid this. ## How was this patch tested? Tested by running the job on cluster and saw 20% CPU gain by this change. Author: Sital KediaCloses #13332 from sitalkedia/DiskObjectWriter. (cherry picked from commit ce756daa4f012ebdc5a41bf5a89ff11b6dfdab8c) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/30e87b55 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/30e87b55 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/30e87b55 Branch: refs/heads/branch-2.0 Commit: 30e87b55b6f59ca029778087710effc768fafc35 Parents: 89fdb69 Author: Sital Kedia Authored: Fri May 27 11:22:39 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:22:48 2016 -0700 -- .../apache/spark/storage/DiskBlockObjectWriter.scala| 3 +-- .../spark/storage/DiskBlockObjectWriterSuite.scala | 12 ++-- 2 files changed, 7 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/30e87b55/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala -- diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala index ab97d2e..5b493f4 100644 --- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala +++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala @@ -203,8 +203,7 @@ private[spark] class DiskBlockObjectWriter( numRecordsWritten += 1 writeMetrics.incRecordsWritten(1) -// TODO: call updateBytesWritten() less frequently. -if (numRecordsWritten % 32 == 0) { +if (numRecordsWritten % 16384 == 0) { updateBytesWritten() } } http://git-wip-us.apache.org/repos/asf/spark/blob/30e87b55/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala -- diff --git a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala index 8eff3c2..ec4ef4b 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskBlockObjectWriterSuite.scala @@ -53,13 +53,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.recordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.bytesWritten == 0) -// After 32 writes, metrics should update -for (i <- 0 until 32) { +// After 16384 writes, metrics should update +for (i <- 0 until 16384) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.bytesWritten > 0) -assert(writeMetrics.recordsWritten === 33) +assert(writeMetrics.recordsWritten === 16385) writer.commitAndClose() assert(file.length() == writeMetrics.bytesWritten) } @@ -75,13 +75,13 @@ class DiskBlockObjectWriterSuite extends SparkFunSuite with BeforeAndAfterEach { assert(writeMetrics.recordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.bytesWritten == 0) -// After 32 writes, metrics should update -for (i <- 0 until 32) { +// After 16384 writes, metrics should update +for (i <- 0 until 16384) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.bytesWritten > 0) -assert(writeMetrics.recordsWritten === 33) +assert(writeMetrics.recordsWritten === 16385) writer.revertPartialWritesAndClose() assert(writeMetrics.bytesWritten == 0) assert(writeMetrics.recordsWritten == 0) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS] Typo fixes in Dataset scaladoc
Repository: spark Updated Branches: refs/heads/branch-2.0 f52a95248 -> 89fdb6972 [MINOR][DOCS] Typo fixes in Dataset scaladoc ## What changes were proposed in this pull request? Minor typo fixes in Dataset scaladoc * Corrected context type as SparkSession, not SQLContext. liancheng rxin andrewor14 ## How was this patch tested? Compiled locally Author: Xinh HuynhCloses #13330 from xinhhuynh/fix-dataset-typos. (cherry picked from commit 5bdbedf2201efa6c34392aa9eff709761f027e1d) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/89fdb697 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/89fdb697 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/89fdb697 Branch: refs/heads/branch-2.0 Commit: 89fdb6972d5410f250bc56f8a834c939ee6653d2 Parents: f52a952 Author: Xinh Huynh Authored: Fri May 27 11:13:53 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:14:01 2016 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/89fdb697/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 85f0cf8..abd16f2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -121,7 +121,7 @@ private[sql] object Dataset { * * A more concrete example in Scala: * {{{ - * // To create Dataset[Row] using SQLContext + * // To create Dataset[Row] using SparkSession * val people = spark.read.parquet("...") * val department = spark.read.parquet("...") * @@ -133,7 +133,7 @@ private[sql] object Dataset { * * and in Java: * {{{ - * // To create Dataset using SQLContext + * // To create Dataset using SparkSession * Dataset people = spark.read().parquet("..."); * Dataset department = spark.read().parquet("..."); * - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][DOCS] Typo fixes in Dataset scaladoc
Repository: spark Updated Branches: refs/heads/master a52e68133 -> 5bdbedf22 [MINOR][DOCS] Typo fixes in Dataset scaladoc ## What changes were proposed in this pull request? Minor typo fixes in Dataset scaladoc * Corrected context type as SparkSession, not SQLContext. liancheng rxin andrewor14 ## How was this patch tested? Compiled locally Author: Xinh HuynhCloses #13330 from xinhhuynh/fix-dataset-typos. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5bdbedf2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5bdbedf2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5bdbedf2 Branch: refs/heads/master Commit: 5bdbedf2201efa6c34392aa9eff709761f027e1d Parents: a52e681 Author: Xinh Huynh Authored: Fri May 27 11:13:53 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:13:53 2016 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5bdbedf2/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 85f0cf8..abd16f2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -121,7 +121,7 @@ private[sql] object Dataset { * * A more concrete example in Scala: * {{{ - * // To create Dataset[Row] using SQLContext + * // To create Dataset[Row] using SparkSession * val people = spark.read.parquet("...") * val department = spark.read.parquet("...") * @@ -133,7 +133,7 @@ private[sql] object Dataset { * * and in Java: * {{{ - * // To create Dataset using SQLContext + * // To create Dataset using SparkSession * Dataset people = spark.read().parquet("..."); * Dataset department = spark.read().parquet("..."); * - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15597][SQL] Add SparkSession.emptyDataset
Repository: spark Updated Branches: refs/heads/branch-2.0 e69639f43 -> f52a95248 [SPARK-15597][SQL] Add SparkSession.emptyDataset ## What changes were proposed in this pull request? This patch adds a new function emptyDataset to SparkSession, for creating an empty dataset. ## How was this patch tested? Added a test case. Author: Reynold XinCloses #13344 from rxin/SPARK-15597. (cherry picked from commit a52e6813392ba4bdb1b818694b7ced8f6caa6a2b) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f52a9524 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f52a9524 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f52a9524 Branch: refs/heads/branch-2.0 Commit: f52a9524865b8c56058a65b29a1aaacffb709f69 Parents: e69639f Author: Reynold Xin Authored: Fri May 27 11:13:09 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:13:17 2016 -0700 -- .../main/scala/org/apache/spark/sql/SparkSession.scala | 12 .../test/scala/org/apache/spark/sql/DatasetSuite.scala | 6 ++ 2 files changed, 18 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f52a9524/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index aa60048..c9276cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -224,6 +224,18 @@ class SparkSession private( /** * :: Experimental :: + * Creates a new [[Dataset]] of type T containing zero elements. + * + * @return 2.0.0 + */ + @Experimental + def emptyDataset[T: Encoder]: Dataset[T] = { +val encoder = implicitly[Encoder[T]] +new Dataset(self, LocalRelation(encoder.schema.toAttributes), encoder) + } + + /** + * :: Experimental :: * Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, tuples). * * @group dataframes http://git-wip-us.apache.org/repos/asf/spark/blob/f52a9524/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 2a65916..e395007 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -46,6 +46,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext { 1, 1, 1) } + test("emptyDataset") { +val ds = spark.emptyDataset[Int] +assert(ds.count() == 0L) +assert(ds.collect() sameElements Array.empty[Int]) + } + test("range") { assert(spark.range(10).map(_ + 1).reduce(_ + _) == 55) assert(spark.range(10).map{ case i: java.lang.Long => i + 1 }.reduce(_ + _) == 55) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15597][SQL] Add SparkSession.emptyDataset
Repository: spark Updated Branches: refs/heads/master 635fb30f8 -> a52e68133 [SPARK-15597][SQL] Add SparkSession.emptyDataset ## What changes were proposed in this pull request? This patch adds a new function emptyDataset to SparkSession, for creating an empty dataset. ## How was this patch tested? Added a test case. Author: Reynold XinCloses #13344 from rxin/SPARK-15597. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a52e6813 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a52e6813 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a52e6813 Branch: refs/heads/master Commit: a52e6813392ba4bdb1b818694b7ced8f6caa6a2b Parents: 635fb30 Author: Reynold Xin Authored: Fri May 27 11:13:09 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:13:09 2016 -0700 -- .../main/scala/org/apache/spark/sql/SparkSession.scala | 12 .../test/scala/org/apache/spark/sql/DatasetSuite.scala | 6 ++ 2 files changed, 18 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a52e6813/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index aa60048..c9276cf 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -224,6 +224,18 @@ class SparkSession private( /** * :: Experimental :: + * Creates a new [[Dataset]] of type T containing zero elements. + * + * @return 2.0.0 + */ + @Experimental + def emptyDataset[T: Encoder]: Dataset[T] = { +val encoder = implicitly[Encoder[T]] +new Dataset(self, LocalRelation(encoder.schema.toAttributes), encoder) + } + + /** + * :: Experimental :: * Creates a [[DataFrame]] from an RDD of Product (e.g. case classes, tuples). * * @group dataframes http://git-wip-us.apache.org/repos/asf/spark/blob/a52e6813/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 2a65916..e395007 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -46,6 +46,12 @@ class DatasetSuite extends QueryTest with SharedSQLContext { 1, 1, 1) } + test("emptyDataset") { +val ds = spark.emptyDataset[Int] +assert(ds.count() == 0L) +assert(ds.collect() sameElements Array.empty[Int]) + } + test("range") { assert(spark.range(10).map(_ + 1).reduce(_ + _) == 55) assert(spark.range(10).map{ case i: java.lang.Long => i + 1 }.reduce(_ + _) == 55) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession
Repository: spark Updated Branches: refs/heads/branch-2.0 a14c88acc -> e69639f43 [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession ## What changes were proposed in this pull request? Adds API docs and usage examples for the 3 `createDataset` calls in `SparkSession` ## How was this patch tested? N/A Author: Sameer AgarwalCloses #13345 from sameeragarwal/dataset-doc. (cherry picked from commit 635fb30f83a66cc56f5fecfed5bff77873bf49a6) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e69639f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e69639f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e69639f4 Branch: refs/heads/branch-2.0 Commit: e69639f4334aae3ace5e50452603dd667467ea9a Parents: a14c88a Author: Sameer Agarwal Authored: Fri May 27 11:11:31 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:11:40 2016 -0700 -- .../org/apache/spark/sql/SparkSession.scala | 63 1 file changed, 63 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e69639f4/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 5dabe0e..aa60048 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -376,6 +376,40 @@ class SparkSession private( Dataset.ofRows(self, LogicalRelation(baseRelation)) } + /* --- * + | Methods for creating DataSets | + * --- */ + + /** + * :: Experimental :: + * Creates a [[Dataset]] from a local Seq of data of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * == Example == + * + * {{{ + * + * import spark.implicits._ + * case class Person(name: String, age: Long) + * val data = Seq(Person("Michael", 29), Person("Andy", 30), Person("Justin", 19)) + * val ds = spark.createDataset(data) + * + * ds.show() + * // +---+---+ + * // | name|age| + * // +---+---+ + * // |Michael| 29| + * // | Andy| 30| + * // | Justin| 19| + * // +---+---+ + * }}} + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = { val enc = encoderFor[T] val attributes = enc.schema.toAttributes @@ -384,6 +418,17 @@ class SparkSession private( Dataset[T](self, plan) } + /** + * :: Experimental :: + * Creates a [[Dataset]] from an RDD of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: RDD[T]): Dataset[T] = { val enc = encoderFor[T] val attributes = enc.schema.toAttributes @@ -392,6 +437,24 @@ class SparkSession private( Dataset[T](self, plan) } + /** + * :: Experimental :: + * Creates a [[Dataset]] from a [[java.util.List]] of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * == Java Example == + * + * {{{ + * List data = Arrays.asList("hello", "world"); + * Dataset ds = spark.createDataset(data, Encoders.STRING()); + * }}} + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: java.util.List[T]): Dataset[T] = { createDataset(data.asScala) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession
Repository: spark Updated Branches: refs/heads/master 4538443e2 -> 635fb30f8 [SPARK-15599][SQL][DOCS] API docs for `createDataset` functions in SparkSession ## What changes were proposed in this pull request? Adds API docs and usage examples for the 3 `createDataset` calls in `SparkSession` ## How was this patch tested? N/A Author: Sameer AgarwalCloses #13345 from sameeragarwal/dataset-doc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/635fb30f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/635fb30f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/635fb30f Branch: refs/heads/master Commit: 635fb30f83a66cc56f5fecfed5bff77873bf49a6 Parents: 4538443 Author: Sameer Agarwal Authored: Fri May 27 11:11:31 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:11:31 2016 -0700 -- .../org/apache/spark/sql/SparkSession.scala | 63 1 file changed, 63 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/635fb30f/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala index 5dabe0e..aa60048 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -376,6 +376,40 @@ class SparkSession private( Dataset.ofRows(self, LogicalRelation(baseRelation)) } + /* --- * + | Methods for creating DataSets | + * --- */ + + /** + * :: Experimental :: + * Creates a [[Dataset]] from a local Seq of data of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * == Example == + * + * {{{ + * + * import spark.implicits._ + * case class Person(name: String, age: Long) + * val data = Seq(Person("Michael", 29), Person("Andy", 30), Person("Justin", 19)) + * val ds = spark.createDataset(data) + * + * ds.show() + * // +---+---+ + * // | name|age| + * // +---+---+ + * // |Michael| 29| + * // | Andy| 30| + * // | Justin| 19| + * // +---+---+ + * }}} + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: Seq[T]): Dataset[T] = { val enc = encoderFor[T] val attributes = enc.schema.toAttributes @@ -384,6 +418,17 @@ class SparkSession private( Dataset[T](self, plan) } + /** + * :: Experimental :: + * Creates a [[Dataset]] from an RDD of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: RDD[T]): Dataset[T] = { val enc = encoderFor[T] val attributes = enc.schema.toAttributes @@ -392,6 +437,24 @@ class SparkSession private( Dataset[T](self, plan) } + /** + * :: Experimental :: + * Creates a [[Dataset]] from a [[java.util.List]] of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL representation) + * that is generally created automatically through implicits from a `SparkSession`, or can be + * created explicitly by calling static methods on [[Encoders]]. + * + * == Java Example == + * + * {{{ + * List data = Arrays.asList("hello", "world"); + * Dataset ds = spark.createDataset(data, Encoders.STRING()); + * }}} + * + * @since 2.0.0 + * @group dataset + */ + @Experimental def createDataset[T : Encoder](data: java.util.List[T]): Dataset[T] = { createDataset(data.asScala) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties
Repository: spark Updated Branches: refs/heads/master d24e25157 -> 4538443e2 [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties ## What changes were proposed in this pull request? This PR replaces `spark.sql.sources.` strings with `CreateDataSourceTableUtils.*` constant variables. ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon HyunCloses #13349 from dongjoon-hyun/SPARK-15584. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4538443e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4538443e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4538443e Branch: refs/heads/master Commit: 4538443e276597530a27c6922e48503677b13956 Parents: d24e251 Author: Dongjoon Hyun Authored: Fri May 27 11:10:31 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:10:31 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 3 +- .../command/createDataSourceTables.scala| 28 +- .../spark/sql/execution/command/ddl.scala | 19 +++ .../spark/sql/execution/command/tables.scala| 4 +- .../datasources/DataSourceStrategy.scala| 2 +- .../execution/datasources/WriterContainer.scala | 10 ++-- .../execution/datasources/csv/CSVRelation.scala | 3 +- .../datasources/json/JsonFileFormat.scala | 5 +- .../datasources/parquet/ParquetFileFormat.scala | 4 +- .../datasources/text/TextFileFormat.scala | 3 +- .../spark/sql/execution/command/DDLSuite.scala | 10 ++-- .../spark/sql/hive/HiveMetastoreCatalog.scala | 18 +++--- .../spark/sql/hive/orc/OrcFileFormat.scala | 3 +- .../sql/hive/MetastoreDataSourcesSuite.scala| 58 ++-- .../sql/hive/execution/HiveCommandSuite.scala | 16 +++--- .../spark/sql/sources/SimpleTextRelation.scala | 3 +- 16 files changed, 95 insertions(+), 94 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4538443e/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 64ebf0c..7629369 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ @@ -51,7 +52,7 @@ private[libsvm] class LibSVMOutputWriter( new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration -val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") +val uniqueWriteJobId = configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension") http://git-wip-us.apache.org/repos/asf/spark/blob/4538443e/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala index deedb68..4b9aab6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala @@ -256,15 +256,15 @@ case class CreateDataSourceTableAsSelectCommand( object CreateDataSourceTableUtils extends Logging { - // TODO: Actually replace usages with these variables (SPARK-15584) - val DATASOURCE_PREFIX = "spark.sql.sources." val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider" val DATASOURCE_WRITEJOBUUID = DATASOURCE_PREFIX + "writeJobUUID" val DATASOURCE_OUTPUTPATH = DATASOURCE_PREFIX + "output.path" - val DATASOURCE_SCHEMA_PREFIX = DATASOURCE_PREFIX + "schema." + val
spark git commit: [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties
Repository: spark Updated Branches: refs/heads/branch-2.0 a355edeef -> a14c88acc [SPARK-15584][SQL] Abstract duplicate code: `spark.sql.sources.` properties ## What changes were proposed in this pull request? This PR replaces `spark.sql.sources.` strings with `CreateDataSourceTableUtils.*` constant variables. ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon HyunCloses #13349 from dongjoon-hyun/SPARK-15584. (cherry picked from commit 4538443e276597530a27c6922e48503677b13956) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a14c88ac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a14c88ac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a14c88ac Branch: refs/heads/branch-2.0 Commit: a14c88acce0733f3db8b0508ae8b0417822e08d8 Parents: a355ede Author: Dongjoon Hyun Authored: Fri May 27 11:10:31 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:10:39 2016 -0700 -- .../spark/ml/source/libsvm/LibSVMRelation.scala | 3 +- .../command/createDataSourceTables.scala| 28 +- .../spark/sql/execution/command/ddl.scala | 19 +++ .../spark/sql/execution/command/tables.scala| 4 +- .../datasources/DataSourceStrategy.scala| 2 +- .../execution/datasources/WriterContainer.scala | 10 ++-- .../execution/datasources/csv/CSVRelation.scala | 3 +- .../datasources/json/JsonFileFormat.scala | 5 +- .../datasources/parquet/ParquetFileFormat.scala | 4 +- .../datasources/text/TextFileFormat.scala | 3 +- .../spark/sql/execution/command/DDLSuite.scala | 10 ++-- .../spark/sql/hive/HiveMetastoreCatalog.scala | 18 +++--- .../spark/sql/hive/orc/OrcFileFormat.scala | 3 +- .../sql/hive/MetastoreDataSourcesSuite.scala| 58 ++-- .../sql/hive/execution/HiveCommandSuite.scala | 16 +++--- .../spark/sql/sources/SimpleTextRelation.scala | 3 +- 16 files changed, 95 insertions(+), 94 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a14c88ac/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala index 64ebf0c..7629369 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/source/libsvm/LibSVMRelation.scala @@ -34,6 +34,7 @@ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection +import org.apache.spark.sql.execution.command.CreateDataSourceTableUtils import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ @@ -51,7 +52,7 @@ private[libsvm] class LibSVMOutputWriter( new TextOutputFormat[NullWritable, Text]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val configuration = context.getConfiguration -val uniqueWriteJobId = configuration.get("spark.sql.sources.writeJobUUID") +val uniqueWriteJobId = configuration.get(CreateDataSourceTableUtils.DATASOURCE_WRITEJOBUUID) val taskAttemptId = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(path, f"part-r-$split%05d-$uniqueWriteJobId$extension") http://git-wip-us.apache.org/repos/asf/spark/blob/a14c88ac/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala index deedb68..4b9aab6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala @@ -256,15 +256,15 @@ case class CreateDataSourceTableAsSelectCommand( object CreateDataSourceTableUtils extends Logging { - // TODO: Actually replace usages with these variables (SPARK-15584) - val DATASOURCE_PREFIX = "spark.sql.sources." val DATASOURCE_PROVIDER = DATASOURCE_PREFIX + "provider" val DATASOURCE_WRITEJOBUUID = DATASOURCE_PREFIX + "writeJobUUID" val
spark git commit: [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib
Repository: spark Updated Branches: refs/heads/branch-2.0 2cb84dd23 -> a355edeef [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib ## What changes were proposed in this pull request? This PR replaces all deprecated `SQLContext` occurrences with `SparkSession` in `ML/MLLib` module except the following two classes. These two classes use `SQLContext` in their function signatures. - ReadWrite.scala - TreeModels.scala ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon HyunCloses #13352 from dongjoon-hyun/SPARK-15603. (cherry picked from commit d24e251572d39a453293cabfe14e4aed25a55208) Signed-off-by: Andrew Or Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a355edee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a355edee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a355edee Branch: refs/heads/branch-2.0 Commit: a355edeefa16988da8b05d2539a91277e75e823c Parents: 2cb84dd Author: Dongjoon Hyun Authored: Fri May 27 11:09:15 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:09:33 2016 -0700 -- .../spark/ml/clustering/GaussianMixture.scala | 7 ++-- .../spark/ml/feature/SQLTransformer.scala | 14 .../org/apache/spark/ml/feature/Word2Vec.scala | 16 - .../spark/mllib/api/python/PythonMLLibAPI.scala | 23 +++-- .../classification/LogisticRegression.scala | 19 +-- .../spark/mllib/classification/NaiveBayes.scala | 24 ++--- .../impl/GLMClassificationModel.scala | 18 +- .../mllib/clustering/BisectingKMeansModel.scala | 12 +++ .../mllib/clustering/GaussianMixtureModel.scala | 16 - .../spark/mllib/clustering/KMeansModel.scala| 13 --- .../spark/mllib/clustering/LDAModel.scala | 36 +--- .../clustering/PowerIterationClustering.scala | 12 +++ .../spark/mllib/feature/ChiSqSelector.scala | 13 --- .../apache/spark/mllib/feature/Word2Vec.scala | 13 +++ .../org/apache/spark/mllib/fpm/FPGrowth.scala | 10 +++--- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 10 +++--- .../MatrixFactorizationModel.scala | 12 +++ .../mllib/regression/IsotonicRegression.scala | 12 +++ .../regression/impl/GLMRegressionModel.scala| 18 +- .../mllib/tree/model/DecisionTreeModel.scala| 20 +-- .../mllib/tree/model/treeEnsembleModels.scala | 17 + .../ml/feature/QuantileDiscretizerSuite.scala | 14 .../mllib/util/MLlibTestSparkContext.scala | 6 ++-- 23 files changed, 160 insertions(+), 195 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a355edee/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 88b6b27..773e50e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -20,7 +20,6 @@ package org.apache.spark.ml.clustering import breeze.linalg.{DenseVector => BDV} import org.apache.hadoop.fs.Path -import org.apache.spark.SparkContext import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.impl.Utils.EPSILON @@ -33,7 +32,7 @@ import org.apache.spark.mllib.clustering.{GaussianMixture => MLlibGM} import org.apache.spark.mllib.linalg.{Matrices => OldMatrices, Matrix => OldMatrix, Vector => OldVector, Vectors => OldVectors, VectorUDT => OldVectorUDT} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{IntegerType, StructType} @@ -134,9 +133,7 @@ class GaussianMixtureModel private[ml] ( val modelGaussians = gaussians.map { gaussian => (OldVectors.fromML(gaussian.mean), OldMatrices.fromML(gaussian.cov)) } -val sc = SparkContext.getOrCreate() -val sqlContext = SQLContext.getOrCreate(sc) -sqlContext.createDataFrame(modelGaussians).toDF("mean", "cov") + SparkSession.builder().getOrCreate().createDataFrame(modelGaussians).toDF("mean", "cov") } /** http://git-wip-us.apache.org/repos/asf/spark/blob/a355edee/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala --
spark git commit: [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib
Repository: spark Updated Branches: refs/heads/master c17272902 -> d24e25157 [SPARK-15603][MLLIB] Replace SQLContext with SparkSession in ML/MLLib ## What changes were proposed in this pull request? This PR replaces all deprecated `SQLContext` occurrences with `SparkSession` in `ML/MLLib` module except the following two classes. These two classes use `SQLContext` in their function signatures. - ReadWrite.scala - TreeModels.scala ## How was this patch tested? Pass the existing Jenkins tests. Author: Dongjoon HyunCloses #13352 from dongjoon-hyun/SPARK-15603. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d24e2515 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d24e2515 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d24e2515 Branch: refs/heads/master Commit: d24e251572d39a453293cabfe14e4aed25a55208 Parents: c172729 Author: Dongjoon Hyun Authored: Fri May 27 11:09:15 2016 -0700 Committer: Andrew Or Committed: Fri May 27 11:09:15 2016 -0700 -- .../spark/ml/clustering/GaussianMixture.scala | 7 ++-- .../spark/ml/feature/SQLTransformer.scala | 14 .../org/apache/spark/ml/feature/Word2Vec.scala | 16 - .../spark/mllib/api/python/PythonMLLibAPI.scala | 23 +++-- .../classification/LogisticRegression.scala | 19 +-- .../spark/mllib/classification/NaiveBayes.scala | 24 ++--- .../impl/GLMClassificationModel.scala | 18 +- .../mllib/clustering/BisectingKMeansModel.scala | 12 +++ .../mllib/clustering/GaussianMixtureModel.scala | 16 - .../spark/mllib/clustering/KMeansModel.scala| 13 --- .../spark/mllib/clustering/LDAModel.scala | 36 +--- .../clustering/PowerIterationClustering.scala | 12 +++ .../spark/mllib/feature/ChiSqSelector.scala | 13 --- .../apache/spark/mllib/feature/Word2Vec.scala | 13 +++ .../org/apache/spark/mllib/fpm/FPGrowth.scala | 10 +++--- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 10 +++--- .../MatrixFactorizationModel.scala | 12 +++ .../mllib/regression/IsotonicRegression.scala | 12 +++ .../regression/impl/GLMRegressionModel.scala| 18 +- .../mllib/tree/model/DecisionTreeModel.scala| 20 +-- .../mllib/tree/model/treeEnsembleModels.scala | 17 + .../ml/feature/QuantileDiscretizerSuite.scala | 14 .../mllib/util/MLlibTestSparkContext.scala | 6 ++-- 23 files changed, 160 insertions(+), 195 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d24e2515/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 88b6b27..773e50e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -20,7 +20,6 @@ package org.apache.spark.ml.clustering import breeze.linalg.{DenseVector => BDV} import org.apache.hadoop.fs.Path -import org.apache.spark.SparkContext import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.impl.Utils.EPSILON @@ -33,7 +32,7 @@ import org.apache.spark.mllib.clustering.{GaussianMixture => MLlibGM} import org.apache.spark.mllib.linalg.{Matrices => OldMatrices, Matrix => OldMatrix, Vector => OldVector, Vectors => OldVectors, VectorUDT => OldVectorUDT} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Dataset, Row, SQLContext} +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{IntegerType, StructType} @@ -134,9 +133,7 @@ class GaussianMixtureModel private[ml] ( val modelGaussians = gaussians.map { gaussian => (OldVectors.fromML(gaussian.mean), OldMatrices.fromML(gaussian.cov)) } -val sc = SparkContext.getOrCreate() -val sqlContext = SQLContext.getOrCreate(sc) -sqlContext.createDataFrame(modelGaussians).toDF("mean", "cov") + SparkSession.builder().getOrCreate().createDataFrame(modelGaussians).toDF("mean", "cov") } /** http://git-wip-us.apache.org/repos/asf/spark/blob/d24e2515/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/SQLTransformer.scala
spark git commit: [SPARK-15565][SQL] Add the File Scheme to the Default Value of WAREHOUSE_PATH
Repository: spark Updated Branches: refs/heads/branch-2.0 b430aa98c -> 2cb84dd23 [SPARK-15565][SQL] Add the File Scheme to the Default Value of WAREHOUSE_PATH What changes were proposed in this pull request? The default value of `spark.sql.warehouse.dir` is `System.getProperty("user.dir")/spark-warehouse`. Since `System.getProperty("user.dir")` is a local dir, we should explicitly set the scheme to local filesystem. cc yhuai How was this patch tested? Added two test cases Author: gatorsmileCloses #13348 from gatorsmile/addSchemeToDefaultWarehousePath. (cherry picked from commit c17272902c95290beca274ee6316a8a98fd7a725) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2cb84dd2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2cb84dd2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2cb84dd2 Branch: refs/heads/branch-2.0 Commit: 2cb84dd2356e782b9e606cd126057726fcf6f228 Parents: b430aa9 Author: gatorsmile Authored: Fri May 27 09:54:31 2016 -0700 Committer: Yin Huai Committed: Fri May 27 09:54:43 2016 -0700 -- .../org/apache/spark/sql/internal/SQLConf.scala | 2 +- .../spark/sql/execution/command/DDLSuite.scala | 25 .../spark/sql/internal/SQLConfSuite.scala | 12 ++ 3 files changed, 38 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2cb84dd2/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 4efefda..d1db0dd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -55,7 +55,7 @@ object SQLConf { val WAREHOUSE_PATH = SQLConfigBuilder("spark.sql.warehouse.dir") .doc("The default location for managed databases and tables.") .stringConf -.createWithDefault("${system:user.dir}/spark-warehouse") +.createWithDefault("file:${system:user.dir}/spark-warehouse") val OPTIMIZER_MAX_ITERATIONS = SQLConfigBuilder("spark.sql.optimizer.maxIterations") .internal() http://git-wip-us.apache.org/repos/asf/spark/blob/2cb84dd2/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index e32521a..e975756 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -171,6 +171,31 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } } + test("Create Database using Default Warehouse Path") { +withSQLConf(SQLConf.WAREHOUSE_PATH.key -> "") { + // Will use the default location if and only if we unset the conf + spark.conf.unset(SQLConf.WAREHOUSE_PATH.key) + val catalog = spark.sessionState.catalog + val dbName = "db1" + try { +sql(s"CREATE DATABASE $dbName") +val db1 = catalog.getDatabaseMetadata(dbName) +val expectedLocation = + "file:" + appendTrailingSlash(System.getProperty("user.dir")) + +s"spark-warehouse/$dbName.db" +assert(db1 == CatalogDatabase( + dbName, + "", + expectedLocation, + Map.empty)) +sql(s"DROP DATABASE $dbName CASCADE") +assert(!catalog.databaseExists(dbName)) + } finally { +catalog.reset() + } +} + } + test("Create/Drop Database - location") { val catalog = spark.sessionState.catalog val databaseNames = Seq("db1", "`database`") http://git-wip-us.apache.org/repos/asf/spark/blob/2cb84dd2/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index ad5365a..3d4fc75 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -207,4 +207,16 @@ class SQLConfSuite extends QueryTest with SharedSQLContext { } } + test("default value of WAREHOUSE_PATH") { +val
spark git commit: [SPARK-15565][SQL] Add the File Scheme to the Default Value of WAREHOUSE_PATH
Repository: spark Updated Branches: refs/heads/master 6f95c6c03 -> c17272902 [SPARK-15565][SQL] Add the File Scheme to the Default Value of WAREHOUSE_PATH What changes were proposed in this pull request? The default value of `spark.sql.warehouse.dir` is `System.getProperty("user.dir")/spark-warehouse`. Since `System.getProperty("user.dir")` is a local dir, we should explicitly set the scheme to local filesystem. cc yhuai How was this patch tested? Added two test cases Author: gatorsmileCloses #13348 from gatorsmile/addSchemeToDefaultWarehousePath. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c1727290 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c1727290 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c1727290 Branch: refs/heads/master Commit: c17272902c95290beca274ee6316a8a98fd7a725 Parents: 6f95c6c Author: gatorsmile Authored: Fri May 27 09:54:31 2016 -0700 Committer: Yin Huai Committed: Fri May 27 09:54:31 2016 -0700 -- .../org/apache/spark/sql/internal/SQLConf.scala | 2 +- .../spark/sql/execution/command/DDLSuite.scala | 25 .../spark/sql/internal/SQLConfSuite.scala | 12 ++ 3 files changed, 38 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c1727290/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 4efefda..d1db0dd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -55,7 +55,7 @@ object SQLConf { val WAREHOUSE_PATH = SQLConfigBuilder("spark.sql.warehouse.dir") .doc("The default location for managed databases and tables.") .stringConf -.createWithDefault("${system:user.dir}/spark-warehouse") +.createWithDefault("file:${system:user.dir}/spark-warehouse") val OPTIMIZER_MAX_ITERATIONS = SQLConfigBuilder("spark.sql.optimizer.maxIterations") .internal() http://git-wip-us.apache.org/repos/asf/spark/blob/c1727290/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala index e32521a..e975756 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala @@ -171,6 +171,31 @@ class DDLSuite extends QueryTest with SharedSQLContext with BeforeAndAfterEach { } } + test("Create Database using Default Warehouse Path") { +withSQLConf(SQLConf.WAREHOUSE_PATH.key -> "") { + // Will use the default location if and only if we unset the conf + spark.conf.unset(SQLConf.WAREHOUSE_PATH.key) + val catalog = spark.sessionState.catalog + val dbName = "db1" + try { +sql(s"CREATE DATABASE $dbName") +val db1 = catalog.getDatabaseMetadata(dbName) +val expectedLocation = + "file:" + appendTrailingSlash(System.getProperty("user.dir")) + +s"spark-warehouse/$dbName.db" +assert(db1 == CatalogDatabase( + dbName, + "", + expectedLocation, + Map.empty)) +sql(s"DROP DATABASE $dbName CASCADE") +assert(!catalog.databaseExists(dbName)) + } finally { +catalog.reset() + } +} + } + test("Create/Drop Database - location") { val catalog = spark.sessionState.catalog val databaseNames = Seq("db1", "`database`") http://git-wip-us.apache.org/repos/asf/spark/blob/c1727290/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index ad5365a..3d4fc75 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -207,4 +207,16 @@ class SQLConfSuite extends QueryTest with SharedSQLContext { } } + test("default value of WAREHOUSE_PATH") { +val original = spark.conf.get(SQLConf.WAREHOUSE_PATH) +try { + // to get the default value, always unset it +
spark git commit: [SPARK-15431][SQL][HOTFIX] ignore 'list' command testcase from CliSuite for now
Repository: spark Updated Branches: refs/heads/branch-2.0 b3845fede -> b430aa98c [SPARK-15431][SQL][HOTFIX] ignore 'list' command testcase from CliSuite for now ## What changes were proposed in this pull request? The test cases for `list` command added in `CliSuite` by PR #13212 can not run in some jenkins jobs after being merged. However, some jenkins jobs can pass: https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.6/ https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.4/ https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.2/ https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/ https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.3/ Others failed on this test case. But the failures on those jobs are at slightly different checkpoints among different jobs too. So it seems that CliSuite's output capture is flaky for list commands to check for expected output. There are test cases already in `HiveQuerySuite` and `SparkContextSuite` to cover the cases. So I am ignoring 2 test cases added by PR #13212 . Author: Xin WuCloses #13276 from xwu0226/SPARK-15431-clisuite. (cherry picked from commit 6f95c6c030db0057de213733c2bd3453463bc6f2) Signed-off-by: Yin Huai Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b430aa98 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b430aa98 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b430aa98 Branch: refs/heads/branch-2.0 Commit: b430aa98caa16978cd53dd354423cac45410c284 Parents: b3845fe Author: Xin Wu Authored: Fri May 27 08:54:14 2016 -0700 Committer: Yin Huai Committed: Fri May 27 08:54:54 2016 -0700 -- .../scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b430aa98/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala -- diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index 2bf0221..656fe97 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -239,7 +239,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { "" -> "This is a test for Spark-11624") } - test("list jars") { + ignore("list jars") { val jarFile = Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar") runCliWithin(2.minute)( s"ADD JAR $jarFile" -> "", @@ -248,7 +248,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { ) } - test("list files") { + ignore("list files") { val dataFilePath = Thread.currentThread().getContextClassLoader .getResource("data/files/small_kv.txt") runCliWithin(2.minute)( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15431][SQL][HOTFIX] ignore 'list' command testcase from CliSuite for now
Repository: spark Updated Branches: refs/heads/master d5911d117 -> 6f95c6c03 [SPARK-15431][SQL][HOTFIX] ignore 'list' command testcase from CliSuite for now ## What changes were proposed in this pull request? The test cases for `list` command added in `CliSuite` by PR #13212 can not run in some jenkins jobs after being merged. However, some jenkins jobs can pass: https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.6/ https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.4/ https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.2/ https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.7/ https://amplab.cs.berkeley.edu/jenkins/job/spark-master-test-sbt-hadoop-2.3/ Others failed on this test case. But the failures on those jobs are at slightly different checkpoints among different jobs too. So it seems that CliSuite's output capture is flaky for list commands to check for expected output. There are test cases already in `HiveQuerySuite` and `SparkContextSuite` to cover the cases. So I am ignoring 2 test cases added by PR #13212 . Author: Xin WuCloses #13276 from xwu0226/SPARK-15431-clisuite. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f95c6c0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f95c6c0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f95c6c0 Branch: refs/heads/master Commit: 6f95c6c030db0057de213733c2bd3453463bc6f2 Parents: d5911d1 Author: Xin Wu Authored: Fri May 27 08:54:14 2016 -0700 Committer: Yin Huai Committed: Fri May 27 08:54:14 2016 -0700 -- .../scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f95c6c0/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala -- diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala index 2bf0221..656fe97 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/CliSuite.scala @@ -239,7 +239,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { "" -> "This is a test for Spark-11624") } - test("list jars") { + ignore("list jars") { val jarFile = Thread.currentThread().getContextClassLoader.getResource("TestUDTF.jar") runCliWithin(2.minute)( s"ADD JAR $jarFile" -> "", @@ -248,7 +248,7 @@ class CliSuite extends SparkFunSuite with BeforeAndAfterAll with Logging { ) } - test("list files") { + ignore("list files") { val dataFilePath = Thread.currentThread().getContextClassLoader .getResource("data/files/small_kv.txt") runCliWithin(2.minute)( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org