spark git commit: [SPARK-7050] [BUILD] Fix Python Kafka test assembly jar not found issue under Maven build
Repository: spark Updated Branches: refs/heads/master 351a36d0c - 8a9d9cc15 [SPARK-7050] [BUILD] Fix Python Kafka test assembly jar not found issue under Maven build To fix Spark Streaming unit test with maven build. Previously the name and path of maven generated jar is different from sbt, which will lead to following exception. This fix keep the same behavior with both Maven and sbt build. ``` Failed to find Spark Streaming Kafka assembly jar in /home/xyz/spark/external/kafka-assembly You need to build Spark with 'build/sbt assembly/assembly streaming-kafka-assembly/assembly' or 'build/mvn package' before running this program ``` Author: jerryshao saisai.s...@intel.com Closes #5632 from jerryshao/SPARK-7050 and squashes the following commits: 74b068d [jerryshao] Fix mvn build issue Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8a9d9cc1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8a9d9cc1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8a9d9cc1 Branch: refs/heads/master Commit: 8a9d9cc1561cf157793c90db6700ffa6f1f00a69 Parents: 351a36d Author: jerryshao saisai.s...@intel.com Authored: Wed Jul 8 12:23:32 2015 +0100 Committer: Sean Owen so...@cloudera.com Committed: Wed Jul 8 12:23:32 2015 +0100 -- external/kafka-assembly/pom.xml | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8a9d9cc1/external/kafka-assembly/pom.xml -- diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml index 8059c44..977514f 100644 --- a/external/kafka-assembly/pom.xml +++ b/external/kafka-assembly/pom.xml @@ -58,6 +58,7 @@ artifactIdmaven-shade-plugin/artifactId configuration shadedArtifactAttachedfalse/shadedArtifactAttached + outputFile${project.build.directory}/scala-${scala.binary.version}/spark-streaming-kafka-assembly-${project.version}.jar/outputFile artifactSet includes include*:*/include - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8872] [MLLIB] added verification results from R for FPGrowthSuite
Repository: spark Updated Branches: refs/heads/master 8a9d9cc15 - 3bb217750 [SPARK-8872] [MLLIB] added verification results from R for FPGrowthSuite Author: Kashif Rasul kashif.ra...@gmail.com Closes #7269 from kashif/SPARK-8872 and squashes the following commits: 2d5457f [Kashif Rasul] added R code for FP Int type 3de6808 [Kashif Rasul] added verification results from R for FPGrowthSuite Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3bb21775 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3bb21775 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3bb21775 Branch: refs/heads/master Commit: 3bb217750ada18a49c40d974ac57050ef2abfd2c Parents: 8a9d9cc Author: Kashif Rasul kashif.ra...@gmail.com Authored: Wed Jul 8 08:44:58 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Jul 8 08:44:58 2015 -0700 -- .../apache/spark/mllib/fpm/FPGrowthSuite.scala | 114 +++ 1 file changed, 114 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3bb21775/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala index 66ae354..ddc296a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala @@ -39,6 +39,22 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { .setMinSupport(0.9) .setNumPartitions(1) .run(rdd) + +/* Verify results using the `R` code: + transactions = as(sapply( + list(r z h k p, + z y x w v u t s, + s x o n r, + x z y m t s q e, + z, + x z y r q t p), + FUN=function(x) strsplit(x, ,fixed=TRUE)), + transactions) +eclat(transactions, parameter = list(support = 0.9)) + ... + eclat - zero frequent items + set of 0 itemsets + */ assert(model6.freqItemsets.count() === 0) val model3 = fpg @@ -48,6 +64,33 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { val freqItemsets3 = model3.freqItemsets.collect().map { itemset = (itemset.items.toSet, itemset.freq) } + +/* Verify results using the `R` code: + fp = eclat(transactions, parameter = list(support = 0.5)) + fpDF = as(sort(fp), data.frame) + fpDF$support = fpDF$support * length(transactions) + names(fpDF)[names(fpDF) == support] = freq +fpDF + items freq + 13 {z}5 + 14 {x}4 + 1 {s,x}3 + 2 {t,x,y,z}3 + 3{t,y,z}3 + 4{t,x,y}3 + 5{x,y,z}3 + 6 {y,z}3 + 7 {x,y}3 + 8 {t,y}3 + 9{t,x,z}3 + 10 {t,z}3 + 11 {t,x}3 + 12 {x,z}3 + 15 {t}3 + 16 {y}3 + 17 {s}3 + 18 {r}3 + */ val expected = Set( (Set(s), 3L), (Set(z), 5L), (Set(x), 4L), (Set(t), 3L), (Set(y), 3L), (Set(r), 3L), @@ -62,12 +105,30 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { .setMinSupport(0.3) .setNumPartitions(4) .run(rdd) + +/* Verify results using the `R` code: + fp = eclat(transactions, parameter = list(support = 0.3)) + fpDF = as(fp, data.frame) + fpDF$support = fpDF$support * length(transactions) + names(fpDF)[names(fpDF) == support] = freq +nrow(fpDF) + [1] 54 + */ assert(model2.freqItemsets.count() === 54) val model1 = fpg .setMinSupport(0.1) .setNumPartitions(8) .run(rdd) + +/* Verify results using the `R` code: + fp = eclat(transactions, parameter = list(support = 0.1)) + fpDF = as(fp, data.frame) + fpDF$support = fpDF$support * length(transactions) + names(fpDF)[names(fpDF) == support] = freq +nrow(fpDF) + [1] 625 + */ assert(model1.freqItemsets.count() === 625) } @@ -89,6 +150,23 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { .setMinSupport(0.9) .setNumPartitions(1) .run(rdd) + +/* Verify results using the `R` code: + transactions = as(sapply( + list(1 2 3, + 1 2 3 4, + 5 4 3 2 1, + 6 5 4 3 2 1, + 2 4, + 1 3, + 1 7), + FUN=function(x) strsplit(x, ,fixed=TRUE)), + transactions) +eclat(transactions,
spark git commit: [SPARK-8657] [YARN] [HOTFIX] Fail to upload resource to viewfs
Repository: spark Updated Branches: refs/heads/branch-1.4 e91d87e66 - e4313db38 [SPARK-8657] [YARN] [HOTFIX] Fail to upload resource to viewfs Fail to upload resource to viewfs in spark-1.4 JIRA Link: https://issues.apache.org/jira/browse/SPARK-8657 Author: Tao Li li...@sogou-inc.com Closes #7125 from litao-buptsse/SPARK-8657-for-master and squashes the following commits: 65b13f4 [Tao Li] [SPARK-8657] [YARN] Fail to upload resource to viewfs (cherry picked from commit 26d9b6b8cae9ac6593f78ab98dd45a25d03cf71c) Signed-off-by: Sean Owen so...@cloudera.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e4313db3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e4313db3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e4313db3 Branch: refs/heads/branch-1.4 Commit: e4313db38e81f6288f1704c22e17d0c6e81b4d75 Parents: e91d87e Author: Tao Li li...@sogou-inc.com Authored: Wed Jul 8 19:02:24 2015 +0100 Committer: Sean Owen so...@cloudera.com Committed: Wed Jul 8 19:23:29 2015 +0100 -- .../org/apache/spark/deploy/yarn/Client.scala | 57 ++-- 1 file changed, 4 insertions(+), 53 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e4313db3/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 38e5926..cc0aa45 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -304,57 +304,6 @@ private[spark] class Client( } /** - * Distribute a file to the cluster. - * - * If the file's path is a local: URI, it's actually not distributed. Other files are copied - * to HDFS (if not already there) and added to the application's distributed cache. - * - * @param path URI of the file to distribute. - * @param resType Type of resource being distributed. - * @param destName Name of the file in the distributed cache. - * @param targetDir Subdirectory where to place the file. - * @param appMasterOnly Whether to distribute only to the AM. - * @return A 2-tuple. First item is whether the file is a local: URI. Second item is the - * localized path for non-local paths, or the input `path` for local paths. - * The localized path will be null if the URI has already been added to the cache. - */ -def distribute( -path: String, -resType: LocalResourceType = LocalResourceType.FILE, -destName: Option[String] = None, -targetDir: Option[String] = None, -appMasterOnly: Boolean = false): (Boolean, String) = { - val localURI = new URI(path.trim()) - if (localURI.getScheme != LOCAL_SCHEME) { -if (addDistributedUri(localURI)) { - val localPath = getQualifiedLocalPath(localURI, hadoopConf) - val linkname = targetDir.map(_ + /).getOrElse() + - destName.orElse(Option(localURI.getFragment())).getOrElse(localPath.getName()) - val destPath = copyFileToRemote(dst, localPath, replication) - distCacheMgr.addResource( -fs, hadoopConf, destPath, localResources, resType, linkname, statCache, -appMasterOnly = appMasterOnly) - (false, linkname) -} else { - (false, null) -} - } else { -(true, path.trim()) - } -} - -// If we passed in a keytab, make sure we copy the keytab to the staging directory on -// HDFS, and setup the relevant environment vars, so the AM can login again. -if (loginFromKeytab) { - logInfo(To enable the AM to login from keytab, credentials are being copied over to the AM + - via the YARN Secure Distributed Cache.) - val (_, localizedPath) = distribute(args.keytab, -destName = Some(sparkConf.get(spark.yarn.keytab)), -appMasterOnly = true) - require(localizedPath != null, Keytab file already distributed.) -} - -/** * Copy the given main resource to the distributed cache if the scheme is not local. * Otherwise, set the corresponding key in our SparkConf to handle it downstream. * Each resource is represented by a 3-tuple of: @@ -389,7 +338,8 @@ private[spark] class Client( createConfArchive().foreach { file = require(addDistributedUri(file.toURI())) val destPath = copyFileToRemote(dst, new Path(file.toURI()), replication) - distCacheMgr.addResource(fs, hadoopConf, destPath, localResources, LocalResourceType.ARCHIVE, + val destFs = FileSystem.get(destPath.toUri(), hadoopConf) +
spark git commit: [SPARK-8785] [SQL] Improve Parquet schema merging
Repository: spark Updated Branches: refs/heads/master bf02e3771 - 6722aca80 [SPARK-8785] [SQL] Improve Parquet schema merging JIRA: https://issues.apache.org/jira/browse/SPARK-8785 Currently, the parquet schema merging (`ParquetRelation2.readSchema`) may spend much time to merge duplicate schema. We can select only non duplicate schema and merge them later. Author: Liang-Chi Hsieh vii...@gmail.com Author: Liang-Chi Hsieh vii...@appier.com Closes #7182 from viirya/improve_parquet_merging and squashes the following commits: 5cf934f [Liang-Chi Hsieh] Refactor it to make it faster. f3411ea [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into improve_parquet_merging a63c3ff [Liang-Chi Hsieh] Improve Parquet schema merging. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6722aca8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6722aca8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6722aca8 Branch: refs/heads/master Commit: 6722aca809ddc28aa20abf3bbb2e0de8629a9903 Parents: bf02e37 Author: Liang-Chi Hsieh vii...@gmail.com Authored: Wed Jul 8 10:09:50 2015 -0700 Committer: Cheng Lian l...@databricks.com Committed: Wed Jul 8 10:09:50 2015 -0700 -- .../apache/spark/sql/parquet/newParquet.scala | 82 1 file changed, 48 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6722aca8/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala index 6bc69c6..ce456e7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala @@ -21,6 +21,7 @@ import java.net.URI import java.util.{List = JList} import scala.collection.JavaConversions._ +import scala.collection.mutable import scala.util.Try import com.google.common.base.Objects @@ -30,8 +31,9 @@ import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.parquet.filter2.predicate.FilterApi import org.apache.parquet.hadoop._ -import org.apache.parquet.hadoop.metadata.CompressionCodecName +import org.apache.parquet.hadoop.metadata.{FileMetaData, CompressionCodecName} import org.apache.parquet.hadoop.util.ContextUtil +import org.apache.parquet.schema.MessageType import org.apache.spark.broadcast.Broadcast import org.apache.spark.deploy.SparkHadoopUtil @@ -508,44 +510,56 @@ private[sql] object ParquetRelation2 extends Logging { private[parquet] def readSchema( footers: Seq[Footer], sqlContext: SQLContext): Option[StructType] = { -footers.map { footer = + +def parseParquetSchema(schema: MessageType): StructType = { + StructType.fromAttributes( +// TODO Really no need to use `Attribute` here, we only need to know the data type. +ParquetTypesConverter.convertToAttributes( + schema, + sqlContext.conf.isParquetBinaryAsString, + sqlContext.conf.isParquetINT96AsTimestamp)) +} + +val seen = mutable.HashSet[String]() +val finalSchemas: Seq[StructType] = footers.flatMap { footer = val metadata = footer.getParquetMetadata.getFileMetaData - val parquetSchema = metadata.getSchema - val maybeSparkSchema = metadata + val serializedSchema = metadata .getKeyValueMetaData .toMap .get(RowReadSupport.SPARK_METADATA_KEY) -.flatMap { serializedSchema = - // Don't throw even if we failed to parse the serialized Spark schema. Just fallback to - // whatever is available. - Try(DataType.fromJson(serializedSchema)) -.recover { case _: Throwable = - logInfo( -sSerialized Spark schema in Parquet key-value metadata is not in JSON format, + - falling back to the deprecated DataType.fromCaseClassString parser.) - DataType.fromCaseClassString(serializedSchema) -} -.recover { case cause: Throwable = - logWarning( -sFailed to parse serialized Spark schema in Parquet key-value metadata: - |\t$serializedSchema - .stripMargin, -cause) -} -.map(_.asInstanceOf[StructType]) -.toOption -} - - maybeSparkSchema.getOrElse { -// Falls back to Parquet schema if Spark SQL schema is absent. -StructType.fromAttributes( - // TODO Really no need to use `Attribute` here, we only need to know the data type. -
spark git commit: [SPARK-6912] [SQL] Throw an AnalysisException when unsupported Java MapK, V types used in Hive UDF
Repository: spark Updated Branches: refs/heads/master 6722aca80 - 3e831a269 [SPARK-6912] [SQL] Throw an AnalysisException when unsupported Java MapK,V types used in Hive UDF To make UDF developers understood, throw an exception when unsupported MapK,V types used in Hive UDF. This fix is the same with #7248. Author: Takeshi YAMAMURO linguin@gmail.com Closes #7257 from maropu/ThrowExceptionWhenMapUsed and squashes the following commits: 916099a [Takeshi YAMAMURO] Fix style errors 7886dcc [Takeshi YAMAMURO] Throw an exception when Map used in Hive UDF Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3e831a26 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3e831a26 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3e831a26 Branch: refs/heads/master Commit: 3e831a26965a5e92210431f9ad6935f70aa01b48 Parents: 6722aca Author: Takeshi YAMAMURO linguin@gmail.com Authored: Wed Jul 8 10:33:27 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Jul 8 10:33:27 2015 -0700 -- .../apache/spark/sql/hive/HiveInspectors.scala | 6 .../sql/hive/execution/UDFToIntIntMap.java | 35 .../sql/hive/execution/UDFToStringIntMap.java | 35 .../spark/sql/hive/execution/HiveUDFSuite.scala | 32 ++ 4 files changed, 108 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3e831a26/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 924e2ba..4cba175 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -226,6 +226,12 @@ private[hive] trait HiveInspectors { List type in java is unsupported because + JVM type erasure makes spark fail to catch a component type in List) +// java map type unsupported +case c: Class[_] if c == classOf[java.util.Map[_, _]] = + throw new AnalysisException( +Map type in java is unsupported because + +JVM type erasure makes spark fail to catch key and value types in Map) + case c = throw new AnalysisException(sUnsupported java type $c) } http://git-wip-us.apache.org/repos/asf/spark/blob/3e831a26/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToIntIntMap.java -- diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToIntIntMap.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToIntIntMap.java new file mode 100644 index 000..b3e8bcb --- /dev/null +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToIntIntMap.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.hive.execution; + +import org.apache.hadoop.hive.ql.exec.UDF; + +import java.util.HashMap; +import java.util.Map; + +public class UDFToIntIntMap extends UDF { +public MapInteger, Integer evaluate(Object o) { +return new HashMapInteger, Integer() { +{ +put(1, 1); +put(2, 1); +put(3, 1); +} +}; +} +} http://git-wip-us.apache.org/repos/asf/spark/blob/3e831a26/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToStringIntMap.java -- diff --git a/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToStringIntMap.java b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToStringIntMap.java new file mode 100644 index 000..9eea5c9 --- /dev/null +++ b/sql/hive/src/test/java/org/apache/spark/sql/hive/execution/UDFToStringIntMap.java @@ -0,0 +1,35 @@ +/* + *
spark git commit: [SPARK-8894] [SPARKR] [DOC] Example code errors in SparkR documentation.
Repository: spark Updated Branches: refs/heads/master 3bb217750 - bf02e3771 [SPARK-8894] [SPARKR] [DOC] Example code errors in SparkR documentation. Author: Sun Rui rui@intel.com Closes #7287 from sun-rui/SPARK-8894 and squashes the following commits: da63898 [Sun Rui] [SPARK-8894][SPARKR][DOC] Example code errors in SparkR documentation. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bf02e377 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bf02e377 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bf02e377 Branch: refs/heads/master Commit: bf02e377168f39459d5c216e939097ae5705f573 Parents: 3bb2177 Author: Sun Rui rui@intel.com Authored: Wed Jul 8 09:48:16 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Wed Jul 8 09:48:16 2015 -0700 -- docs/sparkr.md| 2 +- docs/sql-programming-guide.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bf02e377/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 095ea43..1197c0d 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -116,7 +116,7 @@ sql(hiveContext, CREATE TABLE IF NOT EXISTS src (key INT, value STRING)) sql(hiveContext, LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src) # Queries can be expressed in HiveQL. -results - hiveContext.sql(FROM src SELECT key, value) +results - sql(hiveContext, FROM src SELECT key, value) # results is now a DataFrame head(results) http://git-wip-us.apache.org/repos/asf/spark/blob/bf02e377/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 7c25578..26f6ba8 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1637,7 +1637,7 @@ sql(sqlContext, CREATE TABLE IF NOT EXISTS src (key INT, value STRING)) sql(sqlContext, LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src) # Queries can be expressed in HiveQL. -results = sqlContext.sql(FROM src SELECT key, value).collect() +results - collect(sql(sqlContext, FROM src SELECT key, value)) {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8894] [SPARKR] [DOC] Example code errors in SparkR documentation.
Repository: spark Updated Branches: refs/heads/branch-1.4 d3d5f2ab2 - de49916ab [SPARK-8894] [SPARKR] [DOC] Example code errors in SparkR documentation. Author: Sun Rui rui@intel.com Closes #7287 from sun-rui/SPARK-8894 and squashes the following commits: da63898 [Sun Rui] [SPARK-8894][SPARKR][DOC] Example code errors in SparkR documentation. (cherry picked from commit bf02e377168f39459d5c216e939097ae5705f573) Signed-off-by: Shivaram Venkataraman shiva...@cs.berkeley.edu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/de49916a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/de49916a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/de49916a Branch: refs/heads/branch-1.4 Commit: de49916ab61a88566a352a6af02319e44d92930f Parents: d3d5f2a Author: Sun Rui rui@intel.com Authored: Wed Jul 8 09:48:16 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Wed Jul 8 09:48:29 2015 -0700 -- docs/sparkr.md| 2 +- docs/sql-programming-guide.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/de49916a/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 095ea43..1197c0d 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -116,7 +116,7 @@ sql(hiveContext, CREATE TABLE IF NOT EXISTS src (key INT, value STRING)) sql(hiveContext, LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src) # Queries can be expressed in HiveQL. -results - hiveContext.sql(FROM src SELECT key, value) +results - sql(hiveContext, FROM src SELECT key, value) # results is now a DataFrame head(results) http://git-wip-us.apache.org/repos/asf/spark/blob/de49916a/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 572c678..9c689e9 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1518,7 +1518,7 @@ sql(sqlContext, CREATE TABLE IF NOT EXISTS src (key INT, value STRING)) sql(sqlContext, LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src) # Queries can be expressed in HiveQL. -results = sqlContext.sql(FROM src SELECT key, value).collect() +results - collect(sql(sqlContext, FROM src SELECT key, value)) {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8753][SQL] Create an IntervalType data type
Repository: spark Updated Branches: refs/heads/master 74335b310 - 0ba98c04c [SPARK-8753][SQL] Create an IntervalType data type We need a new data type to represent time intervals. Because we can't determine how many days in a month, so we need 2 values for interval: a int `months`, a long `microseconds`. The interval literal syntax looks like: `interval 3 years -4 month 4 weeks 3 second` Because we use number of 100ns as value of `TimestampType`, so it may not makes sense to support nano second unit. Author: Wenchen Fan cloud0...@outlook.com Closes #7226 from cloud-fan/interval and squashes the following commits: 632062d [Wenchen Fan] address comments ac348c3 [Wenchen Fan] use case class 0342d2e [Wenchen Fan] use array byte df9256c [Wenchen Fan] fix style fd6f18a [Wenchen Fan] address comments 1856af3 [Wenchen Fan] support interval type Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0ba98c04 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0ba98c04 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0ba98c04 Branch: refs/heads/master Commit: 0ba98c04c726a827df8cb19b0db17c352a647960 Parents: 74335b3 Author: Wenchen Fan cloud0...@outlook.com Authored: Wed Jul 8 10:51:32 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 10:51:32 2015 -0700 -- .../org/apache/spark/sql/types/DataTypes.java | 5 ++ .../apache/spark/sql/catalyst/SqlParser.scala | 86 +++- .../apache/spark/sql/types/IntervalType.scala | 37 + .../apache/spark/sql/types/TimestampType.scala | 2 +- .../org/apache/spark/sql/sources/ddl.scala | 3 + .../org/apache/spark/sql/SQLQuerySuite.scala| 25 ++ .../org/apache/spark/unsafe/types/Interval.java | 47 +++ 7 files changed, 185 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0ba98c04/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java -- diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java index e457542..d22ad67 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/types/DataTypes.java @@ -50,6 +50,11 @@ public class DataTypes { public static final DataType TimestampType = TimestampType$.MODULE$; /** + * Gets the IntervalType object. + */ + public static final DataType IntervalType = IntervalType$.MODULE$; + + /** * Gets the DoubleType object. */ public static final DataType DoubleType = DoubleType$.MODULE$; http://git-wip-us.apache.org/repos/asf/spark/blob/0ba98c04/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala index e8e9b98..dedd8c8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala @@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.Interval /** * A very simple SQL parser. Based loosely on: @@ -72,6 +73,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser { protected val INNER = Keyword(INNER) protected val INSERT = Keyword(INSERT) protected val INTERSECT = Keyword(INTERSECT) + protected val INTERVAL = Keyword(INTERVAL) protected val INTO = Keyword(INTO) protected val IS = Keyword(IS) protected val JOIN = Keyword(JOIN) @@ -279,12 +281,12 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser { throw new AnalysisException(sinvalid function approximate $udfName) } } -| APPROXIMATE ~ ( ~ floatLit ~ ) ~ ident ~ ( ~ DISTINCT ~ expression ~ ) ^^ +| APPROXIMATE ~ ( ~ unsignedFloat ~ ) ~ ident ~ ( ~ DISTINCT ~ expression ~ ) ^^ { case s ~ _ ~ udfName ~ _ ~ _ ~ exp = if (lexical.normalizeKeyword(udfName) == count) { ApproxCountDistinct(exp, s.toDouble) } else { - throw new AnalysisException(sinvalid function approximate($floatLit) $udfName) + throw new AnalysisException(sinvalid function approximate($s) $udfName) } } | CASE ~ whenThenElse ^^ CaseWhen @@ -309,6 +311,7 @@ class SqlParser extends AbstractSparkSQLParser with DataTypeParser {
spark git commit: [SPARK-5707] [SQL] fix serialization of generated projection
Repository: spark Updated Branches: refs/heads/master 3e831a269 - 74335b310 [SPARK-5707] [SQL] fix serialization of generated projection Author: Davies Liu dav...@databricks.com Closes #7272 from davies/fix_projection and squashes the following commits: 075ef76 [Davies Liu] fix codegen with BroadcastHashJion Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/74335b31 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/74335b31 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/74335b31 Branch: refs/heads/master Commit: 74335b31072951244967f878d8b766cd1bfc2ac6 Parents: 3e831a2 Author: Davies Liu dav...@databricks.com Authored: Wed Jul 8 10:43:00 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Jul 8 10:43:00 2015 -0700 -- .../apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala | 3 +-- .../org/apache/spark/sql/execution/joins/HashOuterJoin.scala | 2 +- .../org/apache/spark/sql/execution/joins/HashedRelation.scala | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/74335b31/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala index 06c244f..ab757fc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashOuterJoin.scala @@ -79,8 +79,7 @@ case class BroadcastHashOuterJoin( // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[InternalRow] = buildPlan.execute().map(_.copy()).collect() // buildHashTable uses code-generated rows as keys, which are not serializable -val hashed = - buildHashTable(input.iterator, new InterpretedProjection(buildKeys, buildPlan.output)) +val hashed = buildHashTable(input.iterator, newProjection(buildKeys, buildPlan.output)) sparkContext.broadcast(hashed) }(BroadcastHashOuterJoin.broadcastHashOuterJoinExecutionContext) http://git-wip-us.apache.org/repos/asf/spark/blob/74335b31/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala index 3337451..0522ee8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashOuterJoin.scala @@ -171,7 +171,7 @@ override def outputPartitioning: Partitioning = joinType match { var existingMatchList = hashTable.get(rowKey) if (existingMatchList == null) { existingMatchList = new CompactBuffer[InternalRow]() -hashTable.put(rowKey, existingMatchList) +hashTable.put(rowKey.copy(), existingMatchList) } existingMatchList += currentRow.copy() http://git-wip-us.apache.org/repos/asf/spark/blob/74335b31/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala index de062c7..6b51f5d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala @@ -125,7 +125,7 @@ private[joins] object HashedRelation { val existingMatchList = hashTable.get(rowKey) val matchList = if (existingMatchList == null) { val newMatchList = new CompactBuffer[InternalRow]() - hashTable.put(rowKey, newMatchList) + hashTable.put(rowKey.copy(), newMatchList) newMatchList } else { keyIsUnique = false - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8888][SQL] Use java.util.HashMap in DynamicPartitionWriterContainer.
Repository: spark Updated Branches: refs/heads/master 0ba98c04c - f61c989b4 [SPARK-][SQL] Use java.util.HashMap in DynamicPartitionWriterContainer. Just a baby step towards making it more efficient. Author: Reynold Xin r...@databricks.com Closes #7282 from rxin/SPARK- and squashes the following commits: 3da51ae [Reynold Xin] [SPARK-][SQL] Use java.util.HashMap in DynamicPartitionWriterContainer. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f61c989b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f61c989b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f61c989b Branch: refs/heads/master Commit: f61c989b404808f79a58b6503cf3835cf602528a Parents: 0ba98c0 Author: Reynold Xin r...@databricks.com Authored: Wed Jul 8 10:56:31 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 10:56:31 2015 -0700 -- .../org/apache/spark/sql/sources/commands.scala | 36 +--- 1 file changed, 23 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f61c989b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala index a97142d..ecbc889 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/commands.scala @@ -19,8 +19,6 @@ package org.apache.spark.sql.sources import java.util.{Date, UUID} -import scala.collection.mutable - import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.output.{FileOutputCommitter = MapReduceFileOutputCommitter, FileOutputFormat} @@ -110,7 +108,7 @@ private[sql] case class InsertIntoHadoopFsRelation( !exists } // If we are appending data to an existing dir. -val isAppend = (pathExists) (mode == SaveMode.Append) +val isAppend = pathExists (mode == SaveMode.Append) if (doInsertion) { val job = new Job(hadoopConf) @@ -142,9 +140,12 @@ private[sql] case class InsertIntoHadoopFsRelation( } } -Seq.empty[InternalRow] +Seq.empty[Row] } + /** + * Inserts the content of the [[DataFrame]] into a table without any partitioning columns. + */ private def insert(writerContainer: BaseWriterContainer, df: DataFrame): Unit = { // Uses local vals for serialization val needsConversion = relation.needConversion @@ -188,6 +189,9 @@ private[sql] case class InsertIntoHadoopFsRelation( } } + /** + * Inserts the content of the [[DataFrame]] into a table with partitioning columns. + */ private def insertWithDynamicPartitions( sqlContext: SQLContext, writerContainer: BaseWriterContainer, @@ -497,13 +501,14 @@ private[sql] class DynamicPartitionWriterContainer( extends BaseWriterContainer(relation, job, isAppend) { // All output writers are created on executor side. - @transient protected var outputWriters: mutable.Map[String, OutputWriter] = _ + @transient protected var outputWriters: java.util.HashMap[String, OutputWriter] = _ override protected def initWriters(): Unit = { -outputWriters = mutable.Map.empty[String, OutputWriter] +outputWriters = new java.util.HashMap[String, OutputWriter] } override def outputWriterForRow(row: Row): OutputWriter = { +// TODO (SPARK-): zip and all the stuff happening here is very inefficient. val partitionPath = partitionColumns.zip(row.toSeq).map { case (col, rawValue) = val string = if (rawValue == null) null else String.valueOf(rawValue) val valueString = if (string == null || string.isEmpty) { @@ -514,18 +519,23 @@ private[sql] class DynamicPartitionWriterContainer( s/$col=$valueString }.mkString.stripPrefix(Path.SEPARATOR) -outputWriters.getOrElseUpdate(partitionPath, { +val writer = outputWriters.get(partitionPath) +if (writer.eq(null)) { val path = new Path(getWorkPath, partitionPath) - taskAttemptContext.getConfiguration.set( -spark.sql.sources.output.path, + taskAttemptContext.getConfiguration.set(spark.sql.sources.output.path, new Path(outputPath, partitionPath).toString) - outputWriterFactory.newInstance(path.toString, dataSchema, taskAttemptContext) -}) + val newWriter = outputWriterFactory.newInstance(path.toString, dataSchema, taskAttemptContext) + outputWriters.put(partitionPath, newWriter) + newWriter +} else { + writer +} } private def clearOutputWriters(): Unit = { -if (outputWriters.nonEmpty) { -
spark git commit: [SPARK-8657] [YARN] Fail to upload resource to viewfs
Repository: spark Updated Branches: refs/heads/master f61c989b4 - 26d9b6b8c [SPARK-8657] [YARN] Fail to upload resource to viewfs Fail to upload resource to viewfs in spark-1.4 JIRA Link: https://issues.apache.org/jira/browse/SPARK-8657 Author: Tao Li li...@sogou-inc.com Closes #7125 from litao-buptsse/SPARK-8657-for-master and squashes the following commits: 65b13f4 [Tao Li] [SPARK-8657] [YARN] Fail to upload resource to viewfs Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/26d9b6b8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/26d9b6b8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/26d9b6b8 Branch: refs/heads/master Commit: 26d9b6b8cae9ac6593f78ab98dd45a25d03cf71c Parents: f61c989 Author: Tao Li li...@sogou-inc.com Authored: Wed Jul 8 19:02:24 2015 +0100 Committer: Sean Owen so...@cloudera.com Committed: Wed Jul 8 19:02:24 2015 +0100 -- yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/26d9b6b8/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index 67a5c95..4d52ae7 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -321,8 +321,9 @@ private[spark] class Client( val linkname = targetDir.map(_ + /).getOrElse() + destName.orElse(Option(localURI.getFragment())).getOrElse(localPath.getName()) val destPath = copyFileToRemote(dst, localPath, replication) + val destFs = FileSystem.get(destPath.toUri(), hadoopConf) distCacheMgr.addResource( -fs, hadoopConf, destPath, localResources, resType, linkname, statCache, +destFs, hadoopConf, destPath, localResources, resType, linkname, statCache, appMasterOnly = appMasterOnly) (false, linkname) } else { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [HOTFIX] Fix style error introduced in e4313db38e81f6288f1704c22e17d0c6e81b4d75
Repository: spark Updated Branches: refs/heads/branch-1.4 e4313db38 - 898b0739e [HOTFIX] Fix style error introduced in e4313db38e81f6288f1704c22e17d0c6e81b4d75 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/898b0739 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/898b0739 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/898b0739 Branch: refs/heads/branch-1.4 Commit: 898b0739ea9ab5669ca99fed05b4f9bdeb920d9f Parents: e4313db Author: Josh Rosen joshro...@databricks.com Authored: Wed Jul 8 12:22:18 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Wed Jul 8 12:22:18 2015 -0700 -- yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/898b0739/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala index cc0aa45..e33b822 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/Client.scala @@ -339,8 +339,8 @@ private[spark] class Client( require(addDistributedUri(file.toURI())) val destPath = copyFileToRemote(dst, new Path(file.toURI()), replication) val destFs = FileSystem.get(destPath.toUri(), hadoopConf) - distCacheMgr.addResource(destFs, hadoopConf, destPath, localResources, LocalResourceType.ARCHIVE, -LOCALIZED_HADOOP_CONF_DIR, statCache, appMasterOnly = true) + distCacheMgr.addResource(destFs, hadoopConf, destPath, localResources, +LocalResourceType.ARCHIVE, LOCALIZED_HADOOP_CONF_DIR, statCache, appMasterOnly = true) } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8700][ML] Disable feature scaling in Logistic Regression
Repository: spark Updated Branches: refs/heads/master 00b265f12 - 57221934e [SPARK-8700][ML] Disable feature scaling in Logistic Regression All compressed sensing applications, and some of the regression use-cases will have better result by turning the feature scaling off. However, if we implement this naively by training the dataset without doing any standardization, the rate of convergency will not be good. This can be implemented by still standardizing the training dataset but we penalize each component differently to get effectively the same objective function but a better numerical problem. As a result, for those columns with high variances, they will be penalized less, and vice versa. Without this, since all the features are standardized, so they will be penalized the same. In R, there is an option for this. `standardize` Logical flag for x variable standardization, prior to fitting the model sequence. The coefficients are always returned on the original scale. Default is standardize=TRUE. If variables are in the same units already, you might not wish to standardize. See details below for y standardization with family=gaussian. +cc holdenk mengxr jkbradley Author: DB Tsai d...@netflix.com Closes #7080 from dbtsai/lors and squashes the following commits: 877e6c7 [DB Tsai] repahse the doc 7cf45f2 [DB Tsai] address feedback 78d75c9 [DB Tsai] small change c2c9e60 [DB Tsai] style 6e1a8e0 [DB Tsai] first commit Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/57221934 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/57221934 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/57221934 Branch: refs/heads/master Commit: 57221934e0376e5bb8421dc35d4bf91db4deeca1 Parents: 00b265f Author: DB Tsai d...@netflix.com Authored: Wed Jul 8 15:21:58 2015 -0700 Committer: DB Tsai d...@netflix.com Committed: Wed Jul 8 15:21:58 2015 -0700 -- .../ml/classification/LogisticRegression.scala | 89 ++-- .../ml/param/shared/SharedParamsCodeGen.scala | 3 +- .../spark/ml/param/shared/sharedParams.scala| 4 +- .../LogisticRegressionSuite.scala | 403 ++- project/MimaExcludes.scala | 2 + 5 files changed, 384 insertions(+), 117 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/57221934/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 3967151..8fc9199 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.classification import scala.collection.mutable -import breeze.linalg.{DenseVector = BDV, norm = brzNorm} +import breeze.linalg.{DenseVector = BDV} import breeze.optimize.{CachedDiffFunction, DiffFunction, LBFGS = BreezeLBFGS, OWLQN = BreezeOWLQN} import org.apache.spark.{Logging, SparkException} @@ -41,7 +41,7 @@ import org.apache.spark.storage.StorageLevel */ private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol - with HasThreshold + with HasThreshold with HasStandardization /** * :: Experimental :: @@ -98,6 +98,18 @@ class LogisticRegression(override val uid: String) def setFitIntercept(value: Boolean): this.type = set(fitIntercept, value) setDefault(fitIntercept - true) + /** + * Whether to standardize the training features before fitting the model. + * The coefficients of models will be always returned on the original scale, + * so it will be transparent for users. Note that when no regularization, + * with or without standardization, the models should be always converged to + * the same solution. + * Default is true. + * @group setParam + * */ + def setStandardization(value: Boolean): this.type = set(standardization, value) + setDefault(standardization - true) + /** @group setParam */ def setThreshold(value: Double): this.type = set(threshold, value) setDefault(threshold - 0.5) @@ -149,15 +161,28 @@ class LogisticRegression(override val uid: String) val regParamL1 = $(elasticNetParam) * $(regParam) val regParamL2 = (1.0 - $(elasticNetParam)) * $(regParam) -val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), +val costFun = new LogisticCostFun(instances, numClasses, $(fitIntercept), $(standardization), featuresStd,
spark git commit: [SPARK-7785] [MLLIB] [PYSPARK] Add __str__ and __repr__ to Matrices
Repository: spark Updated Branches: refs/heads/master 374c8a8a4 - 2b40365d7 [SPARK-7785] [MLLIB] [PYSPARK] Add __str__ and __repr__ to Matrices Adding __str__ and __repr__ to DenseMatrix and SparseMatrix Author: MechCoder manojkumarsivaraj...@gmail.com Closes #6342 from MechCoder/spark-7785 and squashes the following commits: 7b9a82c [MechCoder] Add tests for greater than 16 elements b88e9dd [MechCoder] Increment limit to 16 1425a01 [MechCoder] Change tests 36bd166 [MechCoder] Change str and repr representation 97f0da9 [MechCoder] zip is same as izip in python3 94ca4b2 [MechCoder] Added doctests and iterate over values instead of colPtrs b26fa89 [MechCoder] minor 394dde9 [MechCoder] [SPARK-7785] Add __str__ and __repr__ to Matrices Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2b40365d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2b40365d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2b40365d Branch: refs/heads/master Commit: 2b40365d76b7d9d382ad5077cdf979906bca17f2 Parents: 374c8a8 Author: MechCoder manojkumarsivaraj...@gmail.com Authored: Wed Jul 8 13:19:27 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Jul 8 13:19:27 2015 -0700 -- python/pyspark/mllib/linalg.py | 127 python/pyspark/mllib/tests.py | 52 ++- 2 files changed, 178 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2b40365d/python/pyspark/mllib/linalg.py -- diff --git a/python/pyspark/mllib/linalg.py b/python/pyspark/mllib/linalg.py index 12d8dbb..51ac198 100644 --- a/python/pyspark/mllib/linalg.py +++ b/python/pyspark/mllib/linalg.py @@ -31,6 +31,7 @@ if sys.version = '3': xrange = range import copyreg as copy_reg else: +from itertools import izip as zip import copy_reg import numpy as np @@ -116,6 +117,10 @@ def _format_float(f, digits=4): return s +def _format_float_list(l): +return [_format_float(x) for x in l] + + class VectorUDT(UserDefinedType): SQL user-defined type (UDT) for Vector. @@ -870,6 +875,50 @@ class DenseMatrix(Matrix): self.numRows, self.numCols, self.values.tostring(), int(self.isTransposed)) +def __str__(self): + +Pretty printing of a DenseMatrix + + dm = DenseMatrix(2, 2, range(4)) + print(dm) +DenseMatrix([[ 0., 2.], + [ 1., 3.]]) + dm = DenseMatrix(2, 2, range(4), isTransposed=True) + print(dm) +DenseMatrix([[ 0., 1.], + [ 2., 3.]]) + +# Inspired by __repr__ in scipy matrices. +array_lines = repr(self.toArray()).splitlines() + +# We need to adjust six spaces which is the difference in number +# of letters between DenseMatrix and array +x = '\n'.join([( * 6 + line) for line in array_lines[1:]]) +return array_lines[0].replace(array, DenseMatrix) + \n + x + +def __repr__(self): + +Representation of a DenseMatrix + + dm = DenseMatrix(2, 2, range(4)) + dm +DenseMatrix(2, 2, [0.0, 1.0, 2.0, 3.0], False) + +# If the number of values are less than seventeen then return as it is. +# Else return first eight values and last eight values. +if len(self.values) 17: +entries = _format_float_list(self.values) +else: +entries = ( +_format_float_list(self.values[:8]) + +[...] + +_format_float_list(self.values[-8:]) +) + +entries = , .join(entries) +return DenseMatrix({0}, {1}, [{2}], {3}).format( +self.numRows, self.numCols, entries, self.isTransposed) + def toArray(self): Return an numpy.ndarray @@ -946,6 +995,84 @@ class SparseMatrix(Matrix): raise ValueError(Expected rowIndices of length %d, got %d. % (self.rowIndices.size, self.values.size)) +def __str__(self): + +Pretty printing of a SparseMatrix + + sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) + print(sm1) +2 X 2 CSCMatrix +(0,0) 2.0 +(1,0) 3.0 +(1,1) 4.0 + sm1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) + print(sm1) +2 X 2 CSRMatrix +(0,0) 2.0 +(0,1) 3.0 +(1,1) 4.0 + +spstr = {0} X {1} .format(self.numRows, self.numCols) +if self.isTransposed: +spstr += CSRMatrix\n +else: +spstr += CSCMatrix\n + +cur_col = 0 +smlist = [] + +# Display first 16
spark git commit: [SPARK-8457] [ML] NGram Documentation
Repository: spark Updated Branches: refs/heads/master f03154378 - c5532e2fe [SPARK-8457] [ML] NGram Documentation Add documentation for NGram feature transformer. Author: Feynman Liang fli...@databricks.com Closes #7244 from feynmanliang/SPARK-8457 and squashes the following commits: 5aface9 [Feynman Liang] Pretty print Scala output and add API doc to each codetab 60d5ac0 [Feynman Liang] Inline API doc and fix indentation 736ccbc [Feynman Liang] NGram feature transformer documentation Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5532e2f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5532e2f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5532e2f Branch: refs/heads/master Commit: c5532e2fe700978da4bdfdb54a522f5934c3db55 Parents: f031543 Author: Feynman Liang fli...@databricks.com Authored: Wed Jul 8 14:49:52 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Wed Jul 8 14:49:52 2015 -0700 -- docs/ml-features.md | 88 1 file changed, 88 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c5532e2f/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index f88c024..54068de 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -288,6 +288,94 @@ for words_label in wordsDataFrame.select(words, label).take(3): /div +## $n$-gram + +An [n-gram](https://en.wikipedia.org/wiki/N-gram) is a sequence of $n$ tokens (typically words) for some integer $n$. The `NGram` class can be used to transform input features into $n$-grams. + +`NGram` takes as input a sequence of strings (e.g. the output of a [Tokenizer](ml-features.html#tokenizer). The parameter `n` is used to determine the number of terms in each $n$-gram. The output will consist of a sequence of $n$-grams where each $n$-gram is represented by a space-delimited string of $n$ consecutive words. If the input sequence contains fewer than `n` strings, no output is produced. + +div class=codetabs +div data-lang=scala markdown=1 +div class=codetabs + +div data-lang=scala markdown=1 + +[`NGram`](api/scala/index.html#org.apache.spark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by default). + +{% highlight scala %} +import org.apache.spark.ml.feature.NGram + +val wordDataFrame = sqlContext.createDataFrame(Seq( + (0, Array(Hi, I, heard, about, Spark)), + (1, Array(I, wish, Java, could, use, case, classes)), + (2, Array(Logistic, regression, models, are, neat)) +)).toDF(label, words) + +val ngram = new NGram().setInputCol(words).setOutputCol(ngrams) +val ngramDataFrame = ngram.transform(wordDataFrame) +ngramDataFrame.take(3).map(_.getAs[Stream[String]](ngrams).toList).foreach(println) +{% endhighlight %} +/div + +div data-lang=java markdown=1 + +[`NGram`](api/java/org/apache/spark/ml/feature/NGram.html) takes an input column name, an output column name, and an optional length parameter n (n=2 by default). + +{% highlight java %} +import com.google.common.collect.Lists; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.NGram; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +JavaRDDRow jrdd = jsc.parallelize(Lists.newArrayList( + RowFactory.create(0D, Lists.newArrayList(Hi, I, heard, about, Spark)), + RowFactory.create(1D, Lists.newArrayList(I, wish, Java, could, use, case, classes)), + RowFactory.create(2D, Lists.newArrayList(Logistic, regression, models, are, neat)) +)); +StructType schema = new StructType(new StructField[]{ + new StructField(label, DataTypes.DoubleType, false, Metadata.empty()), + new StructField(words, DataTypes.createArrayType(DataTypes.StringType), false, Metadata.empty()) +}); +DataFrame wordDataFrame = sqlContext.createDataFrame(jrdd, schema); +NGram ngramTransformer = new NGram().setInputCol(words).setOutputCol(ngrams); +DataFrame ngramDataFrame = ngramTransformer.transform(wordDataFrame); +for (Row r : ngramDataFrame.select(ngrams, label).take(3)) { + java.util.ListString ngrams = r.getList(0); + for (String ngram : ngrams) System.out.print(ngram + --- ); + System.out.println(); +} +{% endhighlight %} +/div + +div data-lang=python markdown=1 + +[`NGram`](api/python/pyspark.ml.html#pyspark.ml.feature.NGram) takes an input column name, an output column name, and an optional length parameter n (n=2 by
spark git commit: [SPARK-8908] [SQL] Add () to distinct definition in dataframe
Repository: spark Updated Branches: refs/heads/master 8f3cd9327 - 00b265f12 [SPARK-8908] [SQL] Add () to distinct definition in dataframe Adding `()` to the definition of `distinct` in DataFrame allows distinct to be called with parentheses, which is consistent with `dropDuplicates`. Author: Cheolsoo Park cheols...@netflix.com Closes #7298 from piaozhexiu/SPARK-8908 and squashes the following commits: 7f0d923 [Cheolsoo Park] Add () to distinct definition in dataframe Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00b265f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00b265f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00b265f1 Branch: refs/heads/master Commit: 00b265f12c0f0271b7036f831fee09b694908b29 Parents: 8f3cd93 Author: Cheolsoo Park cheols...@netflix.com Authored: Wed Jul 8 15:18:24 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 15:18:24 2015 -0700 -- sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/00b265f1/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index 6014229..f33e19a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -1415,7 +1415,7 @@ class DataFrame private[sql]( * @group dfops * @since 1.3.0 */ - override def distinct: DataFrame = dropDuplicates() + override def distinct(): DataFrame = dropDuplicates() /** * @group basic - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8900] [SPARKR] Fix sparkPackages in init documentation
Repository: spark Updated Branches: refs/heads/branch-1.4 898b0739e - 512786350 [SPARK-8900] [SPARKR] Fix sparkPackages in init documentation cc pwendell Author: Shivaram Venkataraman shiva...@cs.berkeley.edu Closes #7293 from shivaram/sparkr-packages-doc and squashes the following commits: c91471d [Shivaram Venkataraman] Fix sparkPackages in init documentation (cherry picked from commit 374c8a8a4a8ac4171d312a6c31080a6724e55c60) Signed-off-by: Shivaram Venkataraman shiva...@cs.berkeley.edu Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/51278635 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/51278635 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/51278635 Branch: refs/heads/branch-1.4 Commit: 512786350a2178b441071d29a3a33f723f4c8f87 Parents: 898b073 Author: Shivaram Venkataraman shiva...@cs.berkeley.edu Authored: Wed Jul 8 12:39:32 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Wed Jul 8 12:39:41 2015 -0700 -- docs/sparkr.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/51278635/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 1197c0d..4385a4e 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -68,7 +68,7 @@ you can specify the packages with the `packages` argument. div data-lang=r markdown=1 {% highlight r %} -sc - sparkR.init(packages=com.databricks:spark-csv_2.11:1.0.3) +sc - sparkR.init(sparkPackages=com.databricks:spark-csv_2.11:1.0.3) sqlContext - sparkRSQL.init(sc) {% endhighlight %} /div - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8909][Documentation] Change the scala example in sql-programmi…
Repository: spark Updated Branches: refs/heads/branch-1.4 512786350 - 4df0f1b1b [SPARK-8909][Documentation] Change the scala example in sql-programmi⦠â¦ng-guide#Manually Specifying Options to be in sync with java,python, R version Author: Alok Singh âsing...@us.ibm.comâ Closes #7299 from aloknsingh/aloknsingh_SPARK-8909 and squashes the following commits: d3c20ba [Alok Singh] fix the file to .parquet from .json d476140 [Alok Singh] [SPARK-8909][Documentation] Change the scala example in sql-programming-guide#Manually Specifying Options to be in sync with java,python, R version (cherry picked from commit 8f3cd93278337dc10b9dd3a344d6f7b51ba9960d) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4df0f1b1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4df0f1b1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4df0f1b1 Branch: refs/heads/branch-1.4 Commit: 4df0f1b1bbc994c4e538e22c4580c62f4fed9c45 Parents: 5127863 Author: Alok Singh âsing...@us.ibm.comâ Authored: Wed Jul 8 14:51:18 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 14:51:25 2015 -0700 -- docs/sql-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4df0f1b1/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 9c689e9..79111a7 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -828,7 +828,7 @@ using this syntax. {% highlight scala %} val df = sqlContext.read.format(json).load(examples/src/main/resources/people.json) -df.select(name, age).write.format(json).save(namesAndAges.json) +df.select(name, age).write.format(parquet).save(namesAndAges.parquet) {% endhighlight %} /div - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8900] [SPARKR] Fix sparkPackages in init documentation
Repository: spark Updated Branches: refs/heads/master 26d9b6b8c - 374c8a8a4 [SPARK-8900] [SPARKR] Fix sparkPackages in init documentation cc pwendell Author: Shivaram Venkataraman shiva...@cs.berkeley.edu Closes #7293 from shivaram/sparkr-packages-doc and squashes the following commits: c91471d [Shivaram Venkataraman] Fix sparkPackages in init documentation Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/374c8a8a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/374c8a8a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/374c8a8a Branch: refs/heads/master Commit: 374c8a8a4a8ac4171d312a6c31080a6724e55c60 Parents: 26d9b6b Author: Shivaram Venkataraman shiva...@cs.berkeley.edu Authored: Wed Jul 8 12:39:32 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Wed Jul 8 12:39:32 2015 -0700 -- docs/sparkr.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/374c8a8a/docs/sparkr.md -- diff --git a/docs/sparkr.md b/docs/sparkr.md index 1197c0d..4385a4e 100644 --- a/docs/sparkr.md +++ b/docs/sparkr.md @@ -68,7 +68,7 @@ you can specify the packages with the `packages` argument. div data-lang=r markdown=1 {% highlight r %} -sc - sparkR.init(packages=com.databricks:spark-csv_2.11:1.0.3) +sc - sparkR.init(sparkPackages=com.databricks:spark-csv_2.11:1.0.3) sqlContext - sparkRSQL.init(sc) {% endhighlight %} /div - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8883][SQL]Remove the OverrideFunctionRegistry
Repository: spark Updated Branches: refs/heads/master 08192a1b8 - 351a36d0c [SPARK-8883][SQL]Remove the OverrideFunctionRegistry Remove the `OverrideFunctionRegistry` from the Spark SQL, as the subclasses of `FunctionRegistry` have their own way to the delegate to the right underlying `FunctionRegistry`. Author: Cheng Hao hao.ch...@intel.com Closes #7260 from chenghao-intel/override and squashes the following commits: 164d093 [Cheng Hao] enable the function registry 2ca8459 [Cheng Hao] remove the OverrideFunctionRegistry Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/351a36d0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/351a36d0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/351a36d0 Branch: refs/heads/master Commit: 351a36d0c54d2f995df956ffb0a4236e12f89aad Parents: 08192a1 Author: Cheng Hao hao.ch...@intel.com Authored: Wed Jul 8 00:10:24 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 00:10:24 2015 -0700 -- .../spark/sql/catalyst/analysis/FunctionRegistry.scala | 13 - .../main/scala/org/apache/spark/sql/SQLContext.scala | 3 +-- .../scala/org/apache/spark/sql/hive/HiveContext.scala | 2 +- .../scala/org/apache/spark/sql/hive/hiveUDFs.scala | 2 +- 4 files changed, 3 insertions(+), 17 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/351a36d0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index fef2763..5c25181 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -35,19 +35,6 @@ trait FunctionRegistry { def lookupFunction(name: String, children: Seq[Expression]): Expression } -class OverrideFunctionRegistry(underlying: FunctionRegistry) extends FunctionRegistry { - - private val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive = false) - - override def registerFunction(name: String, builder: FunctionBuilder): Unit = { -functionBuilders.put(name, builder) - } - - override def lookupFunction(name: String, children: Seq[Expression]): Expression = { - functionBuilders.get(name).map(_(children)).getOrElse(underlying.lookupFunction(name, children)) - } -} - class SimpleFunctionRegistry extends FunctionRegistry { private val functionBuilders = StringKeyHashMap[FunctionBuilder](caseSensitive = false) http://git-wip-us.apache.org/repos/asf/spark/blob/351a36d0/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala index e81371e..079f31a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala @@ -139,8 +139,7 @@ class SQLContext(@transient val sparkContext: SparkContext) // TODO how to handle the temp function per user session? @transient - protected[sql] lazy val functionRegistry: FunctionRegistry = -new OverrideFunctionRegistry(FunctionRegistry.builtin) + protected[sql] lazy val functionRegistry: FunctionRegistry = FunctionRegistry.builtin @transient protected[sql] lazy val analyzer: Analyzer = http://git-wip-us.apache.org/repos/asf/spark/blob/351a36d0/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala index b91242a..439d8ca 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveContext.scala @@ -371,7 +371,7 @@ class HiveContext(sc: SparkContext) extends SQLContext(sc) { // Note that HiveUDFs will be overridden by functions registered in this context. @transient override protected[sql] lazy val functionRegistry: FunctionRegistry = -new OverrideFunctionRegistry(new HiveFunctionRegistry(FunctionRegistry.builtin)) +new HiveFunctionRegistry(FunctionRegistry.builtin) /* An analyzer that uses the Hive metastore. */ @transient
spark git commit: [SPARK-8783] [SQL] CTAS with WITH clause does not work
Repository: spark Updated Branches: refs/heads/master 2b40365d7 - f03154378 [SPARK-8783] [SQL] CTAS with WITH clause does not work Currently, CTESubstitution only handles the case that WITH is on the top of the plan. I think it SHOULD handle the case that WITH is child of CTAS. This patch simply changes 'match' to 'transform' for recursive search of WITH in the plan. Author: Keuntae Park sir...@apache.org Closes #7180 from sirpkt/SPARK-8783 and squashes the following commits: e4428f0 [Keuntae Park] Merge remote-tracking branch 'upstream/master' into CTASwithWITH 1671c77 [Keuntae Park] WITH clause can be inside CTAS Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f0315437 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f0315437 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f0315437 Branch: refs/heads/master Commit: f031543782e8f0f5b6a4471ba1c1d5c53efbe5cd Parents: 2b40365 Author: Keuntae Park sir...@apache.org Authored: Wed Jul 8 14:29:52 2015 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Wed Jul 8 14:29:52 2015 -0700 -- .../spark/sql/catalyst/analysis/Analyzer.scala| 2 +- .../spark/sql/hive/execution/SQLQuerySuite.scala | 18 ++ 2 files changed, 19 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f0315437/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 15e84e6..3fdc6d6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -85,7 +85,7 @@ class Analyzer( */ object CTESubstitution extends Rule[LogicalPlan] { // TODO allow subquery to define CTE -def apply(plan: LogicalPlan): LogicalPlan = plan match { +def apply(plan: LogicalPlan): LogicalPlan = plan transform { case With(child, relations) = substituteCTE(child, relations) case other = other } http://git-wip-us.apache.org/repos/asf/spark/blob/f0315437/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala index bf9f2ec..05a1f00 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala @@ -159,6 +159,24 @@ class SQLQuerySuite extends QueryTest { checkAnswer(query, Row(1, 1) :: Nil) } + test(CTAS with WITH clause) { +val df = Seq((1, 1)).toDF(c1, c2) +df.registerTempTable(table1) + +sql( + +|CREATE TABLE with_table1 AS +|WITH T AS ( +| SELECT * +| FROM table1 +|) +|SELECT * +|FROM T + .stripMargin) +val query = sql(SELECT * FROM with_table1) +checkAnswer(query, Row(1, 1) :: Nil) + } + test(explode nested Field) { Seq(NestedArray1(NestedArray2(Seq(1, 2, 3.toDF.registerTempTable(nestedArray) checkAnswer( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8932] Support copy() for UnsafeRows that do not use ObjectPools
Repository: spark Updated Branches: refs/heads/master a29081487 - b55499a44 [SPARK-8932] Support copy() for UnsafeRows that do not use ObjectPools We call Row.copy() in many places throughout SQL but UnsafeRow currently throws UnsupportedOperationException when copy() is called. Supporting copying when ObjectPool is used may be difficult, since we may need to handle deep-copying of objects in the pool. In addition, this copy() method needs to produce a self-contained row object which may be passed around / buffered by downstream code which does not understand the UnsafeRow format. In the long run, we'll need to figure out how to handle the ObjectPool corner cases, but this may be unnecessary if other changes are made. Therefore, in order to unblock my sort patch (#6444) I propose that we support copy() for the cases where UnsafeRow does not use an ObjectPool and continue to throw UnsupportedOperationException when an ObjectPool is used. This patch accomplishes this by modifying UnsafeRow so that it knows the size of the row's backing data in order to be able to copy it into a byte array. Author: Josh Rosen joshro...@databricks.com Closes #7306 from JoshRosen/SPARK-8932 and squashes the following commits: 338e6bf [Josh Rosen] Support copy for UnsafeRows that do not use ObjectPools. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b55499a4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b55499a4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b55499a4 Branch: refs/heads/master Commit: b55499a44ab74e33378211fb0d6940905d7c6318 Parents: a290814 Author: Josh Rosen joshro...@databricks.com Authored: Wed Jul 8 20:28:05 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 20:28:05 2015 -0700 -- .../UnsafeFixedWidthAggregationMap.java | 12 +++-- .../sql/catalyst/expressions/UnsafeRow.java | 32 +++- .../expressions/UnsafeRowConverter.scala| 10 +++- .../expressions/UnsafeRowConverterSuite.scala | 52 +++- 4 files changed, 87 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b55499a4/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java -- diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java index 1e79f4b..79d55b3 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeFixedWidthAggregationMap.java @@ -120,9 +120,11 @@ public final class UnsafeFixedWidthAggregationMap { this.bufferPool = new ObjectPool(initialCapacity); InternalRow initRow = initProjection.apply(emptyRow); -this.emptyBuffer = new byte[bufferConverter.getSizeRequirement(initRow)]; +int emptyBufferSize = bufferConverter.getSizeRequirement(initRow); +this.emptyBuffer = new byte[emptyBufferSize]; int writtenLength = bufferConverter.writeRow( - initRow, emptyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, bufferPool); + initRow, emptyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, emptyBufferSize, + bufferPool); assert (writtenLength == emptyBuffer.length): Size requirement calculation was wrong!; // re-use the empty buffer only when there is no object saved in pool. reuseEmptyBuffer = bufferPool.size() == 0; @@ -142,6 +144,7 @@ public final class UnsafeFixedWidthAggregationMap { groupingKey, groupingKeyConversionScratchSpace, PlatformDependent.BYTE_ARRAY_OFFSET, + groupingKeySize, keyPool); assert (groupingKeySize == actualGroupingKeySize) : Size requirement calculation was wrong!; @@ -157,7 +160,7 @@ public final class UnsafeFixedWidthAggregationMap { // There is some objects referenced by emptyBuffer, so generate a new one InternalRow initRow = initProjection.apply(emptyRow); bufferConverter.writeRow(initRow, emptyBuffer, PlatformDependent.BYTE_ARRAY_OFFSET, - bufferPool); + groupingKeySize, bufferPool); } loc.putNewKey( groupingKeyConversionScratchSpace, @@ -175,6 +178,7 @@ public final class UnsafeFixedWidthAggregationMap { address.getBaseObject(), address.getBaseOffset(), bufferConverter.numFields(), + loc.getValueLength(), bufferPool ); return currentBuffer; @@ -214,12 +218,14 @@ public final class UnsafeFixedWidthAggregationMap {
spark git commit: [SPARK-8937] [TEST] A setting `spark.unsafe.exceptionOnMemoryLeak ` is missing in ScalaTest config.
Repository: spark Updated Branches: refs/heads/branch-1.4 12c1c36d9 - c04f0a5cf [SPARK-8937] [TEST] A setting `spark.unsafe.exceptionOnMemoryLeak ` is missing in ScalaTest config. `spark.unsafe.exceptionOnMemoryLeak` is present in the config of surefire. ``` !-- Surefire runs all Java tests -- plugin groupIdorg.apache.maven.plugins/groupId artifactIdmaven-surefire-plugin/artifactId version2.18.1/version !-- Note config is repeated in scalatest config -- ... spark.unsafe.exceptionOnMemoryLeaktrue/spark.unsafe.exceptionOnMemoryLeak /systemProperties ... ``` but is absent in the config ScalaTest. Author: Kousuke Saruta saru...@oss.nttdata.co.jp Closes #7308 from sarutak/add-setting-for-memory-leak and squashes the following commits: 95644e7 [Kousuke Saruta] Added a setting for memory leak (cherry picked from commit aba5784dab24c03ddad89f7a1b5d3d0dc8d109be) Signed-off-by: Kousuke Saruta saru...@oss.nttdata.co.jp Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c04f0a5c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c04f0a5c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c04f0a5c Branch: refs/heads/branch-1.4 Commit: c04f0a5cf4d23405d96d945dc67a8d6d6495e560 Parents: 12c1c36 Author: Kousuke Saruta saru...@oss.nttdata.co.jp Authored: Thu Jul 9 13:28:17 2015 +0900 Committer: Kousuke Saruta saru...@oss.nttdata.co.jp Committed: Thu Jul 9 13:28:53 2015 +0900 -- pom.xml | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c04f0a5c/pom.xml -- diff --git a/pom.xml b/pom.xml index 9953ad0..512bc87 100644 --- a/pom.xml +++ b/pom.xml @@ -1294,6 +1294,7 @@ spark.ui.enabledfalse/spark.ui.enabled spark.ui.showConsoleProgressfalse/spark.ui.showConsoleProgress spark.driver.allowMultipleContextstrue/spark.driver.allowMultipleContexts + spark.unsafe.exceptionOnMemoryLeaktrue/spark.unsafe.exceptionOnMemoryLeak /systemProperties /configuration executions - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8937] [TEST] A setting `spark.unsafe.exceptionOnMemoryLeak ` is missing in ScalaTest config.
Repository: spark Updated Branches: refs/heads/master 47ef423f8 - aba5784da [SPARK-8937] [TEST] A setting `spark.unsafe.exceptionOnMemoryLeak ` is missing in ScalaTest config. `spark.unsafe.exceptionOnMemoryLeak` is present in the config of surefire. ``` !-- Surefire runs all Java tests -- plugin groupIdorg.apache.maven.plugins/groupId artifactIdmaven-surefire-plugin/artifactId version2.18.1/version !-- Note config is repeated in scalatest config -- ... spark.unsafe.exceptionOnMemoryLeaktrue/spark.unsafe.exceptionOnMemoryLeak /systemProperties ... ``` but is absent in the config ScalaTest. Author: Kousuke Saruta saru...@oss.nttdata.co.jp Closes #7308 from sarutak/add-setting-for-memory-leak and squashes the following commits: 95644e7 [Kousuke Saruta] Added a setting for memory leak Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aba5784d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aba5784d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aba5784d Branch: refs/heads/master Commit: aba5784dab24c03ddad89f7a1b5d3d0dc8d109be Parents: 47ef423 Author: Kousuke Saruta saru...@oss.nttdata.co.jp Authored: Thu Jul 9 13:28:17 2015 +0900 Committer: Kousuke Saruta saru...@oss.nttdata.co.jp Committed: Thu Jul 9 13:28:17 2015 +0900 -- pom.xml | 1 + 1 file changed, 1 insertion(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aba5784d/pom.xml -- diff --git a/pom.xml b/pom.xml index 9cf2471..529e47f 100644 --- a/pom.xml +++ b/pom.xml @@ -1339,6 +1339,7 @@ spark.ui.enabledfalse/spark.ui.enabled spark.ui.showConsoleProgressfalse/spark.ui.showConsoleProgress spark.driver.allowMultipleContextstrue/spark.driver.allowMultipleContexts + spark.unsafe.exceptionOnMemoryLeaktrue/spark.unsafe.exceptionOnMemoryLeak /systemProperties /configuration executions - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8928] [SQL] Makes CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in compatible mode
Repository: spark Updated Branches: refs/heads/master c056484c0 - 851e247ca [SPARK-8928] [SQL] Makes CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in compatible mode This PR is based on #7209 authored by Sephiroth-Lin. Author: Weizhong Lin linweizh...@huawei.com Closes #7314 from liancheng/spark-8928 and squashes the following commits: 75267fe [Cheng Lian] Makes CatalystSchemaConverter sticking to 1.4.x- when handling LISTs in compatible mode Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/851e247c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/851e247c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/851e247c Branch: refs/heads/master Commit: 851e247caad0977cfd4998254d9602624e06539f Parents: c056484 Author: Weizhong Lin linweizh...@huawei.com Authored: Wed Jul 8 22:18:39 2015 -0700 Committer: Cheng Lian l...@databricks.com Committed: Wed Jul 8 22:19:19 2015 -0700 -- .../spark/sql/parquet/CatalystSchemaConverter.scala | 6 -- .../org/apache/spark/sql/parquet/ParquetSchemaSuite.scala | 10 +- 2 files changed, 9 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/851e247c/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala index de3a72d..1ea6926 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala @@ -461,7 +461,8 @@ private[parquet] class CatalystSchemaConverter( field.name, Types .buildGroup(REPEATED) -.addField(convertField(StructField(element, elementType, nullable))) +// array_element is the name chosen by parquet-hive (1.7.0 and prior version) +.addField(convertField(StructField(array_element, elementType, nullable))) .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME)) // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level @@ -474,7 +475,8 @@ private[parquet] class CatalystSchemaConverter( ConversionPatterns.listType( repetition, field.name, - convertField(StructField(element, elementType, nullable), REPEATED)) + // array is the name chosen by parquet-avro (1.7.0 and prior version) + convertField(StructField(array, elementType, nullable), REPEATED)) // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by // MAP_KEY_VALUE. This is covered by `convertGroupField(field: GroupType): DataType`. http://git-wip-us.apache.org/repos/asf/spark/blob/851e247c/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala index 35d3c33..fa62939 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala @@ -174,7 +174,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional group _1 (LIST) { - |repeated int32 element; + |repeated int32 array; | } |} .stripMargin) @@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional group _1 (LIST) { |repeated group bag { - | optional int32 element; + | optional int32 array_element; |} | } |} @@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |optional binary _1 (UTF8); |optional group _2 (LIST) { | repeated group bag { - |optional group element { + |optional group array_element { | required int32 _1; | required double _2; |} @@ -616,7 +616,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { message root { | optional group f1 (LIST) { |repeated group bag { - | optional int32 element; + | optional int32 array_element; |} | } |} @@ -648,7 +648,7 @@ class ParquetSchemaSuite extends
spark git commit: [SPARK-8866][SQL] use 1us precision for timestamp type
Repository: spark Updated Branches: refs/heads/master 28fa01e2b - a29081487 [SPARK-8866][SQL] use 1us precision for timestamp type JIRA: https://issues.apache.org/jira/browse/SPARK-8866 Author: Yijie Shen henry.yijies...@gmail.com Closes #7283 from yijieshen/micro_timestamp and squashes the following commits: dc735df [Yijie Shen] update CastSuite to avoid round error 714eaea [Yijie Shen] add timestamp_udf into blacklist due to precision lose c3ca2f4 [Yijie Shen] fix unhandled case in CurrentTimestamp 8d4aa6b [Yijie Shen] use 1us precision for timestamp type Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a2908148 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a2908148 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a2908148 Branch: refs/heads/master Commit: a290814877308c6fa9b0f78b1a81145db7651ca4 Parents: 28fa01e Author: Yijie Shen henry.yijies...@gmail.com Authored: Wed Jul 8 20:20:17 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 20:20:17 2015 -0700 -- python/pyspark/sql/types.py | 2 +- .../spark/sql/catalyst/expressions/Cast.scala | 18 +- .../expressions/datetimeFunctions.scala | 2 +- .../spark/sql/catalyst/util/DateTimeUtils.scala | 38 ++-- .../sql/catalyst/expressions/CastSuite.scala| 10 +++--- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 8 ++--- .../apache/spark/sql/json/JacksonParser.scala | 4 +-- .../org/apache/spark/sql/json/JsonRDD.scala | 6 ++-- .../org/apache/spark/sql/jdbc/JDBCSuite.scala | 2 +- .../hive/execution/HiveCompatibilitySuite.scala | 6 ++-- .../apache/spark/sql/hive/HiveInspectors.scala | 4 +-- 11 files changed, 50 insertions(+), 50 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a2908148/python/pyspark/sql/types.py -- diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 7e64cb0..fecfe6d 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -775,7 +775,7 @@ def _python_to_sql_converter(dataType): if dt: seconds = (calendar.timegm(dt.utctimetuple()) if dt.tzinfo else time.mktime(dt.timetuple())) -return int(seconds * 1e7 + dt.microsecond * 10) +return int(seconds * 1e6 + dt.microsecond) return to_posix_timstamp else: http://git-wip-us.apache.org/repos/asf/spark/blob/a2908148/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala index 662ceec..567feca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala @@ -186,7 +186,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w case ByteType = buildCast[Byte](_, b = longToTimestamp(b.toLong)) case DateType = - buildCast[Int](_, d = DateTimeUtils.daysToMillis(d) * 1) + buildCast[Int](_, d = DateTimeUtils.daysToMillis(d) * 1000) // TimestampWritable.decimalToTimestamp case DecimalType() = buildCast[Decimal](_, d = decimalToTimestamp(d)) @@ -207,16 +207,16 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w } private[this] def decimalToTimestamp(d: Decimal): Long = { -(d.toBigDecimal * 1000L).longValue() +(d.toBigDecimal * 100L).longValue() } - // converting milliseconds to 100ns - private[this] def longToTimestamp(t: Long): Long = t * 1L - // converting 100ns to seconds - private[this] def timestampToLong(ts: Long): Long = math.floor(ts.toDouble / 1000L).toLong - // converting 100ns to seconds in double + // converting milliseconds to us + private[this] def longToTimestamp(t: Long): Long = t * 1000L + // converting us to seconds + private[this] def timestampToLong(ts: Long): Long = math.floor(ts.toDouble / 100L).toLong + // converting us to seconds in double private[this] def timestampToDouble(ts: Long): Double = { -ts / 1000.0 +ts / 100.0 } // DateConverter @@ -229,7 +229,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w case TimestampType = // throw valid precision more than seconds, according to Hive. // Timestamp.nanos is in 0 to 999,999,999, no more than a second. - buildCast[Long](_, t =
spark git commit: [SPARK-8928] [SQL] Makes CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in compatible mode
Repository: spark Updated Branches: refs/heads/master a240bf3b4 - 3dab0da42 [SPARK-8928] [SQL] Makes CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in compatible mode This PR is based on #7209 authored by Sephiroth-Lin. Author: Weizhong Lin linweizh...@huawei.com Closes #7304 from liancheng/spark-8928 and squashes the following commits: 75267fe [Cheng Lian] Makes CatalystSchemaConverter sticking to 1.4.x- when handling LISTs in compatible mode Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3dab0da4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3dab0da4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3dab0da4 Branch: refs/heads/master Commit: 3dab0da42940a46f0c4aa4853bdb5c64c4cb2613 Parents: a240bf3 Author: Cheng Lian l...@databricks.com Authored: Wed Jul 8 22:09:12 2015 -0700 Committer: Cheng Lian l...@databricks.com Committed: Wed Jul 8 22:09:14 2015 -0700 -- .../spark/sql/parquet/CatalystSchemaConverter.scala | 6 -- .../org/apache/spark/sql/parquet/ParquetSchemaSuite.scala | 10 +- 2 files changed, 9 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3dab0da4/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala index de3a72d..1ea6926 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala @@ -461,7 +461,8 @@ private[parquet] class CatalystSchemaConverter( field.name, Types .buildGroup(REPEATED) -.addField(convertField(StructField(element, elementType, nullable))) +// array_element is the name chosen by parquet-hive (1.7.0 and prior version) +.addField(convertField(StructField(array_element, elementType, nullable))) .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME)) // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level @@ -474,7 +475,8 @@ private[parquet] class CatalystSchemaConverter( ConversionPatterns.listType( repetition, field.name, - convertField(StructField(element, elementType, nullable), REPEATED)) + // array is the name chosen by parquet-avro (1.7.0 and prior version) + convertField(StructField(array, elementType, nullable), REPEATED)) // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by // MAP_KEY_VALUE. This is covered by `convertGroupField(field: GroupType): DataType`. http://git-wip-us.apache.org/repos/asf/spark/blob/3dab0da4/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala index 35d3c33..fa62939 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala @@ -174,7 +174,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional group _1 (LIST) { - |repeated int32 element; + |repeated int32 array; | } |} .stripMargin) @@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional group _1 (LIST) { |repeated group bag { - | optional int32 element; + | optional int32 array_element; |} | } |} @@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |optional binary _1 (UTF8); |optional group _2 (LIST) { | repeated group bag { - |optional group element { + |optional group array_element { | required int32 _1; | required double _2; |} @@ -616,7 +616,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { message root { | optional group f1 (LIST) { |repeated group bag { - | optional int32 element; + | optional int32 array_element; |} | } |} @@ -648,7 +648,7 @@ class ParquetSchemaSuite extends
spark git commit: [SPARK-8927] [DOCS] Format wrong for some config descriptions
Repository: spark Updated Branches: refs/heads/branch-1.4 5bc19a1a9 - 2fb2ef0ee [SPARK-8927] [DOCS] Format wrong for some config descriptions A couple descriptions were not inside `td/td` and were being displayed immediately under the section title instead of in their row. Author: Jonathan Alter jonal...@users.noreply.github.com Closes #7292 from jonalter/docs-config and squashes the following commits: 5ce1570 [Jonathan Alter] [DOCS] Format wrong for some config descriptions (cherry picked from commit 28fa01e2ba146e823489f6d81c5eb3a76b20c71f) Signed-off-by: Sean Owen so...@cloudera.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2fb2ef0e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2fb2ef0e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2fb2ef0e Branch: refs/heads/branch-1.4 Commit: 2fb2ef0ee7414cb5f7342cd80e0b943c1ad52cec Parents: 5bc19a1 Author: Jonathan Alter jonal...@users.noreply.github.com Authored: Thu Jul 9 03:28:51 2015 +0100 Committer: Sean Owen so...@cloudera.com Committed: Thu Jul 9 03:29:13 2015 +0100 -- docs/configuration.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2fb2ef0e/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index affcd21..19f3b7e 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1007,9 +1007,9 @@ Apart from these, the following properties are also available, and may be useful tr tdcodespark.rpc.numRetries/code/td td3/td + td Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number. - td /td /tr tr @@ -1029,8 +1029,8 @@ Apart from these, the following properties are also available, and may be useful tr tdcodespark.rpc.lookupTimeout/code/td td120s/td -Duration for an RPC remote endpoint lookup operation to wait before timing out. td +Duration for an RPC remote endpoint lookup operation to wait before timing out. /td /tr /table - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8927] [DOCS] Format wrong for some config descriptions
Repository: spark Updated Branches: refs/heads/master 74d8d3d92 - 28fa01e2b [SPARK-8927] [DOCS] Format wrong for some config descriptions A couple descriptions were not inside `td/td` and were being displayed immediately under the section title instead of in their row. Author: Jonathan Alter jonal...@users.noreply.github.com Closes #7292 from jonalter/docs-config and squashes the following commits: 5ce1570 [Jonathan Alter] [DOCS] Format wrong for some config descriptions Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28fa01e2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28fa01e2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28fa01e2 Branch: refs/heads/master Commit: 28fa01e2ba146e823489f6d81c5eb3a76b20c71f Parents: 74d8d3d Author: Jonathan Alter jonal...@users.noreply.github.com Authored: Thu Jul 9 03:28:51 2015 +0100 Committer: Sean Owen so...@cloudera.com Committed: Thu Jul 9 03:28:51 2015 +0100 -- docs/configuration.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/28fa01e2/docs/configuration.md -- diff --git a/docs/configuration.md b/docs/configuration.md index bebaf6f..892c02b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1007,9 +1007,9 @@ Apart from these, the following properties are also available, and may be useful tr tdcodespark.rpc.numRetries/code/td td3/td + td Number of times to retry before an RPC task gives up. An RPC task will run at most times of this number. - td /td /tr tr @@ -1029,8 +1029,8 @@ Apart from these, the following properties are also available, and may be useful tr tdcodespark.rpc.lookupTimeout/code/td td120s/td -Duration for an RPC remote endpoint lookup operation to wait before timing out. td +Duration for an RPC remote endpoint lookup operation to wait before timing out. /td /tr /table - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8910] Fix MiMa flaky due to port contention issue
Repository: spark Updated Branches: refs/heads/master b55499a44 - 47ef423f8 [SPARK-8910] Fix MiMa flaky due to port contention issue Due to the way MiMa works, we currently start a `SQLContext` pretty early on. This causes us to start a `SparkUI` that attempts to bind to port 4040. Because many tests run in parallel on the Jenkins machines, this causes port contention sometimes and fails the MiMa tests. Note that we already disabled the SparkUI for scalatests. However, the MiMa test is run before we even have a chance to load the default scalatest settings, so we need to explicitly disable the UI ourselves. Author: Andrew Or and...@databricks.com Closes #7300 from andrewor14/mima-flaky and squashes the following commits: b55a547 [Andrew Or] Do not enable SparkUI during tests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/47ef423f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/47ef423f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/47ef423f Branch: refs/heads/master Commit: 47ef423f860c3109d50c7e321616b267f4296e34 Parents: b55499a Author: Andrew Or and...@databricks.com Authored: Wed Jul 8 20:29:08 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 20:29:08 2015 -0700 -- .../scala/org/apache/spark/sql/test/TestSQLContext.scala | 8 .../main/scala/org/apache/spark/sql/hive/test/TestHive.scala | 7 --- 2 files changed, 8 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/47ef423f/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala index 9fa3945..b3a4231 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala @@ -26,10 +26,10 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan /** A SQLContext that can be used for local testing. */ class LocalSQLContext extends SQLContext( -new SparkContext( - local[2], - TestSQLContext, - new SparkConf().set(spark.sql.testkey, true))) { +new SparkContext(local[2], TestSQLContext, new SparkConf() + .set(spark.sql.testkey, true) + // SPARK-8910 + .set(spark.ui.enabled, false))) { override protected[sql] def createSession(): SQLSession = { new this.SQLSession() http://git-wip-us.apache.org/repos/asf/spark/blob/47ef423f/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index 7978fda..0f217bc 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -53,9 +53,10 @@ object TestHive TestSQLContext, new SparkConf() .set(spark.sql.test, ) -.set( - spark.sql.hive.metastore.barrierPrefixes, - org.apache.spark.sql.hive.execution.PairSerDe))) +.set(spark.sql.hive.metastore.barrierPrefixes, + org.apache.spark.sql.hive.execution.PairSerDe) +// SPARK-8910 +.set(spark.ui.enabled, false))) /** * A locally running test instance of Spark's Hive execution engine. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8910] Fix MiMa flaky due to port contention issue
Repository: spark Updated Branches: refs/heads/branch-1.4 2fb2ef0ee - 12c1c36d9 [SPARK-8910] Fix MiMa flaky due to port contention issue Due to the way MiMa works, we currently start a `SQLContext` pretty early on. This causes us to start a `SparkUI` that attempts to bind to port 4040. Because many tests run in parallel on the Jenkins machines, this causes port contention sometimes and fails the MiMa tests. Note that we already disabled the SparkUI for scalatests. However, the MiMa test is run before we even have a chance to load the default scalatest settings, so we need to explicitly disable the UI ourselves. Author: Andrew Or and...@databricks.com Closes #7300 from andrewor14/mima-flaky and squashes the following commits: b55a547 [Andrew Or] Do not enable SparkUI during tests (cherry picked from commit 47ef423f860c3109d50c7e321616b267f4296e34) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/12c1c36d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/12c1c36d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/12c1c36d Branch: refs/heads/branch-1.4 Commit: 12c1c36d90ea44ca3141104fa1fe450ab5c0a107 Parents: 2fb2ef0 Author: Andrew Or and...@databricks.com Authored: Wed Jul 8 20:29:08 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 20:29:15 2015 -0700 -- .../scala/org/apache/spark/sql/test/TestSQLContext.scala | 8 .../main/scala/org/apache/spark/sql/hive/test/TestHive.scala | 7 --- 2 files changed, 8 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/12c1c36d/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala index 356a610..e374dd0 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/test/TestSQLContext.scala @@ -26,10 +26,10 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan /** A SQLContext that can be used for local testing. */ class LocalSQLContext extends SQLContext( -new SparkContext( - local[2], - TestSQLContext, - new SparkConf().set(spark.sql.testkey, true))) { +new SparkContext(local[2], TestSQLContext, new SparkConf() + .set(spark.sql.testkey, true) + // SPARK-8910 + .set(spark.ui.enabled, false))) { override protected[sql] def createSession(): SQLSession = { new this.SQLSession() http://git-wip-us.apache.org/repos/asf/spark/blob/12c1c36d/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala index e337744..1b6c32a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/test/TestHive.scala @@ -53,9 +53,10 @@ object TestHive TestSQLContext, new SparkConf() .set(spark.sql.test, ) -.set( - spark.sql.hive.metastore.barrierPrefixes, - org.apache.spark.sql.hive.execution.PairSerDe))) +.set(spark.sql.hive.metastore.barrierPrefixes, + org.apache.spark.sql.hive.execution.PairSerDe) +// SPARK-8910 +.set(spark.ui.enabled, false))) /** * A locally running test instance of Spark's Hive execution engine. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8926][SQL] Good errors for ExpectsInputType expressions
Repository: spark Updated Branches: refs/heads/master aba5784da - 768907eb7 [SPARK-8926][SQL] Good errors for ExpectsInputType expressions For example: `cannot resolve 'testfunction(null)' due to data type mismatch: argument 1 is expected to be of type int, however, null is of type datetype.` Author: Michael Armbrust mich...@databricks.com Closes #7303 from marmbrus/expectsTypeErrors and squashes the following commits: c654a0e [Michael Armbrust] fix udts and make errors pretty 137160d [Michael Armbrust] style 5428fda [Michael Armbrust] style 10fac82 [Michael Armbrust] [SPARK-8926][SQL] Good errors for ExpectsInputType expressions Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/768907eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/768907eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/768907eb Branch: refs/heads/master Commit: 768907eb7b0d3c11a420ef281454e36167011c89 Parents: aba5784 Author: Michael Armbrust mich...@databricks.com Authored: Wed Jul 8 22:05:58 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 22:05:58 2015 -0700 -- .../catalyst/analysis/HiveTypeCoercion.scala| 12 +- .../expressions/ExpectsInputTypes.scala | 13 +- .../spark/sql/types/AbstractDataType.scala | 30 +++- .../org/apache/spark/sql/types/ArrayType.scala | 8 +- .../org/apache/spark/sql/types/DataType.scala | 4 +- .../apache/spark/sql/types/DecimalType.scala| 8 +- .../org/apache/spark/sql/types/MapType.scala| 8 +- .../org/apache/spark/sql/types/StructType.scala | 8 +- .../spark/sql/types/UserDefinedType.scala | 5 +- .../catalyst/analysis/AnalysisErrorSuite.scala | 167 +++ .../sql/catalyst/analysis/AnalysisSuite.scala | 126 ++ .../analysis/HiveTypeCoercionSuite.scala| 8 + .../org/apache/spark/sql/hive/HiveContext.scala | 2 +- 13 files changed, 256 insertions(+), 143 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/768907eb/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index 5367b7f..8cb7199 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -702,11 +702,19 @@ object HiveTypeCoercion { @Nullable val ret: Expression = (inType, expectedType) match { // If the expected type is already a parent of the input type, no need to cast. -case _ if expectedType.isParentOf(inType) = e +case _ if expectedType.isSameType(inType) = e // Cast null type (usually from null literals) into target types case (NullType, target) = Cast(e, target.defaultConcreteType) +// If the function accepts any numeric type (i.e. the ADT `NumericType`) and the input is +// already a number, leave it as is. +case (_: NumericType, NumericType) = e + +// If the function accepts any numeric type and the input is a string, we follow the hive +// convention and cast that input into a double +case (StringType, NumericType) = Cast(e, NumericType.defaultConcreteType) + // Implicit cast among numeric types // If input is a numeric type but not decimal, and we expect a decimal type, // cast the input to unlimited precision decimal. @@ -732,7 +740,7 @@ object HiveTypeCoercion { // First see if we can find our input type in the type collection. If we can, then just // use the current expression; otherwise, find the first one we can implicitly cast. case (_, TypeCollection(types)) = - if (types.exists(_.isParentOf(inType))) { + if (types.exists(_.isSameType(inType))) { e } else { types.flatMap(implicitCast(e, _)).headOption.orNull http://git-wip-us.apache.org/repos/asf/spark/blob/768907eb/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala index 916e301..986cc09 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExpectsInputTypes.scala +++
spark git commit: Revert [SPARK-8928] [SQL] Makes CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in compatible mode
Repository: spark Updated Branches: refs/heads/master 3dab0da42 - c056484c0 Revert [SPARK-8928] [SQL] Makes CatalystSchemaConverter sticking to 1.4.x- when handling Parquet LISTs in compatible mode This reverts commit 3dab0da42940a46f0c4aa4853bdb5c64c4cb2613. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c056484c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c056484c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c056484c Branch: refs/heads/master Commit: c056484c0741e2a03d4a916538e1b9e3e65e71c3 Parents: 3dab0da Author: Cheng Lian l...@databricks.com Authored: Wed Jul 8 22:14:38 2015 -0700 Committer: Cheng Lian l...@databricks.com Committed: Wed Jul 8 22:14:38 2015 -0700 -- .../spark/sql/parquet/CatalystSchemaConverter.scala | 6 ++ .../org/apache/spark/sql/parquet/ParquetSchemaSuite.scala | 10 +- 2 files changed, 7 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c056484c/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala index 1ea6926..de3a72d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/CatalystSchemaConverter.scala @@ -461,8 +461,7 @@ private[parquet] class CatalystSchemaConverter( field.name, Types .buildGroup(REPEATED) -// array_element is the name chosen by parquet-hive (1.7.0 and prior version) -.addField(convertField(StructField(array_element, elementType, nullable))) +.addField(convertField(StructField(element, elementType, nullable))) .named(CatalystConverter.ARRAY_CONTAINS_NULL_BAG_SCHEMA_NAME)) // Spark 1.4.x and prior versions convert ArrayType with non-nullable elements into a 2-level @@ -475,8 +474,7 @@ private[parquet] class CatalystSchemaConverter( ConversionPatterns.listType( repetition, field.name, - // array is the name chosen by parquet-avro (1.7.0 and prior version) - convertField(StructField(array, elementType, nullable), REPEATED)) + convertField(StructField(element, elementType, nullable), REPEATED)) // Spark 1.4.x and prior versions convert MapType into a 3-level group annotated by // MAP_KEY_VALUE. This is covered by `convertGroupField(field: GroupType): DataType`. http://git-wip-us.apache.org/repos/asf/spark/blob/c056484c/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala index fa62939..35d3c33 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetSchemaSuite.scala @@ -174,7 +174,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional group _1 (LIST) { - |repeated int32 array; + |repeated int32 element; | } |} .stripMargin) @@ -198,7 +198,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |message root { | optional group _1 (LIST) { |repeated group bag { - | optional int32 array_element; + | optional int32 element; |} | } |} @@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { |optional binary _1 (UTF8); |optional group _2 (LIST) { | repeated group bag { - |optional group array_element { + |optional group element { | required int32 _1; | required double _2; |} @@ -616,7 +616,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { message root { | optional group f1 (LIST) { |repeated group bag { - | optional int32 array_element; + | optional int32 element; |} | } |} @@ -648,7 +648,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { nullable = true))), message root { | optional group f1 (LIST) { - |repeated int32 array; + |repeated int32 element; | } |} .stripMargin)
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc4 [created] dbaa5c294 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[2/2] spark git commit: Preparing development version 1.4.2-SNAPSHOT
Preparing development version 1.4.2-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5bc19a1a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5bc19a1a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5bc19a1a Branch: refs/heads/branch-1.4 Commit: 5bc19a1a90ad42b6df5ecf3da8896a7b94cf0a40 Parents: dbaa5c2 Author: Patrick Wendell pwend...@gmail.com Authored: Wed Jul 8 15:40:54 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Wed Jul 8 15:40:54 2015 -0700 -- assembly/pom.xml | 2 +- bagel/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml| 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml | 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml| 2 +- extras/kinesis-asl/pom.xml| 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib/pom.xml | 2 +- network/common/pom.xml| 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml| 2 +- yarn/pom.xml | 2 +- 30 files changed, 30 insertions(+), 30 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5bc19a1a/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index ba233e7..228db59 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/5bc19a1a/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index c5e9183..ce791a6 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/5bc19a1a/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index f0d236d..176ea9b 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/5bc19a1a/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index e9a9cc2..877c2fb 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/5bc19a1a/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index 7eae7a7..ad431fa 100644 --- a/external/flume-sink/pom.xml +++ b/external/flume-sink/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +version1.4.2-SNAPSHOT/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/5bc19a1a/external/flume/pom.xml -- diff --git a/external/flume/pom.xml b/external/flume/pom.xml index b3ad09a..9789435 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.4.1/version +
spark git commit: [SPARK-8914][SQL] Remove RDDApi
Repository: spark Updated Branches: refs/heads/master f472b8cdc - 2a4f88b6c [SPARK-8914][SQL] Remove RDDApi As rxin suggested in #7298 , we should consider to remove `RDDApi`. Author: Kousuke Saruta saru...@oss.nttdata.co.jp Closes #7302 from sarutak/remove-rddapi and squashes the following commits: e495d35 [Kousuke Saruta] Fixed mima cb7ebb9 [Kousuke Saruta] Removed overriding RDDApi Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a4f88b6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a4f88b6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a4f88b6 Branch: refs/heads/master Commit: 2a4f88b6c16f2991e63b17c0e103bcd79f04dbbc Parents: f472b8c Author: Kousuke Saruta saru...@oss.nttdata.co.jp Authored: Wed Jul 8 18:09:39 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Wed Jul 8 18:09:39 2015 -0700 -- project/MimaExcludes.scala | 5 ++ .../scala/org/apache/spark/sql/DataFrame.scala | 39 ++-- .../scala/org/apache/spark/sql/RDDApi.scala | 67 3 files changed, 24 insertions(+), 87 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2a4f88b6/project/MimaExcludes.scala -- diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 7346d80..57a86bf 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -70,7 +70,12 @@ object MimaExcludes { org.apache.spark.mllib.linalg.Matrix.numNonzeros), ProblemFilters.exclude[MissingMethodProblem]( org.apache.spark.mllib.linalg.Matrix.numActives) + ) ++ Seq( +// SPARK-8914 Remove RDDApi +ProblemFilters.exclude[MissingClassProblem]( +org.apache.spark.sql.RDDApi) ) + case v if v.startsWith(1.4) = Seq( MimaBuild.excludeSparkPackage(deploy), http://git-wip-us.apache.org/repos/asf/spark/blob/2a4f88b6/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala index f33e19a..eeefc85 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala @@ -115,8 +115,7 @@ private[sql] object DataFrame { @Experimental class DataFrame private[sql]( @transient val sqlContext: SQLContext, -@DeveloperApi @transient val queryExecution: SQLContext#QueryExecution) - extends RDDApi[Row] with Serializable { +@DeveloperApi @transient val queryExecution: SQLContext#QueryExecution) extends Serializable { /** * A constructor that automatically analyzes the logical plan. @@ -1320,14 +1319,14 @@ class DataFrame private[sql]( * @group action * @since 1.3.0 */ - override def first(): Row = head() + def first(): Row = head() /** * Returns a new RDD by applying a function to all rows of this DataFrame. * @group rdd * @since 1.3.0 */ - override def map[R: ClassTag](f: Row = R): RDD[R] = rdd.map(f) + def map[R: ClassTag](f: Row = R): RDD[R] = rdd.map(f) /** * Returns a new RDD by first applying a function to all rows of this [[DataFrame]], @@ -1335,14 +1334,14 @@ class DataFrame private[sql]( * @group rdd * @since 1.3.0 */ - override def flatMap[R: ClassTag](f: Row = TraversableOnce[R]): RDD[R] = rdd.flatMap(f) + def flatMap[R: ClassTag](f: Row = TraversableOnce[R]): RDD[R] = rdd.flatMap(f) /** * Returns a new RDD by applying a function to each partition of this DataFrame. * @group rdd * @since 1.3.0 */ - override def mapPartitions[R: ClassTag](f: Iterator[Row] = Iterator[R]): RDD[R] = { + def mapPartitions[R: ClassTag](f: Iterator[Row] = Iterator[R]): RDD[R] = { rdd.mapPartitions(f) } @@ -1351,49 +1350,49 @@ class DataFrame private[sql]( * @group rdd * @since 1.3.0 */ - override def foreach(f: Row = Unit): Unit = rdd.foreach(f) + def foreach(f: Row = Unit): Unit = rdd.foreach(f) /** * Applies a function f to each partition of this [[DataFrame]]. * @group rdd * @since 1.3.0 */ - override def foreachPartition(f: Iterator[Row] = Unit): Unit = rdd.foreachPartition(f) + def foreachPartition(f: Iterator[Row] = Unit): Unit = rdd.foreachPartition(f) /** * Returns the first `n` rows in the [[DataFrame]]. * @group action * @since 1.3.0 */ - override def take(n: Int): Array[Row] = head(n) + def take(n: Int): Array[Row] = head(n) /** * Returns an array that contains all of [[Row]]s in this
spark git commit: [SPARK-8902] Correctly print hostname in error
Repository: spark Updated Branches: refs/heads/branch-1.4 3f6e6e0e2 - df763495f [SPARK-8902] Correctly print hostname in error With + the strings are separate expressions, and format() is called on the last string before concatenation. (So substitution does not happen.) Without + the string literals are merged first by the parser, so format() is called on the complete string. Should I make a JIRA for this? Author: Daniel Darabos darabos.dan...@gmail.com Closes #7288 from darabos/patch-2 and squashes the following commits: be0d3b7 [Daniel Darabos] Correctly print hostname in error (cherry picked from commit 5687f76552369fa20b3a4385eab4810214653aa7) Signed-off-by: Kousuke Saruta saru...@oss.nttdata.co.jp Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/df763495 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/df763495 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/df763495 Branch: refs/heads/branch-1.4 Commit: df763495f48e67995434019b943df97e7b39194d Parents: 3f6e6e0 Author: Daniel Darabos darabos.dan...@gmail.com Authored: Thu Jul 9 07:34:02 2015 +0900 Committer: Kousuke Saruta saru...@oss.nttdata.co.jp Committed: Thu Jul 9 07:34:51 2015 +0900 -- ec2/spark_ec2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/df763495/ec2/spark_ec2.py -- diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index 91f0a24..3880c2d 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -,8 +,8 @@ def ssh(host, opts, command): # If this was an ssh failure, provide the user with hints. if e.returncode == 255: raise UsageError( -Failed to SSH to remote host {0}.\n + -Please check that you have provided the correct --identity-file and + +Failed to SSH to remote host {0}.\n +Please check that you have provided the correct --identity-file and --key-pair parameters and try again..format(host)) else: raise e - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-5016] [MLLIB] Distribute GMM mixture components to executors
Repository: spark Updated Branches: refs/heads/master 8c32b2e87 - f472b8cdc [SPARK-5016] [MLLIB] Distribute GMM mixture components to executors Distribute expensive portions of computation for Gaussian mixture components (in particular, pre-computation of `MultivariateGaussian.rootSigmaInv`, the inverse covariance matrix and covariance determinant) across executors. Repost of PR#4654. Notes for reviewers: * What should be the policy for when to distribute computation. Always? When numClusters threshold? User-specified param? TODO: * Performance testing and comparison for large number of clusters Author: Feynman Liang fli...@databricks.com Closes #7166 from feynmanliang/GMM_parallel_mixtures and squashes the following commits: 4f351fa [Feynman Liang] Update heuristic and scaladoc 5ea947e [Feynman Liang] Fix parallelization logic 00eb7db [Feynman Liang] Add helper method for GMM's M step, remove distributeGaussians flag e7c8127 [Feynman Liang] Add distributeGaussians flag and tests 1da3c7f [Feynman Liang] Distribute mixtures Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f472b8cd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f472b8cd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f472b8cd Branch: refs/heads/master Commit: f472b8cdc00839780dc79be0bbe53a098cde230c Parents: 8c32b2e Author: Feynman Liang fli...@databricks.com Authored: Wed Jul 8 16:32:00 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Jul 8 16:32:00 2015 -0700 -- .../mllib/clustering/GaussianMixture.scala | 44 1 file changed, 36 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f472b8cd/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index fc509d2..e459367 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -140,6 +140,10 @@ class GaussianMixture private ( // Get length of the input vectors val d = breezeData.first().length +// Heuristic to distribute the computation of the [[MultivariateGaussian]]s, approximately when +// d 25 except for when k is very small +val distributeGaussians = ((k - 1.0) / k) * d 25 + // Determine initial weights and corresponding Gaussians. // If the user supplied an initial GMM, we use those values, otherwise // we start with uniform weights, a random mean from the data, and @@ -171,14 +175,25 @@ class GaussianMixture private ( // Create new distributions based on the partial assignments // (often referred to as the M step in literature) val sumWeights = sums.weights.sum - var i = 0 - while (i k) { -val mu = sums.means(i) / sums.weights(i) -BLAS.syr(-sums.weights(i), Vectors.fromBreeze(mu), - Matrices.fromBreeze(sums.sigmas(i)).asInstanceOf[DenseMatrix]) -weights(i) = sums.weights(i) / sumWeights -gaussians(i) = new MultivariateGaussian(mu, sums.sigmas(i) / sums.weights(i)) -i = i + 1 + + if (distributeGaussians) { +val numPartitions = math.min(k, 1024) +val tuples = + Seq.tabulate(k)(i = (sums.means(i), sums.sigmas(i), sums.weights(i))) +val (ws, gs) = sc.parallelize(tuples, numPartitions).map { case (mean, sigma, weight) = + updateWeightsAndGaussians(mean, sigma, weight, sumWeights) +}.collect.unzip +Array.copy(ws, 0, weights, 0, ws.length) +Array.copy(gs, 0, gaussians, 0, gs.length) + } else { +var i = 0 +while (i k) { + val (weight, gaussian) = +updateWeightsAndGaussians(sums.means(i), sums.sigmas(i), sums.weights(i), sumWeights) + weights(i) = weight + gaussians(i) = gaussian + i = i + 1 +} } llhp = llh // current becomes previous @@ -192,6 +207,19 @@ class GaussianMixture private ( /** Java-friendly version of [[run()]] */ def run(data: JavaRDD[Vector]): GaussianMixtureModel = run(data.rdd) + private def updateWeightsAndGaussians( + mean: BDV[Double], + sigma: BreezeMatrix[Double], + weight: Double, + sumWeights: Double): (Double, MultivariateGaussian) = { +val mu = (mean /= weight) +BLAS.syr(-weight, Vectors.fromBreeze(mu), + Matrices.fromBreeze(sigma).asInstanceOf[DenseMatrix]) +val newWeight = weight / sumWeights +val newGaussian = new
spark git commit: [SPARK-8450] [SQL] [PYSARK] cleanup type converter for Python DataFrame
Repository: spark Updated Branches: refs/heads/master 2a4f88b6c - 74d8d3d92 [SPARK-8450] [SQL] [PYSARK] cleanup type converter for Python DataFrame This PR fixes the converter for Python DataFrame, especially for DecimalType Closes #7106 Author: Davies Liu dav...@databricks.com Closes #7131 from davies/decimal_python and squashes the following commits: 4d3c234 [Davies Liu] Merge branch 'master' of github.com:apache/spark into decimal_python 20531d6 [Davies Liu] Merge branch 'master' of github.com:apache/spark into decimal_python 7d73168 [Davies Liu] fix conflit 6cdd86a [Davies Liu] Merge branch 'master' of github.com:apache/spark into decimal_python 7104e97 [Davies Liu] improve type infer 9cd5a21 [Davies Liu] run python tests with SPARK_PREPEND_CLASSES 829a05b [Davies Liu] fix UDT in python c99e8c5 [Davies Liu] fix mima c46814a [Davies Liu] convert decimal for Python DataFrames Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/74d8d3d9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/74d8d3d9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/74d8d3d9 Branch: refs/heads/master Commit: 74d8d3d928cc9a7386b68588ac89ae042847d146 Parents: 2a4f88b Author: Davies Liu dav...@databricks.com Authored: Wed Jul 8 18:22:53 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Wed Jul 8 18:22:53 2015 -0700 -- .../apache/spark/mllib/linalg/Matrices.scala| 10 +-- .../org/apache/spark/mllib/linalg/Vectors.scala | 16 +--- project/MimaExcludes.scala | 5 +- python/pyspark/sql/tests.py | 13 +++ python/pyspark/sql/types.py | 4 + python/run-tests.py | 3 +- .../scala/org/apache/spark/sql/DataFrame.scala | 4 +- .../scala/org/apache/spark/sql/SQLContext.scala | 28 +- .../apache/spark/sql/execution/pythonUDFs.scala | 95 +++- 9 files changed, 84 insertions(+), 94 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/74d8d3d9/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 75e7004..0df0766 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -24,9 +24,9 @@ import scala.collection.mutable.{ArrayBuilder = MArrayBuilder, HashSet = MHash import breeze.linalg.{CSCMatrix = BSM, DenseMatrix = BDM, Matrix = BM} import org.apache.spark.annotation.DeveloperApi -import org.apache.spark.sql.Row -import org.apache.spark.sql.types._ import org.apache.spark.sql.catalyst.expressions.GenericMutableRow +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.types._ /** * Trait for a local matrix. @@ -147,7 +147,7 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { )) } - override def serialize(obj: Any): Row = { + override def serialize(obj: Any): InternalRow = { val row = new GenericMutableRow(7) obj match { case sm: SparseMatrix = @@ -173,9 +173,7 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { override def deserialize(datum: Any): Matrix = { datum match { - // TODO: something wrong with UDT serialization, should never happen. - case m: Matrix = m - case row: Row = + case row: InternalRow = require(row.length == 7, sMatrixUDT.deserialize given row with length ${row.length} but requires length == 7) val tpe = row.getByte(0) http://git-wip-us.apache.org/repos/asf/spark/blob/74d8d3d9/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala index c9c2742..e048b01 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Vectors.scala @@ -28,7 +28,7 @@ import breeze.linalg.{DenseVector = BDV, SparseVector = BSV, Vector = BV} import org.apache.spark.SparkException import org.apache.spark.annotation.DeveloperApi import org.apache.spark.mllib.util.NumericParser -import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.types._ @@ -175,7 +175,7 @@ private[spark] class VectorUDT extends UserDefinedType[Vector] { StructField(values, ArrayType(DoubleType,
spark git commit: [SPARK-8903] Fix bug in cherry-pick of SPARK-8803
Repository: spark Updated Branches: refs/heads/branch-1.4 4df0f1b1b - 3f6e6e0e2 [SPARK-8903] Fix bug in cherry-pick of SPARK-8803 This fixes a bug introduced in the cherry-pick of #7201 which led to a NullPointerException when cross-tabulating a data set that contains null values. Author: Josh Rosen joshro...@databricks.com Closes #7295 from JoshRosen/SPARK-8903 and squashes the following commits: 5489948 [Josh Rosen] [SPARK-8903] Fix bug in cherry-pick of SPARK-8803 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3f6e6e0e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3f6e6e0e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3f6e6e0e Branch: refs/heads/branch-1.4 Commit: 3f6e6e0e2668832af1a54f5cb95e5a4537c7bc5a Parents: 4df0f1b Author: Josh Rosen joshro...@databricks.com Authored: Wed Jul 8 15:33:14 2015 -0700 Committer: Josh Rosen joshro...@databricks.com Committed: Wed Jul 8 15:33:14 2015 -0700 -- .../org/apache/spark/sql/execution/stat/StatFunctions.scala| 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3f6e6e0e/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala index 5a0c9a6..3c68028 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala @@ -113,7 +113,7 @@ private[sql] object StatFunctions extends Logging { if (element == null) null else element.toString } // get the distinct values of column 2, so that we can make them the column names -val distinctCol2: Map[Any, Int] = +val distinctCol2: Map[String, Int] = counts.map(e = cleanElement(e.get(1))).distinct.zipWithIndex.toMap val columnSize = distinctCol2.size require(columnSize 1e4, sThe number of distinct values for $col2, can't + @@ -128,7 +128,7 @@ private[sql] object StatFunctions extends Logging { countsRow.setLong(columnIndex + 1, row.getLong(2)) } // the value of col1 is the first value, the rest are the counts - countsRow.setString(0, cleanElement(col1Item.toString)) + countsRow.setString(0, cleanElement(col1Item)) countsRow }.toSeq // Back ticks can't exist in DataFrame column names, therefore drop them. To be able to accept @@ -139,7 +139,7 @@ private[sql] object StatFunctions extends Logging { // In the map, the column names (._1) are not ordered by the index (._2). This was the bug in // SPARK-8681. We need to explicitly sort by the column index and assign the column names. val headerNames = distinctCol2.toSeq.sortBy(_._2).map { r = - StructField(cleanColumnName(r._1.toString), LongType) + StructField(cleanColumnName(r._1), LongType) } val schema = StructType(StructField(tableName, StringType) +: headerNames) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[2/4] spark git commit: [SPARK-6123] [SPARK-6775] [SPARK-6776] [SQL] Refactors Parquet read path for interoperability and backwards-compatibility
http://git-wip-us.apache.org/repos/asf/spark/blob/4ffc27ca/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java -- diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java new file mode 100644 index 000..326ae9d --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/ParquetThriftCompat.java @@ -0,0 +1,2808 @@ +/** + * Autogenerated by Thrift Compiler (0.9.2) + * + * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING + * @generated + */ +package org.apache.spark.sql.parquet.test.thrift; + +import org.apache.thrift.scheme.IScheme; +import org.apache.thrift.scheme.SchemeFactory; +import org.apache.thrift.scheme.StandardScheme; + +import org.apache.thrift.scheme.TupleScheme; +import org.apache.thrift.protocol.TTupleProtocol; +import org.apache.thrift.protocol.TProtocolException; +import org.apache.thrift.EncodingUtils; +import org.apache.thrift.TException; +import org.apache.thrift.async.AsyncMethodCallback; +import org.apache.thrift.server.AbstractNonblockingServer.*; +import java.util.List; +import java.util.ArrayList; +import java.util.Map; +import java.util.HashMap; +import java.util.EnumMap; +import java.util.Set; +import java.util.HashSet; +import java.util.EnumSet; +import java.util.Collections; +import java.util.BitSet; +import java.nio.ByteBuffer; +import java.util.Arrays; +import javax.annotation.Generated; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@SuppressWarnings({cast, rawtypes, serial, unchecked}) +/** + * This is a test struct for testing parquet-thrift compatibility. + */ +@Generated(value = Autogenerated by Thrift Compiler (0.9.2), date = 2015-7-7) +public class ParquetThriftCompat implements org.apache.thrift.TBaseParquetThriftCompat, ParquetThriftCompat._Fields, java.io.Serializable, Cloneable, ComparableParquetThriftCompat { + private static final org.apache.thrift.protocol.TStruct STRUCT_DESC = new org.apache.thrift.protocol.TStruct(ParquetThriftCompat); + + private static final org.apache.thrift.protocol.TField BOOL_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(boolColumn, org.apache.thrift.protocol.TType.BOOL, (short)1); + private static final org.apache.thrift.protocol.TField BYTE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(byteColumn, org.apache.thrift.protocol.TType.BYTE, (short)2); + private static final org.apache.thrift.protocol.TField SHORT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(shortColumn, org.apache.thrift.protocol.TType.I16, (short)3); + private static final org.apache.thrift.protocol.TField INT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(intColumn, org.apache.thrift.protocol.TType.I32, (short)4); + private static final org.apache.thrift.protocol.TField LONG_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(longColumn, org.apache.thrift.protocol.TType.I64, (short)5); + private static final org.apache.thrift.protocol.TField DOUBLE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(doubleColumn, org.apache.thrift.protocol.TType.DOUBLE, (short)6); + private static final org.apache.thrift.protocol.TField BINARY_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(binaryColumn, org.apache.thrift.protocol.TType.STRING, (short)7); + private static final org.apache.thrift.protocol.TField STRING_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(stringColumn, org.apache.thrift.protocol.TType.STRING, (short)8); + private static final org.apache.thrift.protocol.TField ENUM_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(enumColumn, org.apache.thrift.protocol.TType.I32, (short)9); + private static final org.apache.thrift.protocol.TField MAYBE_BOOL_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(maybeBoolColumn, org.apache.thrift.protocol.TType.BOOL, (short)10); + private static final org.apache.thrift.protocol.TField MAYBE_BYTE_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(maybeByteColumn, org.apache.thrift.protocol.TType.BYTE, (short)11); + private static final org.apache.thrift.protocol.TField MAYBE_SHORT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(maybeShortColumn, org.apache.thrift.protocol.TType.I16, (short)12); + private static final org.apache.thrift.protocol.TField MAYBE_INT_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(maybeIntColumn, org.apache.thrift.protocol.TType.I32, (short)13); + private static final org.apache.thrift.protocol.TField MAYBE_LONG_COLUMN_FIELD_DESC = new org.apache.thrift.protocol.TField(maybeLongColumn, org.apache.thrift.protocol.TType.I64, (short)14); + private static final org.apache.thrift.protocol.TField MAYBE_DOUBLE_COLUMN_FIELD_DESC
[3/4] spark git commit: [SPARK-6123] [SPARK-6775] [SPARK-6776] [SQL] Refactors Parquet read path for interoperability and backwards-compatibility
http://git-wip-us.apache.org/repos/asf/spark/blob/4ffc27ca/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java -- diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java new file mode 100644 index 000..daec65a --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/CompatibilityTest.java @@ -0,0 +1,17 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.parquet.test.avro; + +@SuppressWarnings(all) +@org.apache.avro.specific.AvroGenerated +public interface CompatibilityTest { + public static final org.apache.avro.Protocol PROTOCOL = org.apache.avro.Protocol.parse({\protocol\:\CompatibilityTest\,\namespace\:\org.apache.spark.sql.parquet.test.avro\,\types\:[{\type\:\record\,\name\:\Nested\,\fields\:[{\name\:\nested_ints_column\,\type\:{\type\:\array\,\items\:\int\}},{\name\:\nested_string_column\,\type\:{\type\:\string\,\avro.java.string\:\String\}}]},{\type\:\record\,\name\:\ParquetAvroCompat\,\fields\:[{\name\:\bool_column\,\type\:\boolean\},{\name\:\int_column\,\type\:\int\},{\name\:\long_column\,\type\:\long\},{\name\:\float_column\,\type\:\float\},{\name\:\double_column\,\type\:\double\},{\name\:\binary_column\,\type\:\bytes\},{\name\:\string_column\,\type\:{\type\:\string\,\avro.java.string\:\String\}},{\name\:\maybe_bool_column\,\type\:[\null\,\boolean\]},{\name\:\maybe_int_column\,\type\:[\null\,\int\]},{\nam e\:\maybe_long_column\,\type\:[\null\,\long\]},{\name\:\maybe_float_column\,\type\:[\null\,\float\]},{\name\:\maybe_double_column\,\type\:[\null\,\double\]},{\name\:\maybe_binary_column\,\type\:[\null\,\bytes\]},{\name\:\maybe_string_column\,\type\:[\null\,{\type\:\string\,\avro.java.string\:\String\}]},{\name\:\strings_column\,\type\:{\type\:\array\,\items\:{\type\:\string\,\avro.java.string\:\String\}}},{\name\:\string_to_int_column\,\type\:{\type\:\map\,\values\:\int\,\avro.java.string\:\String\}},{\name\:\complex_column\,\type\:{\type\:\map\,\values\:{\type\:\array\,\items\:\Nested\},\avro.java.string\:\String\}}]}],\messages\:{}}); + + @SuppressWarnings(all) + public interface Callback extends CompatibilityTest { +public static final org.apache.avro.Protocol PROTOCOL = org.apache.spark.sql.parquet.test.avro.CompatibilityTest.PROTOCOL; + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/spark/blob/4ffc27ca/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java -- diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java new file mode 100644 index 000..051f1ee --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/avro/Nested.java @@ -0,0 +1,196 @@ +/** + * Autogenerated by Avro + * + * DO NOT EDIT DIRECTLY + */ +package org.apache.spark.sql.parquet.test.avro; +@SuppressWarnings(all) +@org.apache.avro.specific.AvroGenerated +public class Nested extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord { + public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse({\type\:\record\,\name\:\Nested\,\namespace\:\org.apache.spark.sql.parquet.test.avro\,\fields\:[{\name\:\nested_ints_column\,\type\:{\type\:\array\,\items\:\int\}},{\name\:\nested_string_column\,\type\:{\type\:\string\,\avro.java.string\:\String\}}]}); + public static org.apache.avro.Schema getClassSchema() { return SCHEMA$; } + @Deprecated public java.util.Listjava.lang.Integer nested_ints_column; + @Deprecated public java.lang.String nested_string_column; + + /** + * Default constructor. Note that this does not initialize fields + * to their default values from the schema. If that is desired then + * one should use codenewBuilder()/code. + */ + public Nested() {} + + /** + * All-args constructor. + */ + public Nested(java.util.Listjava.lang.Integer nested_ints_column, java.lang.String nested_string_column) { +this.nested_ints_column = nested_ints_column; +this.nested_string_column = nested_string_column; + } + + public org.apache.avro.Schema getSchema() { return SCHEMA$; } + // Used by DatumWriter. Applications should not call. + public java.lang.Object get(int field$) { +switch (field$) { +case 0: return nested_ints_column; +case 1: return nested_string_column; +default: throw new org.apache.avro.AvroRuntimeException(Bad index); +} + } + // Used by DatumReader. Applications should not call. + @SuppressWarnings(value=unchecked) + public void put(int field$, java.lang.Object value$) { +switch
[1/4] spark git commit: [SPARK-6123] [SPARK-6775] [SPARK-6776] [SQL] Refactors Parquet read path for interoperability and backwards-compatibility
Repository: spark Updated Branches: refs/heads/master 5687f7655 - 4ffc27caa http://git-wip-us.apache.org/repos/asf/spark/blob/4ffc27ca/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java -- diff --git a/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java new file mode 100644 index 000..5315c6a --- /dev/null +++ b/sql/core/src/test/gen-java/org/apache/spark/sql/parquet/test/thrift/Suit.java @@ -0,0 +1,51 @@ +/** + * Autogenerated by Thrift Compiler (0.9.2) + * + * DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING + * @generated + */ +package org.apache.spark.sql.parquet.test.thrift; + + +import java.util.Map; +import java.util.HashMap; +import org.apache.thrift.TEnum; + +public enum Suit implements org.apache.thrift.TEnum { + SPADES(0), + HEARTS(1), + DIAMONDS(2), + CLUBS(3); + + private final int value; + + private Suit(int value) { +this.value = value; + } + + /** + * Get the integer value of this enum value, as defined in the Thrift IDL. + */ + public int getValue() { +return value; + } + + /** + * Find a the enum type by its integer value, as defined in the Thrift IDL. + * @return null if the value is not found. + */ + public static Suit findByValue(int value) { +switch (value) { + case 0: +return SPADES; + case 1: +return HEARTS; + case 2: +return DIAMONDS; + case 3: +return CLUBS; + default: +return null; +} + } +} http://git-wip-us.apache.org/repos/asf/spark/blob/4ffc27ca/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala new file mode 100644 index 000..bfa4273 --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetAvroCompatibilitySuite.scala @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.parquet + +import java.nio.ByteBuffer +import java.util.{List = JList, Map = JMap} + +import scala.collection.JavaConversions._ + +import org.apache.hadoop.fs.Path +import org.apache.parquet.avro.AvroParquetWriter + +import org.apache.spark.sql.parquet.test.avro.{Nested, ParquetAvroCompat} +import org.apache.spark.sql.test.TestSQLContext +import org.apache.spark.sql.{Row, SQLContext} + +class ParquetAvroCompatibilitySuite extends ParquetCompatibilityTest { + import ParquetCompatibilityTest._ + + override val sqlContext: SQLContext = TestSQLContext + + override protected def beforeAll(): Unit = { +super.beforeAll() + +val writer = + new AvroParquetWriter[ParquetAvroCompat]( +new Path(parquetStore.getCanonicalPath), +ParquetAvroCompat.getClassSchema) + +(0 until 10).foreach(i = writer.write(makeParquetAvroCompat(i))) +writer.close() + } + + test(Read Parquet file generated by parquet-avro) { +logInfo( + sSchema of the Parquet file written by parquet-avro: + |${readParquetSchema(parquetStore.getCanonicalPath)} + .stripMargin) + +checkAnswer(sqlContext.read.parquet(parquetStore.getCanonicalPath), (0 until 10).map { i = + def nullable[T : AnyRef]: ( = T) = T = makeNullable[T](i) + + Row( +i % 2 == 0, +i, +i.toLong * 10, +i.toFloat + 0.1f, +i.toDouble + 0.2d, +sval_$i.getBytes, +sval_$i, + +nullable(i % 2 == 0: java.lang.Boolean), +nullable(i: Integer), +nullable(i.toLong: java.lang.Long), +nullable(i.toFloat + 0.1f: java.lang.Float), +nullable(i.toDouble + 0.2d: java.lang.Double), +nullable(sval_$i.getBytes), +nullable(sval_$i), + +Seq.tabulate(3)(n = sarr_${i + n}), +Seq.tabulate(3)(n = n.toString - (i + n: Integer)).toMap, +Seq.tabulate(3) { n = +
spark git commit: [SPARK-8877] [MLLIB] Public API for association rule generation
Repository: spark Updated Branches: refs/heads/master 381cb161b - 8c32b2e87 [SPARK-8877] [MLLIB] Public API for association rule generation Adds FPGrowth.generateAssociationRules to public API for generating association rules after mining frequent itemsets. Author: Feynman Liang fli...@databricks.com Closes #7271 from feynmanliang/SPARK-8877 and squashes the following commits: 83b8baf [Feynman Liang] Add API Doc 867abff [Feynman Liang] Add FPGrowth.generateAssociationRules and change access modifiers for AssociationRules Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8c32b2e8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8c32b2e8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8c32b2e8 Branch: refs/heads/master Commit: 8c32b2e870c7c250a63e838718df833edf6dea07 Parents: 381cb16 Author: Feynman Liang fli...@databricks.com Authored: Wed Jul 8 16:27:11 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Jul 8 16:27:11 2015 -0700 -- .../spark/mllib/fpm/AssociationRules.scala | 5 ++- .../org/apache/spark/mllib/fpm/FPGrowth.scala | 11 - .../apache/spark/mllib/fpm/FPGrowthSuite.scala | 42 3 files changed, 55 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8c32b2e8/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index 4a0f842..7e2bbfe 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -33,7 +33,7 @@ import org.apache.spark.rdd.RDD * association rules which have a single item as the consequent. */ @Experimental -class AssociationRules private ( +class AssociationRules private[fpm] ( private var minConfidence: Double) extends Logging with Serializable { /** @@ -45,6 +45,7 @@ class AssociationRules private ( * Sets the minimal confidence (default: `0.8`). */ def setMinConfidence(minConfidence: Double): this.type = { +require(minConfidence = 0.0 minConfidence = 1.0) this.minConfidence = minConfidence this } @@ -91,7 +92,7 @@ object AssociationRules { * @tparam Item item type */ @Experimental - class Rule[Item] private[mllib] ( + class Rule[Item] private[fpm] ( val antecedent: Array[Item], val consequent: Array[Item], freqUnion: Double, http://git-wip-us.apache.org/repos/asf/spark/blob/8c32b2e8/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index 0da59e8..9cb9a00 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -40,7 +40,16 @@ import org.apache.spark.storage.StorageLevel * @tparam Item item type */ @Experimental -class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable +class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { + /** + * Generates association rules for the [[Item]]s in [[freqItemsets]]. + * @param confidence minimal confidence of the rules produced + */ + def generateAssociationRules(confidence: Double): RDD[AssociationRules.Rule[Item]] = { +val associationRules = new AssociationRules(confidence) +associationRules.run(freqItemsets) + } +} /** * :: Experimental :: http://git-wip-us.apache.org/repos/asf/spark/blob/8c32b2e8/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala index ddc296a..4a9bfdb 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/fpm/FPGrowthSuite.scala @@ -132,6 +132,48 @@ class FPGrowthSuite extends SparkFunSuite with MLlibTestSparkContext { assert(model1.freqItemsets.count() === 625) } + test(FP-Growth String type association rule generation) { +val transactions = Seq( + r z h k p, + z y x w v u t s, + s x o n r, + x z y m t s q e, + z, + x z y r q t p) + .map(_.split( )) +val rdd = sc.parallelize(transactions,
spark git commit: [SPARK-8068] [MLLIB] Add confusionMatrix method at class MulticlassMetrics in pyspark/mllib
Repository: spark Updated Branches: refs/heads/master 4ffc27caa - 381cb161b [SPARK-8068] [MLLIB] Add confusionMatrix method at class MulticlassMetrics in pyspark/mllib Add confusionMatrix method at class MulticlassMetrics in pyspark/mllib Author: Yanbo Liang yblia...@gmail.com Closes #7286 from yanboliang/spark-8068 and squashes the following commits: 6109fe1 [Yanbo Liang] Add confusionMatrix method at class MulticlassMetrics in pyspark/mllib Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/381cb161 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/381cb161 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/381cb161 Branch: refs/heads/master Commit: 381cb161ba4e3a30f2da3c4ef4ee19869d51f101 Parents: 4ffc27c Author: Yanbo Liang yblia...@gmail.com Authored: Wed Jul 8 16:21:28 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Wed Jul 8 16:21:28 2015 -0700 -- python/pyspark/mllib/evaluation.py | 11 +++ 1 file changed, 11 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/381cb161/python/pyspark/mllib/evaluation.py -- diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py index c5cf3a4..f214037 100644 --- a/python/pyspark/mllib/evaluation.py +++ b/python/pyspark/mllib/evaluation.py @@ -152,6 +152,10 @@ class MulticlassMetrics(JavaModelWrapper): predictionAndLabels = sc.parallelize([(0.0, 0.0), (0.0, 1.0), (0.0, 0.0), ... (1.0, 0.0), (1.0, 1.0), (1.0, 1.0), (1.0, 1.0), (2.0, 2.0), (2.0, 0.0)]) metrics = MulticlassMetrics(predictionAndLabels) + metrics.confusionMatrix().toArray() +array([[ 2., 1., 1.], + [ 1., 3., 0.], + [ 0., 0., 1.]]) metrics.falsePositiveRate(0.0) 0.2... metrics.precision(1.0) @@ -186,6 +190,13 @@ class MulticlassMetrics(JavaModelWrapper): java_model = java_class(df._jdf) super(MulticlassMetrics, self).__init__(java_model) +def confusionMatrix(self): + +Returns confusion matrix: predicted classes are in columns, +they are ordered by class label ascending, as in labels. + +return self.call(confusionMatrix) + def truePositiveRate(self, label): Returns true positive rate for a given label (category). - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org