spark git commit: [SPARK-7033] [SPARKR] Clean usage of split. Use partition instead where applicable.
Repository: spark Updated Branches: refs/heads/master 6e57d57b3 - ebb77b2af [SPARK-7033] [SPARKR] Clean usage of split. Use partition instead where applicable. Author: Sun Rui rui@intel.com Closes #5628 from sun-rui/SPARK-7033 and squashes the following commits: 046bc9e [Sun Rui] Clean split usage in tests. d531c86 [Sun Rui] [SPARK-7033][SPARKR] Clean usage of split. Use partition instead where applicable. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebb77b2a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebb77b2a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebb77b2a Branch: refs/heads/master Commit: ebb77b2aff085e71906b5de9d266ded89051af82 Parents: 6e57d57 Author: Sun Rui rui@intel.com Authored: Fri Apr 24 11:00:19 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Fri Apr 24 11:00:19 2015 -0700 -- R/pkg/R/RDD.R | 36 ++-- R/pkg/R/context.R | 20 ++-- R/pkg/R/pairRDD.R | 8 R/pkg/R/utils.R | 2 +- R/pkg/inst/tests/test_rdd.R | 12 ++-- 5 files changed, 39 insertions(+), 39 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ebb77b2a/R/pkg/R/RDD.R -- diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index 1284313..cc09efb 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -91,8 +91,8 @@ setMethod(initialize, PipelinedRDD, function(.Object, prev, func, jrdd_val) # NOTE: We use prev_serializedMode to track the serialization mode of prev_JRDD # prev_serializedMode is used during the delayed computation of JRDD in getJRDD } else { -pipelinedFunc - function(split, iterator) { - func(split, prev@func(split, iterator)) +pipelinedFunc - function(partIndex, part) { + func(partIndex, prev@func(partIndex, part)) } .Object@func - cleanClosure(pipelinedFunc) .Object@prev_jrdd - prev@prev_jrdd # maintain the pipeline @@ -306,7 +306,7 @@ setMethod(numPartitions, signature(x = RDD), function(x) { jrdd - getJRDD(x) -partitions - callJMethod(jrdd, splits) +partitions - callJMethod(jrdd, partitions) callJMethod(partitions, size) }) @@ -452,8 +452,8 @@ setMethod(countByValue, setMethod(lapply, signature(X = RDD, FUN = function), function(X, FUN) { -func - function(split, iterator) { - lapply(iterator, FUN) +func - function(partIndex, part) { + lapply(part, FUN) } lapplyPartitionsWithIndex(X, func) }) @@ -538,8 +538,8 @@ setMethod(mapPartitions, #'\dontrun{ #' sc - sparkR.init() #' rdd - parallelize(sc, 1:10, 5L) -#' prod - lapplyPartitionsWithIndex(rdd, function(split, part) { -#' split * Reduce(+, part) }) +#' prod - lapplyPartitionsWithIndex(rdd, function(partIndex, part) { +#' partIndex * Reduce(+, part) }) #' collect(prod, flatten = FALSE) # 0, 7, 22, 45, 76 #'} #' @rdname lapplyPartitionsWithIndex @@ -813,7 +813,7 @@ setMethod(distinct, #' @examples #'\dontrun{ #' sc - sparkR.init() -#' rdd - parallelize(sc, 1:10) # ensure each num is in its own split +#' rdd - parallelize(sc, 1:10) #' collect(sampleRDD(rdd, FALSE, 0.5, 1618L)) # ~5 distinct elements #' collect(sampleRDD(rdd, TRUE, 0.5, 9L)) # ~5 elements possibly with duplicates #'} @@ -825,14 +825,14 @@ setMethod(sampleRDD, function(x, withReplacement, fraction, seed) { # The sampler: takes a partition and returns its sampled version. -samplingFunc - function(split, part) { +samplingFunc - function(partIndex, part) { set.seed(seed) res - vector(list, length(part)) len - 0 # Discards some random values to ensure each partition has a # different random seed. - runif(split) + runif(partIndex) for (elem in part) { if (withReplacement) { @@ -989,8 +989,8 @@ setMethod(coalesce, function(x, numPartitions, shuffle = FALSE) { numPartitions - numToInt(numPartitions) if (shuffle || numPartitions SparkR::numPartitions(x)) { - func - function(s, part) { - set.seed(s) # split as seed + func - function(partIndex, part) { + set.seed(partIndex) # partIndex as seed start - as.integer(sample(numPartitions, 1) - 1) lapply(seq_along(part), function(i)
spark git commit: [SPARK-7115] [MLLIB] skip the very first 1 in poly expansion
Repository: spark Updated Branches: refs/heads/master 8509519d8 - 78b39c7e0 [SPARK-7115] [MLLIB] skip the very first 1 in poly expansion yinxusen Author: Xiangrui Meng m...@databricks.com Closes #5681 from mengxr/SPARK-7115 and squashes the following commits: 9ac27cd [Xiangrui Meng] skip the very first 1 in poly expansion Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/78b39c7e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/78b39c7e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/78b39c7e Branch: refs/heads/master Commit: 78b39c7e0de8c9dc748cfbf8f78578a9524b6a94 Parents: 8509519 Author: Xiangrui Meng m...@databricks.com Authored: Fri Apr 24 08:27:48 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Fri Apr 24 08:27:48 2015 -0700 -- .../spark/ml/feature/PolynomialExpansion.scala | 22 .../ml/feature/PolynomialExpansionSuite.scala | 22 ++-- 2 files changed, 24 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/78b39c7e/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala index c3a59a3..d855f04 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -87,7 +87,9 @@ object PolynomialExpansion { if (multiplier == 0.0) { // do nothing } else if (degree == 0 || lastIdx 0) { - polyValues(curPolyIdx) = multiplier + if (curPolyIdx = 0) { // skip the very first 1 +polyValues(curPolyIdx) = multiplier + } } else { val v = values(lastIdx) val lastIdx1 = lastIdx - 1 @@ -116,8 +118,10 @@ object PolynomialExpansion { if (multiplier == 0.0) { // do nothing } else if (degree == 0 || lastIdx 0) { - polyIndices += curPolyIdx - polyValues += multiplier + if (curPolyIdx = 0) { // skip the very first 1 +polyIndices += curPolyIdx +polyValues += multiplier + } } else { // Skip all zeros at the tail. val v = values(lastIdx) @@ -139,8 +143,8 @@ object PolynomialExpansion { private def expand(dv: DenseVector, degree: Int): DenseVector = { val n = dv.size val polySize = getPolySize(n, degree) -val polyValues = new Array[Double](polySize) -expandDense(dv.values, n - 1, degree, 1.0, polyValues, 0) +val polyValues = new Array[Double](polySize - 1) +expandDense(dv.values, n - 1, degree, 1.0, polyValues, -1) new DenseVector(polyValues) } @@ -149,12 +153,12 @@ object PolynomialExpansion { val nnz = sv.values.length val nnzPolySize = getPolySize(nnz, degree) val polyIndices = mutable.ArrayBuilder.make[Int] -polyIndices.sizeHint(nnzPolySize) +polyIndices.sizeHint(nnzPolySize - 1) val polyValues = mutable.ArrayBuilder.make[Double] -polyValues.sizeHint(nnzPolySize) +polyValues.sizeHint(nnzPolySize - 1) expandSparse( - sv.indices, sv.values, nnz - 1, sv.size - 1, degree, 1.0, polyIndices, polyValues, 0) -new SparseVector(polySize, polyIndices.result(), polyValues.result()) + sv.indices, sv.values, nnz - 1, sv.size - 1, degree, 1.0, polyIndices, polyValues, -1) +new SparseVector(polySize - 1, polyIndices.result(), polyValues.result()) } def expand(v: Vector, degree: Int): Vector = { http://git-wip-us.apache.org/repos/asf/spark/blob/78b39c7e/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala index b0a537b..c1d64fb 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala @@ -44,11 +44,11 @@ class PolynomialExpansionSuite extends FunSuite with MLlibTestSparkContext { ) val twoDegreeExpansion: Array[Vector] = Array( - Vectors.sparse(10, Array(0, 1, 2, 3, 4, 5), Array(1.0, -2.0, 4.0, 2.3, -4.6, 5.29)), - Vectors.dense(1.0, -2.0, 4.0, 2.3, -4.6, 5.29), - Vectors.dense(Array(1.0) ++ Array.fill[Double](9)(0.0)), - Vectors.dense(1.0, 0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0), - Vectors.sparse(10, Array(0), Array(1.0))) + Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0,
spark git commit: [SPARK-6528] [ML] Add IDF transformer
Repository: spark Updated Branches: refs/heads/master 78b39c7e0 - 6e57d57b3 [SPARK-6528] [ML] Add IDF transformer See [SPARK-6528](https://issues.apache.org/jira/browse/SPARK-6528). Add IDF transformer in ML package. Author: Xusen Yin yinxu...@gmail.com Closes #5266 from yinxusen/SPARK-6528 and squashes the following commits: 741db31 [Xusen Yin] get param from new paramMap d169967 [Xusen Yin] add final to param and IDF class c9c3759 [Xusen Yin] simplify test suite 5867c09 [Xusen Yin] refine IDF transformer with new interfaces 7727cae [Xusen Yin] Merge branch 'master' into SPARK-6528 4338a37 [Xusen Yin] Merge branch 'master' into SPARK-6528 aef2cdf [Xusen Yin] add doc and group for param 5760b49 [Xusen Yin] fix code style 2add691 [Xusen Yin] fix code style and test 03fbecb [Xusen Yin] remove duplicated code 2aa4be0 [Xusen Yin] clean test suite 4802c67 [Xusen Yin] add IDF transformer and test suite Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e57d57b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e57d57b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e57d57b Branch: refs/heads/master Commit: 6e57d57b32ba2aa0514692074897b5edd34e0dd6 Parents: 78b39c7 Author: Xusen Yin yinxu...@gmail.com Authored: Fri Apr 24 08:29:49 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Fri Apr 24 08:29:49 2015 -0700 -- .../scala/org/apache/spark/ml/feature/IDF.scala | 116 +++ .../org/apache/spark/ml/feature/IDFSuite.scala | 101 2 files changed, 217 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6e57d57b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala new file mode 100644 index 000..e6a62d9 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import org.apache.spark.annotation.AlphaComponent +import org.apache.spark.ml._ +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.util.SchemaUtils +import org.apache.spark.mllib.feature +import org.apache.spark.mllib.linalg.{Vector, VectorUDT} +import org.apache.spark.sql._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.StructType + +/** + * Params for [[IDF]] and [[IDFModel]]. + */ +private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol { + + /** + * The minimum of documents in which a term should appear. + * @group param + */ + final val minDocFreq = new IntParam( +this, minDocFreq, minimum of documents in which a term should appear for filtering) + + setDefault(minDocFreq - 0) + + /** @group getParam */ + def getMinDocFreq: Int = getOrDefault(minDocFreq) + + /** @group setParam */ + def setMinDocFreq(value: Int): this.type = set(minDocFreq, value) + + /** + * Validate and transform the input schema. + */ + protected def validateAndTransformSchema(schema: StructType, paramMap: ParamMap): StructType = { +val map = extractParamMap(paramMap) +SchemaUtils.checkColumnType(schema, map(inputCol), new VectorUDT) +SchemaUtils.appendColumn(schema, map(outputCol), new VectorUDT) + } +} + +/** + * :: AlphaComponent :: + * Compute the Inverse Document Frequency (IDF) given a collection of documents. + */ +@AlphaComponent +final class IDF extends Estimator[IDFModel] with IDFBase { + + /** @group setParam */ + def setInputCol(value: String): this.type = set(inputCol, value) + + /** @group setParam */ + def setOutputCol(value: String): this.type = set(outputCol, value) + + override def fit(dataset: DataFrame, paramMap: ParamMap): IDFModel = { +transformSchema(dataset.schema, paramMap, logging = true) +val map =
spark git commit: [PySpark][Minor] Update sql example, so that can read file correctly
Repository: spark Updated Branches: refs/heads/master 438859eb7 - d874f8b54 [PySpark][Minor] Update sql example, so that can read file correctly To run Spark, default will read file from HDFS if we don't set the schema. Author: linweizhong linweizh...@huawei.com Closes #5684 from Sephiroth-Lin/pyspark_example_minor and squashes the following commits: 19fe145 [linweizhong] Update example sql.py, so that can read file correctly Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d874f8b5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d874f8b5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d874f8b5 Branch: refs/heads/master Commit: d874f8b546d8fae95bc92d8461b8189e51cb731b Parents: 438859e Author: linweizhong linweizh...@huawei.com Authored: Fri Apr 24 20:23:19 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Fri Apr 24 20:23:19 2015 -0700 -- examples/src/main/python/sql.py | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d874f8b5/examples/src/main/python/sql.py -- diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py index 87d7b08..2c18875 100644 --- a/examples/src/main/python/sql.py +++ b/examples/src/main/python/sql.py @@ -18,6 +18,7 @@ from __future__ import print_function import os +import sys from pyspark import SparkContext from pyspark.sql import SQLContext @@ -50,7 +51,11 @@ if __name__ == __main__: # A JSON dataset is pointed to by path. # The path can be either a single text file or a directory storing text files. -path = os.path.join(os.environ['SPARK_HOME'], examples/src/main/resources/people.json) +if len(sys.argv) 2: +path = file:// + \ +os.path.join(os.environ['SPARK_HOME'], examples/src/main/resources/people.json) +else: +path = sys.argv[1] # Create a DataFrame from the file(s) pointed to by path people = sqlContext.jsonFile(path) # root - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-7136][Docs] Spark SQL and DataFrame Guide fix example file and paths
Repository: spark Updated Branches: refs/heads/master d874f8b54 - 59b7cfc41 [SPARK-7136][Docs] Spark SQL and DataFrame Guide fix example file and paths Changes example file for Generic Load/Save Functions to users.parquet rather than people.parquet which doesn't exist unless a later example has already been executed. Also adds filepaths. Author: Deborah Siegel deborah.sie...@gmail.com Author: DEBORAH SIEGEL deborahsie...@d-140-142-0-49.dhcp4.washington.edu Author: DEBORAH SIEGEL deborahsiegel@DEBORAHs-MacBook-Pro.local Author: DEBORAH SIEGEL deborahsie...@d-69-91-154-197.dhcp4.washington.edu Closes #5693 from d3borah/master and squashes the following commits: 4d5e43b [Deborah Siegel] sparkSQL doc change b15a497 [Deborah Siegel] Revert sparkSQL doc change 5a2863c [DEBORAH SIEGEL] Merge remote-tracking branch 'upstream/master' 91972fc [DEBORAH SIEGEL] sparkSQL doc change f000e59 [DEBORAH SIEGEL] Merge remote-tracking branch 'upstream/master' db54173 [DEBORAH SIEGEL] fixed aggregateMessages example in graphX doc Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/59b7cfc4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/59b7cfc4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/59b7cfc4 Branch: refs/heads/master Commit: 59b7cfc41b2c06fbfbf6aca16c1619496a8d1d00 Parents: d874f8b Author: Deborah Siegel deborah.sie...@gmail.com Authored: Fri Apr 24 20:25:07 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Fri Apr 24 20:25:07 2015 -0700 -- docs/sql-programming-guide.md | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/59b7cfc4/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index 49b1e69..b8233ae 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -681,8 +681,8 @@ In the simplest form, the default data source (`parquet` unless otherwise config div data-lang=scala markdown=1 {% highlight scala %} -val df = sqlContext.load(people.parquet) -df.select(name, age).save(namesAndAges.parquet) +val df = sqlContext.load(examples/src/main/resources/users.parquet) +df.select(name, favorite_color).save(namesAndFavColors.parquet) {% endhighlight %} /div @@ -691,8 +691,8 @@ df.select(name, age).save(namesAndAges.parquet) {% highlight java %} -DataFrame df = sqlContext.load(people.parquet); -df.select(name, age).save(namesAndAges.parquet); +DataFrame df = sqlContext.load(examples/src/main/resources/users.parquet); +df.select(name, favorite_color).save(namesAndFavColors.parquet); {% endhighlight %} @@ -702,8 +702,8 @@ df.select(name, age).save(namesAndAges.parquet); {% highlight python %} -df = sqlContext.load(people.parquet) -df.select(name, age).save(namesAndAges.parquet) +df = sqlContext.load(examples/src/main/resources/users.parquet) +df.select(name, favorite_color).save(namesAndFavColors.parquet) {% endhighlight %} @@ -722,7 +722,7 @@ using this syntax. div data-lang=scala markdown=1 {% highlight scala %} -val df = sqlContext.load(people.json, json) +val df = sqlContext.load(examples/src/main/resources/people.json, json) df.select(name, age).save(namesAndAges.parquet, parquet) {% endhighlight %} @@ -732,7 +732,7 @@ df.select(name, age).save(namesAndAges.parquet, parquet) {% highlight java %} -DataFrame df = sqlContext.load(people.json, json); +DataFrame df = sqlContext.load(examples/src/main/resources/people.json, json); df.select(name, age).save(namesAndAges.parquet, parquet); {% endhighlight %} @@ -743,7 +743,7 @@ df.select(name, age).save(namesAndAges.parquet, parquet); {% highlight python %} -df = sqlContext.load(people.json, json) +df = sqlContext.load(examples/src/main/resources/people.json, json) df.select(name, age).save(namesAndAges.parquet, parquet) {% endhighlight %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-5894] [ML] Add polynomial mapper
Repository: spark Updated Branches: refs/heads/master 4c722d77a - 8509519d8 [SPARK-5894] [ML] Add polynomial mapper See [SPARK-5894](https://issues.apache.org/jira/browse/SPARK-5894). Author: Xusen Yin yinxu...@gmail.com Author: Xiangrui Meng m...@databricks.com Closes #5245 from yinxusen/SPARK-5894 and squashes the following commits: dc461a6 [Xusen Yin] merge polynomial expansion v2 6d0c3cc [Xusen Yin] Merge branch 'SPARK-5894' of https://github.com/mengxr/spark into mengxr-SPARK-5894 57bfdd5 [Xusen Yin] Merge branch 'master' into SPARK-5894 3d02a7d [Xusen Yin] Merge branch 'master' into SPARK-5894 a067da2 [Xiangrui Meng] a new approach for poly expansion 0789d81 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into SPARK-5894 4e9aed0 [Xusen Yin] fix test suite 95d8fb9 [Xusen Yin] fix sparse vector indices 8d39674 [Xusen Yin] fix sparse vector expansion error 5998dd6 [Xusen Yin] fix dense vector fillin fa3ade3 [Xusen Yin] change the functional code into imperative one to speedup b70e7e1 [Xusen Yin] remove useless case class 6fa236f [Xusen Yin] fix vector slice error daff601 [Xusen Yin] fix index error of sparse vector 6bd0a10 [Xusen Yin] merge repeated features 419f8a2 [Xusen Yin] need to merge same columns 4ebf34e [Xusen Yin] add test suite of polynomial expansion 372227c [Xusen Yin] add polynomial expansion Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8509519d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8509519d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8509519d Branch: refs/heads/master Commit: 8509519d8bcf99e2d1b5e21da514d51357f9116d Parents: 4c722d7 Author: Xusen Yin yinxu...@gmail.com Authored: Fri Apr 24 00:39:29 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Fri Apr 24 00:39:29 2015 -0700 -- .../spark/ml/feature/PolynomialExpansion.scala | 167 +++ .../ml/feature/PolynomialExpansionSuite.scala | 104 2 files changed, 271 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8509519d/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala new file mode 100644 index 000..c3a59a3 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the License); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.feature + +import scala.collection.mutable + +import org.apache.spark.annotation.AlphaComponent +import org.apache.spark.ml.UnaryTransformer +import org.apache.spark.ml.param.{IntParam, ParamMap} +import org.apache.spark.mllib.linalg._ +import org.apache.spark.sql.types.DataType + +/** + * :: AlphaComponent :: + * Perform feature expansion in a polynomial space. As said in wikipedia of Polynomial Expansion, + * which is available at [[http://en.wikipedia.org/wiki/Polynomial_expansion]], In mathematics, an + * expansion of a product of sums expresses it as a sum of products by using the fact that + * multiplication distributes over addition. Take a 2-variable feature vector as an example: + * `(x, y)`, if we want to expand it with degree 2, then we get `(x, y, x * x, x * y, y * y)`. + */ +@AlphaComponent +class PolynomialExpansion extends UnaryTransformer[Vector, Vector, PolynomialExpansion] { + + /** + * The polynomial degree to expand, which should be larger than 1. + * @group param + */ + val degree = new IntParam(this, degree, the polynomial degree to expand) + setDefault(degree - 2) + + /** @group getParam */ + def getDegree: Int = getOrDefault(degree) + + /** @group setParam */ + def setDegree(value: Int): this.type = set(degree, value) + + override protected def createTransformFunc(paramMap: ParamMap): Vector = Vector = { v = +val d = paramMap(degree) +
spark git commit: [SPARK-6852] [SPARKR] Accept numeric as numPartitions in SparkR.
Repository: spark Updated Branches: refs/heads/master ebb77b2af - caf0136ec [SPARK-6852] [SPARKR] Accept numeric as numPartitions in SparkR. Author: Sun Rui rui@intel.com Closes #5613 from sun-rui/SPARK-6852 and squashes the following commits: abaf02e [Sun Rui] Change the type of default numPartitions from integer to numeric in generics.R. 29d67c1 [Sun Rui] [SPARK-6852][SPARKR] Accept numeric as numPartitions in SparkR. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/caf0136e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/caf0136e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/caf0136e Branch: refs/heads/master Commit: caf0136ec5838cf5bf61f39a5b3474a505a6ae11 Parents: ebb77b2 Author: Sun Rui rui@intel.com Authored: Fri Apr 24 12:52:07 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Fri Apr 24 12:52:07 2015 -0700 -- R/pkg/R/RDD.R | 2 +- R/pkg/R/generics.R | 12 ++-- R/pkg/R/pairRDD.R | 24 3 files changed, 19 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/caf0136e/R/pkg/R/RDD.R -- diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R index cc09efb..1662d6b 100644 --- a/R/pkg/R/RDD.R +++ b/R/pkg/R/RDD.R @@ -967,7 +967,7 @@ setMethod(keyBy, setMethod(repartition, signature(x = RDD, numPartitions = numeric), function(x, numPartitions) { -coalesce(x, numToInt(numPartitions), TRUE) +coalesce(x, numPartitions, TRUE) }) #' Return a new RDD that is reduced into numPartitions partitions. http://git-wip-us.apache.org/repos/asf/spark/blob/caf0136e/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 6c62333..34dbe84 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -60,7 +60,7 @@ setGeneric(countByValue, function(x) { standardGeneric(countByValue) }) #' @rdname distinct #' @export -setGeneric(distinct, function(x, numPartitions = 1L) { standardGeneric(distinct) }) +setGeneric(distinct, function(x, numPartitions = 1) { standardGeneric(distinct) }) #' @rdname filterRDD #' @export @@ -182,7 +182,7 @@ setGeneric(setName, function(x, name) { standardGeneric(setName) }) #' @rdname sortBy #' @export setGeneric(sortBy, - function(x, func, ascending = TRUE, numPartitions = 1L) { + function(x, func, ascending = TRUE, numPartitions = 1) { standardGeneric(sortBy) }) @@ -244,7 +244,7 @@ setGeneric(flatMapValues, function(X, FUN) { standardGeneric(flatMapValues) #' @rdname intersection #' @export -setGeneric(intersection, function(x, other, numPartitions = 1L) { +setGeneric(intersection, function(x, other, numPartitions = 1) { standardGeneric(intersection) }) #' @rdname keys @@ -346,21 +346,21 @@ setGeneric(rightOuterJoin, function(x, y, numPartitions) { standardGeneric(ri #' @rdname sortByKey #' @export setGeneric(sortByKey, - function(x, ascending = TRUE, numPartitions = 1L) { + function(x, ascending = TRUE, numPartitions = 1) { standardGeneric(sortByKey) }) #' @rdname subtract #' @export setGeneric(subtract, - function(x, other, numPartitions = 1L) { + function(x, other, numPartitions = 1) { standardGeneric(subtract) }) #' @rdname subtractByKey #' @export setGeneric(subtractByKey, - function(x, other, numPartitions = 1L) { + function(x, other, numPartitions = 1) { standardGeneric(subtractByKey) }) http://git-wip-us.apache.org/repos/asf/spark/blob/caf0136e/R/pkg/R/pairRDD.R -- diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R index f99b474..9791e55 100644 --- a/R/pkg/R/pairRDD.R +++ b/R/pkg/R/pairRDD.R @@ -190,7 +190,7 @@ setMethod(flatMapValues, #' @rdname partitionBy #' @aliases partitionBy,RDD,integer-method setMethod(partitionBy, - signature(x = RDD, numPartitions = integer), + signature(x = RDD, numPartitions = numeric), function(x, numPartitions, partitionFunc = hashCode) { #if (missing(partitionFunc)) { @@ -211,7 +211,7 @@ setMethod(partitionBy, # the content (key-val pairs). pairwiseRRDD - newJObject(org.apache.spark.api.r.PairwiseRRDD, callJMethod(jrdd, rdd), - as.integer(numPartitions), + numToInt(numPartitions),
spark git commit: [SPARK-6122] [CORE] Upgrade tachyon-client version to 0.6.3
Repository: spark Updated Branches: refs/heads/master caf0136ec - 438859eb7 [SPARK-6122] [CORE] Upgrade tachyon-client version to 0.6.3 This is a reopening of #4867. A short summary of the issues resolved from the previous PR: 1. HTTPClient version mismatch: Selenium (used for UI tests) requires version 4.3.x, and Tachyon included 4.2.5 through a transitive dependency of its shaded thrift jar. To address this, Tachyon 0.6.3 will promote the transitive dependencies of the shaded jar so they can be excluded in spark. 2. Jackson-Mapper-ASL version mismatch: In lower versions of hadoop-client (ie. 1.0.4), version 1.0.1 is included. The parquet library used in spark sql requires version 1.8+. Its unclear to me why upgrading tachyon-client would cause this dependency to break. The solution was to exclude jackson-mapper-asl from hadoop-client. It seems that the dependency management in spark-parent will not work on transitive dependencies, one way to make sure jackson-mapper-asl is included with the correct version is to add it as a top level dependency. The best solution would be to exclude the dependency in the modules which require a higher version, but that did not fix the unit tests. Any suggestions on the best way to solve this would be appreciated! Author: Calvin Jia jia.cal...@gmail.com Closes #5354 from calvinjia/upgrade_tachyon_0.6.3 and squashes the following commits: 0eefe4d [Calvin Jia] Handle httpclient version in maven dependency management. Remove httpclient version setting from profiles. 7c00dfa [Calvin Jia] Set httpclient version to 4.3.2 for selenium. Specify version of httpclient for sql/hive (previously 4.2.5 transitive dependency of libthrift). 9263097 [Calvin Jia] Merge master to test latest changes dbfc1bd [Calvin Jia] Use Tachyon 0.6.4 for cleaner dependencies. e2ff80a [Calvin Jia] Exclude the jetty and curator promoted dependencies from tachyon-client. a3a29da [Calvin Jia] Update tachyon-client exclusions. 0ae6c97 [Calvin Jia] Change tachyon version to 0.6.3 a204df9 [Calvin Jia] Update make distribution tachyon version. a93c94f [Calvin Jia] Exclude jackson-mapper-asl from hadoop client since it has a lower version than spark's expected version. a8a923c [Calvin Jia] Exclude httpcomponents from Tachyon 910fabd [Calvin Jia] Update to master eed9230 [Calvin Jia] Update tachyon version to 0.6.1. 11907b3 [Calvin Jia] Use TachyonURI for tachyon paths instead of strings. 71bf441 [Calvin Jia] Upgrade Tachyon client version to 0.6.0. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/438859eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/438859eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/438859eb Branch: refs/heads/master Commit: 438859eb7c4e605bb4041d9a547a16be9c827c75 Parents: caf0136 Author: Calvin Jia jia.cal...@gmail.com Authored: Fri Apr 24 17:57:41 2015 -0400 Committer: Sean Owen so...@cloudera.com Committed: Fri Apr 24 17:57:41 2015 -0400 -- assembly/pom.xml| 10 -- core/pom.xml| 6 +- .../apache/spark/storage/TachyonBlockManager.scala | 16 .../main/scala/org/apache/spark/util/Utils.scala| 4 +++- examples/pom.xml| 5 - launcher/pom.xml| 6 ++ make-distribution.sh| 2 +- pom.xml | 12 +++- sql/hive/pom.xml| 5 + 9 files changed, 39 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/438859eb/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index f1f8b0d..20593e7 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -213,16 +213,6 @@ /plugins /build /profile -profile - idkinesis-asl/id - dependencies -dependency - groupIdorg.apache.httpcomponents/groupId - artifactIdhttpclient/artifactId - version${commons.httpclient.version}/version -/dependency - /dependencies -/profile !-- Profiles that disable inclusion of certain dependencies. -- profile http://git-wip-us.apache.org/repos/asf/spark/blob/438859eb/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index e80829b..5e89d54 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -74,6 +74,10 @@ groupIdjavax.servlet/groupId artifactIdservlet-api/artifactId /exclusion +exclusion + groupIdorg.codehaus.jackson/groupId +