spark git commit: [SPARK-7033] [SPARKR] Clean usage of split. Use partition instead where applicable.

2015-04-24 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master 6e57d57b3 - ebb77b2af


[SPARK-7033] [SPARKR] Clean usage of split. Use partition instead where 
applicable.

Author: Sun Rui rui@intel.com

Closes #5628 from sun-rui/SPARK-7033 and squashes the following commits:

046bc9e [Sun Rui] Clean split usage in tests.
d531c86 [Sun Rui] [SPARK-7033][SPARKR] Clean usage of split. Use partition 
instead where applicable.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ebb77b2a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ebb77b2a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ebb77b2a

Branch: refs/heads/master
Commit: ebb77b2aff085e71906b5de9d266ded89051af82
Parents: 6e57d57
Author: Sun Rui rui@intel.com
Authored: Fri Apr 24 11:00:19 2015 -0700
Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu
Committed: Fri Apr 24 11:00:19 2015 -0700

--
 R/pkg/R/RDD.R   | 36 ++--
 R/pkg/R/context.R   | 20 ++--
 R/pkg/R/pairRDD.R   |  8 
 R/pkg/R/utils.R |  2 +-
 R/pkg/inst/tests/test_rdd.R | 12 ++--
 5 files changed, 39 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ebb77b2a/R/pkg/R/RDD.R
--
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 1284313..cc09efb 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -91,8 +91,8 @@ setMethod(initialize, PipelinedRDD, function(.Object, 
prev, func, jrdd_val)
 # NOTE: We use prev_serializedMode to track the serialization mode of 
prev_JRDD
 # prev_serializedMode is used during the delayed computation of JRDD in 
getJRDD
   } else {
-pipelinedFunc - function(split, iterator) {
-  func(split, prev@func(split, iterator))
+pipelinedFunc - function(partIndex, part) {
+  func(partIndex, prev@func(partIndex, part))
 }
 .Object@func - cleanClosure(pipelinedFunc)
 .Object@prev_jrdd - prev@prev_jrdd # maintain the pipeline
@@ -306,7 +306,7 @@ setMethod(numPartitions,
   signature(x = RDD),
   function(x) {
 jrdd - getJRDD(x)
-partitions - callJMethod(jrdd, splits)
+partitions - callJMethod(jrdd, partitions)
 callJMethod(partitions, size)
   })
 
@@ -452,8 +452,8 @@ setMethod(countByValue,
 setMethod(lapply,
   signature(X = RDD, FUN = function),
   function(X, FUN) {
-func - function(split, iterator) {
-  lapply(iterator, FUN)
+func - function(partIndex, part) {
+  lapply(part, FUN)
 }
 lapplyPartitionsWithIndex(X, func)
   })
@@ -538,8 +538,8 @@ setMethod(mapPartitions,
 #'\dontrun{
 #' sc - sparkR.init()
 #' rdd - parallelize(sc, 1:10, 5L)
-#' prod - lapplyPartitionsWithIndex(rdd, function(split, part) {
-#'  split * Reduce(+, part) })
+#' prod - lapplyPartitionsWithIndex(rdd, function(partIndex, part) {
+#'  partIndex * Reduce(+, part) })
 #' collect(prod, flatten = FALSE) # 0, 7, 22, 45, 76
 #'}
 #' @rdname lapplyPartitionsWithIndex
@@ -813,7 +813,7 @@ setMethod(distinct,
 #' @examples
 #'\dontrun{
 #' sc - sparkR.init()
-#' rdd - parallelize(sc, 1:10) # ensure each num is in its own split
+#' rdd - parallelize(sc, 1:10)
 #' collect(sampleRDD(rdd, FALSE, 0.5, 1618L)) # ~5 distinct elements
 #' collect(sampleRDD(rdd, TRUE, 0.5, 9L)) # ~5 elements possibly with 
duplicates
 #'}
@@ -825,14 +825,14 @@ setMethod(sampleRDD,
   function(x, withReplacement, fraction, seed) {
 
 # The sampler: takes a partition and returns its sampled version.
-samplingFunc - function(split, part) {
+samplingFunc - function(partIndex, part) {
   set.seed(seed)
   res - vector(list, length(part))
   len - 0
 
   # Discards some random values to ensure each partition has a
   # different random seed.
-  runif(split)
+  runif(partIndex)
 
   for (elem in part) {
 if (withReplacement) {
@@ -989,8 +989,8 @@ setMethod(coalesce,
function(x, numPartitions, shuffle = FALSE) {
  numPartitions - numToInt(numPartitions)
  if (shuffle || numPartitions  SparkR::numPartitions(x)) {
-   func - function(s, part) {
- set.seed(s)  # split as seed
+   func - function(partIndex, part) {
+ set.seed(partIndex)  # partIndex as seed
  start - as.integer(sample(numPartitions, 1) - 1)
  lapply(seq_along(part),
 function(i) 

spark git commit: [SPARK-7115] [MLLIB] skip the very first 1 in poly expansion

2015-04-24 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master 8509519d8 - 78b39c7e0


[SPARK-7115] [MLLIB] skip the very first 1 in poly expansion

yinxusen

Author: Xiangrui Meng m...@databricks.com

Closes #5681 from mengxr/SPARK-7115 and squashes the following commits:

9ac27cd [Xiangrui Meng] skip the very first 1 in poly expansion


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/78b39c7e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/78b39c7e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/78b39c7e

Branch: refs/heads/master
Commit: 78b39c7e0de8c9dc748cfbf8f78578a9524b6a94
Parents: 8509519
Author: Xiangrui Meng m...@databricks.com
Authored: Fri Apr 24 08:27:48 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Fri Apr 24 08:27:48 2015 -0700

--
 .../spark/ml/feature/PolynomialExpansion.scala  | 22 
 .../ml/feature/PolynomialExpansionSuite.scala   | 22 ++--
 2 files changed, 24 insertions(+), 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/78b39c7e/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
index c3a59a3..d855f04 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -87,7 +87,9 @@ object PolynomialExpansion {
 if (multiplier == 0.0) {
   // do nothing
 } else if (degree == 0 || lastIdx  0) {
-  polyValues(curPolyIdx) = multiplier
+  if (curPolyIdx = 0) { // skip the very first 1
+polyValues(curPolyIdx) = multiplier
+  }
 } else {
   val v = values(lastIdx)
   val lastIdx1 = lastIdx - 1
@@ -116,8 +118,10 @@ object PolynomialExpansion {
 if (multiplier == 0.0) {
   // do nothing
 } else if (degree == 0 || lastIdx  0) {
-  polyIndices += curPolyIdx
-  polyValues += multiplier
+  if (curPolyIdx = 0) { // skip the very first 1
+polyIndices += curPolyIdx
+polyValues += multiplier
+  }
 } else {
   // Skip all zeros at the tail.
   val v = values(lastIdx)
@@ -139,8 +143,8 @@ object PolynomialExpansion {
   private def expand(dv: DenseVector, degree: Int): DenseVector = {
 val n = dv.size
 val polySize = getPolySize(n, degree)
-val polyValues = new Array[Double](polySize)
-expandDense(dv.values, n - 1, degree, 1.0, polyValues, 0)
+val polyValues = new Array[Double](polySize - 1)
+expandDense(dv.values, n - 1, degree, 1.0, polyValues, -1)
 new DenseVector(polyValues)
   }
 
@@ -149,12 +153,12 @@ object PolynomialExpansion {
 val nnz = sv.values.length
 val nnzPolySize = getPolySize(nnz, degree)
 val polyIndices = mutable.ArrayBuilder.make[Int]
-polyIndices.sizeHint(nnzPolySize)
+polyIndices.sizeHint(nnzPolySize - 1)
 val polyValues = mutable.ArrayBuilder.make[Double]
-polyValues.sizeHint(nnzPolySize)
+polyValues.sizeHint(nnzPolySize - 1)
 expandSparse(
-  sv.indices, sv.values, nnz - 1, sv.size - 1, degree, 1.0, polyIndices, 
polyValues, 0)
-new SparseVector(polySize, polyIndices.result(), polyValues.result())
+  sv.indices, sv.values, nnz - 1, sv.size - 1, degree, 1.0, polyIndices, 
polyValues, -1)
+new SparseVector(polySize - 1, polyIndices.result(), polyValues.result())
   }
 
   def expand(v: Vector, degree: Int): Vector = {

http://git-wip-us.apache.org/repos/asf/spark/blob/78b39c7e/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
index b0a537b..c1d64fb 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/feature/PolynomialExpansionSuite.scala
@@ -44,11 +44,11 @@ class PolynomialExpansionSuite extends FunSuite with 
MLlibTestSparkContext {
 )
 
 val twoDegreeExpansion: Array[Vector] = Array(
-  Vectors.sparse(10, Array(0, 1, 2, 3, 4, 5), Array(1.0, -2.0, 4.0, 2.3, 
-4.6, 5.29)),
-  Vectors.dense(1.0, -2.0, 4.0, 2.3, -4.6, 5.29),
-  Vectors.dense(Array(1.0) ++ Array.fill[Double](9)(0.0)),
-  Vectors.dense(1.0, 0.6, 0.36, -1.1, -0.66, 1.21, -3.0, -1.8, 3.3, 9.0),
-  Vectors.sparse(10, Array(0), Array(1.0)))
+  Vectors.sparse(9, Array(0, 1, 2, 3, 4), Array(-2.0, 4.0, 

spark git commit: [SPARK-6528] [ML] Add IDF transformer

2015-04-24 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master 78b39c7e0 - 6e57d57b3


[SPARK-6528] [ML] Add IDF transformer

See [SPARK-6528](https://issues.apache.org/jira/browse/SPARK-6528). Add IDF 
transformer in ML package.

Author: Xusen Yin yinxu...@gmail.com

Closes #5266 from yinxusen/SPARK-6528 and squashes the following commits:

741db31 [Xusen Yin] get param from new paramMap
d169967 [Xusen Yin] add final to param and IDF class
c9c3759 [Xusen Yin] simplify test suite
5867c09 [Xusen Yin] refine IDF transformer with new interfaces
7727cae [Xusen Yin] Merge branch 'master' into SPARK-6528
4338a37 [Xusen Yin] Merge branch 'master' into SPARK-6528
aef2cdf [Xusen Yin] add doc and group for param
5760b49 [Xusen Yin] fix code style
2add691 [Xusen Yin] fix code style and test
03fbecb [Xusen Yin] remove duplicated code
2aa4be0 [Xusen Yin] clean test suite
4802c67 [Xusen Yin] add IDF transformer and test suite


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6e57d57b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6e57d57b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6e57d57b

Branch: refs/heads/master
Commit: 6e57d57b32ba2aa0514692074897b5edd34e0dd6
Parents: 78b39c7
Author: Xusen Yin yinxu...@gmail.com
Authored: Fri Apr 24 08:29:49 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Fri Apr 24 08:29:49 2015 -0700

--
 .../scala/org/apache/spark/ml/feature/IDF.scala | 116 +++
 .../org/apache/spark/ml/feature/IDFSuite.scala  | 101 
 2 files changed, 217 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6e57d57b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
new file mode 100644
index 000..e6a62d9
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml._
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util.SchemaUtils
+import org.apache.spark.mllib.feature
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Params for [[IDF]] and [[IDFModel]].
+ */
+private[feature] trait IDFBase extends Params with HasInputCol with 
HasOutputCol {
+
+  /**
+   * The minimum of documents in which a term should appear.
+   * @group param
+   */
+  final val minDocFreq = new IntParam(
+this, minDocFreq, minimum of documents in which a term should appear 
for filtering)
+
+  setDefault(minDocFreq - 0)
+
+  /** @group getParam */
+  def getMinDocFreq: Int = getOrDefault(minDocFreq)
+
+  /** @group setParam */
+  def setMinDocFreq(value: Int): this.type = set(minDocFreq, value)
+
+  /**
+   * Validate and transform the input schema.
+   */
+  protected def validateAndTransformSchema(schema: StructType, paramMap: 
ParamMap): StructType = {
+val map = extractParamMap(paramMap)
+SchemaUtils.checkColumnType(schema, map(inputCol), new VectorUDT)
+SchemaUtils.appendColumn(schema, map(outputCol), new VectorUDT)
+  }
+}
+
+/**
+ * :: AlphaComponent ::
+ * Compute the Inverse Document Frequency (IDF) given a collection of 
documents.
+ */
+@AlphaComponent
+final class IDF extends Estimator[IDFModel] with IDFBase {
+
+  /** @group setParam */
+  def setInputCol(value: String): this.type = set(inputCol, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  override def fit(dataset: DataFrame, paramMap: ParamMap): IDFModel = {
+transformSchema(dataset.schema, paramMap, logging = true)
+val map = 

spark git commit: [PySpark][Minor] Update sql example, so that can read file correctly

2015-04-24 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master 438859eb7 - d874f8b54


[PySpark][Minor] Update sql example, so that can read file correctly

To run Spark, default will read file from HDFS if we don't set the schema.

Author: linweizhong linweizh...@huawei.com

Closes #5684 from Sephiroth-Lin/pyspark_example_minor and squashes the 
following commits:

19fe145 [linweizhong] Update example sql.py, so that can read file correctly


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d874f8b5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d874f8b5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d874f8b5

Branch: refs/heads/master
Commit: d874f8b546d8fae95bc92d8461b8189e51cb731b
Parents: 438859e
Author: linweizhong linweizh...@huawei.com
Authored: Fri Apr 24 20:23:19 2015 -0700
Committer: Reynold Xin r...@databricks.com
Committed: Fri Apr 24 20:23:19 2015 -0700

--
 examples/src/main/python/sql.py | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d874f8b5/examples/src/main/python/sql.py
--
diff --git a/examples/src/main/python/sql.py b/examples/src/main/python/sql.py
index 87d7b08..2c18875 100644
--- a/examples/src/main/python/sql.py
+++ b/examples/src/main/python/sql.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 
 import os
+import sys
 
 from pyspark import SparkContext
 from pyspark.sql import SQLContext
@@ -50,7 +51,11 @@ if __name__ == __main__:
 
 # A JSON dataset is pointed to by path.
 # The path can be either a single text file or a directory storing text 
files.
-path = os.path.join(os.environ['SPARK_HOME'], 
examples/src/main/resources/people.json)
+if len(sys.argv)  2:
+path = file:// + \
+os.path.join(os.environ['SPARK_HOME'], 
examples/src/main/resources/people.json)
+else:
+path = sys.argv[1]
 # Create a DataFrame from the file(s) pointed to by path
 people = sqlContext.jsonFile(path)
 # root


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-7136][Docs] Spark SQL and DataFrame Guide fix example file and paths

2015-04-24 Thread rxin
Repository: spark
Updated Branches:
  refs/heads/master d874f8b54 - 59b7cfc41


[SPARK-7136][Docs] Spark SQL and DataFrame Guide fix example file and paths

Changes example file for Generic Load/Save Functions to users.parquet rather 
than people.parquet which doesn't exist unless a later example has already been 
executed. Also adds filepaths.

Author: Deborah Siegel deborah.sie...@gmail.com
Author: DEBORAH SIEGEL deborahsie...@d-140-142-0-49.dhcp4.washington.edu
Author: DEBORAH SIEGEL deborahsiegel@DEBORAHs-MacBook-Pro.local
Author: DEBORAH SIEGEL deborahsie...@d-69-91-154-197.dhcp4.washington.edu

Closes #5693 from d3borah/master and squashes the following commits:

4d5e43b [Deborah Siegel] sparkSQL doc change
b15a497 [Deborah Siegel] Revert sparkSQL doc change
5a2863c [DEBORAH SIEGEL] Merge remote-tracking branch 'upstream/master'
91972fc [DEBORAH SIEGEL] sparkSQL doc change
f000e59 [DEBORAH SIEGEL] Merge remote-tracking branch 'upstream/master'
db54173 [DEBORAH SIEGEL] fixed aggregateMessages example in graphX doc


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/59b7cfc4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/59b7cfc4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/59b7cfc4

Branch: refs/heads/master
Commit: 59b7cfc41b2c06fbfbf6aca16c1619496a8d1d00
Parents: d874f8b
Author: Deborah Siegel deborah.sie...@gmail.com
Authored: Fri Apr 24 20:25:07 2015 -0700
Committer: Reynold Xin r...@databricks.com
Committed: Fri Apr 24 20:25:07 2015 -0700

--
 docs/sql-programming-guide.md | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/59b7cfc4/docs/sql-programming-guide.md
--
diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md
index 49b1e69..b8233ae 100644
--- a/docs/sql-programming-guide.md
+++ b/docs/sql-programming-guide.md
@@ -681,8 +681,8 @@ In the simplest form, the default data source (`parquet` 
unless otherwise config
 div data-lang=scala  markdown=1
 
 {% highlight scala %}
-val df = sqlContext.load(people.parquet)
-df.select(name, age).save(namesAndAges.parquet)
+val df = sqlContext.load(examples/src/main/resources/users.parquet)
+df.select(name, favorite_color).save(namesAndFavColors.parquet)
 {% endhighlight %}
 
 /div
@@ -691,8 +691,8 @@ df.select(name, age).save(namesAndAges.parquet)
 
 {% highlight java %}
 
-DataFrame df = sqlContext.load(people.parquet);
-df.select(name, age).save(namesAndAges.parquet);
+DataFrame df = sqlContext.load(examples/src/main/resources/users.parquet);
+df.select(name, favorite_color).save(namesAndFavColors.parquet);
 
 {% endhighlight %}
 
@@ -702,8 +702,8 @@ df.select(name, age).save(namesAndAges.parquet);
 
 {% highlight python %}
 
-df = sqlContext.load(people.parquet)
-df.select(name, age).save(namesAndAges.parquet)
+df = sqlContext.load(examples/src/main/resources/users.parquet)
+df.select(name, favorite_color).save(namesAndFavColors.parquet)
 
 {% endhighlight %}
 
@@ -722,7 +722,7 @@ using this syntax.
 div data-lang=scala  markdown=1
 
 {% highlight scala %}
-val df = sqlContext.load(people.json, json)
+val df = sqlContext.load(examples/src/main/resources/people.json, json)
 df.select(name, age).save(namesAndAges.parquet, parquet)
 {% endhighlight %}
 
@@ -732,7 +732,7 @@ df.select(name, age).save(namesAndAges.parquet, 
parquet)
 
 {% highlight java %}
 
-DataFrame df = sqlContext.load(people.json, json);
+DataFrame df = sqlContext.load(examples/src/main/resources/people.json, 
json);
 df.select(name, age).save(namesAndAges.parquet, parquet);
 
 {% endhighlight %}
@@ -743,7 +743,7 @@ df.select(name, age).save(namesAndAges.parquet, 
parquet);
 
 {% highlight python %}
 
-df = sqlContext.load(people.json, json)
+df = sqlContext.load(examples/src/main/resources/people.json, json)
 df.select(name, age).save(namesAndAges.parquet, parquet)
 
 {% endhighlight %}


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org



spark git commit: [SPARK-5894] [ML] Add polynomial mapper

2015-04-24 Thread meng
Repository: spark
Updated Branches:
  refs/heads/master 4c722d77a - 8509519d8


[SPARK-5894] [ML] Add polynomial mapper

See [SPARK-5894](https://issues.apache.org/jira/browse/SPARK-5894).

Author: Xusen Yin yinxu...@gmail.com
Author: Xiangrui Meng m...@databricks.com

Closes #5245 from yinxusen/SPARK-5894 and squashes the following commits:

dc461a6 [Xusen Yin] merge polynomial expansion v2
6d0c3cc [Xusen Yin] Merge branch 'SPARK-5894' of 
https://github.com/mengxr/spark into mengxr-SPARK-5894
57bfdd5 [Xusen Yin] Merge branch 'master' into SPARK-5894
3d02a7d [Xusen Yin] Merge branch 'master' into SPARK-5894
a067da2 [Xiangrui Meng] a new approach for poly expansion
0789d81 [Xiangrui Meng] Merge remote-tracking branch 'apache/master' into 
SPARK-5894
4e9aed0 [Xusen Yin] fix test suite
95d8fb9 [Xusen Yin] fix sparse vector indices
8d39674 [Xusen Yin] fix sparse vector expansion error
5998dd6 [Xusen Yin] fix dense vector fillin
fa3ade3 [Xusen Yin] change the functional code into imperative one to speedup
b70e7e1 [Xusen Yin] remove useless case class
6fa236f [Xusen Yin] fix vector slice error
daff601 [Xusen Yin] fix index error of sparse vector
6bd0a10 [Xusen Yin] merge repeated features
419f8a2 [Xusen Yin] need to merge same columns
4ebf34e [Xusen Yin] add test suite of polynomial expansion
372227c [Xusen Yin] add polynomial expansion


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8509519d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8509519d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8509519d

Branch: refs/heads/master
Commit: 8509519d8bcf99e2d1b5e21da514d51357f9116d
Parents: 4c722d7
Author: Xusen Yin yinxu...@gmail.com
Authored: Fri Apr 24 00:39:29 2015 -0700
Committer: Xiangrui Meng m...@databricks.com
Committed: Fri Apr 24 00:39:29 2015 -0700

--
 .../spark/ml/feature/PolynomialExpansion.scala  | 167 +++
 .../ml/feature/PolynomialExpansionSuite.scala   | 104 
 2 files changed, 271 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8509519d/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
new file mode 100644
index 000..c3a59a3
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PolynomialExpansion.scala
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the License); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.feature
+
+import scala.collection.mutable
+
+import org.apache.spark.annotation.AlphaComponent
+import org.apache.spark.ml.UnaryTransformer
+import org.apache.spark.ml.param.{IntParam, ParamMap}
+import org.apache.spark.mllib.linalg._
+import org.apache.spark.sql.types.DataType
+
+/**
+ * :: AlphaComponent ::
+ * Perform feature expansion in a polynomial space. As said in wikipedia of 
Polynomial Expansion,
+ * which is available at 
[[http://en.wikipedia.org/wiki/Polynomial_expansion]], In mathematics, an
+ * expansion of a product of sums expresses it as a sum of products by using 
the fact that
+ * multiplication distributes over addition. Take a 2-variable feature vector 
as an example:
+ * `(x, y)`, if we want to expand it with degree 2, then we get `(x, y, x * x, 
x * y, y * y)`.
+ */
+@AlphaComponent
+class PolynomialExpansion extends UnaryTransformer[Vector, Vector, 
PolynomialExpansion] {
+
+  /**
+   * The polynomial degree to expand, which should be larger than 1.
+   * @group param
+   */
+  val degree = new IntParam(this, degree, the polynomial degree to expand)
+  setDefault(degree - 2)
+
+  /** @group getParam */
+  def getDegree: Int = getOrDefault(degree)
+
+  /** @group setParam */
+  def setDegree(value: Int): this.type = set(degree, value)
+
+  override protected def createTransformFunc(paramMap: ParamMap): Vector = 
Vector = { v =
+val d = paramMap(degree)
+

spark git commit: [SPARK-6852] [SPARKR] Accept numeric as numPartitions in SparkR.

2015-04-24 Thread shivaram
Repository: spark
Updated Branches:
  refs/heads/master ebb77b2af - caf0136ec


[SPARK-6852] [SPARKR] Accept numeric as numPartitions in SparkR.

Author: Sun Rui rui@intel.com

Closes #5613 from sun-rui/SPARK-6852 and squashes the following commits:

abaf02e [Sun Rui] Change the type of default numPartitions from integer to 
numeric in generics.R.
29d67c1 [Sun Rui] [SPARK-6852][SPARKR] Accept numeric as numPartitions in 
SparkR.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/caf0136e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/caf0136e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/caf0136e

Branch: refs/heads/master
Commit: caf0136ec5838cf5bf61f39a5b3474a505a6ae11
Parents: ebb77b2
Author: Sun Rui rui@intel.com
Authored: Fri Apr 24 12:52:07 2015 -0700
Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu
Committed: Fri Apr 24 12:52:07 2015 -0700

--
 R/pkg/R/RDD.R  |  2 +-
 R/pkg/R/generics.R | 12 ++--
 R/pkg/R/pairRDD.R  | 24 
 3 files changed, 19 insertions(+), 19 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/caf0136e/R/pkg/R/RDD.R
--
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index cc09efb..1662d6b 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -967,7 +967,7 @@ setMethod(keyBy,
 setMethod(repartition,
   signature(x = RDD, numPartitions = numeric),
   function(x, numPartitions) {
-coalesce(x, numToInt(numPartitions), TRUE)
+coalesce(x, numPartitions, TRUE)
   })
 
 #' Return a new RDD that is reduced into numPartitions partitions.

http://git-wip-us.apache.org/repos/asf/spark/blob/caf0136e/R/pkg/R/generics.R
--
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 6c62333..34dbe84 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -60,7 +60,7 @@ setGeneric(countByValue, function(x) { 
standardGeneric(countByValue) })
 
 #' @rdname distinct
 #' @export
-setGeneric(distinct, function(x, numPartitions = 1L) { 
standardGeneric(distinct) })
+setGeneric(distinct, function(x, numPartitions = 1) { 
standardGeneric(distinct) })
 
 #' @rdname filterRDD
 #' @export
@@ -182,7 +182,7 @@ setGeneric(setName, function(x, name) { 
standardGeneric(setName) })
 #' @rdname sortBy
 #' @export
 setGeneric(sortBy,
-   function(x, func, ascending = TRUE, numPartitions = 1L) {
+   function(x, func, ascending = TRUE, numPartitions = 1) {
  standardGeneric(sortBy)
})
 
@@ -244,7 +244,7 @@ setGeneric(flatMapValues, function(X, FUN) { 
standardGeneric(flatMapValues)
 
 #' @rdname intersection
 #' @export
-setGeneric(intersection, function(x, other, numPartitions = 1L) {
+setGeneric(intersection, function(x, other, numPartitions = 1) {
   standardGeneric(intersection) })
 
 #' @rdname keys
@@ -346,21 +346,21 @@ setGeneric(rightOuterJoin, function(x, y, 
numPartitions) { standardGeneric(ri
 #' @rdname sortByKey
 #' @export
 setGeneric(sortByKey,
-   function(x, ascending = TRUE, numPartitions = 1L) {
+   function(x, ascending = TRUE, numPartitions = 1) {
  standardGeneric(sortByKey)
})
 
 #' @rdname subtract
 #' @export
 setGeneric(subtract,
-   function(x, other, numPartitions = 1L) {
+   function(x, other, numPartitions = 1) {
  standardGeneric(subtract)
})
 
 #' @rdname subtractByKey
 #' @export
 setGeneric(subtractByKey, 
-   function(x, other, numPartitions = 1L) {
+   function(x, other, numPartitions = 1) {
  standardGeneric(subtractByKey)
})
 

http://git-wip-us.apache.org/repos/asf/spark/blob/caf0136e/R/pkg/R/pairRDD.R
--
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index f99b474..9791e55 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -190,7 +190,7 @@ setMethod(flatMapValues,
 #' @rdname partitionBy
 #' @aliases partitionBy,RDD,integer-method
 setMethod(partitionBy,
-  signature(x = RDD, numPartitions = integer),
+  signature(x = RDD, numPartitions = numeric),
   function(x, numPartitions, partitionFunc = hashCode) {
 
 #if (missing(partitionFunc)) {
@@ -211,7 +211,7 @@ setMethod(partitionBy,
 # the content (key-val pairs).
 pairwiseRRDD - newJObject(org.apache.spark.api.r.PairwiseRRDD,
callJMethod(jrdd, rdd),
-   as.integer(numPartitions),
+   numToInt(numPartitions),

spark git commit: [SPARK-6122] [CORE] Upgrade tachyon-client version to 0.6.3

2015-04-24 Thread srowen
Repository: spark
Updated Branches:
  refs/heads/master caf0136ec - 438859eb7


[SPARK-6122] [CORE] Upgrade tachyon-client version to 0.6.3

This is a reopening of #4867.
A short summary of the issues resolved from the previous PR:

1. HTTPClient version mismatch: Selenium (used for UI tests) requires version 
4.3.x, and Tachyon included 4.2.5 through a transitive dependency of its shaded 
thrift jar. To address this, Tachyon 0.6.3 will promote the transitive 
dependencies of the shaded jar so they can be excluded in spark.

2. Jackson-Mapper-ASL version mismatch: In lower versions of hadoop-client (ie. 
1.0.4), version 1.0.1 is included. The parquet library used in spark sql 
requires version 1.8+. Its unclear to me why upgrading tachyon-client would 
cause this dependency to break. The solution was to exclude jackson-mapper-asl 
from hadoop-client.

It seems that the dependency management in spark-parent will not work on 
transitive dependencies, one way to make sure jackson-mapper-asl is included 
with the correct version is to add it as a top level dependency. The best 
solution would be to exclude the dependency in the modules which require a 
higher version, but that did not fix the unit tests. Any suggestions on the 
best way to solve this would be appreciated!

Author: Calvin Jia jia.cal...@gmail.com

Closes #5354 from calvinjia/upgrade_tachyon_0.6.3 and squashes the following 
commits:

0eefe4d [Calvin Jia] Handle httpclient version in maven dependency management. 
Remove httpclient version setting from profiles.
7c00dfa [Calvin Jia] Set httpclient version to 4.3.2 for selenium. Specify 
version of httpclient for sql/hive (previously 4.2.5 transitive dependency of 
libthrift).
9263097 [Calvin Jia] Merge master to test latest changes
dbfc1bd [Calvin Jia] Use Tachyon 0.6.4 for cleaner dependencies.
e2ff80a [Calvin Jia] Exclude the jetty and curator promoted dependencies from 
tachyon-client.
a3a29da [Calvin Jia] Update tachyon-client exclusions.
0ae6c97 [Calvin Jia] Change tachyon version to 0.6.3
a204df9 [Calvin Jia] Update make distribution tachyon version.
a93c94f [Calvin Jia] Exclude jackson-mapper-asl from hadoop client since it has 
a lower version than spark's expected version.
a8a923c [Calvin Jia] Exclude httpcomponents from Tachyon
910fabd [Calvin Jia] Update to master
eed9230 [Calvin Jia] Update tachyon version to 0.6.1.
11907b3 [Calvin Jia] Use TachyonURI for tachyon paths instead of strings.
71bf441 [Calvin Jia] Upgrade Tachyon client version to 0.6.0.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/438859eb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/438859eb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/438859eb

Branch: refs/heads/master
Commit: 438859eb7c4e605bb4041d9a547a16be9c827c75
Parents: caf0136
Author: Calvin Jia jia.cal...@gmail.com
Authored: Fri Apr 24 17:57:41 2015 -0400
Committer: Sean Owen so...@cloudera.com
Committed: Fri Apr 24 17:57:41 2015 -0400

--
 assembly/pom.xml| 10 --
 core/pom.xml|  6 +-
 .../apache/spark/storage/TachyonBlockManager.scala  | 16 
 .../main/scala/org/apache/spark/util/Utils.scala|  4 +++-
 examples/pom.xml|  5 -
 launcher/pom.xml|  6 ++
 make-distribution.sh|  2 +-
 pom.xml | 12 +++-
 sql/hive/pom.xml|  5 +
 9 files changed, 39 insertions(+), 27 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/438859eb/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index f1f8b0d..20593e7 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -213,16 +213,6 @@
 /plugins
   /build
 /profile
-profile
-  idkinesis-asl/id
-  dependencies
-dependency
-  groupIdorg.apache.httpcomponents/groupId
-  artifactIdhttpclient/artifactId
-  version${commons.httpclient.version}/version
-/dependency
-  /dependencies
-/profile
 
 !-- Profiles that disable inclusion of certain dependencies. --
 profile

http://git-wip-us.apache.org/repos/asf/spark/blob/438859eb/core/pom.xml
--
diff --git a/core/pom.xml b/core/pom.xml
index e80829b..5e89d54 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -74,6 +74,10 @@
   groupIdjavax.servlet/groupId
   artifactIdservlet-api/artifactId
 /exclusion
+exclusion
+  groupIdorg.codehaus.jackson/groupId
+