spark git commit: [SPARK-10238] [MLLIB] update since versions in mllib.linalg
Repository: spark Updated Branches: refs/heads/master 8668ead2e - ab431f8a9 [SPARK-10238] [MLLIB] update since versions in mllib.linalg Same as #8421 but for `mllib.linalg`. cc dbtsai Author: Xiangrui Meng m...@databricks.com Closes #8440 from mengxr/SPARK-10238 and squashes the following commits: b38437e [Xiangrui Meng] update since versions in mllib.linalg Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab431f8a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab431f8a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab431f8a Branch: refs/heads/master Commit: ab431f8a970b85fba34ccb506c0f8815e55c63bf Parents: 8668ead Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 20:07:56 2015 -0700 Committer: DB Tsai d...@netflix.com Committed: Tue Aug 25 20:07:56 2015 -0700 -- .../apache/spark/mllib/linalg/Matrices.scala| 44 +--- .../linalg/SingularValueDecomposition.scala | 1 + .../org/apache/spark/mllib/linalg/Vectors.scala | 25 --- .../mllib/linalg/distributed/BlockMatrix.scala | 10 +++-- .../linalg/distributed/CoordinateMatrix.scala | 4 +- .../linalg/distributed/DistributedMatrix.scala | 2 + .../linalg/distributed/IndexedRowMatrix.scala | 4 +- .../mllib/linalg/distributed/RowMatrix.scala| 5 ++- 8 files changed, 64 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ab431f8a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 28b5b46..c02ba42 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -32,18 +32,23 @@ import org.apache.spark.sql.types._ * Trait for a local matrix. */ @SQLUserDefinedType(udt = classOf[MatrixUDT]) +@Since(1.0.0) sealed trait Matrix extends Serializable { /** Number of rows. */ + @Since(1.0.0) def numRows: Int /** Number of columns. */ + @Since(1.0.0) def numCols: Int /** Flag that keeps track whether the matrix is transposed or not. False by default. */ + @Since(1.3.0) val isTransposed: Boolean = false /** Converts to a dense array in column major. */ + @Since(1.0.0) def toArray: Array[Double] = { val newArray = new Array[Double](numRows * numCols) foreachActive { (i, j, v) = @@ -56,6 +61,7 @@ sealed trait Matrix extends Serializable { private[mllib] def toBreeze: BM[Double] /** Gets the (i, j)-th element. */ + @Since(1.3.0) def apply(i: Int, j: Int): Double /** Return the index for the (i, j)-th element in the backing array. */ @@ -65,12 +71,15 @@ sealed trait Matrix extends Serializable { private[mllib] def update(i: Int, j: Int, v: Double): Unit /** Get a deep copy of the matrix. */ + @Since(1.2.0) def copy: Matrix /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */ + @Since(1.3.0) def transpose: Matrix /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */ + @Since(1.2.0) def multiply(y: DenseMatrix): DenseMatrix = { val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols) BLAS.gemm(1.0, this, y, 0.0, C) @@ -78,11 +87,13 @@ sealed trait Matrix extends Serializable { } /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */ + @Since(1.2.0) def multiply(y: DenseVector): DenseVector = { multiply(y.asInstanceOf[Vector]) } /** Convenience method for `Matrix`-`Vector` multiplication. */ + @Since(1.4.0) def multiply(y: Vector): DenseVector = { val output = new DenseVector(new Array[Double](numRows)) BLAS.gemv(1.0, this, y, 0.0, output) @@ -93,6 +104,7 @@ sealed trait Matrix extends Serializable { override def toString: String = toBreeze.toString() /** A human readable representation of the matrix with maximum lines and width */ + @Since(1.4.0) def toString(maxLines: Int, maxLineWidth: Int): String = toBreeze.toString(maxLines, maxLineWidth) /** Map the values of this matrix using a function. Generates a new matrix. Performs the @@ -118,11 +130,13 @@ sealed trait Matrix extends Serializable { /** * Find the number of non-zero active values. */ + @Since(1.5.0) def numNonzeros: Int /** * Find the number of values stored explicitly. These values can be zero as well. */ + @Since(1.5.0) def numActives: Int } @@ -230,11 +244,11 @@ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { */ @Since(1.0.0)
spark git commit: [SPARK-10238] [MLLIB] update since versions in mllib.linalg
Repository: spark Updated Branches: refs/heads/branch-1.5 af98e51f2 - 46750b912 [SPARK-10238] [MLLIB] update since versions in mllib.linalg Same as #8421 but for `mllib.linalg`. cc dbtsai Author: Xiangrui Meng m...@databricks.com Closes #8440 from mengxr/SPARK-10238 and squashes the following commits: b38437e [Xiangrui Meng] update since versions in mllib.linalg (cherry picked from commit ab431f8a970b85fba34ccb506c0f8815e55c63bf) Signed-off-by: DB Tsai d...@netflix.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/46750b91 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/46750b91 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/46750b91 Branch: refs/heads/branch-1.5 Commit: 46750b912781433b6ce0845ac22805cde975361e Parents: af98e51 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 20:07:56 2015 -0700 Committer: DB Tsai d...@netflix.com Committed: Tue Aug 25 20:08:09 2015 -0700 -- .../apache/spark/mllib/linalg/Matrices.scala| 44 +--- .../linalg/SingularValueDecomposition.scala | 1 + .../org/apache/spark/mllib/linalg/Vectors.scala | 25 --- .../mllib/linalg/distributed/BlockMatrix.scala | 10 +++-- .../linalg/distributed/CoordinateMatrix.scala | 4 +- .../linalg/distributed/DistributedMatrix.scala | 2 + .../linalg/distributed/IndexedRowMatrix.scala | 4 +- .../mllib/linalg/distributed/RowMatrix.scala| 5 ++- 8 files changed, 64 insertions(+), 31 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/46750b91/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 28b5b46..c02ba42 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -32,18 +32,23 @@ import org.apache.spark.sql.types._ * Trait for a local matrix. */ @SQLUserDefinedType(udt = classOf[MatrixUDT]) +@Since(1.0.0) sealed trait Matrix extends Serializable { /** Number of rows. */ + @Since(1.0.0) def numRows: Int /** Number of columns. */ + @Since(1.0.0) def numCols: Int /** Flag that keeps track whether the matrix is transposed or not. False by default. */ + @Since(1.3.0) val isTransposed: Boolean = false /** Converts to a dense array in column major. */ + @Since(1.0.0) def toArray: Array[Double] = { val newArray = new Array[Double](numRows * numCols) foreachActive { (i, j, v) = @@ -56,6 +61,7 @@ sealed trait Matrix extends Serializable { private[mllib] def toBreeze: BM[Double] /** Gets the (i, j)-th element. */ + @Since(1.3.0) def apply(i: Int, j: Int): Double /** Return the index for the (i, j)-th element in the backing array. */ @@ -65,12 +71,15 @@ sealed trait Matrix extends Serializable { private[mllib] def update(i: Int, j: Int, v: Double): Unit /** Get a deep copy of the matrix. */ + @Since(1.2.0) def copy: Matrix /** Transpose the Matrix. Returns a new `Matrix` instance sharing the same underlying data. */ + @Since(1.3.0) def transpose: Matrix /** Convenience method for `Matrix`-`DenseMatrix` multiplication. */ + @Since(1.2.0) def multiply(y: DenseMatrix): DenseMatrix = { val C: DenseMatrix = DenseMatrix.zeros(numRows, y.numCols) BLAS.gemm(1.0, this, y, 0.0, C) @@ -78,11 +87,13 @@ sealed trait Matrix extends Serializable { } /** Convenience method for `Matrix`-`DenseVector` multiplication. For binary compatibility. */ + @Since(1.2.0) def multiply(y: DenseVector): DenseVector = { multiply(y.asInstanceOf[Vector]) } /** Convenience method for `Matrix`-`Vector` multiplication. */ + @Since(1.4.0) def multiply(y: Vector): DenseVector = { val output = new DenseVector(new Array[Double](numRows)) BLAS.gemv(1.0, this, y, 0.0, output) @@ -93,6 +104,7 @@ sealed trait Matrix extends Serializable { override def toString: String = toBreeze.toString() /** A human readable representation of the matrix with maximum lines and width */ + @Since(1.4.0) def toString(maxLines: Int, maxLineWidth: Int): String = toBreeze.toString(maxLines, maxLineWidth) /** Map the values of this matrix using a function. Generates a new matrix. Performs the @@ -118,11 +130,13 @@ sealed trait Matrix extends Serializable { /** * Find the number of non-zero active values. */ + @Since(1.5.0) def numNonzeros: Int /** * Find the number of values stored explicitly. These values can be zero as well. */ + @Since(1.5.0) def numActives: Int } @@ -230,11
spark git commit: [SPARK-10234] [MLLIB] update since version in mllib.clustering
Repository: spark Updated Branches: refs/heads/master c3a54843c - d703372f8 [SPARK-10234] [MLLIB] update since version in mllib.clustering Same as #8421 but for `mllib.clustering`. cc feynmanliang yu-iskw Author: Xiangrui Meng m...@databricks.com Closes #8435 from mengxr/SPARK-10234. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d703372f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d703372f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d703372f Branch: refs/heads/master Commit: d703372f86d6a59383ba8569fcd9d379849cffbf Parents: c3a5484 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 22:33:48 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 25 22:33:48 2015 -0700 -- .../mllib/clustering/GaussianMixture.scala | 1 + .../mllib/clustering/GaussianMixtureModel.scala | 8 +++--- .../apache/spark/mllib/clustering/KMeans.scala | 1 + .../spark/mllib/clustering/KMeansModel.scala| 4 +-- .../spark/mllib/clustering/LDAModel.scala | 28 +++- .../clustering/PowerIterationClustering.scala | 10 --- .../mllib/clustering/StreamingKMeans.scala | 15 ++- 7 files changed, 44 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala index daa947e..f82bd82 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala @@ -53,6 +53,7 @@ import org.apache.spark.util.Utils * @param maxIterations The maximum number of iterations to perform */ @Experimental +@Since(1.3.0) class GaussianMixture private ( private var k: Int, private var convergenceTol: Double, http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 1a10a8b..7f6163e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -46,9 +46,9 @@ import org.apache.spark.sql.{SQLContext, Row} */ @Since(1.3.0) @Experimental -class GaussianMixtureModel( - val weights: Array[Double], - val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable { +class GaussianMixtureModel @Since(1.3.0) ( + @Since(1.3.0) val weights: Array[Double], + @Since(1.3.0) val gaussians: Array[MultivariateGaussian]) extends Serializable with Saveable { require(weights.length == gaussians.length, Length of weight and Gaussian arrays must match) @@ -178,7 +178,7 @@ object GaussianMixtureModel extends Loader[GaussianMixtureModel] { (weight, new MultivariateGaussian(mu, sigma)) }.unzip - return new GaussianMixtureModel(weights.toArray, gaussians.toArray) + new GaussianMixtureModel(weights.toArray, gaussians.toArray) } } http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 3e9545a..46920ff 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -37,6 +37,7 @@ import org.apache.spark.util.random.XORShiftRandom * This is an iterative algorithm that will make multiple passes over the data, so any RDDs given * to it should be cached by the user. */ +@Since(0.8.0) class KMeans private ( private var k: Int, private var maxIterations: Int, http://git-wip-us.apache.org/repos/asf/spark/blob/d703372f/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index e425ecd..a741584 100644 ---
spark git commit: [SPARK-10235] [MLLIB] update since versions in mllib.regression
Repository: spark Updated Branches: refs/heads/master fb7e12fe2 - 4657fa1f3 [SPARK-10235] [MLLIB] update since versions in mllib.regression Same as #8421 but for `mllib.regression`. cc freeman-lab dbtsai Author: Xiangrui Meng m...@databricks.com Closes #8426 from mengxr/SPARK-10235 and squashes the following commits: 6cd28e4 [Xiangrui Meng] update since versions in mllib.regression Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4657fa1f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4657fa1f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4657fa1f Branch: refs/heads/master Commit: 4657fa1f37d41dd4c7240a960342b68c7c591f48 Parents: fb7e12f Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 22:49:33 2015 -0700 Committer: DB Tsai d...@netflix.com Committed: Tue Aug 25 22:49:33 2015 -0700 -- .../regression/GeneralizedLinearAlgorithm.scala | 6 -- .../spark/mllib/regression/IsotonicRegression.scala | 16 +--- .../spark/mllib/regression/LabeledPoint.scala | 5 +++-- .../org/apache/spark/mllib/regression/Lasso.scala | 9 ++--- .../spark/mllib/regression/LinearRegression.scala | 9 ++--- .../spark/mllib/regression/RidgeRegression.scala| 12 +++- .../mllib/regression/StreamingLinearAlgorithm.scala | 8 +++- .../StreamingLinearRegressionWithSGD.scala | 11 +-- 8 files changed, 47 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4657fa1f/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 509f6a2..7e3b4d5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -38,7 +38,9 @@ import org.apache.spark.storage.StorageLevel */ @Since(0.8.0) @DeveloperApi -abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double) +abstract class GeneralizedLinearModel @Since(1.0.0) ( +@Since(1.0.0) val weights: Vector, +@Since(0.8.0) val intercept: Double) extends Serializable { /** @@ -107,7 +109,7 @@ abstract class GeneralizedLinearAlgorithm[M : GeneralizedLinearModel] * The optimizer to solve the problem. * */ - @Since(1.0.0) + @Since(0.8.0) def optimizer: Optimizer /** Whether to add intercept (default: false). */ http://git-wip-us.apache.org/repos/asf/spark/blob/4657fa1f/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala index 31ca7c2..877d31b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -50,10 +50,10 @@ import org.apache.spark.sql.SQLContext */ @Since(1.3.0) @Experimental -class IsotonicRegressionModel ( -val boundaries: Array[Double], -val predictions: Array[Double], -val isotonic: Boolean) extends Serializable with Saveable { +class IsotonicRegressionModel @Since(1.3.0) ( +@Since(1.3.0) val boundaries: Array[Double], +@Since(1.3.0) val predictions: Array[Double], +@Since(1.3.0) val isotonic: Boolean) extends Serializable with Saveable { private val predictionOrd = if (isotonic) Ordering[Double] else Ordering[Double].reverse @@ -63,7 +63,6 @@ class IsotonicRegressionModel ( /** * A Java-friendly constructor that takes two Iterable parameters and one Boolean parameter. - * */ @Since(1.4.0) def this(boundaries: java.lang.Iterable[Double], @@ -214,8 +213,6 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { } } - /** - */ @Since(1.4.0) override def load(sc: SparkContext, path: String): IsotonicRegressionModel = { implicit val formats = DefaultFormats @@ -256,6 +253,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]] */ @Experimental +@Since(1.3.0) class IsotonicRegression private (private var isotonic: Boolean) extends Serializable { /** @@ -263,6 +261,7 @@ class IsotonicRegression private (private var isotonic: Boolean)
spark git commit: [SPARK-10235] [MLLIB] update since versions in mllib.regression
Repository: spark Updated Branches: refs/heads/branch-1.5 6d8ebc801 - 08d390f45 [SPARK-10235] [MLLIB] update since versions in mllib.regression Same as #8421 but for `mllib.regression`. cc freeman-lab dbtsai Author: Xiangrui Meng m...@databricks.com Closes #8426 from mengxr/SPARK-10235 and squashes the following commits: 6cd28e4 [Xiangrui Meng] update since versions in mllib.regression (cherry picked from commit 4657fa1f37d41dd4c7240a960342b68c7c591f48) Signed-off-by: DB Tsai d...@netflix.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/08d390f4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/08d390f4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/08d390f4 Branch: refs/heads/branch-1.5 Commit: 08d390f457f80ffdc2dfce61ea579d9026047f12 Parents: 6d8ebc8 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 22:49:33 2015 -0700 Committer: DB Tsai d...@netflix.com Committed: Tue Aug 25 22:49:46 2015 -0700 -- .../regression/GeneralizedLinearAlgorithm.scala | 6 -- .../spark/mllib/regression/IsotonicRegression.scala | 16 +--- .../spark/mllib/regression/LabeledPoint.scala | 5 +++-- .../org/apache/spark/mllib/regression/Lasso.scala | 9 ++--- .../spark/mllib/regression/LinearRegression.scala | 9 ++--- .../spark/mllib/regression/RidgeRegression.scala| 12 +++- .../mllib/regression/StreamingLinearAlgorithm.scala | 8 +++- .../StreamingLinearRegressionWithSGD.scala | 11 +-- 8 files changed, 47 insertions(+), 29 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/08d390f4/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala index 509f6a2..7e3b4d5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/GeneralizedLinearAlgorithm.scala @@ -38,7 +38,9 @@ import org.apache.spark.storage.StorageLevel */ @Since(0.8.0) @DeveloperApi -abstract class GeneralizedLinearModel(val weights: Vector, val intercept: Double) +abstract class GeneralizedLinearModel @Since(1.0.0) ( +@Since(1.0.0) val weights: Vector, +@Since(0.8.0) val intercept: Double) extends Serializable { /** @@ -107,7 +109,7 @@ abstract class GeneralizedLinearAlgorithm[M : GeneralizedLinearModel] * The optimizer to solve the problem. * */ - @Since(1.0.0) + @Since(0.8.0) def optimizer: Optimizer /** Whether to add intercept (default: false). */ http://git-wip-us.apache.org/repos/asf/spark/blob/08d390f4/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala index 31ca7c2..877d31b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/IsotonicRegression.scala @@ -50,10 +50,10 @@ import org.apache.spark.sql.SQLContext */ @Since(1.3.0) @Experimental -class IsotonicRegressionModel ( -val boundaries: Array[Double], -val predictions: Array[Double], -val isotonic: Boolean) extends Serializable with Saveable { +class IsotonicRegressionModel @Since(1.3.0) ( +@Since(1.3.0) val boundaries: Array[Double], +@Since(1.3.0) val predictions: Array[Double], +@Since(1.3.0) val isotonic: Boolean) extends Serializable with Saveable { private val predictionOrd = if (isotonic) Ordering[Double] else Ordering[Double].reverse @@ -63,7 +63,6 @@ class IsotonicRegressionModel ( /** * A Java-friendly constructor that takes two Iterable parameters and one Boolean parameter. - * */ @Since(1.4.0) def this(boundaries: java.lang.Iterable[Double], @@ -214,8 +213,6 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { } } - /** - */ @Since(1.4.0) override def load(sc: SparkContext, path: String): IsotonicRegressionModel = { implicit val formats = DefaultFormats @@ -256,6 +253,7 @@ object IsotonicRegressionModel extends Loader[IsotonicRegressionModel] { * @see [[http://en.wikipedia.org/wiki/Isotonic_regression Isotonic regression (Wikipedia)]] */ @Experimental +@Since(1.3.0) class IsotonicRegression private (private var isotonic: Boolean)
spark git commit: [SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and mllib.stat
Repository: spark Updated Branches: refs/heads/master ab431f8a9 - c3a54843c [SPARK-10240] [SPARK-10242] [MLLIB] update since versions in mlilb.random and mllib.stat The same as #8241 but for `mllib.stat` and `mllib.random`. cc feynmanliang Author: Xiangrui Meng m...@databricks.com Closes #8439 from mengxr/SPARK-10242. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c3a54843 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c3a54843 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c3a54843 Branch: refs/heads/master Commit: c3a54843c0c8a14059da4e6716c1ad45c69bbe6c Parents: ab431f8 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 22:31:23 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 25 22:31:23 2015 -0700 -- .../mllib/random/RandomDataGenerator.scala | 43 ++-- .../apache/spark/mllib/random/RandomRDDs.scala | 69 +--- .../distribution/MultivariateGaussian.scala | 6 +- .../spark/mllib/stat/test/TestResult.scala | 24 --- 4 files changed, 117 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c3a54843/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala index 9349eca..a2d85a6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/random/RandomDataGenerator.scala @@ -20,7 +20,7 @@ package org.apache.spark.mllib.random import org.apache.commons.math3.distribution.{ExponentialDistribution, GammaDistribution, LogNormalDistribution, PoissonDistribution} -import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.annotation.{Since, DeveloperApi} import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom} /** @@ -28,17 +28,20 @@ import org.apache.spark.util.random.{XORShiftRandom, Pseudorandom} * Trait for random data generators that generate i.i.d. data. */ @DeveloperApi +@Since(1.1.0) trait RandomDataGenerator[T] extends Pseudorandom with Serializable { /** * Returns an i.i.d. sample as a generic type from an underlying distribution. */ + @Since(1.1.0) def nextValue(): T /** * Returns a copy of the RandomDataGenerator with a new instance of the rng object used in the * class when applicable for non-locking concurrent usage. */ + @Since(1.1.0) def copy(): RandomDataGenerator[T] } @@ -47,17 +50,21 @@ trait RandomDataGenerator[T] extends Pseudorandom with Serializable { * Generates i.i.d. samples from U[0.0, 1.0] */ @DeveloperApi +@Since(1.1.0) class UniformGenerator extends RandomDataGenerator[Double] { // XORShiftRandom for better performance. Thread safety isn't necessary here. private val random = new XORShiftRandom() + @Since(1.1.0) override def nextValue(): Double = { random.nextDouble() } + @Since(1.1.0) override def setSeed(seed: Long): Unit = random.setSeed(seed) + @Since(1.1.0) override def copy(): UniformGenerator = new UniformGenerator() } @@ -66,17 +73,21 @@ class UniformGenerator extends RandomDataGenerator[Double] { * Generates i.i.d. samples from the standard normal distribution. */ @DeveloperApi +@Since(1.1.0) class StandardNormalGenerator extends RandomDataGenerator[Double] { // XORShiftRandom for better performance. Thread safety isn't necessary here. private val random = new XORShiftRandom() + @Since(1.1.0) override def nextValue(): Double = { random.nextGaussian() } + @Since(1.1.0) override def setSeed(seed: Long): Unit = random.setSeed(seed) + @Since(1.1.0) override def copy(): StandardNormalGenerator = new StandardNormalGenerator() } @@ -87,16 +98,21 @@ class StandardNormalGenerator extends RandomDataGenerator[Double] { * @param mean mean for the Poisson distribution. */ @DeveloperApi -class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] { +@Since(1.1.0) +class PoissonGenerator @Since(1.1.0) ( +@Since(1.1.0) val mean: Double) extends RandomDataGenerator[Double] { private val rng = new PoissonDistribution(mean) + @Since(1.1.0) override def nextValue(): Double = rng.sample() + @Since(1.1.0) override def setSeed(seed: Long) { rng.reseedRandomGenerator(seed) } + @Since(1.1.0) override def copy(): PoissonGenerator = new PoissonGenerator(mean) } @@ -107,16 +123,21 @@ class PoissonGenerator(val mean: Double) extends RandomDataGenerator[Double] { * @param mean mean for
spark git commit: [SPARK-10243] [MLLIB] update since versions in mllib.tree
Repository: spark Updated Branches: refs/heads/master d703372f8 - fb7e12fe2 [SPARK-10243] [MLLIB] update since versions in mllib.tree Same as #8421 but for `mllib.tree`. cc jkbradley Author: Xiangrui Meng m...@databricks.com Closes #8442 from mengxr/SPARK-10236. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb7e12fe Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb7e12fe Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb7e12fe Branch: refs/heads/master Commit: fb7e12fe2e14af8de4c206ca8096b2e8113bfddc Parents: d703372 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 22:35:49 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 25 22:35:49 2015 -0700 -- .../apache/spark/mllib/tree/DecisionTree.scala | 3 +- .../spark/mllib/tree/GradientBoostedTrees.scala | 2 +- .../spark/mllib/tree/configuration/Algo.scala | 2 ++ .../tree/configuration/BoostingStrategy.scala | 12 .../mllib/tree/configuration/FeatureType.scala | 2 ++ .../tree/configuration/QuantileStrategy.scala | 2 ++ .../mllib/tree/configuration/Strategy.scala | 29 ++-- .../mllib/tree/model/DecisionTreeModel.scala| 5 +++- .../apache/spark/mllib/tree/model/Node.scala| 18 ++-- .../apache/spark/mllib/tree/model/Predict.scala | 6 ++-- .../apache/spark/mllib/tree/model/Split.scala | 8 +++--- .../mllib/tree/model/treeEnsembleModels.scala | 12 12 files changed, 57 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 9728410..4a77d4a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -46,7 +46,8 @@ import org.apache.spark.util.random.XORShiftRandom */ @Since(1.0.0) @Experimental -class DecisionTree (private val strategy: Strategy) extends Serializable with Logging { +class DecisionTree @Since(1.0.0) (private val strategy: Strategy) + extends Serializable with Logging { strategy.assertValid() http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index e750408..95ed48c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -51,7 +51,7 @@ import org.apache.spark.storage.StorageLevel */ @Since(1.2.0) @Experimental -class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) +class GradientBoostedTrees @Since(1.2.0) (private val boostingStrategy: BoostingStrategy) extends Serializable with Logging { /** http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala index 8301ad1..853c731 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala @@ -26,7 +26,9 @@ import org.apache.spark.annotation.{Experimental, Since} @Since(1.0.0) @Experimental object Algo extends Enumeration { + @Since(1.0.0) type Algo = Value + @Since(1.0.0) val Classification, Regression = Value private[mllib] def fromString(name: String): Algo = name match { http://git-wip-us.apache.org/repos/asf/spark/blob/fb7e12fe/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index 7c56998..b5c72fb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala @@ -41,14 +41,14 @@ import
spark git commit: [SPARK-10243] [MLLIB] update since versions in mllib.tree
Repository: spark Updated Branches: refs/heads/branch-1.5 be0c9915c - 6d8ebc801 [SPARK-10243] [MLLIB] update since versions in mllib.tree Same as #8421 but for `mllib.tree`. cc jkbradley Author: Xiangrui Meng m...@databricks.com Closes #8442 from mengxr/SPARK-10236. (cherry picked from commit fb7e12fe2e14af8de4c206ca8096b2e8113bfddc) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6d8ebc80 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6d8ebc80 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6d8ebc80 Branch: refs/heads/branch-1.5 Commit: 6d8ebc801799714d297c83be6935b37e26dc2df7 Parents: be0c991 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 22:35:49 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 25 22:35:56 2015 -0700 -- .../apache/spark/mllib/tree/DecisionTree.scala | 3 +- .../spark/mllib/tree/GradientBoostedTrees.scala | 2 +- .../spark/mllib/tree/configuration/Algo.scala | 2 ++ .../tree/configuration/BoostingStrategy.scala | 12 .../mllib/tree/configuration/FeatureType.scala | 2 ++ .../tree/configuration/QuantileStrategy.scala | 2 ++ .../mllib/tree/configuration/Strategy.scala | 29 ++-- .../mllib/tree/model/DecisionTreeModel.scala| 5 +++- .../apache/spark/mllib/tree/model/Node.scala| 18 ++-- .../apache/spark/mllib/tree/model/Predict.scala | 6 ++-- .../apache/spark/mllib/tree/model/Split.scala | 8 +++--- .../mllib/tree/model/treeEnsembleModels.scala | 12 12 files changed, 57 insertions(+), 44 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala index 9728410..4a77d4a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala @@ -46,7 +46,8 @@ import org.apache.spark.util.random.XORShiftRandom */ @Since(1.0.0) @Experimental -class DecisionTree (private val strategy: Strategy) extends Serializable with Logging { +class DecisionTree @Since(1.0.0) (private val strategy: Strategy) + extends Serializable with Logging { strategy.assertValid() http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index e750408..95ed48c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -51,7 +51,7 @@ import org.apache.spark.storage.StorageLevel */ @Since(1.2.0) @Experimental -class GradientBoostedTrees(private val boostingStrategy: BoostingStrategy) +class GradientBoostedTrees @Since(1.2.0) (private val boostingStrategy: BoostingStrategy) extends Serializable with Logging { /** http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala index 8301ad1..853c731 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Algo.scala @@ -26,7 +26,9 @@ import org.apache.spark.annotation.{Experimental, Since} @Since(1.0.0) @Experimental object Algo extends Enumeration { + @Since(1.0.0) type Algo = Value + @Since(1.0.0) val Classification, Regression = Value private[mllib] def fromString(name: String): Algo = name match { http://git-wip-us.apache.org/repos/asf/spark/blob/6d8ebc80/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index 7c56998..b5c72fb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++
spark git commit: [SPARK-10178] [SQL] HiveComparisionTest should print out dependent tables
Repository: spark Updated Branches: refs/heads/branch-1.5 c99f4160b - 2239a2036 [SPARK-10178] [SQL] HiveComparisionTest should print out dependent tables In `HiveComparisionTest`s it is possible to fail a query of the form `SELECT * FROM dest1`, where `dest1` is the query that is actually computing the incorrect results. To aid debugging this patch improves the harness to also print these query plans and their results. Author: Michael Armbrust mich...@databricks.com Closes #8388 from marmbrus/generatedTables. (cherry picked from commit 5175ca0c85b10045d12c3fb57b1e52278a413ecf) Signed-off-by: Reynold Xin r...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2239a203 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2239a203 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2239a203 Branch: refs/heads/branch-1.5 Commit: 2239a20368b7833ffe0059941478924c7be87bbe Parents: c99f416 Author: Michael Armbrust mich...@databricks.com Authored: Mon Aug 24 23:15:27 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Mon Aug 24 23:15:34 2015 -0700 -- .../sql/hive/execution/HiveComparisonTest.scala | 36 1 file changed, 36 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2239a203/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala index 2bdb0e1..4d45249 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.hive.execution import java.io._ +import scala.util.control.NonFatal + import org.scalatest.{BeforeAndAfterAll, GivenWhenThen} import org.apache.spark.{Logging, SparkFunSuite} @@ -386,11 +388,45 @@ abstract class HiveComparisonTest hiveCacheFiles.foreach(_.delete()) } + // If this query is reading other tables that were created during this test run + // also print out the query plans and results for those. + val computedTablesMessages: String = try { +val tablesRead = new TestHive.QueryExecution(query).executedPlan.collect { + case ts: HiveTableScan = ts.relation.tableName +}.toSet + +TestHive.reset() +val executions = queryList.map(new TestHive.QueryExecution(_)) +executions.foreach(_.toRdd) +val tablesGenerated = queryList.zip(executions).flatMap { + case (q, e) = e.executedPlan.collect { +case i: InsertIntoHiveTable if tablesRead contains i.table.tableName = + (q, e, i) + } +} + +tablesGenerated.map { case (hiveql, execution, insert) = + s + |=== Generated Table === + |$hiveql + |$execution + |== Results == + |${insert.child.execute().collect().mkString(\n)} + .stripMargin +}.mkString(\n) + + } catch { +case NonFatal(e) = + logError(Failed to compute generated tables, e) + sCouldn't compute dependent tables: $e + } + val errorMessage = s |Results do not match for $testCaseName: |$hiveQuery\n${hiveQuery.analyzed.output.map(_.name).mkString(\t)} |$resultComparison + |$computedTablesMessages .stripMargin stringToFile(new File(wrongDirectory, testCaseName), errorMessage + consoleTestCase) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10178] [SQL] HiveComparisionTest should print out dependent tables
Repository: spark Updated Branches: refs/heads/master a0c0aae1d - 5175ca0c8 [SPARK-10178] [SQL] HiveComparisionTest should print out dependent tables In `HiveComparisionTest`s it is possible to fail a query of the form `SELECT * FROM dest1`, where `dest1` is the query that is actually computing the incorrect results. To aid debugging this patch improves the harness to also print these query plans and their results. Author: Michael Armbrust mich...@databricks.com Closes #8388 from marmbrus/generatedTables. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5175ca0c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5175ca0c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5175ca0c Branch: refs/heads/master Commit: 5175ca0c85b10045d12c3fb57b1e52278a413ecf Parents: a0c0aae Author: Michael Armbrust mich...@databricks.com Authored: Mon Aug 24 23:15:27 2015 -0700 Committer: Reynold Xin r...@databricks.com Committed: Mon Aug 24 23:15:27 2015 -0700 -- .../sql/hive/execution/HiveComparisonTest.scala | 36 1 file changed, 36 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5175ca0c/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala index 2bdb0e1..4d45249 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala @@ -19,6 +19,8 @@ package org.apache.spark.sql.hive.execution import java.io._ +import scala.util.control.NonFatal + import org.scalatest.{BeforeAndAfterAll, GivenWhenThen} import org.apache.spark.{Logging, SparkFunSuite} @@ -386,11 +388,45 @@ abstract class HiveComparisonTest hiveCacheFiles.foreach(_.delete()) } + // If this query is reading other tables that were created during this test run + // also print out the query plans and results for those. + val computedTablesMessages: String = try { +val tablesRead = new TestHive.QueryExecution(query).executedPlan.collect { + case ts: HiveTableScan = ts.relation.tableName +}.toSet + +TestHive.reset() +val executions = queryList.map(new TestHive.QueryExecution(_)) +executions.foreach(_.toRdd) +val tablesGenerated = queryList.zip(executions).flatMap { + case (q, e) = e.executedPlan.collect { +case i: InsertIntoHiveTable if tablesRead contains i.table.tableName = + (q, e, i) + } +} + +tablesGenerated.map { case (hiveql, execution, insert) = + s + |=== Generated Table === + |$hiveql + |$execution + |== Results == + |${insert.child.execute().collect().mkString(\n)} + .stripMargin +}.mkString(\n) + + } catch { +case NonFatal(e) = + logError(Failed to compute generated tables, e) + sCouldn't compute dependent tables: $e + } + val errorMessage = s |Results do not match for $testCaseName: |$hiveQuery\n${hiveQuery.analyzed.output.map(_.name).mkString(\t)} |$resultComparison + |$computedTablesMessages .stripMargin stringToFile(new File(wrongDirectory, testCaseName), errorMessage + consoleTestCase) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9786] [STREAMING] [KAFKA] fix backpressure so it works with defa…
Repository: spark Updated Branches: refs/heads/branch-1.5 2239a2036 - 88991dc4f [SPARK-9786] [STREAMING] [KAFKA] fix backpressure so it works with defa⦠â¦ult maxRatePerPartition setting of 0 Author: cody koeninger c...@koeninger.org Closes #8413 from koeninger/backpressure-testing-master. (cherry picked from commit d9c25dec87e6da7d66a47ff94e7eefa008081b9d) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/88991dc4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/88991dc4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/88991dc4 Branch: refs/heads/branch-1.5 Commit: 88991dc4f04b0c88466c6eab5ada43506c981341 Parents: 2239a20 Author: cody koeninger c...@koeninger.org Authored: Mon Aug 24 23:26:14 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 23:26:27 2015 -0700 -- .../spark/streaming/kafka/DirectKafkaInputDStream.scala | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/88991dc4/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala index 8a17707..194 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala @@ -95,8 +95,13 @@ class DirectKafkaInputDStream[ val effectiveRateLimitPerPartition = estimatedRateLimit .filter(_ 0) - .map(limit = Math.min(maxRateLimitPerPartition, (limit / numPartitions))) - .getOrElse(maxRateLimitPerPartition) + .map { limit = +if (maxRateLimitPerPartition 0) { + Math.min(maxRateLimitPerPartition, (limit / numPartitions)) +} else { + limit / numPartitions +} + }.getOrElse(maxRateLimitPerPartition) if (effectiveRateLimitPerPartition 0) { val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10196] [SQL] Correctly saving decimals in internal rows to JSON.
Repository: spark Updated Branches: refs/heads/branch-1.5 bb1357f36 - 0b425ed3d [SPARK-10196] [SQL] Correctly saving decimals in internal rows to JSON. https://issues.apache.org/jira/browse/SPARK-10196 Author: Yin Huai yh...@databricks.com Closes #8408 from yhuai/DecimalJsonSPARK-10196. (cherry picked from commit df7041d02d3fd44b08a859f5d77bf6fb726895f0) Signed-off-by: Davies Liu davies@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0b425ed3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0b425ed3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0b425ed3 Branch: refs/heads/branch-1.5 Commit: 0b425ed3d55f7e9c39a259ce4b8d86a41a7bd403 Parents: bb1357f Author: Yin Huai yh...@databricks.com Authored: Mon Aug 24 23:38:32 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Mon Aug 24 23:38:42 2015 -0700 -- .../datasources/json/JacksonGenerator.scala | 2 +- .../sql/sources/JsonHadoopFsRelationSuite.scala | 27 2 files changed, 28 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0b425ed3/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala index 99ac773..330ba90 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala @@ -95,7 +95,7 @@ private[sql] object JacksonGenerator { case (FloatType, v: Float) = gen.writeNumber(v) case (DoubleType, v: Double) = gen.writeNumber(v) case (LongType, v: Long) = gen.writeNumber(v) - case (DecimalType(), v: java.math.BigDecimal) = gen.writeNumber(v) + case (DecimalType(), v: Decimal) = gen.writeNumber(v.toJavaBigDecimal) case (ByteType, v: Byte) = gen.writeNumber(v.toInt) case (BinaryType, v: Array[Byte]) = gen.writeBinary(v) case (BooleanType, v: Boolean) = gen.writeBoolean(v) http://git-wip-us.apache.org/repos/asf/spark/blob/0b425ed3/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala index ed6d512..8ca3a17 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.sources +import java.math.BigDecimal + import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil @@ -75,4 +77,29 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { ) } } + + test(SPARK-10196: save decimal type to JSON) { +withTempDir { file = + file.delete() + + val schema = +new StructType() + .add(decimal, DecimalType(7, 2)) + + val data = +Row(new BigDecimal(10.02)) :: + Row(new BigDecimal(2.99)) :: + Row(new BigDecimal(1)) :: Nil + val df = createDataFrame(sparkContext.parallelize(data), schema) + + // Write the data out. + df.write.format(dataSourceName).save(file.getCanonicalPath) + + // Read it back and check the result. + checkAnswer( +read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), +df + ) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10137] [STREAMING] Avoid to restart receivers if scheduleReceivers returns balanced results
Repository: spark Updated Branches: refs/heads/branch-1.5 88991dc4f - bb1357f36 [SPARK-10137] [STREAMING] Avoid to restart receivers if scheduleReceivers returns balanced results This PR fixes the following cases for `ReceiverSchedulingPolicy`. 1) Assume there are 4 executors: host1, host2, host3, host4, and 5 receivers: r1, r2, r3, r4, r5. Then `ReceiverSchedulingPolicy.scheduleReceivers` will return (r1 - host1, r2 - host2, r3 - host3, r4 - host4, r5 - host1). Let's assume r1 starts at first on `host1` as `scheduleReceivers` suggested, and try to register with ReceiverTracker. But the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will return (host2, host3, host4) according to the current executor weights (host1 - 1.0, host2 - 0.5, host3 - 0.5, host4 - 0.5), so ReceiverTracker will reject `r1`. This is unexpected since r1 is starting exactly where `scheduleReceivers` suggested. This case can be fixed by ignoring the information of the receiver that is rescheduling in `receiverTrackingInfoMap`. 2) Assume there are 3 executors (host1, host2, host3) and each executors has 3 cores, and 3 receivers: r1, r2, r3. Assume r1 is running on host1. Now r2 is restarting, the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will always return (host1, host2, host3). So it's possible that r2 will be scheduled to host1 by TaskScheduler. r3 is similar. Then at last, it's possible that there are 3 receivers running on host1, while host2 and host3 are idle. This issue can be fixed by returning only executors that have the minimum wight rather than returning at least 3 executors. Author: zsxwing zsxw...@gmail.com Closes #8340 from zsxwing/fix-receiver-scheduling. (cherry picked from commit f023aa2fcc1d1dbb82aee568be0a8f2457c309ae) Signed-off-by: Tathagata Das tathagata.das1...@gmail.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bb1357f3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bb1357f3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bb1357f3 Branch: refs/heads/branch-1.5 Commit: bb1357f362cdd96b854c2a0a193496ce709cdbdd Parents: 88991dc Author: zsxwing zsxw...@gmail.com Authored: Mon Aug 24 23:34:50 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 23:35:02 2015 -0700 -- .../scheduler/ReceiverSchedulingPolicy.scala| 58 +++--- .../streaming/scheduler/ReceiverTracker.scala | 106 --- .../ReceiverSchedulingPolicySuite.scala | 13 +-- 3 files changed, 120 insertions(+), 57 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bb1357f3/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala index ef5b687..10b5a7f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala @@ -22,6 +22,36 @@ import scala.collection.mutable import org.apache.spark.streaming.receiver.Receiver +/** + * A class that tries to schedule receivers with evenly distributed. There are two phases for + * scheduling receivers. + * + * - The first phase is global scheduling when ReceiverTracker is starting and we need to schedule + * all receivers at the same time. ReceiverTracker will call `scheduleReceivers` at this phase. + * It will try to schedule receivers with evenly distributed. ReceiverTracker should update its + * receiverTrackingInfoMap according to the results of `scheduleReceivers`. + * `ReceiverTrackingInfo.scheduledExecutors` for each receiver will set to an executor list that + * contains the scheduled locations. Then when a receiver is starting, it will send a register + * request and `ReceiverTracker.registerReceiver` will be called. In + * `ReceiverTracker.registerReceiver`, if a receiver's scheduled executors is set, it should check + * if the location of this receiver is one of the scheduled executors, if not, the register will + * be rejected. + * - The second phase is local scheduling when a receiver is restarting. There are two cases of + * receiver restarting: + * - If a receiver is restarting because it's rejected due to the real location and the scheduled + * executors mismatching, in other words, it fails to start in one of the locations that + * `scheduleReceivers` suggested, `ReceiverTracker` should firstly choose the executors that are + * still alive in the list of
spark git commit: [SPARK-9786] [STREAMING] [KAFKA] fix backpressure so it works with defa…
Repository: spark Updated Branches: refs/heads/master 5175ca0c8 - d9c25dec8 [SPARK-9786] [STREAMING] [KAFKA] fix backpressure so it works with defa⦠â¦ult maxRatePerPartition setting of 0 Author: cody koeninger c...@koeninger.org Closes #8413 from koeninger/backpressure-testing-master. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d9c25dec Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d9c25dec Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d9c25dec Branch: refs/heads/master Commit: d9c25dec87e6da7d66a47ff94e7eefa008081b9d Parents: 5175ca0 Author: cody koeninger c...@koeninger.org Authored: Mon Aug 24 23:26:14 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 23:26:14 2015 -0700 -- .../spark/streaming/kafka/DirectKafkaInputDStream.scala | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d9c25dec/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala -- diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala index 8a17707..194 100644 --- a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala +++ b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala @@ -95,8 +95,13 @@ class DirectKafkaInputDStream[ val effectiveRateLimitPerPartition = estimatedRateLimit .filter(_ 0) - .map(limit = Math.min(maxRateLimitPerPartition, (limit / numPartitions))) - .getOrElse(maxRateLimitPerPartition) + .map { limit = +if (maxRateLimitPerPartition 0) { + Math.min(maxRateLimitPerPartition, (limit / numPartitions)) +} else { + limit / numPartitions +} + }.getOrElse(maxRateLimitPerPartition) if (effectiveRateLimitPerPartition 0) { val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10137] [STREAMING] Avoid to restart receivers if scheduleReceivers returns balanced results
Repository: spark Updated Branches: refs/heads/master d9c25dec8 - f023aa2fc [SPARK-10137] [STREAMING] Avoid to restart receivers if scheduleReceivers returns balanced results This PR fixes the following cases for `ReceiverSchedulingPolicy`. 1) Assume there are 4 executors: host1, host2, host3, host4, and 5 receivers: r1, r2, r3, r4, r5. Then `ReceiverSchedulingPolicy.scheduleReceivers` will return (r1 - host1, r2 - host2, r3 - host3, r4 - host4, r5 - host1). Let's assume r1 starts at first on `host1` as `scheduleReceivers` suggested, and try to register with ReceiverTracker. But the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will return (host2, host3, host4) according to the current executor weights (host1 - 1.0, host2 - 0.5, host3 - 0.5, host4 - 0.5), so ReceiverTracker will reject `r1`. This is unexpected since r1 is starting exactly where `scheduleReceivers` suggested. This case can be fixed by ignoring the information of the receiver that is rescheduling in `receiverTrackingInfoMap`. 2) Assume there are 3 executors (host1, host2, host3) and each executors has 3 cores, and 3 receivers: r1, r2, r3. Assume r1 is running on host1. Now r2 is restarting, the previous `ReceiverSchedulingPolicy.rescheduleReceiver` will always return (host1, host2, host3). So it's possible that r2 will be scheduled to host1 by TaskScheduler. r3 is similar. Then at last, it's possible that there are 3 receivers running on host1, while host2 and host3 are idle. This issue can be fixed by returning only executors that have the minimum wight rather than returning at least 3 executors. Author: zsxwing zsxw...@gmail.com Closes #8340 from zsxwing/fix-receiver-scheduling. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f023aa2f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f023aa2f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f023aa2f Branch: refs/heads/master Commit: f023aa2fcc1d1dbb82aee568be0a8f2457c309ae Parents: d9c25de Author: zsxwing zsxw...@gmail.com Authored: Mon Aug 24 23:34:50 2015 -0700 Committer: Tathagata Das tathagata.das1...@gmail.com Committed: Mon Aug 24 23:34:50 2015 -0700 -- .../scheduler/ReceiverSchedulingPolicy.scala| 58 +++--- .../streaming/scheduler/ReceiverTracker.scala | 106 --- .../ReceiverSchedulingPolicySuite.scala | 13 +-- 3 files changed, 120 insertions(+), 57 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f023aa2f/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala -- diff --git a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala index ef5b687..10b5a7f 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/scheduler/ReceiverSchedulingPolicy.scala @@ -22,6 +22,36 @@ import scala.collection.mutable import org.apache.spark.streaming.receiver.Receiver +/** + * A class that tries to schedule receivers with evenly distributed. There are two phases for + * scheduling receivers. + * + * - The first phase is global scheduling when ReceiverTracker is starting and we need to schedule + * all receivers at the same time. ReceiverTracker will call `scheduleReceivers` at this phase. + * It will try to schedule receivers with evenly distributed. ReceiverTracker should update its + * receiverTrackingInfoMap according to the results of `scheduleReceivers`. + * `ReceiverTrackingInfo.scheduledExecutors` for each receiver will set to an executor list that + * contains the scheduled locations. Then when a receiver is starting, it will send a register + * request and `ReceiverTracker.registerReceiver` will be called. In + * `ReceiverTracker.registerReceiver`, if a receiver's scheduled executors is set, it should check + * if the location of this receiver is one of the scheduled executors, if not, the register will + * be rejected. + * - The second phase is local scheduling when a receiver is restarting. There are two cases of + * receiver restarting: + * - If a receiver is restarting because it's rejected due to the real location and the scheduled + * executors mismatching, in other words, it fails to start in one of the locations that + * `scheduleReceivers` suggested, `ReceiverTracker` should firstly choose the executors that are + * still alive in the list of scheduled executors, then use them to launch the receiver job. + * - If a receiver is restarting without a scheduled executors list,
spark git commit: [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration
Repository: spark Updated Branches: refs/heads/branch-1.5 742c82ed9 - c740f5dd2 [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration See [discussion](https://github.com/apache/spark/pull/8254#discussion_r37837770) CC jkbradley Author: Feynman Liang fli...@databricks.com Closes #8422 from feynmanliang/SPARK-10230. (cherry picked from commit 881208a8e849facf54166bdd69d3634407f952e7) Signed-off-by: Joseph K. Bradley jos...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c740f5dd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c740f5dd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c740f5dd Branch: refs/heads/branch-1.5 Commit: c740f5dd20459b491a8c088383c19c11a76c225d Parents: 742c82e Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 25 11:58:47 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 25 11:58:55 2015 -0700 -- .../spark/mllib/clustering/LDAOptimizer.scala | 16 .../apache/spark/mllib/clustering/LDASuite.scala| 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c740f5dd/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 5c2aae6..38486e9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -258,7 +258,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { private var tau0: Double = 1024 private var kappa: Double = 0.51 private var miniBatchFraction: Double = 0.05 - private var optimizeAlpha: Boolean = false + private var optimizeDocConcentration: Boolean = false // internal data structure private var docs: RDD[(Long, Vector)] = null @@ -335,20 +335,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer { } /** - * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution) - * will be optimized during training. + * Optimize docConcentration, indicates whether docConcentration (Dirichlet parameter for + * document-topic distribution) will be optimized during training. */ @Since(1.5.0) - def getOptimzeAlpha: Boolean = this.optimizeAlpha + def getOptimizeDocConcentration: Boolean = this.optimizeDocConcentration /** - * Sets whether to optimize alpha parameter during training. + * Sets whether to optimize docConcentration parameter during training. * * Default: false */ @Since(1.5.0) - def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = { -this.optimizeAlpha = optimizeAlpha + def setOptimizeDocConcentration(optimizeDocConcentration: Boolean): this.type = { +this.optimizeDocConcentration = optimizeDocConcentration this } @@ -458,7 +458,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { // Note that this is an optimization to avoid batch.count updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt) -if (optimizeAlpha) updateAlpha(gammat) +if (optimizeDocConcentration) updateAlpha(gammat) this } http://git-wip-us.apache.org/repos/asf/spark/blob/c740f5dd/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 8a714f9..746a76a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -423,7 +423,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { val k = 2 val docs = sc.parallelize(toyData) val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51) - .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false) + .setGammaShape(100).setOptimizeDocConcentration(true).setSampleWithReplacement(false) val lda = new LDA().setK(k) .setDocConcentration(1D / k) .setTopicConcentration(0.01) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration
Repository: spark Updated Branches: refs/heads/master b37f0cc1b - 881208a8e [SPARK-10230] [MLLIB] Rename optimizeAlpha to optimizeDocConcentration See [discussion](https://github.com/apache/spark/pull/8254#discussion_r37837770) CC jkbradley Author: Feynman Liang fli...@databricks.com Closes #8422 from feynmanliang/SPARK-10230. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/881208a8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/881208a8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/881208a8 Branch: refs/heads/master Commit: 881208a8e849facf54166bdd69d3634407f952e7 Parents: b37f0cc Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 25 11:58:47 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 25 11:58:47 2015 -0700 -- .../spark/mllib/clustering/LDAOptimizer.scala | 16 .../apache/spark/mllib/clustering/LDASuite.scala| 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/881208a8/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 5c2aae6..38486e9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -258,7 +258,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { private var tau0: Double = 1024 private var kappa: Double = 0.51 private var miniBatchFraction: Double = 0.05 - private var optimizeAlpha: Boolean = false + private var optimizeDocConcentration: Boolean = false // internal data structure private var docs: RDD[(Long, Vector)] = null @@ -335,20 +335,20 @@ final class OnlineLDAOptimizer extends LDAOptimizer { } /** - * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution) - * will be optimized during training. + * Optimize docConcentration, indicates whether docConcentration (Dirichlet parameter for + * document-topic distribution) will be optimized during training. */ @Since(1.5.0) - def getOptimzeAlpha: Boolean = this.optimizeAlpha + def getOptimizeDocConcentration: Boolean = this.optimizeDocConcentration /** - * Sets whether to optimize alpha parameter during training. + * Sets whether to optimize docConcentration parameter during training. * * Default: false */ @Since(1.5.0) - def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = { -this.optimizeAlpha = optimizeAlpha + def setOptimizeDocConcentration(optimizeDocConcentration: Boolean): this.type = { +this.optimizeDocConcentration = optimizeDocConcentration this } @@ -458,7 +458,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { // Note that this is an optimization to avoid batch.count updateLambda(batchResult, (miniBatchFraction * corpusSize).ceil.toInt) -if (optimizeAlpha) updateAlpha(gammat) +if (optimizeDocConcentration) updateAlpha(gammat) this } http://git-wip-us.apache.org/repos/asf/spark/blob/881208a8/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 8a714f9..746a76a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -423,7 +423,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { val k = 2 val docs = sc.parallelize(toyData) val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51) - .setGammaShape(100).setOptimzeAlpha(true).setSampleWithReplacement(false) + .setGammaShape(100).setOptimizeDocConcentration(true).setSampleWithReplacement(false) val lda = new LDA().setK(k) .setDocConcentration(1D / k) .setTopicConcentration(0.01) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10231] [MLLIB] update @Since annotation for mllib.classification
Repository: spark Updated Branches: refs/heads/master 881208a8e - 16a2be1a8 [SPARK-10231] [MLLIB] update @Since annotation for mllib.classification Update `Since` annotation in `mllib.classification`: 1. add version to classes, objects, constructors, and public variables declared in constructors 2. correct some versions 3. remove `Since` on `toString` MechCoder dbtsai Author: Xiangrui Meng m...@databricks.com Closes #8421 from mengxr/SPARK-10231 and squashes the following commits: b2dce80 [Xiangrui Meng] update @Since annotation for mllib.classification Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/16a2be1a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/16a2be1a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/16a2be1a Branch: refs/heads/master Commit: 16a2be1a84c0a274a60c0a584faaf58b55d4942b Parents: 881208a Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 12:16:23 2015 -0700 Committer: DB Tsai d...@netflix.com Committed: Tue Aug 25 12:16:23 2015 -0700 -- .../classification/ClassificationModel.scala| 7 ++--- .../classification/LogisticRegression.scala | 20 +- .../spark/mllib/classification/NaiveBayes.scala | 28 +++- .../apache/spark/mllib/classification/SVM.scala | 15 +++ .../StreamingLogisticRegressionWithSGD.scala| 9 ++- 5 files changed, 58 insertions(+), 21 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/16a2be1a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala index a29b425..85a4132 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/ClassificationModel.scala @@ -30,6 +30,7 @@ import org.apache.spark.rdd.RDD * belongs. The categories are represented by double values: 0.0, 1.0, 2.0, etc. */ @Experimental +@Since(0.8.0) trait ClassificationModel extends Serializable { /** * Predict values for the given data set using the model trained. @@ -37,7 +38,7 @@ trait ClassificationModel extends Serializable { * @param testData RDD representing data points to be predicted * @return an RDD[Double] where each entry contains the corresponding prediction */ - @Since(0.8.0) + @Since(1.0.0) def predict(testData: RDD[Vector]): RDD[Double] /** @@ -46,7 +47,7 @@ trait ClassificationModel extends Serializable { * @param testData array representing a single data point * @return predicted category from the trained model */ - @Since(0.8.0) + @Since(1.0.0) def predict(testData: Vector): Double /** @@ -54,7 +55,7 @@ trait ClassificationModel extends Serializable { * @param testData JavaRDD representing data points to be predicted * @return a JavaRDD[java.lang.Double] where each entry contains the corresponding prediction */ - @Since(0.8.0) + @Since(1.0.0) def predict(testData: JavaRDD[Vector]): JavaRDD[java.lang.Double] = predict(testData.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Double]] } http://git-wip-us.apache.org/repos/asf/spark/blob/16a2be1a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala index e03e662..5ceff5b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/LogisticRegression.scala @@ -41,11 +41,12 @@ import org.apache.spark.rdd.RDD * Multinomial Logistic Regression. By default, it is binary logistic regression * so numClasses will be set to 2. */ -class LogisticRegressionModel ( -override val weights: Vector, -override val intercept: Double, -val numFeatures: Int, -val numClasses: Int) +@Since(0.8.0) +class LogisticRegressionModel @Since(1.3.0) ( +@Since(1.0.0) override val weights: Vector, +@Since(1.0.0) override val intercept: Double, +@Since(1.3.0) val numFeatures: Int, +@Since(1.3.0) val numClasses: Int) extends GeneralizedLinearModel(weights, intercept) with ClassificationModel with Serializable with Saveable with PMMLExportable { @@ -75,6 +76,7 @@ class LogisticRegressionModel ( /** * Constructs a
spark git commit: [DOC] add missing parameters in SparkContext.scala for scala doc
Repository: spark Updated Branches: refs/heads/master 0e6368ffa - 5c1489015 [DOC] add missing parameters in SparkContext.scala for scala doc Author: Zhang, Liye liye.zh...@intel.com Closes #8412 from liyezhang556520/minorDoc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c148901 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c148901 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c148901 Branch: refs/heads/master Commit: 5c14890159a5711072bf395f662b2433a389edf9 Parents: 0e6368f Author: Zhang, Liye liye.zh...@intel.com Authored: Tue Aug 25 11:48:55 2015 +0100 Committer: Sean Owen so...@cloudera.com Committed: Tue Aug 25 11:48:55 2015 +0100 -- .../main/scala/org/apache/spark/SparkContext.scala | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c148901/core/src/main/scala/org/apache/spark/SparkContext.scala -- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 1ddaca8..9849aff 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -114,6 +114,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * :: DeveloperApi :: * Alternative constructor for setting preferred locations where Spark will create executors. * + * @param config a [[org.apache.spark.SparkConf]] object specifying other Spark parameters * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on. * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]] * from a list of input files or InputFormats for the application. @@ -145,6 +146,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @param jars Collection of JARs to send to the cluster. These can be paths on the local file * system or HDFS, HTTP, HTTPS, or FTP URLs. * @param environment Environment variables to set on worker nodes. + * @param preferredNodeLocationData used in YARN mode to select nodes to launch containers on. + * Can be generated using [[org.apache.spark.scheduler.InputFormatInfo.computePreferredLocations]] + * from a list of input files or InputFormats for the application. */ def this( master: String, @@ -841,6 +845,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @note Small files are preferred, large file is also allowable, but may cause bad performance. * @note On some filesystems, `.../path/#42;` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param minPartitions A suggestion value of the minimal splitting number for input data. */ def wholeTextFiles( @@ -889,6 +896,9 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * @note Small files are preferred; very large files may cause bad performance. * @note On some filesystems, `.../path/#42;` can be a more efficient way to read all files * in a directory rather than `.../path/` or `.../path` + * + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param minPartitions A suggestion value of the minimal splitting number for input data. */ @Experimental @@ -918,8 +928,11 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli * '''Note:''' We ensure that the byte array for each record in the resulting RDD * has the provided record length. * - * @param path Directory to the input data files + * @param path Directory to the input data files, the path can be comma separated paths as the + * list of inputs. * @param recordLength The length at which to split the records + * @param conf Configuration for setting up the dataset. + * * @return An RDD of data with values, represented as byte arrays */ @Experimental - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10245] [SQL] Fix decimal literals with precision scale
Repository: spark Updated Branches: refs/heads/master 00ae4be97 - ec89bd840 [SPARK-10245] [SQL] Fix decimal literals with precision scale In BigDecimal or java.math.BigDecimal, the precision could be smaller than scale, for example, BigDecimal(0.001) has precision = 1 and scale = 3. But DecimalType require that the precision should be larger than scale, so we should use the maximum of precision and scale when inferring the schema from decimal literal. Author: Davies Liu dav...@databricks.com Closes #8428 from davies/smaller_decimal. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec89bd84 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec89bd84 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec89bd84 Branch: refs/heads/master Commit: ec89bd840a6862751999d612f586a962cae63f6d Parents: 00ae4be Author: Davies Liu dav...@databricks.com Authored: Tue Aug 25 14:55:34 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Aug 25 14:55:34 2015 -0700 -- .../apache/spark/sql/catalyst/expressions/literals.scala | 7 --- .../sql/catalyst/expressions/LiteralExpressionSuite.scala | 8 +--- .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 10 ++ 3 files changed, 19 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ec89bd84/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 34bad23..8c0c5d5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -36,9 +36,10 @@ object Literal { case s: Short = Literal(s, ShortType) case s: String = Literal(UTF8String.fromString(s), StringType) case b: Boolean = Literal(b, BooleanType) -case d: BigDecimal = Literal(Decimal(d), DecimalType(d.precision, d.scale)) -case d: java.math.BigDecimal = Literal(Decimal(d), DecimalType(d.precision(), d.scale())) -case d: Decimal = Literal(d, DecimalType(d.precision, d.scale)) +case d: BigDecimal = Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale)) +case d: java.math.BigDecimal = + Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale())) +case d: Decimal = Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale)) case t: Timestamp = Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType) case d: Date = Literal(DateTimeUtils.fromJavaDate(d), DateType) case a: Array[Byte] = Literal(a, BinaryType) http://git-wip-us.apache.org/repos/asf/spark/blob/ec89bd84/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index f6404d2..015eb18 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -83,12 +83,14 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } test(decimal) { -List(0.0, 1.2, 1., 5).foreach { d = +List(-0.0001, 0.0, 0.001, 1.2, 1., 5).foreach { d = checkEvaluation(Literal(Decimal(d)), Decimal(d)) checkEvaluation(Literal(Decimal(d.toInt)), Decimal(d.toInt)) checkEvaluation(Literal(Decimal(d.toLong)), Decimal(d.toLong)) - checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 1)), -Decimal((d * 1000L).toLong, 10, 1)) + checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 3)), +Decimal((d * 1000L).toLong, 10, 3)) + checkEvaluation(Literal(BigDecimal(d.toString)), Decimal(d)) + checkEvaluation(Literal(new java.math.BigDecimal(d.toString)), Decimal(d)) } } http://git-wip-us.apache.org/repos/asf/spark/blob/ec89bd84/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index dcb4e83..aa07665 100644 ---
spark git commit: [SPARK-10245] [SQL] Fix decimal literals with precision scale
Repository: spark Updated Branches: refs/heads/branch-1.5 6f05b7aeb - 8925896b1 [SPARK-10245] [SQL] Fix decimal literals with precision scale In BigDecimal or java.math.BigDecimal, the precision could be smaller than scale, for example, BigDecimal(0.001) has precision = 1 and scale = 3. But DecimalType require that the precision should be larger than scale, so we should use the maximum of precision and scale when inferring the schema from decimal literal. Author: Davies Liu dav...@databricks.com Closes #8428 from davies/smaller_decimal. (cherry picked from commit ec89bd840a6862751999d612f586a962cae63f6d) Signed-off-by: Yin Huai yh...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8925896b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8925896b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8925896b Branch: refs/heads/branch-1.5 Commit: 8925896b1eb0a13d723d38fb263d3bec0a01ec10 Parents: 6f05b7a Author: Davies Liu dav...@databricks.com Authored: Tue Aug 25 14:55:34 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Aug 25 14:55:45 2015 -0700 -- .../apache/spark/sql/catalyst/expressions/literals.scala | 7 --- .../sql/catalyst/expressions/LiteralExpressionSuite.scala | 8 +--- .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 10 ++ 3 files changed, 19 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8925896b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index 34bad23..8c0c5d5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -36,9 +36,10 @@ object Literal { case s: Short = Literal(s, ShortType) case s: String = Literal(UTF8String.fromString(s), StringType) case b: Boolean = Literal(b, BooleanType) -case d: BigDecimal = Literal(Decimal(d), DecimalType(d.precision, d.scale)) -case d: java.math.BigDecimal = Literal(Decimal(d), DecimalType(d.precision(), d.scale())) -case d: Decimal = Literal(d, DecimalType(d.precision, d.scale)) +case d: BigDecimal = Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale)) +case d: java.math.BigDecimal = + Literal(Decimal(d), DecimalType(Math.max(d.precision, d.scale), d.scale())) +case d: Decimal = Literal(d, DecimalType(Math.max(d.precision, d.scale), d.scale)) case t: Timestamp = Literal(DateTimeUtils.fromJavaTimestamp(t), TimestampType) case d: Date = Literal(DateTimeUtils.fromJavaDate(d), DateType) case a: Array[Byte] = Literal(a, BinaryType) http://git-wip-us.apache.org/repos/asf/spark/blob/8925896b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala index f6404d2..015eb18 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/LiteralExpressionSuite.scala @@ -83,12 +83,14 @@ class LiteralExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { } test(decimal) { -List(0.0, 1.2, 1., 5).foreach { d = +List(-0.0001, 0.0, 0.001, 1.2, 1., 5).foreach { d = checkEvaluation(Literal(Decimal(d)), Decimal(d)) checkEvaluation(Literal(Decimal(d.toInt)), Decimal(d.toInt)) checkEvaluation(Literal(Decimal(d.toLong)), Decimal(d.toLong)) - checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 1)), -Decimal((d * 1000L).toLong, 10, 1)) + checkEvaluation(Literal(Decimal((d * 1000L).toLong, 10, 3)), +Decimal((d * 1000L).toLong, 10, 3)) + checkEvaluation(Literal(BigDecimal(d.toString)), Decimal(d)) + checkEvaluation(Literal(new java.math.BigDecimal(d.toString)), Decimal(d)) } } http://git-wip-us.apache.org/repos/asf/spark/blob/8925896b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
spark git commit: [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive)
Repository: spark Updated Branches: refs/heads/master ec89bd840 - 7467b52ed [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive) Follow the rule in Hive for decimal division. see https://github.com/apache/hive/blob/ac755ebe26361a4647d53db2a28500f71697b276/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPDivide.java#L113 cc chenghao-intel Author: Davies Liu dav...@databricks.com Closes #8415 from davies/decimal_div2. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7467b52e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7467b52e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7467b52e Branch: refs/heads/master Commit: 7467b52ed07f174d93dfc4cb544dc4b69a2c2826 Parents: ec89bd8 Author: Davies Liu dav...@databricks.com Authored: Tue Aug 25 15:19:41 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Aug 25 15:20:24 2015 -0700 -- .../catalyst/analysis/HiveTypeCoercion.scala| 10 ++-- .../sql/catalyst/analysis/AnalysisSuite.scala | 9 --- .../analysis/DecimalPrecisionSuite.scala| 8 +++ .../org/apache/spark/sql/SQLQuerySuite.scala| 25 ++-- 4 files changed, 39 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7467b52e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index a1aa2a2..87c11ab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -396,8 +396,14 @@ object HiveTypeCoercion { resultType) case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) = - val resultType = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 1), -max(6, s1 + p2 + 1)) + var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2) + var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1)) + val diff = (intDig + decDig) - DecimalType.MAX_SCALE + if (diff 0) { +decDig -= diff / 2 + 1 +intDig = DecimalType.MAX_SCALE - decDig + } + val resultType = DecimalType.bounded(intDig + decDig, decDig) val widerType = widerDecimalType(p1, s1, p2, s2) CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)), resultType) http://git-wip-us.apache.org/repos/asf/spark/blob/7467b52e/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 1e0cc81..820b336 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -17,15 +17,14 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.catalyst.SimpleCatalystConf -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.dsl.plans._ class AnalysisSuite extends AnalysisTest { - import TestRelations._ + import org.apache.spark.sql.catalyst.analysis.TestRelations._ test(union project *) { val plan = (1 to 100) @@ -96,7 +95,7 @@ class AnalysisSuite extends AnalysisTest { assert(pl(1).dataType == DoubleType) assert(pl(2).dataType == DoubleType) // StringType will be promoted into Decimal(38, 18) -assert(pl(3).dataType == DecimalType(38, 29)) +assert(pl(3).dataType == DecimalType(38, 22)) assert(pl(4).dataType == DoubleType) } http://git-wip-us.apache.org/repos/asf/spark/blob/7467b52e/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala
spark git commit: [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive)
Repository: spark Updated Branches: refs/heads/branch-1.5 8925896b1 - ab7d46d1d [SPARK-10215] [SQL] Fix precision of division (follow the rule in Hive) Follow the rule in Hive for decimal division. see https://github.com/apache/hive/blob/ac755ebe26361a4647d53db2a28500f71697b276/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFOPDivide.java#L113 cc chenghao-intel Author: Davies Liu dav...@databricks.com Closes #8415 from davies/decimal_div2. (cherry picked from commit 7467b52ed07f174d93dfc4cb544dc4b69a2c2826) Signed-off-by: Yin Huai yh...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ab7d46d1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ab7d46d1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ab7d46d1 Branch: refs/heads/branch-1.5 Commit: ab7d46d1d6e7e6705a3348a0cab2d05fe62951cf Parents: 8925896 Author: Davies Liu dav...@databricks.com Authored: Tue Aug 25 15:19:41 2015 -0700 Committer: Yin Huai yh...@databricks.com Committed: Tue Aug 25 15:20:42 2015 -0700 -- .../catalyst/analysis/HiveTypeCoercion.scala| 10 ++-- .../sql/catalyst/analysis/AnalysisSuite.scala | 9 --- .../analysis/DecimalPrecisionSuite.scala| 8 +++ .../org/apache/spark/sql/SQLQuerySuite.scala| 25 ++-- 4 files changed, 39 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ab7d46d1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala index a1aa2a2..87c11ab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala @@ -396,8 +396,14 @@ object HiveTypeCoercion { resultType) case Divide(e1 @ DecimalType.Expression(p1, s1), e2 @ DecimalType.Expression(p2, s2)) = - val resultType = DecimalType.bounded(p1 - s1 + s2 + max(6, s1 + p2 + 1), -max(6, s1 + p2 + 1)) + var intDig = min(DecimalType.MAX_SCALE, p1 - s1 + s2) + var decDig = min(DecimalType.MAX_SCALE, max(6, s1 + p2 + 1)) + val diff = (intDig + decDig) - DecimalType.MAX_SCALE + if (diff 0) { +decDig -= diff / 2 + 1 +intDig = DecimalType.MAX_SCALE - decDig + } + val resultType = DecimalType.bounded(intDig + decDig, decDig) val widerType = widerDecimalType(p1, s1, p2, s2) CheckOverflow(Divide(promotePrecision(e1, widerType), promotePrecision(e2, widerType)), resultType) http://git-wip-us.apache.org/repos/asf/spark/blob/ab7d46d1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala -- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 1e0cc81..820b336 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -17,15 +17,14 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.types._ -import org.apache.spark.sql.catalyst.SimpleCatalystConf -import org.apache.spark.sql.catalyst.dsl.expressions._ -import org.apache.spark.sql.catalyst.dsl.plans._ class AnalysisSuite extends AnalysisTest { - import TestRelations._ + import org.apache.spark.sql.catalyst.analysis.TestRelations._ test(union project *) { val plan = (1 to 100) @@ -96,7 +95,7 @@ class AnalysisSuite extends AnalysisTest { assert(pl(1).dataType == DoubleType) assert(pl(2).dataType == DoubleType) // StringType will be promoted into Decimal(38, 18) -assert(pl(3).dataType == DecimalType(38, 29)) +assert(pl(3).dataType == DecimalType(38, 22)) assert(pl(4).dataType == DoubleType) } http://git-wip-us.apache.org/repos/asf/spark/blob/ab7d46d1/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/DecimalPrecisionSuite.scala -- diff --git
spark git commit: [SPARK-10048] [SPARKR] Support arbitrary nested Java array in serde.
Repository: spark Updated Branches: refs/heads/master 16a2be1a8 - 71a138cd0 [SPARK-10048] [SPARKR] Support arbitrary nested Java array in serde. This PR: 1. supports transferring arbitrary nested array from JVM to R side in SerDe; 2. based on 1, collect() implemenation is improved. Now it can support collecting data of complex types from a DataFrame. Author: Sun Rui rui@intel.com Closes #8276 from sun-rui/SPARK-10048. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/71a138cd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/71a138cd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/71a138cd Branch: refs/heads/master Commit: 71a138cd0e0a14e8426f97877e3b52a562bbd02c Parents: 16a2be1 Author: Sun Rui rui@intel.com Authored: Tue Aug 25 13:14:10 2015 -0700 Committer: Shivaram Venkataraman shiva...@cs.berkeley.edu Committed: Tue Aug 25 13:14:10 2015 -0700 -- R/pkg/R/DataFrame.R | 55 ++--- R/pkg/R/deserialize.R | 72 +++- R/pkg/R/serialize.R | 10 +-- R/pkg/inst/tests/test_Serde.R | 77 ++ R/pkg/inst/worker/worker.R | 4 +- .../apache/spark/api/r/RBackendHandler.scala| 7 ++ .../scala/org/apache/spark/api/r/SerDe.scala| 86 .../org/apache/spark/sql/api/r/SQLUtils.scala | 32 +--- 8 files changed, 216 insertions(+), 127 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/71a138cd/R/pkg/R/DataFrame.R -- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 10f3c4e..ae1d912 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -652,18 +652,49 @@ setMethod(dim, setMethod(collect, signature(x = DataFrame), function(x, stringsAsFactors = FALSE) { -# listCols is a list of raw vectors, one per column -listCols - callJStatic(org.apache.spark.sql.api.r.SQLUtils, dfToCols, x@sdf) -cols - lapply(listCols, function(col) { - objRaw - rawConnection(col) - numRows - readInt(objRaw) - col - readCol(objRaw, numRows) - close(objRaw) - col -}) -names(cols) - columns(x) -do.call(cbind.data.frame, list(cols, stringsAsFactors = stringsAsFactors)) - }) +names - columns(x) +ncol - length(names) +if (ncol = 0) { + # empty data.frame with 0 columns and 0 rows + data.frame() +} else { + # listCols is a list of columns + listCols - callJStatic(org.apache.spark.sql.api.r.SQLUtils, dfToCols, x@sdf) + stopifnot(length(listCols) == ncol) + + # An empty data.frame with 0 columns and number of rows as collected + nrow - length(listCols[[1]]) + if (nrow = 0) { +df - data.frame() + } else { +df - data.frame(row.names = 1 : nrow) + } + + # Append columns one by one + for (colIndex in 1 : ncol) { +# Note: appending a column of list type into a data.frame so that +# data of complex type can be held. But getting a cell from a column +# of list type returns a list instead of a vector. So for columns of +# non-complex type, append them as vector. +col - listCols[[colIndex]] +if (length(col) = 0) { + df[[names[colIndex]]] - col +} else { + # TODO: more robust check on column of primitive types + vec - do.call(c, col) + if (class(vec) != list) { +df[[names[colIndex]]] - vec + } else { +# For columns of complex type, be careful to access them. +# Get a column of complex type returns a list. +# Get a cell from a column of complex type returns a list instead of a vector. +df[[names[colIndex]]] - col + } + } +} +df + } +}) #' Limit #' http://git-wip-us.apache.org/repos/asf/spark/blob/71a138cd/R/pkg/R/deserialize.R -- diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R index 33bf13e..6cf628e 100644 --- a/R/pkg/R/deserialize.R +++ b/R/pkg/R/deserialize.R @@ -48,6 +48,7 @@ readTypedObject - function(con, type) { r = readRaw(con),
spark git commit: [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias
Repository: spark Updated Branches: refs/heads/branch-1.5 5a32ed75c - 95e44b4df [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias * Adds doc for alias of runMIniBatchSGD documenting default value for convergeTol * Cleans up a note in code Author: Feynman Liang fli...@databricks.com Closes #8425 from feynmanliang/SPARK-9800. (cherry picked from commit c0e9ff1588b4d9313cc6ec6e00e5c7663eb67910) Signed-off-by: Joseph K. Bradley jos...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/95e44b4d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/95e44b4d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/95e44b4d Branch: refs/heads/branch-1.5 Commit: 95e44b4df81b09803be2fde8c4e2566be0c8fdbc Parents: 5a32ed7 Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 25 13:21:05 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 25 13:21:16 2015 -0700 -- .../org/apache/spark/mllib/optimization/GradientDescent.scala | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/95e44b4d/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index 8f0d1e4..3b663b5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -235,7 +235,7 @@ object GradientDescent extends Logging { if (miniBatchSize 0) { /** - * NOTE(Xinghao): lossSum is computed using the weights from the previous iteration + * lossSum is computed using the weights from the previous iteration * and regVal is the regularization value computed in the previous iteration as well. */ stochasticLossHistory.append(lossSum / miniBatchSize + regVal) @@ -264,6 +264,9 @@ object GradientDescent extends Logging { } + /** + * Alias of [[runMiniBatchSGD]] with convergenceTol set to default value of 0.001. + */ def runMiniBatchSGD( data: RDD[(Double, Vector)], gradient: Gradient, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias
Repository: spark Updated Branches: refs/heads/master 71a138cd0 - c0e9ff158 [SPARK-9800] Adds docs for GradientDescent$.runMiniBatchSGD alias * Adds doc for alias of runMIniBatchSGD documenting default value for convergeTol * Cleans up a note in code Author: Feynman Liang fli...@databricks.com Closes #8425 from feynmanliang/SPARK-9800. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0e9ff15 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0e9ff15 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0e9ff15 Branch: refs/heads/master Commit: c0e9ff1588b4d9313cc6ec6e00e5c7663eb67910 Parents: 71a138c Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 25 13:21:05 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 25 13:21:05 2015 -0700 -- .../org/apache/spark/mllib/optimization/GradientDescent.scala | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c0e9ff15/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala index 8f0d1e4..3b663b5 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/GradientDescent.scala @@ -235,7 +235,7 @@ object GradientDescent extends Logging { if (miniBatchSize 0) { /** - * NOTE(Xinghao): lossSum is computed using the weights from the previous iteration + * lossSum is computed using the weights from the previous iteration * and regVal is the regularization value computed in the previous iteration as well. */ stochasticLossHistory.append(lossSum / miniBatchSize + regVal) @@ -264,6 +264,9 @@ object GradientDescent extends Logging { } + /** + * Alias of [[runMiniBatchSGD]] with convergenceTol set to default value of 0.001. + */ def runMiniBatchSGD( data: RDD[(Double, Vector)], gradient: Gradient, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10237] [MLLIB] update since versions in mllib.fpm
Repository: spark Updated Branches: refs/heads/branch-1.5 95e44b4df - 186326df2 [SPARK-10237] [MLLIB] update since versions in mllib.fpm Same as #8421 but for `mllib.fpm`. cc feynmanliang Author: Xiangrui Meng m...@databricks.com Closes #8429 from mengxr/SPARK-10237. (cherry picked from commit c619c7552f22d28cfa321ce671fc9ca854dd655f) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/186326df Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/186326df Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/186326df Branch: refs/heads/branch-1.5 Commit: 186326df21daf8d8271a522f2569eb5cd7be1442 Parents: 95e44b4 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 13:22:38 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 25 13:22:45 2015 -0700 -- .../spark/mllib/fpm/AssociationRules.scala | 7 -- .../org/apache/spark/mllib/fpm/FPGrowth.scala | 9 ++-- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 23 +--- 3 files changed, 32 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/186326df/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index ba3b447..95c688c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -82,12 +82,15 @@ class AssociationRules private[fpm] ( }.filter(_.confidence = minConfidence) } + /** Java-friendly version of [[run]]. */ + @Since(1.5.0) def run[Item](freqItemsets: JavaRDD[FreqItemset[Item]]): JavaRDD[Rule[Item]] = { val tag = fakeClassTag[Item] run(freqItemsets.rdd)(tag) } } +@Since(1.5.0) object AssociationRules { /** @@ -104,8 +107,8 @@ object AssociationRules { @Since(1.5.0) @Experimental class Rule[Item] private[fpm] ( - val antecedent: Array[Item], - val consequent: Array[Item], + @Since(1.5.0) val antecedent: Array[Item], + @Since(1.5.0) val consequent: Array[Item], freqUnion: Double, freqAntecedent: Double) extends Serializable { http://git-wip-us.apache.org/repos/asf/spark/blob/186326df/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index e37f806..aea5c4f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -42,7 +42,8 @@ import org.apache.spark.storage.StorageLevel */ @Since(1.3.0) @Experimental -class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { +class FPGrowthModel[Item: ClassTag] @Since(1.3.0) ( +@Since(1.3.0) val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { /** * Generates association rules for the [[Item]]s in [[freqItemsets]]. * @param confidence minimal confidence of the rules produced @@ -126,6 +127,8 @@ class FPGrowth private ( new FPGrowthModel(freqItemsets) } + /** Java-friendly version of [[run]]. */ + @Since(1.3.0) def run[Item, Basket : JavaIterable[Item]](data: JavaRDD[Basket]): FPGrowthModel[Item] = { implicit val tag = fakeClassTag[Item] run(data.rdd.map(_.asScala.toArray)) @@ -226,7 +229,9 @@ object FPGrowth { * */ @Since(1.3.0) - class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable { + class FreqItemset[Item] @Since(1.3.0) ( + @Since(1.3.0) val items: Array[Item], + @Since(1.3.0) val freq: Long) extends Serializable { /** * Returns items in a Java List. http://git-wip-us.apache.org/repos/asf/spark/blob/186326df/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index dc4ae1d..97916da 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -25,7 +25,7 @@ import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import
spark git commit: [SPARK-10237] [MLLIB] update since versions in mllib.fpm
Repository: spark Updated Branches: refs/heads/master c0e9ff158 - c619c7552 [SPARK-10237] [MLLIB] update since versions in mllib.fpm Same as #8421 but for `mllib.fpm`. cc feynmanliang Author: Xiangrui Meng m...@databricks.com Closes #8429 from mengxr/SPARK-10237. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c619c755 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c619c755 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c619c755 Branch: refs/heads/master Commit: c619c7552f22d28cfa321ce671fc9ca854dd655f Parents: c0e9ff1 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 13:22:38 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 25 13:22:38 2015 -0700 -- .../spark/mllib/fpm/AssociationRules.scala | 7 -- .../org/apache/spark/mllib/fpm/FPGrowth.scala | 9 ++-- .../org/apache/spark/mllib/fpm/PrefixSpan.scala | 23 +--- 3 files changed, 32 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c619c755/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index ba3b447..95c688c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -82,12 +82,15 @@ class AssociationRules private[fpm] ( }.filter(_.confidence = minConfidence) } + /** Java-friendly version of [[run]]. */ + @Since(1.5.0) def run[Item](freqItemsets: JavaRDD[FreqItemset[Item]]): JavaRDD[Rule[Item]] = { val tag = fakeClassTag[Item] run(freqItemsets.rdd)(tag) } } +@Since(1.5.0) object AssociationRules { /** @@ -104,8 +107,8 @@ object AssociationRules { @Since(1.5.0) @Experimental class Rule[Item] private[fpm] ( - val antecedent: Array[Item], - val consequent: Array[Item], + @Since(1.5.0) val antecedent: Array[Item], + @Since(1.5.0) val consequent: Array[Item], freqUnion: Double, freqAntecedent: Double) extends Serializable { http://git-wip-us.apache.org/repos/asf/spark/blob/c619c755/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala index e37f806..aea5c4f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/FPGrowth.scala @@ -42,7 +42,8 @@ import org.apache.spark.storage.StorageLevel */ @Since(1.3.0) @Experimental -class FPGrowthModel[Item: ClassTag](val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { +class FPGrowthModel[Item: ClassTag] @Since(1.3.0) ( +@Since(1.3.0) val freqItemsets: RDD[FreqItemset[Item]]) extends Serializable { /** * Generates association rules for the [[Item]]s in [[freqItemsets]]. * @param confidence minimal confidence of the rules produced @@ -126,6 +127,8 @@ class FPGrowth private ( new FPGrowthModel(freqItemsets) } + /** Java-friendly version of [[run]]. */ + @Since(1.3.0) def run[Item, Basket : JavaIterable[Item]](data: JavaRDD[Basket]): FPGrowthModel[Item] = { implicit val tag = fakeClassTag[Item] run(data.rdd.map(_.asScala.toArray)) @@ -226,7 +229,9 @@ object FPGrowth { * */ @Since(1.3.0) - class FreqItemset[Item](val items: Array[Item], val freq: Long) extends Serializable { + class FreqItemset[Item] @Since(1.3.0) ( + @Since(1.3.0) val items: Array[Item], + @Since(1.3.0) val freq: Long) extends Serializable { /** * Returns items in a Java List. http://git-wip-us.apache.org/repos/asf/spark/blob/c619c755/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala index dc4ae1d..97916da 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/PrefixSpan.scala @@ -25,7 +25,7 @@ import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.Logging -import org.apache.spark.annotation.Experimental +import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.api.java.JavaRDD import
[1/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters
Repository: spark Updated Branches: refs/heads/master 7f1e507bf - 69c9c1771 http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index ccf753e..5f897cb 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -21,9 +21,7 @@ import java.util.Collections import java.util.concurrent._ import java.util.regex.Pattern -import org.apache.spark.util.Utils - -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet} import com.google.common.util.concurrent.ThreadFactoryBuilder @@ -39,8 +37,8 @@ import org.apache.log4j.{Level, Logger} import org.apache.spark.{Logging, SecurityManager, SparkConf} import org.apache.spark.deploy.yarn.YarnSparkHadoopUtil._ import org.apache.spark.rpc.RpcEndpointRef -import org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend import org.apache.spark.scheduler.cluster.CoarseGrainedClusterMessages._ +import org.apache.spark.util.Utils /** * YarnAllocator is charged with requesting containers from the YARN ResourceManager and deciding @@ -164,7 +162,7 @@ private[yarn] class YarnAllocator( * Number of container requests at the given location that have not yet been fulfilled. */ private def getNumPendingAtLocation(location: String): Int = -amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).map(_.size).sum +amClient.getMatchingRequests(RM_REQUEST_PRIORITY, location, resource).asScala.map(_.size).sum /** * Request as many executors from the ResourceManager as needed to reach the desired total. If @@ -231,14 +229,14 @@ private[yarn] class YarnAllocator( numExecutorsRunning, allocateResponse.getAvailableResources)) - handleAllocatedContainers(allocatedContainers) + handleAllocatedContainers(allocatedContainers.asScala) } val completedContainers = allocateResponse.getCompletedContainersStatuses() if (completedContainers.size 0) { logDebug(Completed %d containers.format(completedContainers.size)) - processCompletedContainers(completedContainers) + processCompletedContainers(completedContainers.asScala) logDebug(Finished processing %d completed containers. Current running executor count: %d. .format(completedContainers.size, numExecutorsRunning)) @@ -271,7 +269,7 @@ private[yarn] class YarnAllocator( val request = createContainerRequest(resource, locality.nodes, locality.racks) amClient.addContainerRequest(request) val nodes = request.getNodes -val hostStr = if (nodes == null || nodes.isEmpty) Any else nodes.last +val hostStr = if (nodes == null || nodes.isEmpty) Any else nodes.asScala.last logInfo(sContainer request (host: $hostStr, capability: $resource)) } } else if (missing 0) { @@ -280,7 +278,8 @@ private[yarn] class YarnAllocator( val matchingRequests = amClient.getMatchingRequests(RM_REQUEST_PRIORITY, ANY_HOST, resource) if (!matchingRequests.isEmpty) { - matchingRequests.head.take(numToCancel).foreach(amClient.removeContainerRequest) +matchingRequests.iterator().next().asScala + .take(numToCancel).foreach(amClient.removeContainerRequest) } else { logWarning(Expected to find pending requests, but found none.) } @@ -459,7 +458,7 @@ private[yarn] class YarnAllocator( } } - if (allocatedContainerToHostMap.containsKey(containerId)) { + if (allocatedContainerToHostMap.contains(containerId)) { val host = allocatedContainerToHostMap.get(containerId).get val containerSet = allocatedHostToContainersMap.get(host).get http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala -- diff --git a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala index 4999f9c..df042bf 100644 --- a/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala +++ b/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnRMClient.scala @@ -19,17 +19,15 @@ package org.apache.spark.deploy.yarn import java.util.{List = JList} -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.{Map, Set} import scala.util.Try import org.apache.hadoop.conf.Configuration -import
[5/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters
[SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters Replace `JavaConversions` implicits with `JavaConverters` Most occurrences I've seen so far are necessary conversions; a few have been avoidable. None are in critical code as far as I see, yet. Author: Sean Owen so...@cloudera.com Closes #8033 from srowen/SPARK-9613. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69c9c177 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69c9c177 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69c9c177 Branch: refs/heads/master Commit: 69c9c177160e32a2fbc9b36ecc52156077fca6fc Parents: 7f1e507 Author: Sean Owen so...@cloudera.com Authored: Tue Aug 25 12:33:13 2015 +0100 Committer: Sean Owen so...@cloudera.com Committed: Tue Aug 25 12:33:13 2015 +0100 -- .../shuffle/unsafe/UnsafeShuffleWriter.java | 4 +- .../org/apache/spark/MapOutputTracker.scala | 4 +- .../scala/org/apache/spark/SSLOptions.scala | 11 +- .../scala/org/apache/spark/SparkContext.scala | 4 +- .../main/scala/org/apache/spark/TestUtils.scala | 9 +- .../apache/spark/api/java/JavaHadoopRDD.scala | 4 +- .../spark/api/java/JavaNewHadoopRDD.scala | 4 +- .../org/apache/spark/api/java/JavaPairRDD.scala | 19 ++-- .../org/apache/spark/api/java/JavaRDDLike.scala | 75 + .../spark/api/java/JavaSparkContext.scala | 20 ++-- .../spark/api/python/PythonHadoopUtil.scala | 28 ++--- .../org/apache/spark/api/python/PythonRDD.scala | 26 ++--- .../apache/spark/api/python/PythonUtils.scala | 15 ++- .../spark/api/python/PythonWorkerFactory.scala | 11 +- .../org/apache/spark/api/python/SerDeUtil.scala | 3 +- .../WriteInputFormatTestDataGenerator.scala | 8 +- .../scala/org/apache/spark/api/r/RRDD.scala | 13 ++- .../scala/org/apache/spark/api/r/RUtils.scala | 5 +- .../scala/org/apache/spark/api/r/SerDe.scala| 4 +- .../spark/broadcast/TorrentBroadcast.scala | 4 +- .../spark/deploy/ExternalShuffleService.scala | 8 +- .../org/apache/spark/deploy/PythonRunner.scala | 4 +- .../org/apache/spark/deploy/RPackageUtils.scala | 4 +- .../scala/org/apache/spark/deploy/RRunner.scala | 4 +- .../apache/spark/deploy/SparkCuratorUtil.scala | 4 +- .../apache/spark/deploy/SparkHadoopUtil.scala | 19 ++-- .../spark/deploy/SparkSubmitArguments.scala | 6 +- .../master/ZooKeeperPersistenceEngine.scala | 6 +- .../spark/deploy/worker/CommandUtils.scala | 5 +- .../spark/deploy/worker/DriverRunner.scala | 8 +- .../spark/deploy/worker/ExecutorRunner.scala| 7 +- .../org/apache/spark/deploy/worker/Worker.scala | 1 - .../org/apache/spark/executor/Executor.scala| 6 +- .../apache/spark/executor/ExecutorSource.scala | 4 +- .../spark/executor/MesosExecutorBackend.scala | 6 +- .../apache/spark/input/PortableDataStream.scala | 11 +- .../spark/input/WholeTextFileInputFormat.scala | 8 +- .../spark/launcher/WorkerCommandBuilder.scala | 4 +- .../apache/spark/metrics/MetricsConfig.scala| 22 ++-- .../network/netty/NettyBlockRpcServer.scala | 4 +- .../netty/NettyBlockTransferService.scala | 6 +- .../apache/spark/network/nio/Connection.scala | 4 +- .../spark/partial/GroupedCountEvaluator.scala | 10 +- .../spark/partial/GroupedMeanEvaluator.scala| 10 +- .../spark/partial/GroupedSumEvaluator.scala | 10 +- .../org/apache/spark/rdd/PairRDDFunctions.scala | 6 +- .../scala/org/apache/spark/rdd/PipedRDD.scala | 6 +- .../org/apache/spark/rdd/SubtractedRDD.scala| 4 +- .../spark/scheduler/InputFormatInfo.scala | 4 +- .../scala/org/apache/spark/scheduler/Pool.scala | 10 +- .../mesos/CoarseMesosSchedulerBackend.scala | 20 ++-- .../mesos/MesosClusterPersistenceEngine.scala | 4 +- .../cluster/mesos/MesosClusterScheduler.scala | 14 +-- .../cluster/mesos/MesosSchedulerBackend.scala | 22 ++-- .../cluster/mesos/MesosSchedulerUtils.scala | 25 ++--- .../spark/serializer/KryoSerializer.scala | 10 +- .../shuffle/FileShuffleBlockResolver.scala | 8 +- .../storage/BlockManagerMasterEndpoint.scala| 8 +- .../scala/org/apache/spark/util/AkkaUtils.scala | 4 +- .../org/apache/spark/util/ListenerBus.scala | 7 +- .../spark/util/MutableURLClassLoader.scala | 2 - .../apache/spark/util/TimeStampedHashMap.scala | 10 +- .../apache/spark/util/TimeStampedHashSet.scala | 4 +- .../scala/org/apache/spark/util/Utils.scala | 20 ++-- .../apache/spark/util/collection/Utils.scala| 4 +- .../java/org/apache/spark/JavaAPISuite.java | 6 +- .../scala/org/apache/spark/SparkConfSuite.scala | 7 +- .../spark/deploy/LogUrlsStandaloneSuite.scala | 1 - .../spark/deploy/RPackageUtilsSuite.scala | 8 +-
[3/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters
http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala -- diff --git a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala index 91d63d4..a2ab320 100644 --- a/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala +++ b/external/flume/src/main/scala/org/apache/spark/streaming/flume/PollingFlumeTestUtils.scala @@ -18,9 +18,8 @@ package org.apache.spark.streaming.flume import java.util.concurrent._ -import java.util.{List = JList, Map = JMap} +import java.util.{Map = JMap, Collections} -import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import com.google.common.base.Charsets.UTF_8 @@ -77,7 +76,7 @@ private[flume] class PollingFlumeTestUtils { /** * Start 2 sinks and return the ports */ - def startMultipleSinks(): JList[Int] = { + def startMultipleSinks(): Seq[Int] = { channels.clear() sinks.clear() @@ -138,8 +137,7 @@ private[flume] class PollingFlumeTestUtils { /** * A Python-friendly method to assert the output */ - def assertOutput( - outputHeaders: JList[JMap[String, String]], outputBodies: JList[String]): Unit = { + def assertOutput(outputHeaders: Seq[JMap[String, String]], outputBodies: Seq[String]): Unit = { require(outputHeaders.size == outputBodies.size) val eventSize = outputHeaders.size if (eventSize != totalEventsPerChannel * channels.size) { @@ -149,12 +147,12 @@ private[flume] class PollingFlumeTestUtils { var counter = 0 for (k - 0 until channels.size; i - 0 until totalEventsPerChannel) { val eventBodyToVerify = s${channels(k).getName}-$i - val eventHeaderToVerify: JMap[String, String] = Map[String, String](stest-$i - header) + val eventHeaderToVerify: JMap[String, String] = Collections.singletonMap(stest-$i, header) var found = false var j = 0 while (j eventSize !found) { -if (eventBodyToVerify == outputBodies.get(j) - eventHeaderToVerify == outputHeaders.get(j)) { +if (eventBodyToVerify == outputBodies(j) + eventHeaderToVerify == outputHeaders(j)) { found = true counter += 1 } @@ -195,7 +193,7 @@ private[flume] class PollingFlumeTestUtils { tx.begin() for (j - 0 until eventsPerBatch) { channel.put(EventBuilder.withBody(s${channel.getName}-$t.getBytes(UTF_8), -Map[String, String](stest-$t - header))) +Collections.singletonMap(stest-$t, header))) t += 1 } tx.commit() http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala -- diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala index d5f9a0a..ff2fb8e 100644 --- a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala +++ b/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumePollingStreamSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.streaming.flume import java.net.InetSocketAddress -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} import scala.concurrent.duration._ import scala.language.postfixOps @@ -116,9 +116,9 @@ class FlumePollingStreamSuite extends SparkFunSuite with BeforeAndAfter with Log // The eventually is required to ensure that all data in the batch has been processed. eventually(timeout(10 seconds), interval(100 milliseconds)) { val flattenOutputBuffer = outputBuffer.flatten -val headers = flattenOutputBuffer.map(_.event.getHeaders.map { - case kv = (kv._1.toString, kv._2.toString) -}).map(mapAsJavaMap) +val headers = flattenOutputBuffer.map(_.event.getHeaders.asScala.map { + case (key, value) = (key.toString, value.toString) +}).map(_.asJava) val bodies = flattenOutputBuffer.map(e = new String(e.event.getBody.array(), UTF_8)) utils.assertOutput(headers, bodies) } http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala -- diff --git a/external/flume/src/test/scala/org/apache/spark/streaming/flume/FlumeStreamSuite.scala
[4/5] spark git commit: [SPARK-9613] [CORE] Ban use of JavaConversions and migrate all existing uses to JavaConverters
http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala -- diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala index b089da8..7c170a7 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockRpcServer.scala @@ -19,7 +19,7 @@ package org.apache.spark.network.netty import java.nio.ByteBuffer -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager @@ -55,7 +55,7 @@ class NettyBlockRpcServer( case openBlocks: OpenBlocks = val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) -val streamId = streamManager.registerStream(blocks.iterator) +val streamId = streamManager.registerStream(blocks.iterator.asJava) logTrace(sRegistered streamId $streamId with ${blocks.size} buffers) responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala -- diff --git a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala index d650d5f..ff8aae9 100644 --- a/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala +++ b/core/src/main/scala/org/apache/spark/network/netty/NettyBlockTransferService.scala @@ -17,7 +17,7 @@ package org.apache.spark.network.netty -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.concurrent.{Future, Promise} import org.apache.spark.{SecurityManager, SparkConf} @@ -58,7 +58,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage securityManager.isSaslEncryptionEnabled())) } transportContext = new TransportContext(transportConf, rpcHandler) -clientFactory = transportContext.createClientFactory(clientBootstrap.toList) +clientFactory = transportContext.createClientFactory(clientBootstrap.toSeq.asJava) server = createServer(serverBootstrap.toList) appId = conf.getAppId logInfo(Server created on + server.getPort) @@ -67,7 +67,7 @@ class NettyBlockTransferService(conf: SparkConf, securityManager: SecurityManage /** Creates and binds the TransportServer, possibly trying multiple ports. */ private def createServer(bootstraps: List[TransportServerBootstrap]): TransportServer = { def startService(port: Int): (TransportServer, Int) = { - val server = transportContext.createServer(port, bootstraps) + val server = transportContext.createServer(port, bootstraps.asJava) (server, server.getPort) } http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/network/nio/Connection.scala -- diff --git a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala index 1499da0..8d9ebad 100644 --- a/core/src/main/scala/org/apache/spark/network/nio/Connection.scala +++ b/core/src/main/scala/org/apache/spark/network/nio/Connection.scala @@ -23,7 +23,7 @@ import java.nio.channels._ import java.util.concurrent.ConcurrentLinkedQueue import java.util.LinkedList -import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ import scala.collection.mutable.{ArrayBuffer, HashMap} import scala.util.control.NonFatal @@ -145,7 +145,7 @@ abstract class Connection(val channel: SocketChannel, val selector: Selector, } def callOnExceptionCallbacks(e: Throwable) { -onExceptionCallbacks foreach { +onExceptionCallbacks.asScala.foreach { callback = try { callback(this, e) http://git-wip-us.apache.org/repos/asf/spark/blob/69c9c177/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala -- diff --git a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala index 91b07ce..5afce75 100644 --- a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala +++ b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala @@ -19,7 +19,7 @@ package org.apache.spark.partial import
spark git commit: [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value
Repository: spark Updated Branches: refs/heads/master c619c7552 - 920590787 [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value Adds default convergence tolerance (0.001, set in `GradientDescent.convergenceTol`) to `setConvergenceTol`'s scaladoc Author: Feynman Liang fli...@databricks.com Closes #8424 from feynmanliang/SPARK-9797. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/92059078 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/92059078 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/92059078 Branch: refs/heads/master Commit: 9205907876cf65695e56c2a94bedd83df3675c03 Parents: c619c75 Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 25 13:23:15 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 25 13:23:15 2015 -0700 -- .../spark/mllib/regression/StreamingLinearRegressionWithSGD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/92059078/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala index 537a052..26654e4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala @@ -93,7 +93,7 @@ class StreamingLinearRegressionWithSGD private[mllib] ( } /** - * Set the convergence tolerance. + * Set the convergence tolerance. Default: 0.001. */ def setConvergenceTol(tolerance: Double): this.type = { this.algorithm.optimizer.setConvergenceTol(tolerance) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value
Repository: spark Updated Branches: refs/heads/branch-1.5 186326df2 - 055387c08 [SPARK-9797] [MLLIB] [DOC] StreamingLinearRegressionWithSGD.setConvergenceTol default value Adds default convergence tolerance (0.001, set in `GradientDescent.convergenceTol`) to `setConvergenceTol`'s scaladoc Author: Feynman Liang fli...@databricks.com Closes #8424 from feynmanliang/SPARK-9797. (cherry picked from commit 9205907876cf65695e56c2a94bedd83df3675c03) Signed-off-by: Joseph K. Bradley jos...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/055387c0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/055387c0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/055387c0 Branch: refs/heads/branch-1.5 Commit: 055387c087989c8790b6761429b68416ecee3a33 Parents: 186326d Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 25 13:23:15 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 25 13:23:25 2015 -0700 -- .../spark/mllib/regression/StreamingLinearRegressionWithSGD.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/055387c0/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala index 537a052..26654e4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.scala @@ -93,7 +93,7 @@ class StreamingLinearRegressionWithSGD private[mllib] ( } /** - * Set the convergence tolerance. + * Set the convergence tolerance. Default: 0.001. */ def setConvergenceTol(tolerance: Double): this.type = { this.algorithm.optimizer.setConvergenceTol(tolerance) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9888] [MLLIB] User guide for new LDA features
Repository: spark Updated Branches: refs/heads/master 7467b52ed - 125205cdb [SPARK-9888] [MLLIB] User guide for new LDA features * Adds two new sections to LDA's user guide; one for each optimizer/model * Documents new features added to LDA (e.g. topXXXperXXX, asymmetric priors, hyperpam optimization) * Cleans up a TODO and sets a default parameter in LDA code jkbradley hhbyyh Author: Feynman Liang fli...@databricks.com Closes #8254 from feynmanliang/SPARK-9888. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/125205cd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/125205cd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/125205cd Branch: refs/heads/master Commit: 125205cdb35530cdb4a8fff3e1ee49cf4a299583 Parents: 7467b52 Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 25 17:39:20 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 25 17:39:20 2015 -0700 -- docs/mllib-clustering.md| 135 --- .../spark/mllib/clustering/LDAModel.scala | 1 - .../spark/mllib/clustering/LDASuite.scala | 1 + 3 files changed, 117 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/125205cd/docs/mllib-clustering.md -- diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index fd9ab25..3fb35d3 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -438,28 +438,125 @@ sameModel = PowerIterationClusteringModel.load(sc, myModelPath) is a topic model which infers topics from a collection of text documents. LDA can be thought of as a clustering algorithm as follows: -* Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset. -* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts. -* Rather than estimating a clustering using a traditional distance, LDA uses a function based - on a statistical model of how text documents are generated. - -LDA takes in a collection of documents as vectors of word counts. -It supports different inference algorithms via `setOptimizer` function. EMLDAOptimizer learns clustering using [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) -on the likelihood function and yields comprehensive results, while OnlineLDAOptimizer uses iterative mini-batch sampling for [online variational inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) and is generally memory friendly. After fitting on the documents, LDA provides: - -* Topics: Inferred topics, each of which is a probability distribution over terms (words). -* Topic distributions for documents: For each non empty document in the training set, LDA gives a probability distribution over topics. (EM only). Note that for empty documents, we don't create the topic distributions. (EM only) +* Topics correspond to cluster centers, and documents correspond to +examples (rows) in a dataset. +* Topics and documents both exist in a feature space, where feature +vectors are vectors of word counts (bag of words). +* Rather than estimating a clustering using a traditional distance, LDA +uses a function based on a statistical model of how text documents are +generated. + +LDA supports different inference algorithms via `setOptimizer` function. +`EMLDAOptimizer` learns clustering using +[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) +on the likelihood function and yields comprehensive results, while +`OnlineLDAOptimizer` uses iterative mini-batch sampling for [online +variational +inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) +and is generally memory friendly. -LDA takes the following parameters: +LDA takes in a collection of documents as vectors of word counts and the +following parameters (set using the builder pattern): * `k`: Number of topics (i.e., cluster centers) -* `maxIterations`: Limit on the number of iterations of EM used for learning -* `docConcentration`: Hyperparameter for prior over documents' distributions over topics. Currently must be 1, where larger values encourage smoother inferred distributions. -* `topicConcentration`: Hyperparameter for prior over topics' distributions over terms (words). Currently must be 1, where larger values encourage smoother inferred distributions. -* `checkpointInterval`: If using checkpointing (set in the Spark configuration), this parameter specifies the frequency with which checkpoints will be created. If `maxIterations` is large, using checkpointing can help reduce shuffle file sizes on disk and
spark git commit: [SPARK-9888] [MLLIB] User guide for new LDA features
Repository: spark Updated Branches: refs/heads/branch-1.5 4c03cb4da - 5cf266fde [SPARK-9888] [MLLIB] User guide for new LDA features * Adds two new sections to LDA's user guide; one for each optimizer/model * Documents new features added to LDA (e.g. topXXXperXXX, asymmetric priors, hyperpam optimization) * Cleans up a TODO and sets a default parameter in LDA code jkbradley hhbyyh Author: Feynman Liang fli...@databricks.com Closes #8254 from feynmanliang/SPARK-9888. (cherry picked from commit 125205cdb35530cdb4a8fff3e1ee49cf4a299583) Signed-off-by: Joseph K. Bradley jos...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5cf266fd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5cf266fd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5cf266fd Branch: refs/heads/branch-1.5 Commit: 5cf266fdeb6632622642e5d9bc056a76680b1970 Parents: 4c03cb4 Author: Feynman Liang fli...@databricks.com Authored: Tue Aug 25 17:39:20 2015 -0700 Committer: Joseph K. Bradley jos...@databricks.com Committed: Tue Aug 25 17:39:40 2015 -0700 -- docs/mllib-clustering.md| 135 --- .../spark/mllib/clustering/LDAModel.scala | 1 - .../spark/mllib/clustering/LDASuite.scala | 1 + 3 files changed, 117 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5cf266fd/docs/mllib-clustering.md -- diff --git a/docs/mllib-clustering.md b/docs/mllib-clustering.md index fd9ab25..3fb35d3 100644 --- a/docs/mllib-clustering.md +++ b/docs/mllib-clustering.md @@ -438,28 +438,125 @@ sameModel = PowerIterationClusteringModel.load(sc, myModelPath) is a topic model which infers topics from a collection of text documents. LDA can be thought of as a clustering algorithm as follows: -* Topics correspond to cluster centers, and documents correspond to examples (rows) in a dataset. -* Topics and documents both exist in a feature space, where feature vectors are vectors of word counts. -* Rather than estimating a clustering using a traditional distance, LDA uses a function based - on a statistical model of how text documents are generated. - -LDA takes in a collection of documents as vectors of word counts. -It supports different inference algorithms via `setOptimizer` function. EMLDAOptimizer learns clustering using [expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) -on the likelihood function and yields comprehensive results, while OnlineLDAOptimizer uses iterative mini-batch sampling for [online variational inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) and is generally memory friendly. After fitting on the documents, LDA provides: - -* Topics: Inferred topics, each of which is a probability distribution over terms (words). -* Topic distributions for documents: For each non empty document in the training set, LDA gives a probability distribution over topics. (EM only). Note that for empty documents, we don't create the topic distributions. (EM only) +* Topics correspond to cluster centers, and documents correspond to +examples (rows) in a dataset. +* Topics and documents both exist in a feature space, where feature +vectors are vectors of word counts (bag of words). +* Rather than estimating a clustering using a traditional distance, LDA +uses a function based on a statistical model of how text documents are +generated. + +LDA supports different inference algorithms via `setOptimizer` function. +`EMLDAOptimizer` learns clustering using +[expectation-maximization](http://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) +on the likelihood function and yields comprehensive results, while +`OnlineLDAOptimizer` uses iterative mini-batch sampling for [online +variational +inference](https://www.cs.princeton.edu/~blei/papers/HoffmanBleiBach2010b.pdf) +and is generally memory friendly. -LDA takes the following parameters: +LDA takes in a collection of documents as vectors of word counts and the +following parameters (set using the builder pattern): * `k`: Number of topics (i.e., cluster centers) -* `maxIterations`: Limit on the number of iterations of EM used for learning -* `docConcentration`: Hyperparameter for prior over documents' distributions over topics. Currently must be 1, where larger values encourage smoother inferred distributions. -* `topicConcentration`: Hyperparameter for prior over topics' distributions over terms (words). Currently must be 1, where larger values encourage smoother inferred distributions. -* `checkpointInterval`: If using checkpointing (set in the Spark configuration), this parameter specifies the frequency with
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc3 [deleted] 3e8ae3894 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc1 [deleted] 60e08e507 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc4 [deleted] dbaa5c294 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.4.1-rc2 [deleted] 07b95c7ad - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10196] [SQL] Correctly saving decimals in internal rows to JSON.
Repository: spark Updated Branches: refs/heads/master f023aa2fc - df7041d02 [SPARK-10196] [SQL] Correctly saving decimals in internal rows to JSON. https://issues.apache.org/jira/browse/SPARK-10196 Author: Yin Huai yh...@databricks.com Closes #8408 from yhuai/DecimalJsonSPARK-10196. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/df7041d0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/df7041d0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/df7041d0 Branch: refs/heads/master Commit: df7041d02d3fd44b08a859f5d77bf6fb726895f0 Parents: f023aa2 Author: Yin Huai yh...@databricks.com Authored: Mon Aug 24 23:38:32 2015 -0700 Committer: Davies Liu davies@gmail.com Committed: Mon Aug 24 23:38:32 2015 -0700 -- .../datasources/json/JacksonGenerator.scala | 2 +- .../sql/sources/JsonHadoopFsRelationSuite.scala | 27 2 files changed, 28 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/df7041d0/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala index 99ac773..330ba90 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JacksonGenerator.scala @@ -95,7 +95,7 @@ private[sql] object JacksonGenerator { case (FloatType, v: Float) = gen.writeNumber(v) case (DoubleType, v: Double) = gen.writeNumber(v) case (LongType, v: Long) = gen.writeNumber(v) - case (DecimalType(), v: java.math.BigDecimal) = gen.writeNumber(v) + case (DecimalType(), v: Decimal) = gen.writeNumber(v.toJavaBigDecimal) case (ByteType, v: Byte) = gen.writeNumber(v.toInt) case (BinaryType, v: Array[Byte]) = gen.writeBinary(v) case (BooleanType, v: Boolean) = gen.writeBoolean(v) http://git-wip-us.apache.org/repos/asf/spark/blob/df7041d0/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala index ed6d512..8ca3a17 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/sources/JsonHadoopFsRelationSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.sources +import java.math.BigDecimal + import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil @@ -75,4 +77,29 @@ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { ) } } + + test(SPARK-10196: save decimal type to JSON) { +withTempDir { file = + file.delete() + + val schema = +new StructType() + .add(decimal, DecimalType(7, 2)) + + val data = +Row(new BigDecimal(10.02)) :: + Row(new BigDecimal(2.99)) :: + Row(new BigDecimal(1)) :: Nil + val df = createDataFrame(sparkContext.parallelize(data), schema) + + // Write the data out. + df.write.format(dataSourceName).save(file.getCanonicalPath) + + // Read it back and check the result. + checkAnswer( +read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), +df + ) +} + } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v1.5.0-rc2
Repository: spark Updated Branches: refs/heads/branch-1.5 ab7d46d1d - 4c03cb4da Preparing Spark release v1.5.0-rc2 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/72777135 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/72777135 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/72777135 Branch: refs/heads/branch-1.5 Commit: 727771352855dbb780008c449a877f5aaa5fc27a Parents: ab7d46d Author: Patrick Wendell pwend...@gmail.com Authored: Tue Aug 25 15:56:37 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Tue Aug 25 15:56:37 2015 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index e9c6d26..3ef7d6f 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index ed5c37e..684e07b 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 4f79d71..6b082ad 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index e6884b0..9ef1eda 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index 561ed4b..aa7021d 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0-SNAPSHOT/version +version1.5.0/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/72777135/external/flume-sink/pom.xml -- diff --git
[2/2] spark git commit: Preparing development version 1.5.1-SNAPSHOT
Preparing development version 1.5.1-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4c03cb4d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4c03cb4d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4c03cb4d Branch: refs/heads/branch-1.5 Commit: 4c03cb4da846bf3ea4cd99f593d74c4a817a7d2d Parents: 7277713 Author: Patrick Wendell pwend...@gmail.com Authored: Tue Aug 25 15:56:44 2015 -0700 Committer: Patrick Wendell pwend...@gmail.com Committed: Tue Aug 25 15:56:44 2015 -0700 -- assembly/pom.xml| 2 +- bagel/pom.xml | 2 +- core/pom.xml| 2 +- examples/pom.xml| 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml | 2 +- external/kafka-assembly/pom.xml | 2 +- external/kafka/pom.xml | 2 +- external/mqtt-assembly/pom.xml | 2 +- external/mqtt/pom.xml | 2 +- external/twitter/pom.xml| 2 +- external/zeromq/pom.xml | 2 +- extras/java8-tests/pom.xml | 2 +- extras/kinesis-asl-assembly/pom.xml | 2 +- extras/kinesis-asl/pom.xml | 2 +- extras/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml | 2 +- launcher/pom.xml| 2 +- mllib/pom.xml | 2 +- network/common/pom.xml | 2 +- network/shuffle/pom.xml | 2 +- network/yarn/pom.xml| 2 +- pom.xml | 2 +- repl/pom.xml| 2 +- sql/catalyst/pom.xml| 2 +- sql/core/pom.xml| 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml| 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- unsafe/pom.xml | 2 +- yarn/pom.xml| 2 +- 33 files changed, 33 insertions(+), 33 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 3ef7d6f..7b41ebb 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.1-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/bagel/pom.xml -- diff --git a/bagel/pom.xml b/bagel/pom.xml index 684e07b..16bf17c 100644 --- a/bagel/pom.xml +++ b/bagel/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.1-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/core/pom.xml -- diff --git a/core/pom.xml b/core/pom.xml index 6b082ad..beb547f 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.1-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/examples/pom.xml -- diff --git a/examples/pom.xml b/examples/pom.xml index 9ef1eda..3926b79 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.1-SNAPSHOT/version relativePath../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/external/flume-assembly/pom.xml -- diff --git a/external/flume-assembly/pom.xml b/external/flume-assembly/pom.xml index aa7021d..5eda12d 100644 --- a/external/flume-assembly/pom.xml +++ b/external/flume-assembly/pom.xml @@ -21,7 +21,7 @@ parent groupIdorg.apache.spark/groupId artifactIdspark-parent_2.10/artifactId -version1.5.0/version +version1.5.1-SNAPSHOT/version relativePath../../pom.xml/relativePath /parent http://git-wip-us.apache.org/repos/asf/spark/blob/4c03cb4d/external/flume-sink/pom.xml -- diff --git a/external/flume-sink/pom.xml b/external/flume-sink/pom.xml index
Git Push Summary
Repository: spark Updated Tags: refs/tags/v1.5.0-rc2 [created] 727771352 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util
Repository: spark Updated Branches: refs/heads/branch-1.5 055387c08 - 6f05b7aeb [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util Same as #8421 but for `mllib.pmml` and `mllib.util`. cc dbtsai Author: Xiangrui Meng m...@databricks.com Closes #8430 from mengxr/SPARK-10239 and squashes the following commits: a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util (cherry picked from commit 00ae4be97f7b205432db2967ba6d506286ef2ca6) Signed-off-by: DB Tsai d...@netflix.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f05b7ae Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f05b7ae Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f05b7ae Branch: refs/heads/branch-1.5 Commit: 6f05b7aebd66a00e2556a29b35084e81ac526406 Parents: 055387c Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 14:11:38 2015 -0700 Committer: DB Tsai d...@netflix.com Committed: Tue Aug 25 14:11:50 2015 -0700 -- .../org/apache/spark/mllib/pmml/PMMLExportable.scala | 7 ++- .../org/apache/spark/mllib/util/DataValidators.scala | 7 +-- .../org/apache/spark/mllib/util/KMeansDataGenerator.scala | 5 - .../org/apache/spark/mllib/util/LinearDataGenerator.scala | 10 -- .../mllib/util/LogisticRegressionDataGenerator.scala | 5 - .../org/apache/spark/mllib/util/MFDataGenerator.scala | 4 +++- .../main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 ++ .../org/apache/spark/mllib/util/SVMDataGenerator.scala| 6 -- .../scala/org/apache/spark/mllib/util/modelSaveLoad.scala | 6 +- 9 files changed, 41 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6f05b7ae/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala index 5e882d4..274ac7c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala @@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult import org.jpmml.model.JAXBUtil import org.apache.spark.SparkContext -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory /** @@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory * developed by the Data Mining Group (www.dmg.org). */ @DeveloperApi +@Since(1.4.0) trait PMMLExportable { /** @@ -48,6 +49,7 @@ trait PMMLExportable { * Export the model to a local file in PMML format */ @Experimental + @Since(1.4.0) def toPMML(localPath: String): Unit = { toPMML(new StreamResult(new File(localPath))) } @@ -57,6 +59,7 @@ trait PMMLExportable { * Export the model to a directory on a distributed file system in PMML format */ @Experimental + @Since(1.4.0) def toPMML(sc: SparkContext, path: String): Unit = { val pmml = toPMML() sc.parallelize(Array(pmml), 1).saveAsTextFile(path) @@ -67,6 +70,7 @@ trait PMMLExportable { * Export the model to the OutputStream in PMML format */ @Experimental + @Since(1.4.0) def toPMML(outputStream: OutputStream): Unit = { toPMML(new StreamResult(outputStream)) } @@ -76,6 +80,7 @@ trait PMMLExportable { * Export the model to a String in PMML format */ @Experimental + @Since(1.4.0) def toPMML(): String = { val writer = new StringWriter toPMML(new StreamResult(writer)) http://git-wip-us.apache.org/repos/asf/spark/blob/6f05b7ae/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala index be335a1..dffe6e7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala @@ -17,16 +17,17 @@ package org.apache.spark.mllib.util -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.Logging -import org.apache.spark.rdd.RDD +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD /** * :: DeveloperApi :: * A collection of methods used to validate data before applying ML algorithms. */ @DeveloperApi
spark git commit: [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util
Repository: spark Updated Branches: refs/heads/master 920590787 - 00ae4be97 [SPARK-10239] [SPARK-10244] [MLLIB] update since versions in mllib.pmml and mllib.util Same as #8421 but for `mllib.pmml` and `mllib.util`. cc dbtsai Author: Xiangrui Meng m...@databricks.com Closes #8430 from mengxr/SPARK-10239 and squashes the following commits: a189acf [Xiangrui Meng] update since versions in mllib.pmml and mllib.util Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00ae4be9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00ae4be9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00ae4be9 Branch: refs/heads/master Commit: 00ae4be97f7b205432db2967ba6d506286ef2ca6 Parents: 9205907 Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 14:11:38 2015 -0700 Committer: DB Tsai d...@netflix.com Committed: Tue Aug 25 14:11:38 2015 -0700 -- .../org/apache/spark/mllib/pmml/PMMLExportable.scala | 7 ++- .../org/apache/spark/mllib/util/DataValidators.scala | 7 +-- .../org/apache/spark/mllib/util/KMeansDataGenerator.scala | 5 - .../org/apache/spark/mllib/util/LinearDataGenerator.scala | 10 -- .../mllib/util/LogisticRegressionDataGenerator.scala | 5 - .../org/apache/spark/mllib/util/MFDataGenerator.scala | 4 +++- .../main/scala/org/apache/spark/mllib/util/MLUtils.scala | 2 ++ .../org/apache/spark/mllib/util/SVMDataGenerator.scala| 6 -- .../scala/org/apache/spark/mllib/util/modelSaveLoad.scala | 6 +- 9 files changed, 41 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/00ae4be9/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala index 5e882d4..274ac7c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala @@ -23,7 +23,7 @@ import javax.xml.transform.stream.StreamResult import org.jpmml.model.JAXBUtil import org.apache.spark.SparkContext -import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.annotation.{DeveloperApi, Experimental, Since} import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory /** @@ -33,6 +33,7 @@ import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory * developed by the Data Mining Group (www.dmg.org). */ @DeveloperApi +@Since(1.4.0) trait PMMLExportable { /** @@ -48,6 +49,7 @@ trait PMMLExportable { * Export the model to a local file in PMML format */ @Experimental + @Since(1.4.0) def toPMML(localPath: String): Unit = { toPMML(new StreamResult(new File(localPath))) } @@ -57,6 +59,7 @@ trait PMMLExportable { * Export the model to a directory on a distributed file system in PMML format */ @Experimental + @Since(1.4.0) def toPMML(sc: SparkContext, path: String): Unit = { val pmml = toPMML() sc.parallelize(Array(pmml), 1).saveAsTextFile(path) @@ -67,6 +70,7 @@ trait PMMLExportable { * Export the model to the OutputStream in PMML format */ @Experimental + @Since(1.4.0) def toPMML(outputStream: OutputStream): Unit = { toPMML(new StreamResult(outputStream)) } @@ -76,6 +80,7 @@ trait PMMLExportable { * Export the model to a String in PMML format */ @Experimental + @Since(1.4.0) def toPMML(): String = { val writer = new StringWriter toPMML(new StreamResult(writer)) http://git-wip-us.apache.org/repos/asf/spark/blob/00ae4be9/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala index be335a1..dffe6e7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/DataValidators.scala @@ -17,16 +17,17 @@ package org.apache.spark.mllib.util -import org.apache.spark.annotation.DeveloperApi import org.apache.spark.Logging -import org.apache.spark.rdd.RDD +import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.rdd.RDD /** * :: DeveloperApi :: * A collection of methods used to validate data before applying ML algorithms. */ @DeveloperApi +@Since(0.8.0) object DataValidators extends Logging { /** @@ -34,6 +35,7 @@ object DataValidators extends Logging {
spark git commit: [SPARK-10233] [MLLIB] update since version in mllib.evaluation
Repository: spark Updated Branches: refs/heads/branch-1.5 5cf266fde - af98e51f2 [SPARK-10233] [MLLIB] update since version in mllib.evaluation Same as #8421 but for `mllib.evaluation`. cc avulanov Author: Xiangrui Meng m...@databricks.com Closes #8423 from mengxr/SPARK-10233. (cherry picked from commit 8668ead2e7097b9591069599fbfccf67c53db659) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af98e51f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af98e51f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af98e51f Branch: refs/heads/branch-1.5 Commit: af98e51f273d95e0fc19da1eca32a5f87a8c5576 Parents: 5cf266f Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 18:17:54 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 25 18:18:27 2015 -0700 -- .../mllib/evaluation/BinaryClassificationMetrics.scala | 8 .../spark/mllib/evaluation/MulticlassMetrics.scala | 11 ++- .../spark/mllib/evaluation/MultilabelMetrics.scala | 12 +++- .../spark/mllib/evaluation/RegressionMetrics.scala | 3 ++- 4 files changed, 27 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/af98e51f/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index 76ae847..508fe53 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -42,11 +42,11 @@ import org.apache.spark.sql.DataFrame *be smaller as a result, meaning there may be an extra sample at *partition boundaries. */ -@Since(1.3.0) +@Since(1.0.0) @Experimental -class BinaryClassificationMetrics( -val scoreAndLabels: RDD[(Double, Double)], -val numBins: Int) extends Logging { +class BinaryClassificationMetrics @Since(1.3.0) ( +@Since(1.3.0) val scoreAndLabels: RDD[(Double, Double)], +@Since(1.3.0) val numBins: Int) extends Logging { require(numBins = 0, numBins must be nonnegative) http://git-wip-us.apache.org/repos/asf/spark/blob/af98e51f/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index 02e89d9..00e8376 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.DataFrame */ @Since(1.1.0) @Experimental -class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { +class MulticlassMetrics @Since(1.1.0) (predictionAndLabels: RDD[(Double, Double)]) { /** * An auxiliary constructor taking a DataFrame. @@ -140,6 +140,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns precision */ + @Since(1.1.0) lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount /** @@ -148,23 +149,27 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * because sum of all false positives is equal to sum * of all false negatives) */ + @Since(1.1.0) lazy val recall: Double = precision /** * Returns f-measure * (equals to precision and recall because precision equals recall) */ + @Since(1.1.0) lazy val fMeasure: Double = precision /** * Returns weighted true positive rate * (equals to precision, recall and f-measure) */ + @Since(1.1.0) lazy val weightedTruePositiveRate: Double = weightedRecall /** * Returns weighted false positive rate */ + @Since(1.1.0) lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) = falsePositiveRate(category) * count.toDouble / labelCount }.sum @@ -173,6 +178,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns weighted averaged recall * (equals to precision, recall and f-measure) */ + @Since(1.1.0) lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) = recall(category) * count.toDouble / labelCount }.sum @@ -180,6 +186,7 @@ class
spark git commit: [SPARK-10233] [MLLIB] update since version in mllib.evaluation
Repository: spark Updated Branches: refs/heads/master 125205cdb - 8668ead2e [SPARK-10233] [MLLIB] update since version in mllib.evaluation Same as #8421 but for `mllib.evaluation`. cc avulanov Author: Xiangrui Meng m...@databricks.com Closes #8423 from mengxr/SPARK-10233. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8668ead2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8668ead2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8668ead2 Branch: refs/heads/master Commit: 8668ead2e7097b9591069599fbfccf67c53db659 Parents: 125205c Author: Xiangrui Meng m...@databricks.com Authored: Tue Aug 25 18:17:54 2015 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Tue Aug 25 18:17:54 2015 -0700 -- .../mllib/evaluation/BinaryClassificationMetrics.scala | 8 .../spark/mllib/evaluation/MulticlassMetrics.scala | 11 ++- .../spark/mllib/evaluation/MultilabelMetrics.scala | 12 +++- .../spark/mllib/evaluation/RegressionMetrics.scala | 3 ++- 4 files changed, 27 insertions(+), 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8668ead2/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala index 76ae847..508fe53 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala @@ -42,11 +42,11 @@ import org.apache.spark.sql.DataFrame *be smaller as a result, meaning there may be an extra sample at *partition boundaries. */ -@Since(1.3.0) +@Since(1.0.0) @Experimental -class BinaryClassificationMetrics( -val scoreAndLabels: RDD[(Double, Double)], -val numBins: Int) extends Logging { +class BinaryClassificationMetrics @Since(1.3.0) ( +@Since(1.3.0) val scoreAndLabels: RDD[(Double, Double)], +@Since(1.3.0) val numBins: Int) extends Logging { require(numBins = 0, numBins must be nonnegative) http://git-wip-us.apache.org/repos/asf/spark/blob/8668ead2/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala index 02e89d9..00e8376 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/MulticlassMetrics.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.DataFrame */ @Since(1.1.0) @Experimental -class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { +class MulticlassMetrics @Since(1.1.0) (predictionAndLabels: RDD[(Double, Double)]) { /** * An auxiliary constructor taking a DataFrame. @@ -140,6 +140,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns precision */ + @Since(1.1.0) lazy val precision: Double = tpByClass.values.sum.toDouble / labelCount /** @@ -148,23 +149,27 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * because sum of all false positives is equal to sum * of all false negatives) */ + @Since(1.1.0) lazy val recall: Double = precision /** * Returns f-measure * (equals to precision and recall because precision equals recall) */ + @Since(1.1.0) lazy val fMeasure: Double = precision /** * Returns weighted true positive rate * (equals to precision, recall and f-measure) */ + @Since(1.1.0) lazy val weightedTruePositiveRate: Double = weightedRecall /** * Returns weighted false positive rate */ + @Since(1.1.0) lazy val weightedFalsePositiveRate: Double = labelCountByClass.map { case (category, count) = falsePositiveRate(category) * count.toDouble / labelCount }.sum @@ -173,6 +178,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { * Returns weighted averaged recall * (equals to precision, recall and f-measure) */ + @Since(1.1.0) lazy val weightedRecall: Double = labelCountByClass.map { case (category, count) = recall(category) * count.toDouble / labelCount }.sum @@ -180,6 +186,7 @@ class MulticlassMetrics(predictionAndLabels: RDD[(Double, Double)]) { /** * Returns weighted averaged precision */ + @Since(1.1.0)