This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 027ed2d [SPARK-23643][CORE][SQL][ML] Shrinking the buffer in hashSeed up to size of the seed parameter 027ed2d is described below commit 027ed2d11b861a4b38c62452d26ce446794792af Author: Maxim Gekk <maxim.g...@databricks.com> AuthorDate: Sat Mar 23 11:26:09 2019 -0500 [SPARK-23643][CORE][SQL][ML] Shrinking the buffer in hashSeed up to size of the seed parameter ## What changes were proposed in this pull request? The hashSeed method allocates 64 bytes instead of 8. Other bytes are always zeros (thanks to default behavior of ByteBuffer). And they could be excluded from hash calculation because they don't differentiate inputs. ## How was this patch tested? By running the existing tests - XORShiftRandomSuite Closes #20793 from MaxGekk/hash-buff-size. Lead-authored-by: Maxim Gekk <maxim.g...@databricks.com> Co-authored-by: Maxim Gekk <max.g...@gmail.com> Signed-off-by: Sean Owen <sean.o...@databricks.com> --- R/pkg/tests/fulltests/test_mllib_classification.R | 6 +- R/pkg/tests/fulltests/test_mllib_clustering.R | 2 +- R/pkg/tests/fulltests/test_mllib_recommendation.R | 4 +- R/pkg/tests/fulltests/test_mllib_tree.R | 8 +- R/pkg/tests/fulltests/test_sparkSQL.R | 30 +- .../apache/spark/util/random/XORShiftRandom.scala | 2 +- .../java/test/org/apache/spark/JavaAPISuite.java | 9 +- .../apache/spark/rdd/PairRDDFunctionsSuite.scala | 2 +- .../spark/util/random/RandomSamplerSuite.scala | 2 +- .../ml/classification/GBTClassifierSuite.scala | 2 +- .../classification/LogisticRegressionSuite.scala | 585 +++++++++++---------- .../apache/spark/ml/clustering/KMeansSuite.scala | 2 +- .../clustering/PowerIterationClusteringSuite.scala | 6 +- .../apache/spark/ml/feature/Word2VecSuite.scala | 10 +- .../spark/ml/regression/GBTRegressorSuite.scala | 2 +- .../GeneralizedLinearRegressionSuite.scala | 48 +- .../clustering/PowerIterationClusteringSuite.scala | 8 +- .../mllib/clustering/StreamingKMeansSuite.scala | 3 +- python/pyspark/ml/clustering.py | 14 +- python/pyspark/ml/feature.py | 14 +- python/pyspark/ml/recommendation.py | 14 +- python/pyspark/ml/tests/test_algorithms.py | 2 +- python/pyspark/ml/tuning.py | 6 +- python/pyspark/mllib/recommendation.py | 6 +- python/pyspark/sql/dataframe.py | 12 +- python/pyspark/sql/functions.py | 8 +- python/pyspark/sql/tests/test_functions.py | 4 +- .../sql/catalyst/expressions/RandomSuite.scala | 16 +- .../sql-tests/results/group-by-ordinal.sql.out | 12 +- .../resources/sql-tests/results/random.sql.out | 16 +- .../org/apache/spark/sql/DataFrameStatSuite.scala | 8 +- .../scala/org/apache/spark/sql/DatasetSuite.scala | 15 +- .../execution/datasources/csv/TestCsvData.scala | 3 +- .../execution/datasources/json/TestJsonData.scala | 3 +- 34 files changed, 446 insertions(+), 438 deletions(-) diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R index 9fdb0cf..1f1b187 100644 --- a/R/pkg/tests/fulltests/test_mllib_classification.R +++ b/R/pkg/tests/fulltests/test_mllib_classification.R @@ -299,7 +299,7 @@ test_that("spark.mlp", { df <- read.df(absoluteSparkPath("data/mllib/sample_multiclass_classification_data.txt"), source = "libsvm") model <- spark.mlp(df, label ~ features, blockSize = 128, layers = c(4, 5, 4, 3), - solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1) + solver = "l-bfgs", maxIter = 100, tol = 0.00001, stepSize = 1, seed = 1) # Test summary method summary <- summary(model) @@ -307,13 +307,13 @@ test_that("spark.mlp", { expect_equal(summary$numOfOutputs, 3) expect_equal(summary$layers, c(4, 5, 4, 3)) expect_equal(length(summary$weights), 64) - expect_equal(head(summary$weights, 5), list(-0.878743, 0.2154151, -1.16304, -0.6583214, 1.009825), + expect_equal(head(summary$weights, 5), list(-24.28415, 107.8701, 16.86376, 1.103736, 9.244488), tolerance = 1e-6) # Test predict method mlpTestDF <- df mlpPredictions <- collect(select(predict(model, mlpTestDF), "prediction")) - expect_equal(head(mlpPredictions$prediction, 6), c("0.0", "1.0", "1.0", "1.0", "1.0", "1.0")) + expect_equal(head(mlpPredictions$prediction, 6), c("1.0", "1.0", "1.0", "1.0", "0.0", "1.0")) # Test model save/load if (windows_with_hadoop()) { diff --git a/R/pkg/tests/fulltests/test_mllib_clustering.R b/R/pkg/tests/fulltests/test_mllib_clustering.R index b78a476..028ad57 100644 --- a/R/pkg/tests/fulltests/test_mllib_clustering.R +++ b/R/pkg/tests/fulltests/test_mllib_clustering.R @@ -153,7 +153,7 @@ test_that("spark.kmeans", { model <- spark.kmeans(data = training, ~ ., k = 2, maxIter = 10, initMode = "random") sample <- take(select(predict(model, training), "prediction"), 1) expect_equal(typeof(sample$prediction), "integer") - expect_equal(sample$prediction, 1) + expect_equal(sample$prediction, 0) # Test stats::kmeans is working statsModel <- kmeans(x = newIris, centers = 2) diff --git a/R/pkg/tests/fulltests/test_mllib_recommendation.R b/R/pkg/tests/fulltests/test_mllib_recommendation.R index 4d919c9..d50de41 100644 --- a/R/pkg/tests/fulltests/test_mllib_recommendation.R +++ b/R/pkg/tests/fulltests/test_mllib_recommendation.R @@ -27,13 +27,13 @@ test_that("spark.als", { list(2, 1, 1.0), list(2, 2, 5.0)) df <- createDataFrame(data, c("user", "item", "score")) model <- spark.als(df, ratingCol = "score", userCol = "user", itemCol = "item", - rank = 10, maxIter = 5, seed = 0, regParam = 0.1) + rank = 10, maxIter = 15, seed = 0, regParam = 0.1) stats <- summary(model) expect_equal(stats$rank, 10) test <- createDataFrame(list(list(0, 2), list(1, 0), list(2, 0)), c("user", "item")) predictions <- collect(predict(model, test)) - expect_equal(predictions$prediction, c(-0.1380762, 2.6258414, -1.5018409), + expect_equal(predictions$prediction, c(0.6324540, 3.6218479, -0.4568263), tolerance = 1e-4) # Test model save/load diff --git a/R/pkg/tests/fulltests/test_mllib_tree.R b/R/pkg/tests/fulltests/test_mllib_tree.R index facd3a9..ad68700 100644 --- a/R/pkg/tests/fulltests/test_mllib_tree.R +++ b/R/pkg/tests/fulltests/test_mllib_tree.R @@ -148,10 +148,10 @@ test_that("spark.randomForest", { model <- spark.randomForest(data, Employed ~ ., "regression", maxDepth = 5, maxBins = 16, numTrees = 20, seed = 123) predictions <- collect(predict(model, data)) - expect_equal(predictions$prediction, c(60.32820, 61.22315, 60.69025, 62.11070, - 63.53160, 64.05470, 65.12710, 64.30450, - 66.70910, 67.86125, 68.08700, 67.21865, - 68.89275, 69.53180, 69.39640, 69.68250), + expect_equal(predictions$prediction, c(60.32495, 61.06495, 60.52120, 61.98500, + 63.64450, 64.21910, 65.00810, 64.30450, + 66.70910, 67.96875, 68.22140, 67.21865, + 68.89275, 69.55900, 69.30160, 69.93050), tolerance = 1e-4) stats <- summary(model) expect_equal(stats$numTrees, 20) diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index cebd0f8..2394f74 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -1786,9 +1786,9 @@ test_that("column binary mathfunctions", { expect_equal(collect(select(df, shiftRight(df$b, 1)))[4, 1], 4) expect_equal(collect(select(df, shiftRightUnsigned(df$b, 1)))[4, 1], 4) expect_equal(class(collect(select(df, rand()))[2, 1]), "numeric") - expect_equal(collect(select(df, rand(1)))[1, 1], 0.134, tolerance = 0.01) + expect_equal(collect(select(df, rand(1)))[1, 1], 0.636, tolerance = 0.01) expect_equal(class(collect(select(df, randn()))[2, 1]), "numeric") - expect_equal(collect(select(df, randn(1)))[1, 1], -1.03, tolerance = 0.01) + expect_equal(collect(select(df, randn(1)))[1, 1], 1.68, tolerance = 0.01) }) test_that("string operators", { @@ -2360,7 +2360,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { expect_equal(names(joined3), c("age", "name", "name", "test")) expect_equal(count(joined3), 4) expect_true(is.na(collect(orderBy(joined3, joined3$age))$age[2])) - + joined4 <- join(df, df2, df$name == df2$name, "right_outer") expect_equal(names(joined4), c("age", "name", "name", "test")) expect_equal(count(joined4), 4) @@ -2377,19 +2377,19 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { expect_equal(names(joined6), c("newAge", "name", "test")) expect_equal(count(joined6), 4) expect_equal(collect(orderBy(joined6, joined6$name))$newAge[3], 24) - + joined7 <- select(join(df, df2, df$name == df2$name, "full"), alias(df$age + 5, "newAge"), df$name, df2$test) expect_equal(names(joined7), c("newAge", "name", "test")) expect_equal(count(joined7), 4) expect_equal(collect(orderBy(joined7, joined7$name))$newAge[3], 24) - + joined8 <- select(join(df, df2, df$name == df2$name, "fullouter"), alias(df$age + 5, "newAge"), df$name, df2$test) expect_equal(names(joined8), c("newAge", "name", "test")) expect_equal(count(joined8), 4) expect_equal(collect(orderBy(joined8, joined8$name))$newAge[3], 24) - + joined9 <- select(join(df, df2, df$name == df2$name, "full_outer"), alias(df$age + 5, "newAge"), df$name, df2$test) expect_equal(names(joined9), c("newAge", "name", "test")) @@ -2400,12 +2400,12 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { expect_equal(names(joined10), c("age", "name", "name", "test")) expect_equal(count(joined10), 3) expect_true(is.na(collect(orderBy(joined10, joined10$age))$age[1])) - + joined11 <- join(df, df2, df$name == df2$name, "leftouter") expect_equal(names(joined11), c("age", "name", "name", "test")) expect_equal(count(joined11), 3) expect_true(is.na(collect(orderBy(joined11, joined11$age))$age[1])) - + joined12 <- join(df, df2, df$name == df2$name, "left_outer") expect_equal(names(joined12), c("age", "name", "name", "test")) expect_equal(count(joined12), 3) @@ -2418,23 +2418,23 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { joined14 <- join(df, df2, df$name == df2$name, "semi") expect_equal(names(joined14), c("age", "name")) expect_equal(count(joined14), 3) - + joined14 <- join(df, df2, df$name == df2$name, "leftsemi") expect_equal(names(joined14), c("age", "name")) expect_equal(count(joined14), 3) - + joined15 <- join(df, df2, df$name == df2$name, "left_semi") expect_equal(names(joined15), c("age", "name")) expect_equal(count(joined15), 3) - + joined16 <- join(df2, df, df2$name == df$name, "anti") expect_equal(names(joined16), c("name", "test")) expect_equal(count(joined16), 1) - + joined17 <- join(df2, df, df2$name == df$name, "leftanti") expect_equal(names(joined17), c("name", "test")) expect_equal(count(joined17), 1) - + joined18 <- join(df2, df, df2$name == df$name, "left_anti") expect_equal(names(joined18), c("name", "test")) expect_equal(count(joined18), 1) @@ -2444,7 +2444,7 @@ test_that("join(), crossJoin() and merge() on a DataFrame", { "'left', 'leftouter', 'left_outer', 'right', 'rightouter', 'right_outer',", "'semi', 'leftsemi', 'left_semi', 'anti', 'leftanti' or 'left_anti'.") expect_error(join(df2, df, df2$name == df$name, "invalid"), error_msg) - + merged <- merge(df, df2, by.x = "name", by.y = "name", all.x = TRUE, all.y = TRUE) expect_equal(count(merged), 4) expect_equal(names(merged), c("age", "name_x", "name_y", "test")) @@ -3026,7 +3026,7 @@ test_that("sampleBy() on a DataFrame", { sample <- sampleBy(df, "key", fractions, 0) result <- collect(orderBy(count(groupBy(sample, "key")), "key")) expect_identical(as.list(result[1, ]), list(key = "0", count = 3)) - expect_identical(as.list(result[2, ]), list(key = "1", count = 7)) + expect_identical(as.list(result[2, ]), list(key = "1", count = 8)) }) test_that("approxQuantile() on a DataFrame", { diff --git a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala index e472756..af09e50 100644 --- a/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala +++ b/core/src/main/scala/org/apache/spark/util/random/XORShiftRandom.scala @@ -59,7 +59,7 @@ private[spark] object XORShiftRandom { /** Hash seeds to have 0/1 bits throughout. */ private[random] def hashSeed(seed: Long): Long = { - val bytes = ByteBuffer.allocate(java.lang.Long.SIZE).putLong(seed).array() + val bytes = ByteBuffer.allocate(java.lang.Long.BYTES).putLong(seed).array() val lowBits = MurmurHash3.bytesHash(bytes) val highBits = MurmurHash3.bytesHash(bytes, lowBits) (highBits.toLong << 32) | (lowBits.toLong & 0xFFFFFFFFL) diff --git a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java index f979f9e..a8252e0 100644 --- a/core/src/test/java/test/org/apache/spark/JavaAPISuite.java +++ b/core/src/test/java/test/org/apache/spark/JavaAPISuite.java @@ -32,6 +32,8 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.concurrent.*; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import org.apache.spark.Partitioner; import org.apache.spark.SparkConf; @@ -156,13 +158,16 @@ public class JavaAPISuite implements Serializable { @Test public void sample() { - List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + List<Integer> ints = IntStream.iterate(1, x -> x + 1) + .limit(20) + .boxed() + .collect(Collectors.toList()); JavaRDD<Integer> rdd = sc.parallelize(ints); // the seeds here are "magic" to make this work out nicely JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8); assertEquals(2, sample20.count()); JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2); - assertEquals(2, sample20WithoutReplacement.count()); + assertEquals(4, sample20WithoutReplacement.count()); } @Test diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index 945b0944..1564435 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -739,7 +739,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val dist = new BinomialDistribution(trials, p) val q = dist.cumulativeProbability(actual) withClue(s"p = $p: trials = $trials") { - assert(q >= 0.001 && q <= 0.999) + assert(0.0 < q && q < 1.0) } } } diff --git a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala index 7eb2f56..c2e3830 100644 --- a/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/random/RandomSamplerSuite.scala @@ -59,7 +59,7 @@ class RandomSamplerSuite extends SparkFunSuite with Matchers { // will always fail with some nonzero probability, so I'll fix the seed to prevent these // tests from generating random failure noise in CI testing, etc. val rngSeed: Random = RandomSampler.newDefaultRNG - rngSeed.setSeed(235711) + rngSeed.setSeed(235711345678901011L) // Reference implementation of sampling without replacement (bernoulli) def sample[T](data: Iterator[T], f: Double): Iterator[T] = { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index cd59900..379e14f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -345,7 +345,7 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest { test("Tests of feature subset strategy") { val numClasses = 2 val gbt = new GBTClassifier() - .setSeed(123) + .setSeed(42) .setMaxDepth(3) .setMaxIter(5) .setFeatureSubsetStrategy("all") diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala index 2499892..9af7fff 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala @@ -664,18 +664,16 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficients = coef(glmnet(features, label, weights=w, family="binomial", alpha = 0, lambda = 0)) coefficients - $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 2.7355261 - data.V3 -0.5734389 - data.V4 0.8911736 - data.V5 -0.3878645 - data.V6 -0.8060570 - + (Intercept) 2.7114519 + data.V3 -0.5667801 + data.V4 0.8818754 + data.V5 -0.3882505 + data.V6 -0.7891183 */ - val coefficientsR = Vectors.dense(-0.5734389, 0.8911736, -0.3878645, -0.8060570) - val interceptR = 2.7355261 + val coefficientsR = Vectors.dense(-0.5667801, 0.8818754, -0.3882505, -0.7891183) + val interceptR = 2.7114519 assert(model1.intercept ~== interceptR relTol 1E-3) assert(model1.coefficients ~= coefficientsR relTol 1E-3) @@ -707,7 +705,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model2 = trainer2.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpected1 = Vectors.dense(0.06079437, 0.0, -0.26351059, -0.59102199) + val coefficientsExpected1 = Vectors.dense( + 0.05997387390575594, 0.0, -0.26536616889454984, -0.5793842425088045) val interceptExpected1 = 1.0 assert(model1.intercept ~== interceptExpected1 relTol 1E-3) @@ -742,8 +741,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model4 = trainer4.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpected3 = Vectors.dense(0.0, 0.0, 0.0, -0.71708632) - val interceptExpected3 = 0.58776113 + val coefficientsExpected3 = Vectors.dense(0.0, 0.0, 0.0, -0.7003382019888361) + val interceptExpected3 = 0.5673234605102715 assert(model3.intercept ~== interceptExpected3 relTol 1E-3) assert(model3.coefficients ~= coefficientsExpected3 relTol 1E-3) @@ -775,8 +774,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. // It should be same as unbound constrained optimization with LBFGS. - val coefficientsExpected5 = Vectors.dense(-0.5734389, 0.8911736, -0.3878645, -0.8060570) - val interceptExpected5 = 2.7355261 + val coefficientsExpected5 = Vectors.dense( + -0.5667990118366208, 0.8819300812352234, -0.38825593561750166, -0.7891233856979563) + val interceptExpected5 = 2.711413425425 assert(model5.intercept ~== interceptExpected5 relTol 1E-3) assert(model5.coefficients ~= coefficientsExpected5 relTol 1E-3) @@ -810,13 +810,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . - data.V3 -0.3448461 - data.V4 1.2776453 - data.V5 -0.3539178 - data.V6 -0.7469384 + data.V3 -0.3451301 + data.V4 1.2721785 + data.V5 -0.3537743 + data.V6 -0.7315618 */ - val coefficientsR = Vectors.dense(-0.3448461, 1.2776453, -0.3539178, -0.7469384) + val coefficientsR = Vectors.dense(-0.3451301, 1.2721785, -0.3537743, -0.7315618) assert(model1.intercept ~== 0.0 relTol 1E-3) assert(model1.coefficients ~= coefficientsR relTol 1E-2) @@ -844,7 +844,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model2 = trainer2.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpected = Vectors.dense(0.20847553, 0.0, -0.24240289, -0.55568071) + val coefficientsExpected = Vectors.dense( + 0.20721074484293306, 0.0, -0.24389739190279183, -0.5446655961212726) assert(model1.intercept ~== 0.0 relTol 1E-3) assert(model1.coefficients ~= coefficientsExpected relTol 1E-3) @@ -877,15 +878,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) -0.06775980 + (Intercept) -0.07157076 data.V3 . data.V4 . - data.V5 -0.03933146 - data.V6 -0.03047580 + data.V5 -0.04058143 + data.V6 -0.02322760 */ - val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.03933146, -0.03047580) - val interceptRStd = -0.06775980 + val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.04058143, -0.02322760) + val interceptRStd = -0.07157076 assert(model1.intercept ~== interceptRStd relTol 1E-2) assert(model1.coefficients ~= coefficientsRStd absTol 2E-2) @@ -904,15 +905,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.3544768 + (Intercept) 0.3602029 data.V3 . data.V4 . - data.V5 -0.1626191 + data.V5 -0.1635707 data.V6 . */ - val coefficientsR = Vectors.dense(0.0, 0.0, -0.1626191, 0.0) - val interceptR = 0.3544768 + val coefficientsR = Vectors.dense(0.0, 0.0, -0.1635707, 0.0) + val interceptR = 0.3602029 assert(model2.intercept ~== interceptR relTol 1E-2) assert(model2.coefficients ~== coefficientsR absTol 1E-3) @@ -945,8 +946,8 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { (Intercept) . data.V3 . data.V4 . - data.V5 -0.04967635 - data.V6 -0.04757757 + data.V5 -0.05164150 + data.V6 -0.04079129 coefficients 5 x 1 sparse Matrix of class "dgCMatrix" @@ -954,13 +955,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { (Intercept) . data.V3 . data.V4 . - data.V5 -0.08433195 + data.V5 -0.08408014 data.V6 . */ - val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.04967635, -0.04757757) + val coefficientsRStd = Vectors.dense(0.0, 0.0, -0.05164150, -0.04079129) - val coefficientsR = Vectors.dense(0.0, 0.0, -0.08433195, 0.0) + val coefficientsR = Vectors.dense(0.0, 0.0, -0.08408014, 0.0) assert(model1.intercept ~== 0.0 absTol 1E-3) assert(model1.coefficients ~= coefficientsRStd absTol 1E-3) @@ -992,26 +993,26 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficientsStd 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.12707703 - data.V3 -0.06980967 - data.V4 0.10803933 - data.V5 -0.04800404 - data.V6 -0.10165096 + (Intercept) 0.12943705 + data.V3 -0.06979418 + data.V4 0.10691465 + data.V5 -0.04835674 + data.V6 -0.09939108 coefficients 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.46613016 - data.V3 -0.04944529 - data.V4 0.02326772 - data.V5 -0.11362772 - data.V6 -0.06312848 + (Intercept) 0.47553535 + data.V3 -0.05058465 + data.V4 0.02296823 + data.V5 -0.11368284 + data.V6 -0.06309008 */ - val coefficientsRStd = Vectors.dense(-0.06980967, 0.10803933, -0.04800404, -0.10165096) - val interceptRStd = 0.12707703 - val coefficientsR = Vectors.dense(-0.04944529, 0.02326772, -0.11362772, -0.06312848) - val interceptR = 0.46613016 + val coefficientsRStd = Vectors.dense(-0.06979418, 0.10691465, -0.04835674, -0.09939108) + val interceptRStd = 0.12943705 + val coefficientsR = Vectors.dense(-0.05058465, 0.02296823, -0.11368284, -0.06309008) + val interceptR = 0.47553535 assert(model1.intercept ~== interceptRStd relTol 1E-3) assert(model1.coefficients ~= coefficientsRStd relTol 1E-3) @@ -1042,10 +1043,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model2 = trainer2.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpectedWithStd = Vectors.dense(-0.06985003, 0.0, -0.04794278, -0.10168595) - val interceptExpectedWithStd = 0.45750141 - val coefficientsExpected = Vectors.dense(-0.0494524, 0.0, -0.11360797, -0.06313577) - val interceptExpected = 0.53722967 + val coefficientsExpectedWithStd = Vectors.dense( + -0.06974410278847253, 0.0, -0.04833486093952599, -0.09941770618793982) + val interceptExpectedWithStd = 0.4564981350661977 + val coefficientsExpected = Vectors.dense( + -0.050579069523730306, 0.0, -0.11367447252893222, -0.06309435539607525) + val interceptExpected = 0.5457873335999178 assert(model1.intercept ~== interceptExpectedWithStd relTol 1E-3) assert(model1.coefficients ~= coefficientsExpectedWithStd relTol 1E-3) @@ -1078,23 +1081,24 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . - data.V3 -0.06000152 - data.V4 0.12598737 - data.V5 -0.04669009 - data.V6 -0.09941025 + data.V3 -0.05998915 + data.V4 0.12541885 + data.V5 -0.04697872 + data.V6 -0.09713973 coefficients 5 x 1 sparse Matrix of class "dgCMatrix" s0 (Intercept) . - data.V3 -0.005482255 - data.V4 0.048106338 - data.V5 -0.093411640 - data.V6 -0.054149798 + data.V3 -0.005927466 + data.V4 0.048313659 + data.V5 -0.092956052 + data.V6 -0.053974895 */ - val coefficientsRStd = Vectors.dense(-0.06000152, 0.12598737, -0.04669009, -0.09941025) - val coefficientsR = Vectors.dense(-0.005482255, 0.048106338, -0.093411640, -0.054149798) + val coefficientsRStd = Vectors.dense(-0.05998915, 0.12541885, -0.04697872, -0.09713973) + val coefficientsR = Vectors.dense( + -0.0059320221190687205, 0.04834399477383437, -0.09296353778288495, -0.05398080548228108) assert(model1.intercept ~== 0.0 absTol 1E-3) assert(model1.coefficients ~= coefficientsRStd relTol 1E-2) @@ -1122,8 +1126,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { val model2 = trainer2.fit(binaryDataset) // The solution is generated by https://github.com/yanboliang/bound-optimization. - val coefficientsExpectedWithStd = Vectors.dense(-0.00796538, 0.0, -0.0394228, -0.0873314) - val coefficientsExpected = Vectors.dense(0.01105972, 0.0, -0.08574949, -0.05079558) + val coefficientsExpectedWithStd = Vectors.dense( + -0.00845365508769699, 0.0, -0.03954848648474558, -0.0851639471468608) + val coefficientsExpected = Vectors.dense( + 0.010675769768102661, 0.0, -0.0852582080623827, -0.050615535080106376) assert(model1.intercept ~== 0.0 relTol 1E-3) assert(model1.coefficients ~= coefficientsExpectedWithStd relTol 1E-3) @@ -1134,7 +1140,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { test("binary logistic regression with intercept with ElasticNet regularization") { val trainer1 = (new LogisticRegression).setFitIntercept(true).setMaxIter(120) .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(true).setWeightCol("weight") - val trainer2 = (new LogisticRegression).setFitIntercept(true).setMaxIter(30) + val trainer2 = (new LogisticRegression).setFitIntercept(true).setMaxIter(60) .setElasticNetParam(0.38).setRegParam(0.21).setStandardization(false).setWeightCol("weight") val model1 = trainer1.fit(binaryDataset) @@ -1155,26 +1161,26 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficientsStd 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.49991996 - data.V3 -0.04131110 + (Intercept) 0.51344133 + data.V3 -0.04395595 data.V4 . - data.V5 -0.08585233 - data.V6 -0.15875400 + data.V5 -0.08699271 + data.V6 -0.15249200 coefficients 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) 0.5024256 + (Intercept) 0.50936159 data.V3 . data.V4 . - data.V5 -0.1846038 - data.V6 -0.0559614 + data.V5 -0.18569346 + data.V6 -0.05625862 */ - val coefficientsRStd = Vectors.dense(-0.04131110, 0.0, -0.08585233, -0.15875400) - val interceptRStd = 0.49991996 - val coefficientsR = Vectors.dense(0.0, 0.0, -0.1846038, -0.0559614) - val interceptR = 0.5024256 + val coefficientsRStd = Vectors.dense(-0.04395595, 0.0, -0.08699271, -0.15249200) + val interceptRStd = 0.51344133 + val coefficientsR = Vectors.dense(0.0, 0.0, -0.18569346, -0.05625862) + val interceptR = 0.50936159 assert(model1.intercept ~== interceptRStd relTol 6E-2) assert(model1.coefficients ~== coefficientsRStd absTol 5E-3) @@ -1285,13 +1291,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 - (Intercept) -0.2516986 + (Intercept) -0.2521953 data.V3 0.0000000 data.V4 . data.V5 . data.V6 . */ - val interceptR = -0.2516986 + val interceptR = -0.2521953 val coefficientsR = Vectors.dense(0.0, 0.0, 0.0, 0.0) assert(model1.intercept ~== interceptR relTol 1E-5) @@ -1373,37 +1379,36 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -2.10320093 - data.V3 0.24337896 - data.V4 -0.05916156 - data.V5 0.14446790 - data.V6 0.35976165 + -2.22347257 + data.V3 0.24574397 + data.V4 -0.04054235 + data.V5 0.14963756 + data.V6 0.37504027 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.3394473 - data.V3 -0.3443375 - data.V4 0.9181331 - data.V5 -0.2283959 - data.V6 -0.4388066 + 0.3674309 + data.V3 -0.3266910 + data.V4 0.8939282 + data.V5 -0.2363519 + data.V6 -0.4631336 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 1.76375361 - data.V3 0.10095851 - data.V4 -0.85897154 - data.V5 0.08392798 - data.V6 0.07904499 - + 1.85604170 + data.V3 0.08094703 + data.V4 -0.85338588 + data.V5 0.08671439 + data.V6 0.08809332 */ val coefficientsR = new DenseMatrix(3, 4, Array( - 0.24337896, -0.05916156, 0.14446790, 0.35976165, - -0.3443375, 0.9181331, -0.2283959, -0.4388066, - 0.10095851, -0.85897154, 0.08392798, 0.07904499), isTransposed = true) - val interceptsR = Vectors.dense(-2.10320093, 0.3394473, 1.76375361) + 0.24574397, -0.04054235, 0.14963756, 0.37504027, + -0.3266910, 0.8939282, -0.2363519, -0.4631336, + 0.08094703, -0.85338588, 0.08671439, 0.08809332), isTransposed = true) + val interceptsR = Vectors.dense(-2.22347257, 0.3674309, 1.85604170) model1.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) model2.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) @@ -1496,10 +1501,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. val coefficientsExpected1 = new DenseMatrix(3, 4, Array( - 2.52076464, 2.73596057, 1.87984904, 2.73264492, - 1.93302281, 3.71363303, 1.50681746, 1.93398782, - 2.37839917, 1.93601818, 1.81924758, 2.45191255), isTransposed = true) - val interceptsExpected1 = Vectors.dense(1.00010477, 3.44237083, 4.86740286) + 2.1156620676212325, 2.7146375863138825, 1.8108730417428125, 2.711975470258063, + 1.54314110882009, 3.648963914233324, 1.4248901324480239, 1.8737908246138315, + 1.950852726788052, 1.9017484391817425, 1.7479497661988832, 2.425055298693075), + isTransposed = true) + val interceptsExpected1 = Vectors.dense( + 1.0000152482448372, 3.591773288423673, 5.079685953744937) checkCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected1) assert(model1.interceptVector ~== interceptsExpected1 relTol 0.01) @@ -1532,9 +1539,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. val coefficientsExpected3 = new DenseMatrix(3, 4, Array( - 1.61967097, 1.16027835, 1.45131448, 1.97390431, - 1.30529317, 2.0, 1.12985473, 1.26652854, - 1.61647195, 1.0, 1.40642959, 1.72985589), isTransposed = true) + 1.641980508924569, 1.1579023489264648, 1.434651352010351, 1.9541352988127463, + 1.3416273422126057, 2.0, 1.1014102844446283, 1.2076556940852765, + 1.6371808928302913, 1.0, 1.3936094723717016, 1.71022540576362), + isTransposed = true) val interceptsExpected3 = Vectors.dense(1.0, 2.0, 2.0) checkCoefficientsEquivalent(model3.coefficientMatrix, coefficientsExpected3) @@ -1566,10 +1574,12 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. // It should be same as unbound constrained optimization with LBFGS. val coefficientsExpected5 = new DenseMatrix(3, 4, Array( - 0.24337896, -0.05916156, 0.14446790, 0.35976165, - -0.3443375, 0.9181331, -0.2283959, -0.4388066, - 0.10095851, -0.85897154, 0.08392798, 0.07904499), isTransposed = true) - val interceptsExpected5 = Vectors.dense(-2.10320093, 0.3394473, 1.76375361) + 0.24573204902629314, -0.040610820463585905, 0.14962716893619094, 0.37502549108817784, + -0.3266914048842952, 0.8940567211111817, -0.23633898260880218, -0.4631024664883818, + 0.08095935585808962, -0.8534459006476851, 0.0867118136726069, 0.0880769754002182), + isTransposed = true) + val interceptsExpected5 = Vectors.dense( + -2.2231282183460723, 0.3669496747012527, 1.856178543644802) checkCoefficientsEquivalent(model5.coefficientMatrix, coefficientsExpected5) assert(model5.interceptVector ~== interceptsExpected5 relTol 0.01) @@ -1602,35 +1612,35 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.07276291 - data.V4 -0.36325496 - data.V5 0.12015088 - data.V6 0.31397340 + data.V3 0.06892068 + data.V4 -0.36546704 + data.V5 0.12274583 + data.V6 0.32616580 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.3180040 - data.V4 0.9679074 - data.V5 -0.2252219 - data.V6 -0.4319914 + data.V3 -0.2987384 + data.V4 0.9483147 + data.V5 -0.2328113 + data.V6 -0.4555157 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.2452411 - data.V4 -0.6046524 - data.V5 0.1050710 - data.V6 0.1180180 + data.V3 0.2298177 + data.V4 -0.5828477 + data.V5 0.1100655 + data.V6 0.1293499 */ val coefficientsR = new DenseMatrix(3, 4, Array( - 0.07276291, -0.36325496, 0.12015088, 0.31397340, - -0.3180040, 0.9679074, -0.2252219, -0.4319914, - 0.2452411, -0.6046524, 0.1050710, 0.1180180), isTransposed = true) + 0.06892068, -0.36546704, 0.12274583, 0.32616580, + -0.2987384, 0.9483147, -0.2328113, -0.4555157, + 0.2298177, -0.5828477, 0.1100655, 0.1293499), isTransposed = true) model1.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) model2.coefficientMatrix.colIter.foreach(v => assert(v.toArray.sum ~== 0.0 absTol eps)) @@ -1664,9 +1674,10 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. val coefficientsExpected = new DenseMatrix(3, 4, Array( - 1.62410051, 1.38219391, 1.34486618, 1.74641729, - 1.23058989, 2.71787825, 1.0, 1.00007073, - 1.79478632, 1.14360459, 1.33011603, 1.55093897), isTransposed = true) + 1.5933935326002155, 1.4427758360562475, 1.356079506266844, 1.7818682794856215, + 1.2224266732592248, 2.762691362720858, 1.0005885171478472, 1.0000022613855966, + 1.7524631428961193, 1.2292565990448736, 1.3433784431904323, 1.5846063017678864), + isTransposed = true) checkCoefficientsEquivalent(model1.coefficientMatrix, coefficientsExpected) assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) @@ -1703,27 +1714,27 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.62244703 + -0.69265374 data.V3 . data.V4 . data.V5 . - data.V6 0.08419825 + data.V6 0.09064661 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.2804845 - data.V3 -0.1336960 - data.V4 0.3717091 - data.V5 -0.1530363 - data.V6 -0.2035286 + -0.2260274 + data.V3 -0.1144333 + data.V4 0.3204703 + data.V5 -0.1621061 + data.V6 -0.2308192 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.9029315 + 0.9186811 data.V3 . - data.V4 -0.4629737 + data.V4 -0.4832131 data.V5 . data.V6 . @@ -1732,25 +1743,25 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.44215290 + -0.44707756 data.V3 . data.V4 . - data.V5 0.01767089 - data.V6 0.02542866 + data.V5 0.01641412 + data.V6 0.03570376 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.76308326 - data.V3 -0.06818576 + 0.75180900 + data.V3 -0.05110822 data.V4 . - data.V5 -0.20446351 - data.V6 -0.13017924 + data.V5 -0.21595670 + data.V6 -0.16162836 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.3209304 + -0.3047314 data.V3 . data.V4 . data.V5 . @@ -1759,15 +1770,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.08419825, - -0.1336960, 0.3717091, -0.1530363, -0.2035286, - 0.0, -0.4629737, 0.0, 0.0), isTransposed = true) - val interceptsRStd = Vectors.dense(-0.62244703, -0.2804845, 0.9029315) + 0.0, 0.0, 0.0, 0.09064661, + -0.1144333, 0.3204703, -0.1621061, -0.2308192, + 0.0, -0.4832131, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.72638218, -0.01737265, 0.74375484) val coefficientsR = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.01767089, 0.02542866, - -0.06818576, 0.0, -0.20446351, -0.13017924, + 0.0, 0.0, 0.01641412, 0.03570376, + -0.05110822, 0.0, -0.21595670, -0.16162836, 0.0, 0.0, 0.0, 0.0), isTransposed = true) - val interceptsR = Vectors.dense(-0.44215290, 0.76308326, -0.3209304) + val interceptsR = Vectors.dense(-0.44707756, 0.75180900, -0.3047314) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.05) assert(model1.interceptVector ~== interceptsRStd relTol 0.1) @@ -1800,31 +1811,30 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficientsStd $`0` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 + s0 . data.V3 . data.V4 . data.V5 . - data.V6 0.01144225 + data.V6 0.01167 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.1678787 - data.V4 0.5385351 - data.V5 -0.1573039 - data.V6 -0.2471624 + data.V3 -0.1413518 + data.V4 0.5100469 + data.V5 -0.1658025 + data.V6 -0.2755998 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - . - data.V3 . - data.V4 . - data.V5 . - data.V6 . - + s0 + . + data.V3 0.001536337 + data.V4 . + data.V5 . + data.V6 . coefficients $`0` @@ -1841,9 +1851,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { s0 . data.V3 . - data.V4 0.1929409 - data.V5 -0.1889121 - data.V6 -0.1010413 + data.V4 0.2094410 + data.V5 -0.1944582 + data.V6 -0.1307681 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" @@ -1857,13 +1867,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.01144225, - -0.1678787, 0.5385351, -0.1573039, -0.2471624, - 0.0, 0.0, 0.0, 0.0), isTransposed = true) + 0.0, 0.0, 0.0, 0.01167, + -0.1413518, 0.5100469, -0.1658025, -0.2755998, + 0.001536337, 0.0, 0.0, 0.0), isTransposed = true) val coefficientsR = new DenseMatrix(3, 4, Array( 0.0, 0.0, 0.0, 0.0, - 0.0, 0.1929409, -0.1889121, -0.1010413, + 0.0, 0.2094410, -0.1944582, -0.1307681, 0.0, 0.0, 0.0, 0.0), isTransposed = true) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) @@ -1897,72 +1907,71 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { coefficientsStd $`0` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -1.5898288335 - data.V3 0.1691226336 - data.V4 0.0002983651 - data.V5 0.1001732896 - data.V6 0.2554575585 + s0 + -1.68571384 + data.V3 0.17156077 + data.V4 0.01658014 + data.V5 0.10303296 + data.V6 0.26459585 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.2125746 - data.V3 -0.2304586 - data.V4 0.6153492 - data.V5 -0.1537017 - data.V6 -0.2975443 + 0.2364585 + data.V3 -0.2182805 + data.V4 0.5960025 + data.V5 -0.1587441 + data.V6 -0.3121284 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 1.37725427 - data.V3 0.06133600 - data.V4 -0.61564761 - data.V5 0.05352840 - data.V6 0.04208671 - + 1.44925536 + data.V3 0.04671972 + data.V4 -0.61258267 + data.V5 0.05571116 + data.V6 0.04753251 coefficients $`0` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -1.5681088 - data.V3 0.1508182 - data.V4 0.0121955 - data.V5 0.1217930 - data.V6 0.2162850 + s0 + -1.65140201 + data.V3 0.15446206 + data.V4 0.02134769 + data.V5 0.12524946 + data.V6 0.22607972 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 1.1217130 - data.V3 -0.2028984 - data.V4 0.2862431 - data.V5 -0.1843559 - data.V6 -0.2481218 + 1.1367722 + data.V3 -0.1931713 + data.V4 0.2766548 + data.V5 -0.1910455 + data.V6 -0.2629336 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.44639579 - data.V3 0.05208012 - data.V4 -0.29843864 - data.V5 0.06256289 - data.V6 0.03183676 + 0.51462979 + data.V3 0.03870921 + data.V4 -0.29800245 + data.V5 0.06579606 + data.V6 0.03685390 */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.1691226336, 0.0002983651, 0.1001732896, 0.2554575585, - -0.2304586, 0.6153492, -0.1537017, -0.2975443, - 0.06133600, -0.61564761, 0.05352840, 0.04208671), isTransposed = true) - val interceptsRStd = Vectors.dense(-1.5898288335, 0.2125746, 1.37725427) + 0.17156077, 0.01658014, 0.10303296, 0.26459585, + -0.2182805, 0.5960025, -0.1587441, -0.3121284, + 0.04671972, -0.61258267, 0.05571116, 0.04753251), isTransposed = true) + val interceptsRStd = Vectors.dense(-1.68571384, 0.2364585, 1.44925536) val coefficientsR = new DenseMatrix(3, 4, Array( - 0.1508182, 0.0121955, 0.1217930, 0.2162850, - -0.2028984, 0.2862431, -0.1843559, -0.2481218, - 0.05208012, -0.29843864, 0.06256289, 0.03183676), isTransposed = true) - val interceptsR = Vectors.dense(-1.5681088, 1.1217130, 0.44639579) + 0.15446206, 0.02134769, 0.12524946, 0.22607972, + -0.1931713, 0.2766548, -0.1910455, -0.2629336, + 0.03870921, -0.29800245, 0.06579606, 0.03685390), isTransposed = true) + val interceptsR = Vectors.dense(-1.65140201, 1.1367722, 0.51462979) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.001) assert(model1.interceptVector ~== interceptsRStd relTol 0.05) @@ -1996,15 +2005,16 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { // The solution is generated by https://github.com/yanboliang/bound-optimization. val coefficientsExpectedWithStd = new DenseMatrix(3, 4, Array( - 1.0, 1.0, 1.0, 1.01647497, - 1.0, 1.44105616, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.025970328910313, + 1.0, 1.4150672323873024, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0), isTransposed = true) - val interceptsExpectedWithStd = Vectors.dense(2.52055893, 1.0, 2.560682) + val interceptsExpectedWithStd = Vectors.dense( + 2.4259954221861473, 1.0000087410832004, 2.490461716522559) val coefficientsExpected = new DenseMatrix(3, 4, Array( - 1.0, 1.0, 1.03189386, 1.0, + 1.0, 1.0, 1.0336746541813002, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0), isTransposed = true) - val interceptsExpected = Vectors.dense(1.06418835, 1.0, 1.20494701) + val interceptsExpected = Vectors.dense(1.0521598454128, 1.0, 1.213158241431565) assert(model1.coefficientMatrix ~== coefficientsExpectedWithStd relTol 0.01) assert(model1.interceptVector ~== interceptsExpectedWithStd relTol 0.01) @@ -2037,69 +2047,68 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.04048126 - data.V4 -0.23075758 - data.V5 0.08228864 - data.V6 0.22277648 + data.V3 0.03804571 + data.V4 -0.23204409 + data.V5 0.08337512 + data.V6 0.23029089 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.2149745 - data.V4 0.6478666 - data.V5 -0.1515158 - data.V6 -0.2930498 + data.V3 -0.2015495 + data.V4 0.6328705 + data.V5 -0.1562475 + data.V6 -0.3071447 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.17449321 - data.V4 -0.41710901 - data.V5 0.06922716 - data.V6 0.07027332 - + data.V3 0.16350376 + data.V4 -0.40082637 + data.V5 0.07287239 + data.V6 0.07685379 coefficients $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.003949652 - data.V4 -0.142982415 - data.V5 0.091439598 - data.V6 0.179286241 + data.V3 -0.006493452 + data.V4 -0.143831823 + data.V5 0.092538445 + data.V6 0.187244839 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.09071124 - data.V4 0.39752531 - data.V5 -0.16233832 - data.V6 -0.22206059 + data.V3 -0.08068443 + data.V4 0.39038929 + data.V5 -0.16822390 + data.V6 -0.23667470 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.09466090 - data.V4 -0.25454290 - data.V5 0.07089872 - data.V6 0.04277435 + data.V3 0.08717788 + data.V4 -0.24655746 + data.V5 0.07568546 + data.V6 0.04942986 */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.04048126, -0.23075758, 0.08228864, 0.22277648, - -0.2149745, 0.6478666, -0.1515158, -0.2930498, - 0.17449321, -0.41710901, 0.06922716, 0.07027332), isTransposed = true) + 0.03804571, -0.23204409, 0.08337512, 0.23029089, + -0.2015495, 0.6328705, -0.1562475, -0.3071447, + 0.16350376, -0.40082637, 0.07287239, 0.07685379), isTransposed = true) val coefficientsR = new DenseMatrix(3, 4, Array( - -0.003949652, -0.142982415, 0.091439598, 0.179286241, - -0.09071124, 0.39752531, -0.16233832, -0.22206059, - 0.09466090, -0.25454290, 0.07089872, 0.04277435), isTransposed = true) + -0.006493452, -0.143831823, 0.092538445, 0.187244839, + -0.08068443, 0.39038929, -0.16822390, -0.23667470, + 0.08717788, -0.24655746, 0.07568546, 0.04942986), isTransposed = true) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) assert(model1.interceptVector.toArray === Array.fill(3)(0.0)) @@ -2150,7 +2159,7 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { .setMaxIter(220).setTol(1e-10) val trainer2 = (new LogisticRegression).setFitIntercept(true).setWeightCol("weight") .setElasticNetParam(0.5).setRegParam(0.1).setStandardization(false) - .setMaxIter(90).setTol(1e-10) + .setMaxIter(220).setTol(1e-10) val model1 = trainer1.fit(multinomialDataset) val model2 = trainer2.fit(multinomialDataset) @@ -2170,54 +2179,53 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.50133383 + -0.55325803 data.V3 . data.V4 . data.V5 . - data.V6 0.08351653 + data.V6 0.09074857 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" - s0 - -0.3151913 - data.V3 -0.1058702 - data.V4 0.3183251 - data.V5 -0.1212969 - data.V6 -0.1629778 + s0 + -0.27291366 + data.V3 -0.09093399 + data.V4 0.28078251 + data.V5 -0.12854559 + data.V6 -0.18382494 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.8165252 + 0.8261717 data.V3 . - data.V4 -0.3943069 + data.V4 -0.4064444 data.V5 . data.V6 . - coefficients $`0` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.38857157 + -0.40016908 data.V3 . data.V4 . - data.V5 0.02384198 - data.V6 0.03127749 + data.V5 0.02312769 + data.V6 0.04159224 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - 0.62492165 - data.V3 -0.04949061 + 0.62474768 + data.V3 -0.03776471 data.V4 . - data.V5 -0.18584462 - data.V6 -0.08952455 + data.V5 -0.19588206 + data.V6 -0.11187712 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 - -0.2363501 + -0.2245786 data.V3 . data.V4 . data.V5 . @@ -2226,15 +2234,15 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.08351653, - -0.1058702, 0.3183251, -0.1212969, -0.1629778, - 0.0, -0.3943069, 0.0, 0.0), isTransposed = true) - val interceptsRStd = Vectors.dense(-0.50133383, -0.3151913, 0.8165252) + 0.0, 0.0, 0.0, 0.09074857, + -0.09093399, 0.28078251, -0.12854559, -0.18382494, + 0.0, -0.4064444, 0.0, 0.0), isTransposed = true) + val interceptsRStd = Vectors.dense(-0.55325803, -0.27291366, 0.8261717) val coefficientsR = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.02384198, 0.03127749, - -0.04949061, 0.0, -0.18584462, -0.08952455, + 0.0, 0.0, 0.02312769, 0.04159224, + -0.03776471, 0.0, -0.19588206, -0.11187712, 0.0, 0.0, 0.0, 0.0), isTransposed = true) - val interceptsR = Vectors.dense(-0.38857157, 0.62492165, -0.2363501) + val interceptsR = Vectors.dense(-0.40016908, 0.62474768, -0.2245786) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.05) assert(model1.interceptVector ~== interceptsRStd absTol 0.1) @@ -2274,27 +2282,26 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { data.V3 . data.V4 . data.V5 . - data.V6 0.03238285 + data.V6 0.03418889 $`1` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 -0.1328284 - data.V4 0.4219321 - data.V5 -0.1247544 - data.V6 -0.1893318 + data.V3 -0.1114779 + data.V4 0.3992145 + data.V5 -0.1315371 + data.V6 -0.2107956 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" s0 . - data.V3 0.004572312 + data.V3 0.006442826 data.V4 . data.V5 . data.V6 . - coefficients $`0` 5 x 1 sparse Matrix of class "dgCMatrix" @@ -2310,9 +2317,9 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { s0 . data.V3 . - data.V4 0.14571623 - data.V5 -0.16456351 - data.V6 -0.05866264 + data.V4 0.15710979 + data.V5 -0.16871602 + data.V6 -0.07928527 $`2` 5 x 1 sparse Matrix of class "dgCMatrix" @@ -2326,13 +2333,13 @@ class LogisticRegressionSuite extends MLTest with DefaultReadWriteTest { */ val coefficientsRStd = new DenseMatrix(3, 4, Array( - 0.0, 0.0, 0.0, 0.03238285, - -0.1328284, 0.4219321, -0.1247544, -0.1893318, - 0.004572312, 0.0, 0.0, 0.0), isTransposed = true) + 0.0, 0.0, 0.0, 0.03418889, + -0.1114779, 0.3992145, -0.1315371, -0.2107956, + 0.006442826, 0.0, 0.0, 0.0), isTransposed = true) val coefficientsR = new DenseMatrix(3, 4, Array( 0.0, 0.0, 0.0, 0.0, - 0.0, 0.14571623, -0.16456351, -0.05866264, + 0.0, 0.15710979, -0.16871602, -0.07928527, 0.0, 0.0, 0.0, 0.0), isTransposed = true) assert(model1.coefficientMatrix ~== coefficientsRStd absTol 0.01) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala index a5159bc..5d439a2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala @@ -167,7 +167,7 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes val model = new KMeans() .setK(3) - .setSeed(1) + .setSeed(42) .setInitMode(MLlibKMeans.RANDOM) .setTol(1e-6) .setDistanceMeasure(DistanceMeasure.COSINE) diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala index 97269ee..d3b8575 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/PowerIterationClusteringSuite.scala @@ -34,9 +34,9 @@ class PowerIterationClusteringSuite extends SparkFunSuite @transient var data: Dataset[_] = _ final val r1 = 1.0 - final val n1 = 10 + final val n1 = 80 final val r2 = 4.0 - final val n2 = 40 + final val n2 = 80 override def beforeAll(): Unit = { super.beforeAll() @@ -222,7 +222,7 @@ class PowerIterationClusteringSuite extends SparkFunSuite (0, 1), (0, 2), (3, 4) - )).toDF("src", "dst") + )).toDF("src", "dst").repartition(1) var assignments2 = new PowerIterationClustering() .setInitMode("random") diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index 70d1177..d28f1f4 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -65,7 +65,7 @@ class Word2VecSuite extends MLTest with DefaultReadWriteTest { // These expectations are just magic values, characterizing the current // behavior. The test needs to be updated to be more general, see SPARK-11502 - val magicExp = Vectors.dense(0.30153007534417237, -0.6833061711354689, 0.5116530778733167) + val magicExp = Vectors.dense(-0.11654884266582402, 0.3115301721475341, -0.6879349987615239) testTransformer[(Seq[String], Vector)](docDF, model, "result", "expected") { case Row(vector1: Vector, vector2: Vector) => assert(vector1 ~== magicExp absTol 1E-5, "Transformed vector is different with expected.") @@ -98,9 +98,9 @@ class Word2VecSuite extends MLTest with DefaultReadWriteTest { // These expectations are just magic values, characterizing the current // behavior. The test needs to be updated to be more general, see SPARK-11502 val magicExpected = Seq( - Vectors.dense(0.3326166272163391, -0.5603077411651611, -0.2309209555387497), - Vectors.dense(0.32463887333869934, -0.9306551218032837, 1.393115520477295), - Vectors.dense(-0.27150997519493103, 0.4372006058692932, -0.13465698063373566) + Vectors.dense(0.12662248313426971, 0.6108677387237549, -0.006755620241165161), + Vectors.dense(-0.3870747685432434, 0.023309476673603058, -1.567158818244934), + Vectors.dense(-0.08617416769266129, -0.09897610545158386, 0.6113300323486328) ) realVectors.zip(magicExpected).foreach { @@ -122,7 +122,7 @@ class Word2VecSuite extends MLTest with DefaultReadWriteTest { .setSeed(42L) .fit(docDF) - val expected = Map(("b", 0.2608488929093532), ("c", -0.8271274846926078)) + val expected = Map(("b", -0.024012837558984756), ("c", -0.19355152547359467)) val findSynonymsResult = model.findSynonyms("a", 2).rdd.map { case Row(w: String, sim: Double) => (w, sim) }.collectAsMap() diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala index 46fa376..f35c8c6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala @@ -184,7 +184,7 @@ class GBTRegressorSuite extends MLTest with DefaultReadWriteTest { val gbt = new GBTRegressor() .setMaxDepth(3) .setMaxIter(5) - .setSeed(123) + .setSeed(42) .setFeatureSubsetStrategy("all") // In this data, feature 1 is very important. diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala index 600a432..fc1284e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala @@ -232,8 +232,8 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest print(as.vector(coef(model))) } - [1] 2.2960999 0.8087933 - [1] 2.5002642 2.2000403 0.5999485 + [1] 2.2958751 0.8088523 + [1] 2.5009266 2.1997901 0.5999522 data <- read.csv("path", header=FALSE) model1 <- glm(f1, family=gaussian(link=log), data=data, start=c(0,0)) @@ -241,8 +241,8 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest print(as.vector(coef(model1))) print(as.vector(coef(model2))) - [1] 0.23069326 0.07993778 - [1] 0.25001858 0.22002452 0.05998789 + [1] 0.23063118 0.07995495 + [1] 0.25016124 0.21995737 0.05999335 data <- read.csv("path", header=FALSE) for (formula in c(f1, f2)) { @@ -250,17 +250,17 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest print(as.vector(coef(model))) } - [1] 2.3010179 0.8198976 - [1] 2.4108902 2.2130248 0.6086152 + [1] 2.3320341 0.8121904 + [1] 2.2837064 2.2487147 0.6120262 */ val expected = Seq( - Vectors.dense(0.0, 2.2960999, 0.8087933), - Vectors.dense(2.5002642, 2.2000403, 0.5999485), - Vectors.dense(0.0, 0.23069326, 0.07993778), - Vectors.dense(0.25001858, 0.22002452, 0.05998789), - Vectors.dense(0.0, 2.3010179, 0.8198976), - Vectors.dense(2.4108902, 2.2130248, 0.6086152)) + Vectors.dense(0.0, 2.2958751, 0.8088523), + Vectors.dense(2.5009266, 2.1997901, 0.5999522), + Vectors.dense(0.0, 0.23063118, 0.07995495), + Vectors.dense(0.25016124, 0.21995737, 0.05999335), + Vectors.dense(0.0, 2.3320341, 0.8121904), + Vectors.dense(2.2837064, 2.2487147, 0.6120262)) import GeneralizedLinearRegression._ @@ -308,21 +308,21 @@ class GeneralizedLinearRegressionSuite extends MLTest with DefaultReadWriteTest } } - [1] 0.0000000 2.2961005 0.8087932 - [1] 0.0000000 2.2130368 0.8309556 - [1] 0.0000000 1.7176137 0.9610657 - [1] 2.5002642 2.2000403 0.5999485 - [1] 3.1106389 2.0935142 0.5712711 - [1] 6.7597127 1.4581054 0.3994266 + [1] 0.0000000 2.2958757 0.8088521 + [1] 0.0000000 2.2128149 0.8310136 + [1] 0.0000000 1.7174260 0.9611137 + [1] 2.5009266 2.1997901 0.5999522 + [1] 3.1113269 2.0932659 0.5712717 + [1] 6.7604302 1.4578902 0.3994153 */ val expected = Seq( - Vectors.dense(0.0, 2.2961005, 0.8087932), - Vectors.dense(0.0, 2.2130368, 0.8309556), - Vectors.dense(0.0, 1.7176137, 0.9610657), - Vectors.dense(2.5002642, 2.2000403, 0.5999485), - Vectors.dense(3.1106389, 2.0935142, 0.5712711), - Vectors.dense(6.7597127, 1.4581054, 0.3994266)) + Vectors.dense(0.0, 2.2958757, 0.8088521), + Vectors.dense(0.0, 2.2128149, 0.8310136), + Vectors.dense(0.0, 1.7174260, 0.9611137), + Vectors.dense(2.5009266, 2.1997901, 0.5999522), + Vectors.dense(3.1113269, 2.0932659, 0.5712717), + Vectors.dense(6.7604302, 1.4578902, 0.3994153)) var idx = 0 for (fitIntercept <- Seq(false, true); diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala index b33b86b..c25c89b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/PowerIterationClusteringSuite.scala @@ -47,9 +47,9 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon test("power iteration clustering") { // Generate two circles following the example in the PIC paper. val r1 = 1.0 - val n1 = 10 + val n1 = 80 val r2 = 4.0 - val n2 = 10 + val n2 = 80 val n = n1 + n2 val points = genCircle(r1, n1) ++ genCircle(r2, n2) val similarities = for (i <- 1 until n; j <- 0 until i) yield { @@ -81,9 +81,9 @@ class PowerIterationClusteringSuite extends SparkFunSuite with MLlibTestSparkCon test("power iteration clustering on graph") { // Generate two circles following the example in the PIC paper. val r1 = 1.0 - val n1 = 10 + val n1 = 80 val r2 = 4.0 - val n2 = 10 + val n2 = 80 val n = n1 + n2 val points = genCircle(r1, n1) ++ genCircle(r2, n2) val similarities = for (i <- 1 until n; j <- 0 until i) yield { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala index fdaa098..a1ac10c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/StreamingKMeansSuite.scala @@ -77,6 +77,7 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase { val k = 2 val d = 5 val r = 0.1 + val seed = 987654321 // create model with two clusters val kMeans = new StreamingKMeans() @@ -88,7 +89,7 @@ class StreamingKMeansSuite extends SparkFunSuite with TestSuiteBase { Array(5.0, 5.0)) // generate random data for k-means - val (input, centers) = StreamingKMeansDataGenerator(numPoints, numBatches, k, d, r, 42) + val (input, centers) = StreamingKMeansDataGenerator(numPoints, numBatches, k, d, r, seed) // setup and run the model training ssc = setupStreams(input, (inputDStream: DStream[Vector]) => { diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 864e2a3..6c9cf7b 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -1193,19 +1193,19 @@ class PowerIterationClustering(HasMaxIter, HasWeightCol, JavaParams, JavaMLReada ... (3, 0, 0.5), (3, 1, 0.7), (3, 2, 0.9), ... (4, 0, 0.5), (4, 1, 0.7), (4, 2, 0.9), (4, 3, 1.1), ... (5, 0, 0.5), (5, 1, 0.7), (5, 2, 0.9), (5, 3, 1.1), (5, 4, 1.3)] - >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight") + >>> df = spark.createDataFrame(data).toDF("src", "dst", "weight").repartition(1) >>> pic = PowerIterationClustering(k=2, maxIter=40, weightCol="weight") >>> assignments = pic.assignClusters(df) >>> assignments.sort(assignments.id).show(truncate=False) +---+-------+ |id |cluster| +---+-------+ - |0 |1 | - |1 |1 | - |2 |1 | - |3 |1 | - |4 |1 | - |5 |0 | + |0 |0 | + |1 |0 | + |2 |0 | + |3 |0 | + |4 |0 | + |5 |1 | +---+-------+ ... >>> pic_path = temp_path + "/pic" diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 3f9de9c..595ab18 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -3064,24 +3064,24 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has +----+--------------------+ |word| vector| +----+--------------------+ - | a|[0.09461779892444...| - | b|[1.15474212169647...| - | c|[-0.3794820010662...| + | a|[0.09511678665876...| + | b|[-1.2028766870498...| + | c|[0.30153277516365...| +----+--------------------+ ... >>> model.findSynonymsArray("a", 2) - [(u'b', 0.25053444504737854), (u'c', -0.6980510950088501)] + [(u'b', 0.015859870240092278), (u'c', -0.5680795907974243)] >>> from pyspark.sql.functions import format_number as fmt >>> model.findSynonyms("a", 2).select("word", fmt("similarity", 5).alias("similarity")).show() +----+----------+ |word|similarity| +----+----------+ - | b| 0.25053| - | c| -0.69805| + | b| 0.01586| + | c| -0.56808| +----+----------+ ... >>> model.transform(doc).head().model - DenseVector([0.5524, -0.4995, -0.3599, 0.0241, 0.3461]) + DenseVector([-0.4833, 0.1855, -0.273, -0.0509, -0.4769]) >>> word2vecPath = temp_path + "/word2vec" >>> word2Vec.save(word2vecPath) >>> loadedWord2Vec = Word2Vec.load(word2vecPath) diff --git a/python/pyspark/ml/recommendation.py b/python/pyspark/ml/recommendation.py index 520d791..bf27164 100644 --- a/python/pyspark/ml/recommendation.py +++ b/python/pyspark/ml/recommendation.py @@ -79,27 +79,27 @@ class ALS(JavaEstimator, HasCheckpointInterval, HasMaxIter, HasPredictionCol, Ha >>> test = spark.createDataFrame([(0, 2), (1, 0), (2, 0)], ["user", "item"]) >>> predictions = sorted(model.transform(test).collect(), key=lambda r: r[0]) >>> predictions[0] - Row(user=0, item=2, prediction=-0.13807615637779236) + Row(user=0, item=2, prediction=0.6929101347923279) >>> predictions[1] - Row(user=1, item=0, prediction=2.6258413791656494) + Row(user=1, item=0, prediction=3.47356915473938) >>> predictions[2] - Row(user=2, item=0, prediction=-1.5018409490585327) + Row(user=2, item=0, prediction=-0.8991986513137817) >>> user_recs = model.recommendForAllUsers(3) >>> user_recs.where(user_recs.user == 0)\ .select("recommendations.item", "recommendations.rating").collect() - [Row(item=[0, 1, 2], rating=[3.910..., 1.992..., -0.138...])] + [Row(item=[0, 1, 2], rating=[3.910..., 1.997..., 0.692...])] >>> item_recs = model.recommendForAllItems(3) >>> item_recs.where(item_recs.item == 2)\ .select("recommendations.user", "recommendations.rating").collect() - [Row(user=[2, 1, 0], rating=[4.901..., 3.981..., -0.138...])] + [Row(user=[2, 1, 0], rating=[4.892..., 3.991..., 0.692...])] >>> user_subset = df.where(df.user == 2) >>> user_subset_recs = model.recommendForUserSubset(user_subset, 3) >>> user_subset_recs.select("recommendations.item", "recommendations.rating").first() - Row(item=[2, 1, 0], rating=[4.901..., 1.056..., -1.501...]) + Row(item=[2, 1, 0], rating=[4.892..., 1.076..., -0.899...]) >>> item_subset = df.where(df.item == 0) >>> item_subset_recs = model.recommendForItemSubset(item_subset, 3) >>> item_subset_recs.select("recommendations.user", "recommendations.rating").first() - Row(user=[0, 1, 2], rating=[3.910..., 2.625..., -1.501...]) + Row(user=[0, 1, 2], rating=[3.910..., 3.473..., -0.899...]) >>> als_path = temp_path + "/als" >>> als.save(als_path) >>> als2 = ALS.load(als_path) diff --git a/python/pyspark/ml/tests/test_algorithms.py b/python/pyspark/ml/tests/test_algorithms.py index 6082082..034eaed 100644 --- a/python/pyspark/ml/tests/test_algorithms.py +++ b/python/pyspark/ml/tests/test_algorithms.py @@ -83,7 +83,7 @@ class MultilayerPerceptronClassifierTest(SparkSessionTestCase): result = model.transform(test).head() expected_prediction = 2.0 expected_probability = [0.0, 0.0, 1.0] - expected_rawPrediction = [57.3955, -124.5462, 67.9943] + expected_rawPrediction = [-11.6081922998, -8.15827998691, 22.17757045] self.assertTrue(result.prediction, expected_prediction) self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4)) self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4)) diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py index 1f4abf5..be7b8da 100644 --- a/python/pyspark/ml/tuning.py +++ b/python/pyspark/ml/tuning.py @@ -504,15 +504,15 @@ class TrainValidationSplit(Estimator, ValidatorParams, HasParallelism, HasCollec ... (Vectors.dense([0.5]), 0.0), ... (Vectors.dense([0.6]), 1.0), ... (Vectors.dense([1.0]), 1.0)] * 10, - ... ["features", "label"]) + ... ["features", "label"]).repartition(1) >>> lr = LogisticRegression() >>> grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() >>> evaluator = BinaryClassificationEvaluator() >>> tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, - ... parallelism=2) + ... parallelism=1, seed=42) >>> tvsModel = tvs.fit(dataset) >>> evaluator.evaluate(tvsModel.transform(dataset)) - 0.8333... + 0.833... .. versionadded:: 2.0.0 """ diff --git a/python/pyspark/mllib/recommendation.py b/python/pyspark/mllib/recommendation.py index 3d4eae8..3dd7cb2 100644 --- a/python/pyspark/mllib/recommendation.py +++ b/python/pyspark/mllib/recommendation.py @@ -100,16 +100,16 @@ class MatrixFactorizationModel(JavaModelWrapper, JavaSaveable, JavaLoader): >>> users_for_products[0] (1, (Rating(user=2, product=1, rating=...),)) - >>> model = ALS.train(ratings, 1, nonnegative=True, seed=10) + >>> model = ALS.train(ratings, 1, nonnegative=True, seed=123456789) >>> model.predict(2, 2) 3.73... >>> df = sqlContext.createDataFrame([Rating(1, 1, 1.0), Rating(1, 2, 2.0), Rating(2, 1, 2.0)]) - >>> model = ALS.train(df, 1, nonnegative=True, seed=10) + >>> model = ALS.train(df, 1, nonnegative=True, seed=123456789) >>> model.predict(2, 2) 3.73... - >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=10) + >>> model = ALS.trainImplicit(ratings, 1, nonnegative=True, seed=123456789) >>> model.predict(2, 2) 0.4... diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 8227e82..58d74f5 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -795,9 +795,9 @@ class DataFrame(object): >>> df = spark.range(10) >>> df.sample(0.5, 3).count() - 4 + 7 >>> df.sample(fraction=0.5, seed=3).count() - 4 + 7 >>> df.sample(withReplacement=True, fraction=0.5, seed=3).count() 1 >>> df.sample(1.0).count() @@ -865,8 +865,8 @@ class DataFrame(object): +---+-----+ |key|count| +---+-----+ - | 0| 5| - | 1| 9| + | 0| 3| + | 1| 6| +---+-----+ >>> dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count() 33 @@ -898,10 +898,10 @@ class DataFrame(object): >>> splits = df4.randomSplit([1.0, 2.0], 24) >>> splits[0].count() - 1 + 2 >>> splits[1].count() - 3 + 2 """ for w in weights: if w < 0.0: diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index bc28c9d..6ae2357 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -584,8 +584,8 @@ def rand(seed=None): .. note:: The function is non-deterministic in general case. >>> df.withColumn('rand', rand(seed=42) * 3).collect() - [Row(age=2, name=u'Alice', rand=1.1568609015300986), - Row(age=5, name=u'Bob', rand=1.403379671529166)] + [Row(age=2, name=u'Alice', rand=2.4052597283576684), + Row(age=5, name=u'Bob', rand=2.3913904055683974)] """ sc = SparkContext._active_spark_context if seed is not None: @@ -604,8 +604,8 @@ def randn(seed=None): .. note:: The function is non-deterministic in general case. >>> df.withColumn('randn', randn(seed=42)).collect() - [Row(age=2, name=u'Alice', randn=-0.7556247885860078), - Row(age=5, name=u'Bob', randn=-0.0861619008451133)] + [Row(age=2, name=u'Alice', randn=1.1027054481455365), + Row(age=5, name=u'Bob', randn=0.7400395449950132)] """ sc = SparkContext._active_spark_context if seed is not None: diff --git a/python/pyspark/sql/tests/test_functions.py b/python/pyspark/sql/tests/test_functions.py index b777573..273749e 100644 --- a/python/pyspark/sql/tests/test_functions.py +++ b/python/pyspark/sql/tests/test_functions.py @@ -83,9 +83,9 @@ class FunctionsTests(ReusedSQLTestCase): self.assertTrue(abs(corr - 0.95734012) < 1e-6) def test_sampleby(self): - df = self.sc.parallelize([Row(a=i, b=(i % 3)) for i in range(10)]).toDF() + df = self.sc.parallelize([Row(a=i, b=(i % 3)) for i in range(100)]).toDF() sampled = df.stat.sampleBy(u"b", fractions={0: 0.5, 1: 0.5}, seed=0) - self.assertTrue(sampled.count() == 3) + self.assertTrue(sampled.count() == 35) def test_cov(self): df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala index 752c9d5..469c24b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RandomSuite.scala @@ -17,25 +17,21 @@ package org.apache.spark.sql.catalyst.expressions -import org.scalatest.Matchers._ - import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, LongType} class RandomSuite extends SparkFunSuite with ExpressionEvalHelper { test("random") { - checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001) - checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001) + checkEvaluation(Rand(30), 0.2762195585886885) + checkEvaluation(Randn(30), -1.0451987154313813) - checkDoubleEvaluation( - new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001) - checkDoubleEvaluation( - new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001) + checkEvaluation(new Rand(Literal.create(null, LongType)), 0.7604953758285915) + checkEvaluation(new Randn(Literal.create(null, IntegerType)), 1.6034991609278433) } test("SPARK-9127 codegen with long seed") { - checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001) - checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001) + checkEvaluation(Rand(5419823303878592871L), 0.7145363364564755) + checkEvaluation(Randn(5419823303878592871L), 0.7816815274533012) } } diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out index cf5add6..09e2c63 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-ordinal.sql.out @@ -141,12 +141,12 @@ from -- !query 13 schema struct<a:int,rand(0):double,sum(b):bigint> -- !query 13 output -1 0.4048454303385226 2 -1 0.8446490682263027 1 -2 0.5871875724155838 1 -2 0.8865128837019473 2 -3 0.742083829230211 1 -3 0.9179913208300406 2 +1 0.5234194256885571 2 +1 0.7604953758285915 1 +2 0.0953472826424725 1 +2 0.3163249920547614 2 +3 0.2710259815484829 2 +3 0.7141011170991605 1 -- !query 14 diff --git a/sql/core/src/test/resources/sql-tests/results/random.sql.out b/sql/core/src/test/resources/sql-tests/results/random.sql.out index bca6732..acd0609 100644 --- a/sql/core/src/test/resources/sql-tests/results/random.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/random.sql.out @@ -7,7 +7,7 @@ SELECT rand(0) -- !query 0 schema struct<rand(0):double> -- !query 0 output -0.8446490682263027 +0.7604953758285915 -- !query 1 @@ -15,7 +15,7 @@ SELECT rand(cast(3 / 7 AS int)) -- !query 1 schema struct<rand(CAST((CAST(3 AS DOUBLE) / CAST(7 AS DOUBLE)) AS INT)):double> -- !query 1 output -0.8446490682263027 +0.7604953758285915 -- !query 2 @@ -23,7 +23,7 @@ SELECT rand(NULL) -- !query 2 schema struct<rand(CAST(NULL AS INT)):double> -- !query 2 output -0.8446490682263027 +0.7604953758285915 -- !query 3 @@ -31,7 +31,7 @@ SELECT rand(cast(NULL AS int)) -- !query 3 schema struct<rand(CAST(NULL AS INT)):double> -- !query 3 output -0.8446490682263027 +0.7604953758285915 -- !query 4 @@ -48,7 +48,7 @@ SELECT randn(0L) -- !query 5 schema struct<randn(0):double> -- !query 5 output -1.1164209726833079 +1.6034991609278433 -- !query 6 @@ -56,7 +56,7 @@ SELECT randn(cast(3 / 7 AS long)) -- !query 6 schema struct<randn(CAST((CAST(3 AS DOUBLE) / CAST(7 AS DOUBLE)) AS BIGINT)):double> -- !query 6 output -1.1164209726833079 +1.6034991609278433 -- !query 7 @@ -64,7 +64,7 @@ SELECT randn(NULL) -- !query 7 schema struct<randn(CAST(NULL AS INT)):double> -- !query 7 output -1.1164209726833079 +1.6034991609278433 -- !query 8 @@ -72,7 +72,7 @@ SELECT randn(cast(NULL AS long)) -- !query 8 schema struct<randn(CAST(NULL AS BIGINT)):double> -- !query 8 output -1.1164209726833079 +1.6034991609278433 -- !query 9 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala index 589873b..2a74bfe 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -47,7 +47,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { val data = sparkContext.parallelize(1 to n, 2).toDF("id") checkAnswer( data.sample(withReplacement = false, 0.05, seed = 13), - Seq(3, 17, 27, 58, 62).map(Row(_)) + Seq(37, 8, 90).map(Row(_)) ) } @@ -371,7 +371,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { val sampled = df.stat.sampleBy("key", Map(0 -> 0.1, 1 -> 0.2), 0L) checkAnswer( sampled.groupBy("key").count().orderBy("key"), - Seq(Row(0, 6), Row(1, 11))) + Seq(Row(0, 1), Row(1, 6))) } test("sampleBy one column") { @@ -379,7 +379,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { val sampled = df.stat.sampleBy($"key", Map(0 -> 0.1, 1 -> 0.2), 0L) checkAnswer( sampled.groupBy("key").count().orderBy("key"), - Seq(Row(0, 6), Row(1, 11))) + Seq(Row(0, 1), Row(1, 6))) } test("sampleBy multiple columns") { @@ -389,7 +389,7 @@ class DataFrameStatSuite extends QueryTest with SharedSQLContext { struct($"name", $"key"), Map(Row("Foo", 0) -> 0.1, Row("Foo", 1) -> 0.2), 0L) checkAnswer( sampled.groupBy("key").count().orderBy("key"), - Seq(Row(0, 6), Row(1, 11))) + Seq(Row(0, 1), Row(1, 6))) } // This test case only verifies that `DataFrame.countMinSketch()` methods do return diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index 050699d..6e35b52 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -618,7 +618,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext { val data = sparkContext.parallelize(1 to n, 2).toDS() checkDataset( data.sample(withReplacement = false, 0.05, seed = 13), - 3, 17, 27, 58, 62) + 8, 37, 90) } test("sample fraction should not be negative with replacement") { @@ -650,9 +650,10 @@ class DatasetSuite extends QueryTest with SharedSQLContext { } test("SPARK-16686: Dataset.sample with seed results shouldn't depend on downstream usage") { + val a = 7 val simpleUdf = udf((n: Int) => { - require(n != 1, "simpleUdf shouldn't see id=1!") - 1 + require(n != a, s"simpleUdf shouldn't see id=$a!") + a }) val df = Seq( @@ -668,10 +669,10 @@ class DatasetSuite extends QueryTest with SharedSQLContext { (9, "string9") ).toDF("id", "stringData") val sampleDF = df.sample(false, 0.7, 50) - // After sampling, sampleDF doesn't contain id=1. - assert(!sampleDF.select("id").as[Int].collect.contains(1)) - // simpleUdf should not encounter id=1. - checkAnswer(sampleDF.select(simpleUdf($"id")), List.fill(sampleDF.count.toInt)(Row(1))) + // After sampling, sampleDF doesn't contain id=a. + assert(!sampleDF.select("id").as[Int].collect.contains(a)) + // simpleUdf should not encounter id=a. + checkAnswer(sampleDF.select(simpleUdf($"id")), List.fill(sampleDF.count.toInt)(Row(a))) } test("SPARK-11436: we should rebind right encoder when join 2 datasets") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala index 3e20cc4..7999331 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/TestCsvData.scala @@ -24,8 +24,7 @@ private[csv] trait TestCsvData { def sampledTestData: Dataset[String] = { spark.range(0, 100, 1).map { index => - val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46, - 57, 62, 68, 72) + val predefinedSample = Set[Long](3, 18, 20, 24, 50, 60, 87, 99) if (predefinedSample.contains(index)) { index.toString } else { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala index 6e9559e..1750333 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/TestJsonData.scala @@ -236,8 +236,7 @@ private[json] trait TestJsonData { def sampledTestData: Dataset[String] = { spark.range(0, 100, 1).map { index => - val predefinedSample = Set[Long](2, 8, 15, 27, 30, 34, 35, 37, 44, 46, - 57, 62, 68, 72) + val predefinedSample = Set[Long](3, 18, 20, 24, 50, 60, 87, 99) if (predefinedSample.contains(index)) { s"""{"f1":${index.toString}}""" } else { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org