This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new d81df56 [SPARK-31223][ML] Set seed in np.random to regenerate test data d81df56 is described below commit d81df56f2dbd9757c87101fa32c28cf0cd96f278 Author: Huaxin Gao <huax...@us.ibm.com> AuthorDate: Thu Mar 26 13:53:31 2020 +0800 [SPARK-31223][ML] Set seed in np.random to regenerate test data ### What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-31223 set seed in np.random when generating test data...... ### Why are the changes needed? so the same set of test data can be regenerated later. ### Does this PR introduce any user-facing change? No ### How was this patch tested? exiting tests Closes #27994 from huaxingao/spark-31223. Authored-by: Huaxin Gao <huax...@us.ibm.com> Signed-off-by: zhengruifeng <ruife...@foxmail.com> --- .../spark/ml/feature/FValueSelectorSuite.scala | 180 +++++++++++--------- .../org/apache/spark/ml/stat/ANOVATestSuite.scala | 175 ++++++++++---------- .../org/apache/spark/ml/stat/FValueTestSuite.scala | 183 +++++++++++---------- 3 files changed, 282 insertions(+), 256 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/FValueSelectorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/FValueSelectorSuite.scala index 8bf0856..b54d6d3 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/FValueSelectorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/FValueSelectorSuite.scala @@ -34,88 +34,112 @@ class FValueSelectorSuite extends MLTest with DefaultReadWriteTest { // scalastyle:off /* - FValue REGRESSION - X (features) = - [[1.67318514e-01, 1.78398028e-01, 4.36846538e-01, 5.24003164e-01, 1.80915415e-01, 1.98030859e-01], - [3.71836586e-01, 6.13453963e-01, 7.15269190e-01, 9.33623792e-03, 5.36095674e-01, 2.74223333e-01], - [3.68988949e-01, 5.34104018e-01, 5.24858744e-01, 6.86815853e-01, 3.26534757e-01, 6.92699400e-01], - [4.87748505e-02, 3.07080315e-01, 7.82955385e-01, 6.90167375e-01, 6.44077919e-01, 4.23739024e-01], - [6.50153455e-01, 8.32746110e-01, 6.88029140e-03, 1.27859556e-01, 6.80223767e-01, 6.25825675e-01], - - [9.47343271e-01, 2.13193978e-01, 3.71342472e-01, 8.21291956e-01, 4.38195693e-01, 5.76569439e-01], - [9.96499254e-01, 8.45833297e-01, 6.56086922e-02, 5.90029174e-01, 1.68954572e-01, 7.19792823e-02], - [1.85926914e-01, 9.60329804e-01, 3.13487406e-01, 9.59549928e-01, 6.89093311e-01, 6.94999427e-01], - [9.40164576e-01, 2.69042714e-02, 5.39491321e-01, 5.74068666e-01, 1.10935343e-01, 2.17519760e-01], - [2.97951848e-02, 1.06592106e-01, 5.74931856e-01, 8.80801522e-01, 8.60445070e-01, 9.22757966e-01], - - [9.80970473e-01, 3.05909353e-01, 4.96401766e-01, 2.44342697e-01, 6.90559227e-01, 5.64858704e-01], - [1.55939260e-01, 2.18626853e-01, 5.01834270e-01, 1.86694987e-01, 9.15411148e-01, 6.40527848e-01], - [3.16107608e-01, 9.25906358e-01, 5.47327167e-01, 4.83712979e-01, 8.42305220e-01, 7.58488462e-01], - [4.14393503e-01, 1.30817883e-01, 5.62034942e-01, 1.05150633e-01, 5.35632795e-01, 9.47594074e-04], - [5.26233981e-01, 7.63781419e-02, 3.19188240e-01, 5.16528633e-02, 5.28416724e-01, 6.47050470e-03], - - [2.73404764e-01, 7.17070744e-01, 3.12889595e-01, 8.39271965e-01, 9.67650889e-01, 8.50098873e-01], - [4.63289495e-01, 3.57055416e-02, 5.43528596e-01, 4.44840919e-01, 9.36845855e-02, 7.81595037e-01], - [3.21784993e-01, 3.15622454e-01, 7.58870408e-01, 5.18198558e-01, 2.28151905e-01, 4.42460325e-01], - [3.72428352e-01, 1.44447969e-01, 8.40274188e-01, 5.86308041e-01, 6.09893953e-01, 3.97006473e-01], - [3.12776786e-01, 9.33630195e-01, 2.29328749e-01, 4.32807208e-01, 1.51703470e-02, 1.51589320e-01]] - - y (labels) = - [0.33997803, 0.71456716, 0.58676766, 0.52894227, 0.53158463, - 0.55515181, 0.67008744, 0.5966537 , 0.56255674, 0.33904133, - 0.66485577, 0.38514965, 0.73885841, 0.45766267, 0.34801557, - 0.52529452, 0.42503336, 0.60221968, 0.58964479, 0.58194949] - - Note that y = X @ w, where w = [0.3, 0.4, 0.5, 0. , 0. , 0. ] - - Sklearn results: - F values per feature: [2.76445780e+00, 1.05267800e+01, 4.43399092e-02, 2.04580501e-02, - 3.13208557e-02, 1.35248025e-03] - p values per feature: [0.11369388, 0.0044996 , 0.83558782, 0.88785417, 0.86150261, 0.97106833] + Use the following sklearn data in this test + + >>> from sklearn.feature_selection import f_regression + >>> import numpy as np + >>> np.random.seed(777) + >>> X = np.random.rand(20, 6) + >>> w = np.array([0.3, 0.4, 0.5, 0, 0, 0]) + >>> y = X @ w + >>> X + array([[0.19151945, 0.62210877, 0.43772774, 0.78535858, 0.77997581, + 0.27259261], + [0.27646426, 0.80187218, 0.95813935, 0.87593263, 0.35781727, + 0.50099513], + [0.68346294, 0.71270203, 0.37025075, 0.56119619, 0.50308317, + 0.01376845], + [0.77282662, 0.88264119, 0.36488598, 0.61539618, 0.07538124, + 0.36882401], + [0.9331401 , 0.65137814, 0.39720258, 0.78873014, 0.31683612, + 0.56809865], + [0.86912739, 0.43617342, 0.80214764, 0.14376682, 0.70426097, + 0.70458131], + [0.21879211, 0.92486763, 0.44214076, 0.90931596, 0.05980922, + 0.18428708], + [0.04735528, 0.67488094, 0.59462478, 0.53331016, 0.04332406, + 0.56143308], + [0.32966845, 0.50296683, 0.11189432, 0.60719371, 0.56594464, + 0.00676406], + [0.61744171, 0.91212289, 0.79052413, 0.99208147, 0.95880176, + 0.79196414], + [0.28525096, 0.62491671, 0.4780938 , 0.19567518, 0.38231745, + 0.05387369], + [0.45164841, 0.98200474, 0.1239427 , 0.1193809 , 0.73852306, + 0.58730363], + [0.47163253, 0.10712682, 0.22921857, 0.89996519, 0.41675354, + 0.53585166], + [0.00620852, 0.30064171, 0.43689317, 0.612149 , 0.91819808, + 0.62573667], + [0.70599757, 0.14983372, 0.74606341, 0.83100699, 0.63372577, + 0.43830988], + [0.15257277, 0.56840962, 0.52822428, 0.95142876, 0.48035918, + 0.50255956], + [0.53687819, 0.81920207, 0.05711564, 0.66942174, 0.76711663, + 0.70811536], + [0.79686718, 0.55776083, 0.96583653, 0.1471569 , 0.029647 , + 0.59389349], + [0.1140657 , 0.95080985, 0.32570741, 0.19361869, 0.45781165, + 0.92040257], + [0.87906916, 0.25261576, 0.34800879, 0.18258873, 0.90179605, + 0.70652816]]) + >>> y + array([0.52516321, 0.88275782, 0.67524507, 0.76734745, 0.73909458, + 0.83628141, 0.65665506, 0.58147135, 0.35603443, 0.94534373, + 0.57458887, 0.59026777, 0.29894977, 0.34056582, 0.64476446, + 0.53724782, 0.5173021 , 0.94508275, 0.57739736, 0.53877145]) + >>> f_regression(X, y) + (array([5.58025504, 3.98311705, 20.59605518, 0.07993376, 1.25127646, + 0.7676937 ]), + array([2.96302196e-02, 6.13173918e-02, 2.54580618e-04, 7.80612726e-01, + 2.78015517e-01, 3.92474567e-01])) */ // scalastyle:on val data = Seq( - (0.33997803, Vectors.dense(1.67318514e-01, 1.78398028e-01, 4.36846538e-01, - 5.24003164e-01, 1.80915415e-01, 1.98030859e-01), Vectors.dense(1.78398028e-01)), - (0.71456716, Vectors.dense(3.71836586e-01, 6.13453963e-01, 7.15269190e-01, - 9.33623792e-03, 5.36095674e-01, 2.74223333e-01), Vectors.dense(6.13453963e-01)), - (0.58676766, Vectors.dense(3.68988949e-01, 5.34104018e-01, 5.24858744e-01, - 6.86815853e-01, 3.26534757e-01, 6.92699400e-01), Vectors.dense(5.34104018e-01)), - (0.52894227, Vectors.dense(4.87748505e-02, 3.07080315e-01, 7.82955385e-01, - 6.90167375e-01, 6.44077919e-01, 4.23739024e-01), Vectors.dense(3.07080315e-01)), - (0.53158463, Vectors.dense(6.50153455e-01, 8.32746110e-01, 6.88029140e-03, - 1.27859556e-01, 6.80223767e-01, 6.25825675e-01), Vectors.dense(8.32746110e-01)), - (0.55515181, Vectors.dense(9.47343271e-01, 2.13193978e-01, 3.71342472e-01, - 8.21291956e-01, 4.38195693e-01, 5.76569439e-01), Vectors.dense(2.13193978e-01)), - (0.67008744, Vectors.dense(9.96499254e-01, 8.45833297e-01, 6.56086922e-02, - 5.90029174e-01, 1.68954572e-01, 7.19792823e-02), Vectors.dense(8.45833297e-01)), - (0.5966537, Vectors.dense(1.85926914e-01, 9.60329804e-01, 3.13487406e-01, - 9.59549928e-01, 6.89093311e-01, 6.94999427e-01), Vectors.dense(9.60329804e-01)), - (0.56255674, Vectors.dense(9.40164576e-01, 2.69042714e-02, 5.39491321e-01, - 5.74068666e-01, 1.10935343e-01, 2.17519760e-01), Vectors.dense(2.69042714e-02)), - (0.33904133, Vectors.dense(2.97951848e-02, 1.06592106e-01, 5.74931856e-01, - 8.80801522e-01, 8.60445070e-01, 9.22757966e-01), Vectors.dense(1.06592106e-01)), - (0.66485577, Vectors.dense(9.80970473e-01, 3.05909353e-01, 4.96401766e-01, - 2.44342697e-01, 6.90559227e-01, 5.64858704e-01), Vectors.dense(3.05909353e-01)), - (0.38514965, Vectors.dense(1.55939260e-01, 2.18626853e-01, 5.01834270e-01, - 1.86694987e-01, 9.15411148e-01, 6.40527848e-01), Vectors.dense(2.18626853e-01)), - (0.73885841, Vectors.dense(3.16107608e-01, 9.25906358e-01, 5.47327167e-01, - 4.83712979e-01, 8.42305220e-01, 7.58488462e-01), Vectors.dense(9.25906358e-01)), - (0.45766267, Vectors.dense(4.14393503e-01, 1.30817883e-01, 5.62034942e-01, - 1.05150633e-01, 5.35632795e-01, 9.47594074e-04), Vectors.dense(1.30817883e-01)), - (0.34801557, Vectors.dense(5.26233981e-01, 7.63781419e-02, 3.19188240e-01, - 5.16528633e-02, 5.28416724e-01, 6.47050470e-03), Vectors.dense(7.63781419e-02)), - (0.52529452, Vectors.dense(2.73404764e-01, 7.17070744e-01, 3.12889595e-01, - 8.39271965e-01, 9.67650889e-01, 8.50098873e-01), Vectors.dense(7.17070744e-01)), - (0.42503336, Vectors.dense(4.63289495e-01, 3.57055416e-02, 5.43528596e-01, - 4.44840919e-01, 9.36845855e-02, 7.81595037e-01), Vectors.dense(3.57055416e-02)), - (0.60221968, Vectors.dense(3.21784993e-01, 3.15622454e-01, 7.58870408e-01, - 5.18198558e-01, 2.28151905e-01, 4.42460325e-01), Vectors.dense(3.15622454e-01)), - (0.58964479, Vectors.dense(3.72428352e-01, 1.44447969e-01, 8.40274188e-01, - 5.86308041e-01, 6.09893953e-01, 3.97006473e-01), Vectors.dense(1.44447969e-01)), - (0.58194949, Vectors.dense(3.12776786e-01, 9.33630195e-01, 2.29328749e-01, - 4.32807208e-01, 1.51703470e-02, 1.51589320e-01), Vectors.dense(9.33630195e-01))) + (0.52516321, Vectors.dense(0.19151945, 0.62210877, 0.43772774, 0.78535858, 0.77997581, + 0.27259261), Vectors.dense(0.43772774)), + (0.88275782, Vectors.dense(0.27646426, 0.80187218, 0.95813935, 0.87593263, 0.35781727, + 0.50099513), Vectors.dense(0.95813935)), + (0.67524507, Vectors.dense(0.68346294, 0.71270203, 0.37025075, 0.56119619, 0.50308317, + 0.01376845), Vectors.dense(0.37025075)), + (0.76734745, Vectors.dense(0.77282662, 0.88264119, 0.36488598, 0.61539618, 0.07538124, + 0.36882401), Vectors.dense(0.36488598)), + (0.73909458, Vectors.dense(0.9331401, 0.65137814, 0.39720258, 0.78873014, 0.31683612, + 0.56809865), Vectors.dense(0.39720258)), + + (0.83628141, Vectors.dense(0.86912739, 0.43617342, 0.80214764, 0.14376682, 0.70426097, + 0.70458131), Vectors.dense(0.80214764)), + (0.65665506, Vectors.dense(0.21879211, 0.92486763, 0.44214076, 0.90931596, 0.05980922, + 0.18428708), Vectors.dense(0.44214076)), + (0.58147135, Vectors.dense(0.04735528, 0.67488094, 0.59462478, 0.53331016, 0.04332406, + 0.56143308), Vectors.dense(0.59462478)), + (0.35603443, Vectors.dense(0.32966845, 0.50296683, 0.11189432, 0.60719371, 0.56594464, + 0.00676406), Vectors.dense(0.11189432)), + (0.94534373, Vectors.dense(0.61744171, 0.91212289, 0.79052413, 0.99208147, 0.95880176, + 0.79196414), Vectors.dense(0.79052413)), + + (0.57458887, Vectors.dense(0.28525096, 0.62491671, 0.4780938, 0.19567518, 0.38231745, + 0.05387369), Vectors.dense(0.4780938)), + (0.59026777, Vectors.dense(0.45164841, 0.98200474, 0.1239427, 0.1193809, 0.73852306, + 0.58730363), Vectors.dense(0.1239427)), + (0.29894977, Vectors.dense(0.47163253, 0.10712682, 0.22921857, 0.89996519, 0.41675354, + 0.53585166), Vectors.dense(0.22921857)), + (0.34056582, Vectors.dense(0.00620852, 0.30064171, 0.43689317, 0.612149, 0.91819808, + 0.62573667), Vectors.dense(0.43689317)), + (0.64476446, Vectors.dense(0.70599757, 0.14983372, 0.74606341, 0.83100699, 0.63372577, + 0.43830988), Vectors.dense(0.74606341)), + + (0.53724782, Vectors.dense(0.15257277, 0.56840962, 0.52822428, 0.95142876, 0.48035918, + 0.50255956), Vectors.dense(0.52822428)), + (0.5173021, Vectors.dense(0.53687819, 0.81920207, 0.05711564, 0.66942174, 0.76711663, + 0.70811536), Vectors.dense(0.05711564)), + (0.94508275, Vectors.dense(0.79686718, 0.55776083, 0.96583653, 0.1471569, 0.029647, + 0.59389349), Vectors.dense(0.96583653)), + (0.57739736, Vectors.dense(0.1140657, 0.95080985, 0.96583653, 0.19361869, 0.45781165, + 0.92040257), Vectors.dense(0.96583653)), + (0.53877145, Vectors.dense(0.87906916, 0.25261576, 0.34800879, 0.18258873, 0.90179605, + 0.70652816), Vectors.dense(0.34800879))) dataset = spark.createDataFrame(data).toDF("label", "features", "topFeature") } diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/ANOVATestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/ANOVATestSuite.scala index c1b8cfc..2b17dbc 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/ANOVATestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/ANOVATestSuite.scala @@ -36,99 +36,100 @@ class ANOVATestSuite >>> from sklearn.feature_selection import f_classif >>> import numpy as np + >>> np.random.seed(888) >>> X = np.random.rand(20, 6) >>> X - array([[1.77744923e-01, 4.46602900e-01, 7.62487415e-01, 5.81095198e-01, - 9.64228053e-01, 2.31090547e-01], - [8.87725538e-01, 7.39137344e-01, 5.76073881e-01, 7.38266678e-01, - 2.28188254e-01, 4.10480664e-01], - [1.26764947e-01, 9.53144673e-01, 2.50788458e-01, 3.17260250e-01, - 8.76189581e-01, 2.56902552e-01], - [3.74475030e-01, 9.23768847e-02, 6.19433427e-01, 4.10280293e-02, - 7.59799273e-04, 3.87053403e-01], - [8.91109208e-01, 5.21208840e-01, 7.83435405e-02, 8.37642752e-01, - 5.26969965e-02, 3.07387671e-02], - [7.94006669e-01, 8.55618651e-02, 9.25974068e-01, 4.07848340e-01, - 9.47526767e-01, 2.10230156e-01], - [6.38404274e-01, 1.71793581e-01, 9.82515893e-01, 3.34181329e-01, - 9.96800651e-02, 4.48531617e-01], - [1.42029531e-01, 2.53590255e-02, 4.00032820e-01, 3.75843553e-01, - 9.73971121e-01, 3.32346317e-01], - [9.94329513e-01, 5.61156964e-01, 5.96579626e-01, 1.92286208e-01, - 9.71888097e-01, 2.48574337e-01], - [9.71838692e-01, 8.50908993e-02, 6.15917459e-01, 1.61320964e-01, - 2.43025079e-01, 7.78200314e-01], - [7.76735907e-01, 6.10769335e-01, 1.58097504e-01, 2.95018676e-01, - 3.94466695e-01, 7.71700212e-01], - [8.44787012e-01, 4.76682368e-01, 9.43624130e-01, 2.20926735e-01, - 8.43054317e-02, 8.51276967e-01], - [3.38797773e-01, 6.78991156e-01, 7.90036698e-01, 4.40145825e-01, - 8.33432294e-01, 9.91810731e-01], - [1.73295200e-01, 6.83267374e-01, 8.88625086e-01, 6.89072609e-01, - 8.35407299e-01, 9.70359856e-01], - [7.74552650e-01, 5.70846800e-01, 5.39894150e-01, 5.92696042e-01, - 5.72618852e-01, 7.00850299e-01], - [7.84482121e-01, 7.80094912e-01, 4.03710589e-02, 4.97916309e-01, - 2.55871739e-01, 5.27961039e-01], - [1.45590738e-01, 2.43124833e-01, 1.69582546e-01, 6.16891208e-01, - 2.96795519e-01, 9.19985890e-01], - [6.89274903e-01, 7.13295249e-01, 1.65640967e-01, 5.74821962e-01, - 9.11149662e-01, 1.09691820e-01], - [3.33334361e-01, 3.40817958e-01, 6.73779642e-01, 6.01719487e-01, - 3.97932741e-01, 6.39527734e-01], - [2.33981601e-01, 1.41349421e-01, 8.13246213e-01, 9.09664223e-01, - 2.36111304e-01, 9.00214578e-01]]) + array([[0.85956061, 0.1645695 , 0.48347596, 0.92102727, 0.42855644, + 0.05746009], + [0.92500743, 0.65760154, 0.13295284, 0.53344893, 0.8994776 , + 0.24836496], + [0.03017182, 0.07244715, 0.87416449, 0.55843035, 0.91604736, + 0.63346045], + [0.28325261, 0.36536881, 0.09223386, 0.37251258, 0.34742278, + 0.70517077], + [0.64850904, 0.04090877, 0.21173176, 0.00148992, 0.13897166, + 0.21182539], + [0.02609493, 0.44608735, 0.23910531, 0.95449222, 0.90763182, + 0.8624905 ], + [0.09158744, 0.97745235, 0.41150139, 0.45830467, 0.52590925, + 0.29441554], + [0.97211594, 0.1814442 , 0.30340642, 0.17445413, 0.52756958, + 0.02069296], + [0.06354593, 0.63527231, 0.49620335, 0.0141264 , 0.62722219, + 0.63497507], + [0.10814149, 0.8296426 , 0.51775217, 0.57068344, 0.54633305, + 0.12714921], + [0.72731796, 0.94010124, 0.45007811, 0.87650674, 0.53735565, + 0.49568415], + [0.41827208, 0.85100628, 0.38685271, 0.60689503, 0.21784097, + 0.91294433], + [0.65843656, 0.5880859 , 0.18862706, 0.856398 , 0.18029327, + 0.94851926], + [0.3841634 , 0.25138793, 0.96746644, 0.77048045, 0.44685196, + 0.19813854], + [0.65982267, 0.23024125, 0.13598434, 0.60144265, 0.57848927, + 0.85623564], + [0.35764189, 0.47623815, 0.5459232 , 0.79508298, 0.14462443, + 0.01802919], + [0.38532153, 0.90614554, 0.86629571, 0.13988735, 0.32062385, + 0.00179492], + [0.2142368 , 0.28306022, 0.59481646, 0.42567028, 0.52207663, + 0.78082401], + [0.20788283, 0.76861782, 0.59595468, 0.62103642, 0.17781246, + 0.77655345], + [0.1751708 , 0.4547537 , 0.46187865, 0.79781199, 0.05104487, + 0.42406092]]) >>> y = np.array([3, 2, 1, 5, 4, 4, 5, 4, 2, 1, 1, 2, 3, 4, 5, 1, 5, 3, 1, 1]) >>> y array([3, 2, 1, 5, 4, 4, 5, 4, 2, 1, 1, 2, 3, 4, 5, 1, 5, 3, 1, 1]) >>> f_classif(X, y) - (array([1.22822174, 1.10202387, 0.3884059 , 0.41225633, 2.43153435, - 0.50553798]), array([0.34047275, 0.39149917, 0.81363895, 0.79712747, 0.09303083, - 0.73241872])) + (array([0.64110932, 1.98689258, 0.55499714, 1.40340562, 0.30881722, + 0.3848595 ]), array([0.64137831, 0.14830724, 0.69858474, 0.28038169, 0.86759161, + 0.81608606])) */ // scalastyle:on val data = Seq( - LabeledPoint(3, Vectors.dense(1.77744923e-01, 4.46602900e-01, 7.62487415e-01, 5.81095198e-01, - 9.64228053e-01, 2.31090547e-01)), - LabeledPoint(2, Vectors.dense(8.87725538e-01, 7.39137344e-01, 5.76073881e-01, 7.38266678e-01, - 2.28188254e-01, 4.10480664e-01)), - LabeledPoint(1, Vectors.dense(1.26764947e-01, 9.53144673e-01, 2.50788458e-01, 3.17260250e-01, - 8.76189581e-01, 2.56902552e-01)), - LabeledPoint(5, Vectors.dense(3.74475030e-01, 9.23768847e-02, 6.19433427e-01, 4.10280293e-02, - 7.59799273e-04, 3.87053403e-01)), - LabeledPoint(4, Vectors.dense(8.91109208e-01, 5.21208840e-01, 7.83435405e-02, 8.37642752e-01, - 5.26969965e-02, 3.07387671e-02)), - LabeledPoint(4, Vectors.dense(7.94006669e-01, 8.55618651e-02, 9.25974068e-01, 4.07848340e-01, - 9.47526767e-01, 2.10230156e-01)), - LabeledPoint(5, Vectors.dense(6.38404274e-01, 1.71793581e-01, 9.82515893e-01, 3.34181329e-01, - 9.96800651e-02, 4.48531617e-01)), - LabeledPoint(4, Vectors.dense(1.42029531e-01, 2.53590255e-02, 4.00032820e-01, 3.75843553e-01, - 9.73971121e-01, 3.32346317e-01)), - LabeledPoint(2, Vectors.dense(9.94329513e-01, 5.61156964e-01, 5.96579626e-01, 1.92286208e-01, - 9.71888097e-01, 2.48574337e-01)), - LabeledPoint(1, Vectors.dense(9.71838692e-01, 8.50908993e-02, 6.15917459e-01, 1.61320964e-01, - 2.43025079e-01, 7.78200314e-01)), - LabeledPoint(1, Vectors.dense(7.76735907e-01, 6.10769335e-01, 1.58097504e-01, 2.95018676e-01, - 3.94466695e-01, 7.71700212e-01)), - LabeledPoint(2, Vectors.dense(8.44787012e-01, 4.76682368e-01, 9.43624130e-01, 2.20926735e-01, - 8.43054317e-02, 8.51276967e-01)), - LabeledPoint(3, Vectors.dense(3.38797773e-01, 6.78991156e-01, 7.90036698e-01, 4.40145825e-01, - 8.33432294e-01, 9.91810731e-01)), - LabeledPoint(4, Vectors.dense(1.73295200e-01, 6.83267374e-01, 8.88625086e-01, 6.89072609e-01, - 8.35407299e-01, 9.70359856e-01)), - LabeledPoint(5, Vectors.dense(7.74552650e-01, 5.70846800e-01, 5.39894150e-01, 5.92696042e-01, - 5.72618852e-01, 7.00850299e-01)), - LabeledPoint(1, Vectors.dense(7.84482121e-01, 7.80094912e-01, 4.03710589e-02, 4.97916309e-01, - 2.55871739e-01, 5.27961039e-01)), - LabeledPoint(5, Vectors.dense(1.45590738e-01, 2.43124833e-01, 1.69582546e-01, 6.16891208e-01, - 2.96795519e-01, 9.19985890e-01)), - LabeledPoint(3, Vectors.dense(6.89274903e-01, 7.13295249e-01, 1.65640967e-01, 5.74821962e-01, - 9.11149662e-01, 1.09691820e-01)), - LabeledPoint(1, Vectors.dense(3.33334361e-01, 3.40817958e-01, 6.73779642e-01, 6.01719487e-01, - 3.97932741e-01, 6.39527734e-01)), - LabeledPoint(1, Vectors.dense(2.33981601e-01, 1.41349421e-01, 8.13246213e-01, 9.09664223e-01, - 2.36111304e-01, 9.00214578e-01))) + LabeledPoint(3, Vectors.dense(0.85956061, 0.1645695, 0.48347596, 0.92102727, 0.42855644, + 0.05746009)), + LabeledPoint(2, Vectors.dense(0.92500743, 0.65760154, 0.13295284, 0.53344893, 0.8994776, + 0.24836496)), + LabeledPoint(1, Vectors.dense(0.03017182, 0.07244715, 0.87416449, 0.55843035, 0.91604736, + 0.63346045)), + LabeledPoint(5, Vectors.dense(0.28325261, 0.36536881, 0.09223386, 0.37251258, 0.34742278, + 0.70517077)), + LabeledPoint(4, Vectors.dense(0.64850904, 0.04090877, 0.21173176, 0.00148992, 0.13897166, + 0.21182539)), + LabeledPoint(4, Vectors.dense(0.02609493, 0.44608735, 0.23910531, 0.95449222, 0.90763182, + 0.8624905)), + LabeledPoint(5, Vectors.dense(0.09158744, 0.97745235, 0.41150139, 0.45830467, 0.52590925, + 0.29441554)), + LabeledPoint(4, Vectors.dense(0.97211594, 0.1814442, 0.30340642, 0.17445413, 0.52756958, + 0.02069296)), + LabeledPoint(2, Vectors.dense(0.06354593, 0.63527231, 0.49620335, 0.0141264, 0.62722219, + 0.63497507)), + LabeledPoint(1, Vectors.dense(0.10814149, 0.8296426, 0.51775217, 0.57068344, 0.54633305, + 0.12714921)), + LabeledPoint(1, Vectors.dense(0.72731796, 0.94010124, 0.45007811, 0.87650674, 0.53735565, + 0.49568415)), + LabeledPoint(2, Vectors.dense(0.41827208, 0.85100628, 0.38685271, 0.60689503, 0.21784097, + 0.91294433)), + LabeledPoint(3, Vectors.dense(0.65843656, 0.5880859, 0.18862706, 0.856398, 0.18029327, + 0.94851926)), + LabeledPoint(4, Vectors.dense(0.3841634, 0.25138793, 0.96746644, 0.77048045, 0.44685196, + 0.19813854)), + LabeledPoint(5, Vectors.dense(0.65982267, 0.23024125, 0.13598434, 0.60144265, 0.57848927, + 0.85623564)), + LabeledPoint(1, Vectors.dense(0.35764189, 0.47623815, 0.5459232, 0.79508298, 0.14462443, + 0.01802919)), + LabeledPoint(5, Vectors.dense(0.38532153, 0.90614554, 0.86629571, 0.13988735, 0.32062385, + 0.00179492)), + LabeledPoint(3, Vectors.dense(0.2142368, 0.28306022, 0.59481646, 0.42567028, 0.52207663, + 0.78082401)), + LabeledPoint(1, Vectors.dense(0.20788283, 0.76861782, 0.59595468, 0.62103642, 0.17781246, + 0.77655345)), + LabeledPoint(1, Vectors.dense(0.1751708, 0.4547537, 0.46187865, 0.79781199, 0.05104487, + 0.42406092))) for (numParts <- List(2, 4, 6, 8)) { val df = spark.createDataFrame(sc.parallelize(data, numParts)) @@ -136,10 +137,10 @@ class ANOVATestSuite val (pValues: Vector, fValues: Vector) = anovaResult.select("pValues", "fValues") .as[(Vector, Vector)].head() - assert(pValues ~== Vectors.dense(0.34047275, 0.39149917, 0.81363895, 0.79712747, 0.09303083, - 0.73241872) relTol 1e-6) - assert(fValues ~== Vectors.dense(1.22822174, 1.10202387, 0.3884059, 0.41225633, 2.43153435, - 0.50553798) relTol 1e-6) + assert(pValues ~== Vectors.dense(0.64137831, 0.14830724, 0.69858474, 0.28038169, 0.86759161, + 0.81608606) relTol 1e-6) + assert(fValues ~== Vectors.dense(0.64110932, 1.98689258, 0.55499714, 1.40340562, 0.30881722, + 0.3848595) relTol 1e-6) } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/stat/FValueTestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/stat/FValueTestSuite.scala index 540c4fb..37195d2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/stat/FValueTestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/stat/FValueTestSuite.scala @@ -36,103 +36,104 @@ class FValueTestSuite >>> from sklearn.feature_selection import f_regression >>> import numpy as np + >>> np.random.seed(777) >>> X = np.random.rand(20, 6) >>> w = np.array([0.3, 0.4, 0.5, 0, 0, 0]) >>> y = X @ w >>> X - array([[1.67318514e-01, 1.78398028e-01, 4.36846538e-01, 5.24003164e-01, - 1.80915415e-01, 1.98030859e-01], - [3.71836586e-01, 6.13453963e-01, 7.15269190e-01, 9.33623792e-03, - 5.36095674e-01, 2.74223333e-01], - [3.68988949e-01, 5.34104018e-01, 5.24858744e-01, 6.86815853e-01, - 3.26534757e-01, 6.92699400e-01], - [4.87748505e-02, 3.07080315e-01, 7.82955385e-01, 6.90167375e-01, - 6.44077919e-01, 4.23739024e-01], - [6.50153455e-01, 8.32746110e-01, 6.88029140e-03, 1.27859556e-01, - 6.80223767e-01, 6.25825675e-01], - [9.47343271e-01, 2.13193978e-01, 3.71342472e-01, 8.21291956e-01, - 4.38195693e-01, 5.76569439e-01], - [9.96499254e-01, 8.45833297e-01, 6.56086922e-02, 5.90029174e-01, - 1.68954572e-01, 7.19792823e-02], - [1.85926914e-01, 9.60329804e-01, 3.13487406e-01, 9.59549928e-01, - 6.89093311e-01, 6.94999427e-01], - [9.40164576e-01, 2.69042714e-02, 5.39491321e-01, 5.74068666e-01, - 1.10935343e-01, 2.17519760e-01], - [2.97951848e-02, 1.06592106e-01, 5.74931856e-01, 8.80801522e-01, - 8.60445070e-01, 9.22757966e-01], - [9.80970473e-01, 3.05909353e-01, 4.96401766e-01, 2.44342697e-01, - 6.90559227e-01, 5.64858704e-01], - [1.55939260e-01, 2.18626853e-01, 5.01834270e-01, 1.86694987e-01, - 9.15411148e-01, 6.40527848e-01], - [3.16107608e-01, 9.25906358e-01, 5.47327167e-01, 4.83712979e-01, - 8.42305220e-01, 7.58488462e-01], - [4.14393503e-01, 1.30817883e-01, 5.62034942e-01, 1.05150633e-01, - 5.35632795e-01, 9.47594074e-04], - [5.26233981e-01, 7.63781419e-02, 3.19188240e-01, 5.16528633e-02, - 5.28416724e-01, 6.47050470e-03], - [2.73404764e-01, 7.17070744e-01, 3.12889595e-01, 8.39271965e-01, - 9.67650889e-01, 8.50098873e-01], - [4.63289495e-01, 3.57055416e-02, 5.43528596e-01, 4.44840919e-01, - 9.36845855e-02, 7.81595037e-01], - [3.21784993e-01, 3.15622454e-01, 7.58870408e-01, 5.18198558e-01, - 2.28151905e-01, 4.42460325e-01], - [3.72428352e-01, 1.44447969e-01, 8.40274188e-01, 5.86308041e-01, - 6.09893953e-01, 3.97006473e-01], - [3.12776786e-01, 9.33630195e-01, 2.29328749e-01, 4.32807208e-01, - 1.51703470e-02, 1.51589320e-01]]) + array([[0.15266373, 0.30235661, 0.06203641, 0.45986034, 0.83525338, + 0.92699705], + [0.72698898, 0.76849622, 0.26920507, 0.64402929, 0.09337326, + 0.07968589], + [0.58961375, 0.34334054, 0.98887615, 0.62647321, 0.68177928, + 0.55225681], + [0.26886006, 0.37325939, 0.2229281 , 0.1864426 , 0.39064809, + 0.19316241], + [0.61091093, 0.88280845, 0.62233882, 0.25311894, 0.17993031, + 0.81640447], + [0.22537162, 0.51685714, 0.51849582, 0.60037494, 0.53262048, + 0.01331005], + [0.52409726, 0.89588471, 0.76990129, 0.1228517 , 0.29587269, + 0.61202358], + [0.72613812, 0.46349747, 0.76911037, 0.19163103, 0.55786672, + 0.55077816], + [0.47222549, 0.79188496, 0.11524968, 0.6813039 , 0.36233361, + 0.34420889], + [0.44951875, 0.02694226, 0.41524769, 0.9222317 , 0.09120557, + 0.31512178], + [0.52802224, 0.32806203, 0.44891554, 0.01633442, 0.0970269 , + 0.69258857], + [0.83594341, 0.42432199, 0.8487743 , 0.54679121, 0.35410346, + 0.72724968], + [0.09385168, 0.8928588 , 0.33625828, 0.89183268, 0.296849 , + 0.30164829], + [0.80624061, 0.83760997, 0.63428133, 0.3113273 , 0.02944858, + 0.39977732], + [0.51817346, 0.00738845, 0.77494778, 0.8544712 , 0.13153282, + 0.28767364], + [0.32658881, 0.90655956, 0.99955954, 0.77088429, 0.04284752, + 0.96525111], + [0.97521246, 0.2025168 , 0.67985305, 0.46534506, 0.92001748, + 0.72820735], + [0.24585653, 0.01953996, 0.70598881, 0.77448287, 0.4729746 , + 0.80146736], + [0.17539792, 0.72016934, 0.3678759 , 0.53209295, 0.29719397, + 0.37429151], + [0.72810013, 0.39850784, 0.1058295 , 0.39858265, 0.52196395, + 0.1060125 ]]) >>> y - array([0.33997803, 0.71456716, 0.58676766, 0.52894227, 0.53158463, - 0.55515181, 0.67008744, 0.5966537 , 0.56255674, 0.33904133, - 0.66485577, 0.38514965, 0.73885841, 0.45766267, 0.34801557, - 0.52529452, 0.42503336, 0.60221968, 0.58964479, 0.58194949]) + array([0.19775997, 0.66009772, 0.80865842, 0.34142582, 0.84756607, + 0.53360225, 0.90053371, 0.78779561, 0.51604647, 0.35325637, + 0.51408926, 0.84489897, 0.55342816, 0.89405683, 0.54588131, + 0.96038024, 0.71349698, 0.43456735, 0.52462506, 0.43074793]) >>> f_regression(X, y) - (array([2.76445780e+00, 1.05267800e+01, 4.43399092e-02, 2.04580501e-02, - 3.13208557e-02, 1.35248025e-03]), array([0.11369388, 0.0044996 , 0.83558782, 0.88785417, 0.86150261, - 0.97106833])) + (array([ 6.86260598, 7.23175589, 24.11424725, 0.6605354 , 1.26266286, + 1.82421406]), array([1.73658700e-02, 1.49916659e-02, 1.12697153e-04, 4.26990301e-01, + 2.75911201e-01, 1.93549275e-01])) */ // scalastyle:on val data = Seq( - LabeledPoint(0.33997803, Vectors.dense(1.67318514e-01, 1.78398028e-01, 4.36846538e-01, - 5.24003164e-01, 1.80915415e-01, 1.98030859e-01)), - LabeledPoint(0.71456716, Vectors.dense(3.71836586e-01, 6.13453963e-01, 7.15269190e-01, - 9.33623792e-03, 5.36095674e-01, 2.74223333e-01)), - LabeledPoint(0.58676766, Vectors.dense(3.68988949e-01, 5.34104018e-01, 5.24858744e-01, - 6.86815853e-01, 3.26534757e-01, 6.92699400e-01)), - LabeledPoint(0.52894227, Vectors.dense(4.87748505e-02, 3.07080315e-01, 7.82955385e-01, - 6.90167375e-01, 6.44077919e-01, 4.23739024e-01)), - LabeledPoint(0.53158463, Vectors.dense(6.50153455e-01, 8.32746110e-01, 6.88029140e-03, - 1.27859556e-01, 6.80223767e-01, 6.25825675e-01)), - LabeledPoint(0.55515181, Vectors.dense(9.47343271e-01, 2.13193978e-01, 3.71342472e-01, - 8.21291956e-01, 4.38195693e-01, 5.76569439e-01)), - LabeledPoint(0.67008744, Vectors.dense(9.96499254e-01, 8.45833297e-01, 6.56086922e-02, - 5.90029174e-01, 1.68954572e-01, 7.19792823e-02)), - LabeledPoint(0.5966537, Vectors.dense(1.85926914e-01, 9.60329804e-01, 3.13487406e-01, - 9.59549928e-01, 6.89093311e-01, 6.94999427e-01)), - LabeledPoint(0.56255674, Vectors.dense(9.40164576e-01, 2.69042714e-02, 5.39491321e-01, - 5.74068666e-01, 1.10935343e-01, 2.17519760e-01)), - LabeledPoint(0.33904133, Vectors.dense(2.97951848e-02, 1.06592106e-01, 5.74931856e-01, - 8.80801522e-01, 8.60445070e-01, 9.22757966e-01)), - LabeledPoint(0.66485577, Vectors.dense(9.80970473e-01, 3.05909353e-01, 4.96401766e-01, - 2.44342697e-01, 6.90559227e-01, 5.64858704e-01)), - LabeledPoint(0.38514965, Vectors.dense(1.55939260e-01, 2.18626853e-01, 5.01834270e-01, - 1.86694987e-01, 9.15411148e-01, 6.40527848e-01)), - LabeledPoint(0.73885841, Vectors.dense(3.16107608e-01, 9.25906358e-01, 5.47327167e-01, - 4.83712979e-01, 8.42305220e-01, 7.58488462e-01)), - LabeledPoint(0.45766267, Vectors.dense(4.14393503e-01, 1.30817883e-01, 5.62034942e-01, - 1.05150633e-01, 5.35632795e-01, 9.47594074e-04)), - LabeledPoint(0.34801557, Vectors.dense(5.26233981e-01, 7.63781419e-02, 3.19188240e-01, - 5.16528633e-02, 5.28416724e-01, 6.47050470e-03)), - LabeledPoint(0.52529452, Vectors.dense(2.73404764e-01, 7.17070744e-01, 3.12889595e-01, - 8.39271965e-01, 9.67650889e-01, 8.50098873e-01)), - LabeledPoint(0.42503336, Vectors.dense(4.63289495e-01, 3.57055416e-02, 5.43528596e-01, - 4.44840919e-01, 9.36845855e-02, 7.81595037e-01)), - LabeledPoint(0.60221968, Vectors.dense(3.21784993e-01, 3.15622454e-01, 7.58870408e-01, - 5.18198558e-01, 2.28151905e-01, 4.42460325e-01)), - LabeledPoint(0.58964479, Vectors.dense(3.72428352e-01, 1.44447969e-01, 8.40274188e-01, - 5.86308041e-01, 6.09893953e-01, 3.97006473e-01)), - LabeledPoint(0.58194949, Vectors.dense(3.12776786e-01, 9.33630195e-01, 2.29328749e-01, - 4.32807208e-01, 1.51703470e-02, 1.51589320e-01))) + LabeledPoint(0.19775997, Vectors.dense(0.15266373, 0.30235661, 0.06203641, 0.45986034, + 0.83525338, 0.92699705)), + LabeledPoint(0.66009772, Vectors.dense(0.72698898, 0.76849622, 0.26920507, 0.64402929, + 0.09337326, 0.07968589)), + LabeledPoint(0.80865842, Vectors.dense(0.58961375, 0.34334054, 0.98887615, 0.62647321, + 0.68177928, 0.55225681)), + LabeledPoint(0.34142582, Vectors.dense(0.26886006, 0.37325939, 0.2229281, 0.1864426, + 0.39064809, 0.19316241)), + LabeledPoint(0.84756607, Vectors.dense(0.61091093, 0.88280845, 0.62233882, 0.25311894, + 0.17993031, 0.81640447)), + LabeledPoint(0.53360225, Vectors.dense(0.22537162, 0.51685714, 0.51849582, 0.60037494, + 0.53262048, 0.01331005)), + LabeledPoint(0.90053371, Vectors.dense(0.52409726, 0.89588471, 0.76990129, 0.1228517, + 0.29587269, 0.61202358)), + LabeledPoint(0.78779561, Vectors.dense(0.72613812, 0.46349747, 0.76911037, 0.19163103, + 0.55786672, 0.55077816)), + LabeledPoint(0.51604647, Vectors.dense(0.47222549, 0.79188496, 0.11524968, 0.6813039, + 0.36233361, 0.34420889)), + LabeledPoint(0.35325637, Vectors.dense(0.44951875, 0.02694226, 0.41524769, 0.9222317, + 0.09120557, 0.31512178)), + LabeledPoint(0.51408926, Vectors.dense(0.52802224, 0.32806203, 0.44891554, 0.01633442, + 0.0970269, 0.69258857)), + LabeledPoint(0.84489897, Vectors.dense(0.83594341, 0.42432199, 0.8487743, 0.54679121, + 0.35410346, 0.72724968)), + LabeledPoint(0.55342816, Vectors.dense(0.09385168, 0.8928588, 0.33625828, 0.89183268, + 0.296849, 0.30164829)), + LabeledPoint(0.89405683, Vectors.dense(0.80624061, 0.83760997, 0.63428133, 0.3113273, + 0.02944858, 0.39977732)), + LabeledPoint(0.54588131, Vectors.dense(0.51817346, 0.00738845, 0.77494778, 0.8544712, + 0.13153282, 0.28767364)), + LabeledPoint(0.96038024, Vectors.dense(0.32658881, 0.90655956, 0.99955954, 0.77088429, + 0.04284752, 0.96525111)), + LabeledPoint(0.71349698, Vectors.dense(0.97521246, 0.2025168, 0.67985305, 0.46534506, + 0.92001748, 0.72820735)), + LabeledPoint(0.43456735, Vectors.dense(0.24585653, 0.01953996, 0.70598881, 0.77448287, + 0.4729746, 0.80146736)), + LabeledPoint(0.52462506, Vectors.dense(0.17539792, 0.72016934, 0.3678759, 0.53209295, + 0.29719397, 0.37429151)), + LabeledPoint(0.43074793, Vectors.dense(0.72810013, 0.39850784, 0.1058295, 0.39858265, + 0.52196395, 0.1060125))) for (numParts <- List(2, 4, 6, 8)) { val df = spark.createDataFrame(sc.parallelize(data, numParts)) @@ -140,10 +141,10 @@ class FValueTestSuite val (pValues: Vector, fValues: Vector) = fRegression.select("pValues", "fValues") .as[(Vector, Vector)].head() - assert(pValues ~== Vectors.dense(0.11369388, 0.0044996, 0.83558782, 0.88785417, 0.86150261, - 0.97106833) relTol 1e-6) - assert(fValues ~== Vectors.dense(2.76445780e+00, 1.05267800e+01, 4.43399092e-02, - 2.04580501e-02, 3.13208557e-02, 1.35248025e-03) relTol 1e-6) + assert(pValues ~== Vectors.dense(1.73658700e-02, 1.49916659e-02, 1.12697153e-04, + 4.26990301e-01, 2.75911201e-01, 1.93549275e-01) relTol 1e-6) + assert(fValues ~== Vectors.dense(6.86260598, 7.23175589, 24.11424725, 0.6605354, 1.26266286, + 1.82421406) relTol 1e-6) } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org