This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 9f1bf4e [SPARK-33398] Fix loading tree models prior to Spark 3.0 9f1bf4e is described below commit 9f1bf4e47c4978be7d55cfadb7da6b7863942bc8 Author: Ruifeng Zheng <ruife...@foxmail.com> AuthorDate: Sun Jan 3 11:52:46 2021 -0600 [SPARK-33398] Fix loading tree models prior to Spark 3.0 ### What changes were proposed in this pull request? In https://github.com/apache/spark/pull/21632/files#diff-0fdae8a6782091746ed20ea43f77b639f9c6a5f072dd2f600fcf9a7b37db4f47, a new field `rawCount` was added into `NodeData`, which cause that a tree model trained in 2.4 can not be loaded in 3.0/3.1/master; field `rawCount` is only used in training, and not used in `transform`/`predict`/`featureImportance`. So I just set it to -1L. ### Why are the changes needed? to support load old tree model in 3.0/3.1/master ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? added testsuites Closes #30889 from zhengruifeng/fix_tree_load. Authored-by: Ruifeng Zheng <ruife...@foxmail.com> Signed-off-by: Sean Owen <sro...@gmail.com> (cherry picked from commit 6b7527e381591bcd51be205853aea3e349893139) Signed-off-by: Sean Owen <sro...@gmail.com> --- .../org/apache/spark/ml/tree/treeModels.scala | 48 ++++++++++++++------- .../ml-models/dtc-2.4.7/data/._SUCCESS.crc | Bin 0 -> 8 bytes ...-406c-894c-ca4eac67c690-c000.snappy.parquet.crc | Bin 0 -> 36 bytes .../resources/ml-models/dtc-2.4.7/data/_SUCCESS | 0 ...c890-406c-894c-ca4eac67c690-c000.snappy.parquet | Bin 0 -> 3242 bytes .../ml-models/dtc-2.4.7/metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../ml-models/dtc-2.4.7/metadata/.part-00000.crc | Bin 0 -> 16 bytes .../ml-models/dtc-2.4.7/metadata/_SUCCESS | 0 .../ml-models/dtc-2.4.7/metadata/part-00000 | 1 + .../ml-models/dtr-2.4.7/data/._SUCCESS.crc | Bin 0 -> 8 bytes ...-4b3d-84af-d861adcb9ca8-c000.snappy.parquet.crc | Bin 0 -> 36 bytes .../resources/ml-models/dtr-2.4.7/data/_SUCCESS | 0 ...a437-4b3d-84af-d861adcb9ca8-c000.snappy.parquet | Bin 0 -> 3264 bytes .../ml-models/dtr-2.4.7/metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../ml-models/dtr-2.4.7/metadata/.part-00000.crc | Bin 0 -> 12 bytes .../ml-models/dtr-2.4.7/metadata/_SUCCESS | 0 .../ml-models/dtr-2.4.7/metadata/part-00000 | 1 + .../ml-models/gbtc-2.4.7/data/._SUCCESS.crc | Bin 0 -> 8 bytes ...-41c7-91c0-6da8cc01fb43-c000.snappy.parquet.crc | Bin 0 -> 44 bytes .../resources/ml-models/gbtc-2.4.7/data/_SUCCESS | 0 ...c861-41c7-91c0-6da8cc01fb43-c000.snappy.parquet | Bin 0 -> 4542 bytes .../ml-models/gbtc-2.4.7/metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../ml-models/gbtc-2.4.7/metadata/.part-00000.crc | Bin 0 -> 16 bytes .../ml-models/gbtc-2.4.7/metadata/_SUCCESS | 0 .../ml-models/gbtc-2.4.7/metadata/part-00000 | 1 + .../gbtc-2.4.7/treesMetadata/._SUCCESS.crc | Bin 0 -> 8 bytes ...-4a90-813c-ddc394101e21-c000.snappy.parquet.crc | Bin 0 -> 36 bytes .../ml-models/gbtc-2.4.7/treesMetadata/_SUCCESS | 0 ...31e3-4a90-813c-ddc394101e21-c000.snappy.parquet | Bin 0 -> 3075 bytes .../ml-models/gbtr-2.4.7/data/._SUCCESS.crc | Bin 0 -> 8 bytes ...-4511-9aab-639288bfae6d-c000.snappy.parquet.crc | Bin 0 -> 40 bytes .../resources/ml-models/gbtr-2.4.7/data/_SUCCESS | 0 ...d346-4511-9aab-639288bfae6d-c000.snappy.parquet | Bin 0 -> 3740 bytes .../ml-models/gbtr-2.4.7/metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../ml-models/gbtr-2.4.7/metadata/.part-00000.crc | Bin 0 -> 16 bytes .../ml-models/gbtr-2.4.7/metadata/_SUCCESS | 0 .../ml-models/gbtr-2.4.7/metadata/part-00000 | 1 + .../gbtr-2.4.7/treesMetadata/._SUCCESS.crc | Bin 0 -> 8 bytes ...-4fd8-ad9c-4be239c2215a-c000.snappy.parquet.crc | Bin 0 -> 32 bytes .../ml-models/gbtr-2.4.7/treesMetadata/_SUCCESS | 0 ...87fe-4fd8-ad9c-4be239c2215a-c000.snappy.parquet | Bin 0 -> 3038 bytes .../ml-models/rfc-2.4.7/data/._SUCCESS.crc | Bin 0 -> 8 bytes ...-4485-b112-25b4b11c9009-c000.snappy.parquet.crc | Bin 0 -> 40 bytes .../resources/ml-models/rfc-2.4.7/data/_SUCCESS | 0 ...91f8-4485-b112-25b4b11c9009-c000.snappy.parquet | Bin 0 -> 3836 bytes .../ml-models/rfc-2.4.7/metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../ml-models/rfc-2.4.7/metadata/.part-00000.crc | Bin 0 -> 16 bytes .../ml-models/rfc-2.4.7/metadata/_SUCCESS | 0 .../ml-models/rfc-2.4.7/metadata/part-00000 | 1 + .../rfc-2.4.7/treesMetadata/._SUCCESS.crc | Bin 0 -> 8 bytes ...-4c4e-a823-70c7afdcbdc5-c000.snappy.parquet.crc | Bin 0 -> 36 bytes .../ml-models/rfc-2.4.7/treesMetadata/_SUCCESS | 0 ...b666-4c4e-a823-70c7afdcbdc5-c000.snappy.parquet | Bin 0 -> 3391 bytes .../ml-models/rfr-2.4.7/data/._SUCCESS.crc | Bin 0 -> 8 bytes ...-40fc-b681-981caaeca996-c000.snappy.parquet.crc | Bin 0 -> 40 bytes .../resources/ml-models/rfr-2.4.7/data/_SUCCESS | 0 ...6edb-40fc-b681-981caaeca996-c000.snappy.parquet | Bin 0 -> 3797 bytes .../ml-models/rfr-2.4.7/metadata/._SUCCESS.crc | Bin 0 -> 8 bytes .../ml-models/rfr-2.4.7/metadata/.part-00000.crc | Bin 0 -> 16 bytes .../ml-models/rfr-2.4.7/metadata/_SUCCESS | 0 .../ml-models/rfr-2.4.7/metadata/part-00000 | 1 + .../rfr-2.4.7/treesMetadata/._SUCCESS.crc | Bin 0 -> 8 bytes ...-447a-9b86-d95edaabcde8-c000.snappy.parquet.crc | Bin 0 -> 32 bytes .../ml-models/rfr-2.4.7/treesMetadata/_SUCCESS | 0 ...d349-447a-9b86-d95edaabcde8-c000.snappy.parquet | Bin 0 -> 3055 bytes .../DecisionTreeClassifierSuite.scala | 12 ++++++ .../ml/classification/GBTClassifierSuite.scala | 14 ++++++ .../MultilayerPerceptronClassifierSuite.scala | 2 +- .../RandomForestClassifierSuite.scala | 16 ++++++- .../apache/spark/ml/feature/HashingTFSuite.scala | 2 +- .../spark/ml/feature/StringIndexerSuite.scala | 2 +- .../ml/regression/DecisionTreeRegressorSuite.scala | 16 ++++++- .../spark/ml/regression/GBTRegressorSuite.scala | 12 ++++++ .../ml/regression/RandomForestRegressorSuite.scala | 12 ++++++ 74 files changed, 122 insertions(+), 20 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala index 162641f..67b9166 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeModels.scala @@ -31,8 +31,10 @@ import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter} import org.apache.spark.ml.util.DefaultParamsReader.Metadata import org.apache.spark.mllib.tree.impurity.ImpurityCalculator import org.apache.spark.mllib.tree.model.{DecisionTreeModel => OldDecisionTreeModel} -import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{Dataset, SparkSession} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions.{col, lit, struct} +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.VersionUtils import org.apache.spark.util.collection.OpenHashMap /** @@ -401,8 +403,13 @@ private[ml] object DecisionTreeModelReadWrite { } val dataPath = new Path(path, "data").toString - val data = sparkSession.read.parquet(dataPath).as[NodeData] - buildTreeFromNodes(data.collect(), impurityType) + var df = sparkSession.read.parquet(dataPath) + val (major, minor) = VersionUtils.majorMinorVersion(metadata.sparkVersion) + if (major.toInt < 3) { + df = df.withColumn("rawCount", lit(-1L)) + } + + buildTreeFromNodes(df.as[NodeData].collect(), impurityType) } /** @@ -497,25 +504,36 @@ private[ml] object EnsembleModelReadWrite { } val treesMetadataPath = new Path(path, "treesMetadata").toString - val treesMetadataRDD: RDD[(Int, (Metadata, Double))] = sql.read.parquet(treesMetadataPath) - .select("treeID", "metadata", "weights").as[(Int, String, Double)].rdd.map { - case (treeID: Int, json: String, weights: Double) => + val treesMetadataRDD = sql.read.parquet(treesMetadataPath) + .select("treeID", "metadata", "weights") + .as[(Int, String, Double)].rdd + .map { case (treeID: Int, json: String, weights: Double) => treeID -> ((DefaultParamsReader.parseMetadata(json, treeClassName), weights)) - } + } val treesMetadataWeights = treesMetadataRDD.sortByKey().values.collect() val treesMetadata = treesMetadataWeights.map(_._1) val treesWeights = treesMetadataWeights.map(_._2) val dataPath = new Path(path, "data").toString - val nodeData: Dataset[EnsembleNodeData] = - sql.read.parquet(dataPath).as[EnsembleNodeData] - val rootNodesRDD: RDD[(Int, Node)] = - nodeData.rdd.map(d => (d.treeID, d.nodeData)).groupByKey().map { - case (treeID: Int, nodeData: Iterable[NodeData]) => - treeID -> DecisionTreeModelReadWrite.buildTreeFromNodes(nodeData.toArray, impurityType) + var df = sql.read.parquet(dataPath) + val (major, minor) = VersionUtils.majorMinorVersion(metadata.sparkVersion) + if (major.toInt < 3) { + val newNodeDataCol = df.schema("nodeData").dataType match { + case StructType(fields) => + val cols = fields.map(f => col(s"nodeData.${f.name}")) :+ lit(-1L).as("rawCount") + struct(cols: _*) + } + df = df.withColumn("nodeData", newNodeDataCol) + } + + val rootNodesRDD = df.as[EnsembleNodeData].rdd + .map(d => (d.treeID, d.nodeData)) + .groupByKey() + .map { case (treeID: Int, nodeData: Iterable[NodeData]) => + treeID -> DecisionTreeModelReadWrite.buildTreeFromNodes(nodeData.toArray, impurityType) } - val rootNodes: Array[Node] = rootNodesRDD.sortByKey().values.collect() + val rootNodes = rootNodesRDD.sortByKey().values.collect() (metadata, treesMetadata.zip(rootNodes), treesWeights) } diff --git a/mllib/src/test/resources/ml-models/dtc-2.4.7/data/._SUCCESS.crc b/mllib/src/test/resources/ml-models/dtc-2.4.7/data/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtc-2.4.7/data/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/dtc-2.4.7/data/.part-00000-bd7ae42f-c890-406c-894c-ca4eac67c690-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/dtc-2.4.7/data/.part-00000-bd7ae42f-c890-406c-894c-ca4eac67c690-c000.snappy.parquet.crc new file mode 100644 index 0000000..3ac562a Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtc-2.4.7/data/.part-00000-bd7ae42f-c890-406c-894c-ca4eac67c690-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/dtc-2.4.7/data/_SUCCESS b/mllib/src/test/resources/ml-models/dtc-2.4.7/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/dtc-2.4.7/data/part-00000-bd7ae42f-c890-406c-894c-ca4eac67c690-c000.snappy.parquet b/mllib/src/test/resources/ml-models/dtc-2.4.7/data/part-00000-bd7ae42f-c890-406c-894c-ca4eac67c690-c000.snappy.parquet new file mode 100644 index 0000000..09c38d6 Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtc-2.4.7/data/part-00000-bd7ae42f-c890-406c-894c-ca4eac67c690-c000.snappy.parquet differ diff --git a/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/.part-00000.crc b/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/.part-00000.crc new file mode 100644 index 0000000..22b1eb8 Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/.part-00000.crc differ diff --git a/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/_SUCCESS b/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/part-00000 b/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/part-00000 new file mode 100644 index 0000000..ef92265 --- /dev/null +++ b/mllib/src/test/resources/ml-models/dtc-2.4.7/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.classification.DecisionTreeClassificationModel","timestamp":1608687929358,"sparkVersion":"2.4.7","uid":"dtc_bc7ad285bb73","paramMap":{},"defaultParamMap":{"impurity":"gini","maxDepth":5,"labelCol":"label","maxMemoryInMB":256,"featuresCol":"features","predictionCol":"prediction","minInfoGain":0.0,"seed":159147643,"rawPredictionCol":"rawPrediction","minInstancesPerNode":1,"cacheNodeIds":false,"probabilityCol":"probability","maxBins":32,"checkpointInterval":10} [...] diff --git a/mllib/src/test/resources/ml-models/dtr-2.4.7/data/._SUCCESS.crc b/mllib/src/test/resources/ml-models/dtr-2.4.7/data/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtr-2.4.7/data/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/dtr-2.4.7/data/.part-00000-39b027f0-a437-4b3d-84af-d861adcb9ca8-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/dtr-2.4.7/data/.part-00000-39b027f0-a437-4b3d-84af-d861adcb9ca8-c000.snappy.parquet.crc new file mode 100644 index 0000000..f6465e2 Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtr-2.4.7/data/.part-00000-39b027f0-a437-4b3d-84af-d861adcb9ca8-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/dtr-2.4.7/data/_SUCCESS b/mllib/src/test/resources/ml-models/dtr-2.4.7/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/dtr-2.4.7/data/part-00000-39b027f0-a437-4b3d-84af-d861adcb9ca8-c000.snappy.parquet b/mllib/src/test/resources/ml-models/dtr-2.4.7/data/part-00000-39b027f0-a437-4b3d-84af-d861adcb9ca8-c000.snappy.parquet new file mode 100644 index 0000000..2904f84 Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtr-2.4.7/data/part-00000-39b027f0-a437-4b3d-84af-d861adcb9ca8-c000.snappy.parquet differ diff --git a/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/.part-00000.crc b/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/.part-00000.crc new file mode 100644 index 0000000..bbad108 Binary files /dev/null and b/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/.part-00000.crc differ diff --git a/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/_SUCCESS b/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/part-00000 b/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/part-00000 new file mode 100644 index 0000000..2895223 --- /dev/null +++ b/mllib/src/test/resources/ml-models/dtr-2.4.7/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.regression.DecisionTreeRegressionModel","timestamp":1608687932847,"sparkVersion":"2.4.7","uid":"dtr_c16a90fcdaf8","paramMap":{},"defaultParamMap":{"labelCol":"label","checkpointInterval":10,"minInfoGain":0.0,"maxMemoryInMB":256,"minInstancesPerNode":1,"maxBins":32,"seed":926680331,"cacheNodeIds":false,"maxDepth":5,"predictionCol":"prediction","featuresCol":"features","impurity":"variance"},"numFeatures":692} diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/._SUCCESS.crc b/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/.part-00000-dacbde64-c861-41c7-91c0-6da8cc01fb43-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/.part-00000-dacbde64-c861-41c7-91c0-6da8cc01fb43-c000.snappy.parquet.crc new file mode 100644 index 0000000..13fc4ed Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/.part-00000-dacbde64-c861-41c7-91c0-6da8cc01fb43-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/_SUCCESS b/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/part-00000-dacbde64-c861-41c7-91c0-6da8cc01fb43-c000.snappy.parquet b/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/part-00000-dacbde64-c861-41c7-91c0-6da8cc01fb43-c000.snappy.parquet new file mode 100644 index 0000000..5682038 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtc-2.4.7/data/part-00000-dacbde64-c861-41c7-91c0-6da8cc01fb43-c000.snappy.parquet differ diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/.part-00000.crc b/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/.part-00000.crc new file mode 100644 index 0000000..a810dd9 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/.part-00000.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/_SUCCESS b/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/part-00000 b/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/part-00000 new file mode 100644 index 0000000..675fea2 --- /dev/null +++ b/mllib/src/test/resources/ml-models/gbtc-2.4.7/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.classification.GBTClassificationModel","timestamp":1608687932103,"sparkVersion":"2.4.7","uid":"gbtc_81db008b4f25","paramMap":{"maxIter":2},"defaultParamMap":{"seed":-1287390502,"maxMemoryInMB":256,"stepSize":0.1,"validationTol":0.01,"maxBins":32,"checkpointInterval":10,"predictionCol":"prediction","lossType":"logistic","rawPredictionCol":"rawPrediction","featuresCol":"features","cacheNodeIds":false,"maxIter":20,"featureSubsetStrategy":"all","impurity":"gini" [...] diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/.part-00000-81137d9f-31e3-4a90-813c-ddc394101e21-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/.part-00000-81137d9f-31e3-4a90-813c-ddc394101e21-c000.snappy.parquet.crc new file mode 100644 index 0000000..101c207 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/.part-00000-81137d9f-31e3-4a90-813c-ddc394101e21-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/_SUCCESS b/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/part-00000-81137d9f-31e3-4a90-813c-ddc394101e21-c000.snappy.parquet b/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/part-00000-81137d9f-31e3-4a90-813c-ddc394101e21-c000.snappy.parquet new file mode 100644 index 0000000..e232de3 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtc-2.4.7/treesMetadata/part-00000-81137d9f-31e3-4a90-813c-ddc394101e21-c000.snappy.parquet differ diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/._SUCCESS.crc b/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/.part-00000-3b5433ff-d346-4511-9aab-639288bfae6d-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/.part-00000-3b5433ff-d346-4511-9aab-639288bfae6d-c000.snappy.parquet.crc new file mode 100644 index 0000000..c35b81f Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/.part-00000-3b5433ff-d346-4511-9aab-639288bfae6d-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/_SUCCESS b/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/part-00000-3b5433ff-d346-4511-9aab-639288bfae6d-c000.snappy.parquet b/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/part-00000-3b5433ff-d346-4511-9aab-639288bfae6d-c000.snappy.parquet new file mode 100644 index 0000000..ba26a44 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtr-2.4.7/data/part-00000-3b5433ff-d346-4511-9aab-639288bfae6d-c000.snappy.parquet differ diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/.part-00000.crc b/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/.part-00000.crc new file mode 100644 index 0000000..7dc6e14 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/.part-00000.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/_SUCCESS b/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/part-00000 b/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/part-00000 new file mode 100644 index 0000000..a9a712e --- /dev/null +++ b/mllib/src/test/resources/ml-models/gbtr-2.4.7/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.regression.GBTRegressionModel","timestamp":1608687942434,"sparkVersion":"2.4.7","uid":"gbtr_0a74cb2536ff","paramMap":{"maxIter":2},"defaultParamMap":{"impurity":"variance","maxMemoryInMB":256,"maxDepth":5,"subsamplingRate":1.0,"validationTol":0.01,"labelCol":"label","maxIter":20,"checkpointInterval":10,"minInfoGain":0.0,"predictionCol":"prediction","stepSize":0.1,"cacheNodeIds":false,"lossType":"squared","seed":-131597770,"featureSubsetStrategy":"all","featu [...] diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/.part-00000-6b9124f5-87fe-4fd8-ad9c-4be239c2215a-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/.part-00000-6b9124f5-87fe-4fd8-ad9c-4be239c2215a-c000.snappy.parquet.crc new file mode 100644 index 0000000..b681b9f Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/.part-00000-6b9124f5-87fe-4fd8-ad9c-4be239c2215a-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/_SUCCESS b/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/part-00000-6b9124f5-87fe-4fd8-ad9c-4be239c2215a-c000.snappy.parquet b/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/part-00000-6b9124f5-87fe-4fd8-ad9c-4be239c2215a-c000.snappy.parquet new file mode 100644 index 0000000..9a7e77a Binary files /dev/null and b/mllib/src/test/resources/ml-models/gbtr-2.4.7/treesMetadata/part-00000-6b9124f5-87fe-4fd8-ad9c-4be239c2215a-c000.snappy.parquet differ diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/data/._SUCCESS.crc b/mllib/src/test/resources/ml-models/rfc-2.4.7/data/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfc-2.4.7/data/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/data/.part-00000-e41a7b98-91f8-4485-b112-25b4b11c9009-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/rfc-2.4.7/data/.part-00000-e41a7b98-91f8-4485-b112-25b4b11c9009-c000.snappy.parquet.crc new file mode 100644 index 0000000..5bb3a22 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfc-2.4.7/data/.part-00000-e41a7b98-91f8-4485-b112-25b4b11c9009-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/data/_SUCCESS b/mllib/src/test/resources/ml-models/rfc-2.4.7/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/data/part-00000-e41a7b98-91f8-4485-b112-25b4b11c9009-c000.snappy.parquet b/mllib/src/test/resources/ml-models/rfc-2.4.7/data/part-00000-e41a7b98-91f8-4485-b112-25b4b11c9009-c000.snappy.parquet new file mode 100644 index 0000000..d9ec35a Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfc-2.4.7/data/part-00000-e41a7b98-91f8-4485-b112-25b4b11c9009-c000.snappy.parquet differ diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/.part-00000.crc b/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/.part-00000.crc new file mode 100644 index 0000000..58bda6d Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/.part-00000.crc differ diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/_SUCCESS b/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/part-00000 b/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/part-00000 new file mode 100644 index 0000000..07748b0 --- /dev/null +++ b/mllib/src/test/resources/ml-models/rfc-2.4.7/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.classification.RandomForestClassificationModel","timestamp":1608687930713,"sparkVersion":"2.4.7","uid":"rfc_db1adb353f1e","paramMap":{"numTrees":2},"defaultParamMap":{"impurity":"gini","predictionCol":"prediction","numTrees":20,"maxDepth":5,"featureSubsetStrategy":"auto","subsamplingRate":1.0,"featuresCol":"features","checkpointInterval":10,"rawPredictionCol":"rawPrediction","cacheNodeIds":false,"labelCol":"label","seed":207336481,"probabilityCol":"probabili [...] diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/.part-00000-21082d24-b666-4c4e-a823-70c7afdcbdc5-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/.part-00000-21082d24-b666-4c4e-a823-70c7afdcbdc5-c000.snappy.parquet.crc new file mode 100644 index 0000000..729c5bb Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/.part-00000-21082d24-b666-4c4e-a823-70c7afdcbdc5-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/_SUCCESS b/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/part-00000-21082d24-b666-4c4e-a823-70c7afdcbdc5-c000.snappy.parquet b/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/part-00000-21082d24-b666-4c4e-a823-70c7afdcbdc5-c000.snappy.parquet new file mode 100644 index 0000000..6108821 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfc-2.4.7/treesMetadata/part-00000-21082d24-b666-4c4e-a823-70c7afdcbdc5-c000.snappy.parquet differ diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/data/._SUCCESS.crc b/mllib/src/test/resources/ml-models/rfr-2.4.7/data/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfr-2.4.7/data/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/data/.part-00000-4a69607d-6edb-40fc-b681-981caaeca996-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/rfr-2.4.7/data/.part-00000-4a69607d-6edb-40fc-b681-981caaeca996-c000.snappy.parquet.crc new file mode 100644 index 0000000..52cf21f Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfr-2.4.7/data/.part-00000-4a69607d-6edb-40fc-b681-981caaeca996-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/data/_SUCCESS b/mllib/src/test/resources/ml-models/rfr-2.4.7/data/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/data/part-00000-4a69607d-6edb-40fc-b681-981caaeca996-c000.snappy.parquet b/mllib/src/test/resources/ml-models/rfr-2.4.7/data/part-00000-4a69607d-6edb-40fc-b681-981caaeca996-c000.snappy.parquet new file mode 100644 index 0000000..75a3f04 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfr-2.4.7/data/part-00000-4a69607d-6edb-40fc-b681-981caaeca996-c000.snappy.parquet differ diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/.part-00000.crc b/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/.part-00000.crc new file mode 100644 index 0000000..1a72b8e Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/.part-00000.crc differ diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/_SUCCESS b/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/part-00000 b/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/part-00000 new file mode 100644 index 0000000..cccbb8f --- /dev/null +++ b/mllib/src/test/resources/ml-models/rfr-2.4.7/metadata/part-00000 @@ -0,0 +1 @@ +{"class":"org.apache.spark.ml.regression.RandomForestRegressionModel","timestamp":1608687933536,"sparkVersion":"2.4.7","uid":"rfr_d946d96b7ff0","paramMap":{"numTrees":2},"defaultParamMap":{"numTrees":20,"featureSubsetStrategy":"auto","maxDepth":5,"minInstancesPerNode":1,"labelCol":"label","cacheNodeIds":false,"checkpointInterval":10,"featuresCol":"features","maxMemoryInMB":256,"predictionCol":"prediction","minInfoGain":0.0,"subsamplingRate":1.0,"impurity":"variance","seed":235498149,"max [...] diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/._SUCCESS.crc b/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/._SUCCESS.crc new file mode 100644 index 0000000..3b7b044 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/._SUCCESS.crc differ diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/.part-00000-dfe4db51-d349-447a-9b86-d95edaabcde8-c000.snappy.parquet.crc b/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/.part-00000-dfe4db51-d349-447a-9b86-d95edaabcde8-c000.snappy.parquet.crc new file mode 100644 index 0000000..8081f88 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/.part-00000-dfe4db51-d349-447a-9b86-d95edaabcde8-c000.snappy.parquet.crc differ diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/_SUCCESS b/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/_SUCCESS new file mode 100644 index 0000000..e69de29 diff --git a/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/part-00000-dfe4db51-d349-447a-9b86-d95edaabcde8-c000.snappy.parquet b/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/part-00000-dfe4db51-d349-447a-9b86-d95edaabcde8-c000.snappy.parquet new file mode 100644 index 0000000..093c346 Binary files /dev/null and b/mllib/src/test/resources/ml-models/rfr-2.4.7/treesMetadata/part-00000-dfe4db51-d349-447a-9b86-d95edaabcde8-c000.snappy.parquet differ diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala index d1ade85..13efdf1 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala @@ -446,6 +446,18 @@ class DecisionTreeClassifierSuite extends MLTest with DefaultReadWriteTest { testDefaultReadWrite(model) } + + test("SPARK-33398: Load DecisionTreeClassificationModel prior to Spark 3.0") { + val path = testFile("ml-models/dtc-2.4.7") + val model = DecisionTreeClassificationModel.load(path) + assert(model.numClasses === 2) + assert(model.numFeatures === 692) + assert(model.numNodes === 5) + + val metadata = spark.read.json(s"$path/metadata") + val sparkVersionStr = metadata.select("sparkVersion").first().getString(0) + assert(sparkVersionStr === "2.4.7") + } } private[ml] object DecisionTreeClassifierSuite extends SparkFunSuite { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala index a2208ed..37e695e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/GBTClassifierSuite.scala @@ -545,6 +545,20 @@ class GBTClassifierSuite extends MLTest with DefaultReadWriteTest { testEstimatorAndModelReadWrite(gbt, continuousData, allParamSettings, allParamSettings, checkModelData) } + + test("SPARK-33398: Load GBTClassificationModel prior to Spark 3.0") { + val path = testFile("ml-models/gbtc-2.4.7") + val model = GBTClassificationModel.load(path) + assert(model.numClasses === 2) + assert(model.numFeatures === 692) + assert(model.getNumTrees === 2) + assert(model.totalNumNodes === 22) + assert(model.trees.map(_.numNodes) === Array(5, 17)) + + val metadata = spark.read.json(s"$path/metadata") + val sparkVersionStr = metadata.select("sparkVersion").first().getString(0) + assert(sparkVersionStr === "2.4.7") + } } private object GBTClassifierSuite extends SparkFunSuite { diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala index 902af71..7cf4e65 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifierSuite.scala @@ -240,6 +240,6 @@ class MultilayerPerceptronClassifierSuite extends MLTest with DefaultReadWriteTe val metadata = spark.read.json(s"$mlpPath/metadata") val sparkVersionStr = metadata.select("sparkVersion").first().getString(0) - assert(sparkVersionStr == "2.4.4") + assert(sparkVersionStr === "2.4.4") } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala index e30e93a..28b245f 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/RandomForestClassifierSuite.scala @@ -25,10 +25,10 @@ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.tree._ import org.apache.spark.ml.tree.impl.TreeTests import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} +import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} -import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} @@ -319,6 +319,20 @@ class RandomForestClassifierSuite extends MLTest with DefaultReadWriteTest { testEstimatorAndModelReadWrite(rf, continuousData, allParamSettings, allParamSettings, checkModelData) } + + test("SPARK-33398: Load RandomForestClassificationModel prior to Spark 3.0") { + val path = testFile("ml-models/rfc-2.4.7") + val model = RandomForestClassificationModel.load(path) + assert(model.numClasses === 2) + assert(model.numFeatures === 692) + assert(model.getNumTrees === 2) + assert(model.totalNumNodes === 10) + assert(model.trees.map(_.numNodes) === Array(3, 7)) + + val metadata = spark.read.json(s"$path/metadata") + val sparkVersionStr = metadata.select("sparkVersion").first().getString(0) + assert(sparkVersionStr === "2.4.7") + } } private object RandomForestClassifierSuite extends SparkFunSuite { diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala index 8fd192f..861bf1e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala @@ -99,7 +99,7 @@ class HashingTFSuite extends MLTest with DefaultReadWriteTest { val metadata = spark.read.json(s"$hashingTFPath/metadata") val sparkVersionStr = metadata.select("sparkVersion").first().getString(0) - assert(sparkVersionStr == "2.4.4") + assert(sparkVersionStr === "2.4.4") intercept[IllegalArgumentException] { loadedHashingTF.save(hashingTFPath) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala index 9481408..c8247b9 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala @@ -483,6 +483,6 @@ class StringIndexerSuite extends MLTest with DefaultReadWriteTest { val metadata = spark.read.json(s"$modelPath/metadata") val sparkVersionStr = metadata.select("sparkVersion").first().getString(0) - assert(sparkVersionStr == "2.4.4") + assert(sparkVersionStr === "2.4.4") } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala index 49ebcb3..9cb03454 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/DecisionTreeRegressorSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint -import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.tree.impl.TreeTests import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ @@ -236,6 +236,20 @@ class DecisionTreeRegressorSuite extends MLTest with DefaultReadWriteTest { TreeTests.allParamSettings ++ Map("maxDepth" -> 0), TreeTests.allParamSettings ++ Map("maxDepth" -> 0), checkModelData) } + + test("SPARK-33398: Load DecisionTreeRegressionModel prior to Spark 3.0") { + val path = testFile("ml-models/dtr-2.4.7") + val model = DecisionTreeRegressionModel.load(path) + assert(model.numFeatures === 692) + assert(model.numNodes === 5) + assert(model.featureImportances ~== + Vectors.sparse(692, Array(100, 434), + Array(0.03987240829346093, 0.960127591706539)) absTol 1e-4) + + val metadata = spark.read.json(s"$path/metadata") + val sparkVersionStr = metadata.select("sparkVersion").first().getString(0) + assert(sparkVersionStr === "2.4.7") + } } private[ml] object DecisionTreeRegressorSuite extends SparkFunSuite { diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala index 04b0d4b..7d84df6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/GBTRegressorSuite.scala @@ -370,6 +370,18 @@ class GBTRegressorSuite extends MLTest with DefaultReadWriteTest { testEstimatorAndModelReadWrite(gbt, continuousData, allParamSettings, allParamSettings, checkModelData) } + + test("SPARK-33398: Load GBTRegressionModel prior to Spark 3.0") { + val path = testFile("ml-models/gbtr-2.4.7") + val model = GBTRegressionModel.load(path) + assert(model.numFeatures === 692) + assert(model.totalNumNodes === 6) + assert(model.trees.map(_.numNodes) === Array(5, 1)) + + val metadata = spark.read.json(s"$path/metadata") + val sparkVersionStr = metadata.select("sparkVersion").first().getString(0) + assert(sparkVersionStr === "2.4.7") + } } private object GBTRegressorSuite extends SparkFunSuite { diff --git a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala index 31dc6d3..4df8366 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/regression/RandomForestRegressorSuite.scala @@ -221,6 +221,18 @@ class RandomForestRegressorSuite extends MLTest with DefaultReadWriteTest{ testEstimatorAndModelReadWrite(rf, continuousData, allParamSettings, allParamSettings, checkModelData) } + + test("SPARK-33398: Load RandomForestRegressionModel prior to Spark 3.0") { + val path = testFile("ml-models/rfr-2.4.7") + val model = RandomForestRegressionModel.load(path) + assert(model.numFeatures === 692) + assert(model.totalNumNodes === 8) + assert(model.trees.map(_.numNodes) === Array(5, 3)) + + val metadata = spark.read.json(s"$path/metadata") + val sparkVersionStr = metadata.select("sparkVersion").first().getString(0) + assert(sparkVersionStr === "2.4.7") + } } private object RandomForestRegressorSuite extends SparkFunSuite { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org