[jira] [Updated] (SPARK-19449) Inconsistent results between ml package RandomForestClassificationModel and mllib package RandomForestModel
[ https://issues.apache.org/jira/browse/SPARK-19449?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Aseem Bansal updated SPARK-19449: - Description: I worked on some code to convert ml package RandomForestClassificationModel to mllib package RandomForestModel. It was needed because we need to make predictions on the order of ms. I found that the results are inconsistent although the underlying DecisionTreeModel are exactly the same. So the behavior between the 2 implementations is inconsistent which should not be the case. The below code can be used to reproduce the issue. Can run this as a simple Java app as long as you have spark dependencies set up properly. {noformat} import org.apache.spark.ml.Transformer; import org.apache.spark.ml.classification.*; import org.apache.spark.ml.linalg.*; import org.apache.spark.ml.regression.RandomForestRegressionModel; import org.apache.spark.mllib.linalg.DenseVector; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.tree.configuration.Algo; import org.apache.spark.mllib.tree.model.DecisionTreeModel; import org.apache.spark.mllib.tree.model.RandomForestModel; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import scala.Enumeration; import java.util.ArrayList; import java.util.List; import java.util.Random; abstract class Predictor { abstract double predict(Vector vector); } public class MainConvertModels { public static final int seed = 42; public static void main(String[] args) { int numRows = 1000; int numFeatures = 3; int numClasses = 2; double trainFraction = 0.8; double testFraction = 0.2; SparkSession spark = SparkSession.builder() .appName("conversion app") .master("local") .getOrCreate(); Dataset data = getDummyData(spark, numRows, numFeatures, numClasses); Dataset[] splits = data.randomSplit(new double[]{trainFraction, testFraction}, seed); Dataset trainingData = splits[0]; Dataset testData = splits[1]; testData.cache(); List labels = getLabels(testData); List features = getFeatures(testData); DecisionTreeClassifier classifier1 = new DecisionTreeClassifier(); DecisionTreeClassificationModel model1 = classifier1.fit(trainingData); final DecisionTreeModel convertedModel1 = convertDecisionTreeModel(model1, Algo.Classification()); RandomForestClassifier classifier = new RandomForestClassifier(); RandomForestClassificationModel model2 = classifier.fit(trainingData); final RandomForestModel convertedModel2 = convertRandomForestModel(model2); System.out.println( "** DecisionTreeClassifier\n" + "** Original **" + getInfo(model1, testData) + "\n" + "** New **" + getInfo(new Predictor() { double predict(Vector vector) {return convertedModel1.predict(vector);} }, labels, features) + "\n" + "\n" + "** RandomForestClassifier\n" + "** Original **" + getInfo(model2, testData) + "\n" + "** New **" + getInfo(new Predictor() {double predict(Vector vector) {return convertedModel2.predict(vector);}}, labels, features) + "\n" + "\n" + ""); } static Dataset getDummyData(SparkSession spark, int numberRows, int numberFeatures, int labelUpperBound) { StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); double[][] vectors = prepareData(numberRows, numberFeatures); Random random = new Random(seed); List dataTest = new ArrayList<>(); for (double[] vector : vectors) { double label = (double) random.nextInt(2); dataTest.add(RowFactory.create(label, Vectors.dense(vector))); } return spark.createDataFrame(dataTest, schema); } static double[][] prepareData(int numRows, int numFeatures) { Random random = new Random(seed); double[][] result = new double[numRows][numFeatures]; for (int row = 0; row < numRows; row++) { for (int feature = 0; feature < numFeatures; feature++) { result[row][feature] = random.nextDouble(); } } return result; } static S
[jira] [Updated] (SPARK-19449) Inconsistent results between ml package RandomForestClassificationModel and mllib package RandomForestModel
[ https://issues.apache.org/jira/browse/SPARK-19449?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Aseem Bansal updated SPARK-19449: - Description: I worked on some code to convert ml package RandomForestClassificationModel to mllib package RandomForestModel. It was needed because we need to make predictions on the order of ms. I found that the results are inconsistent although the underlying DecisionTreeModel are exactly the same. The below code can be used to reproduce the issue. Can run this as a simple Java app as long as you have spark dependencies set up properly. {noformat} import org.apache.spark.ml.Transformer; import org.apache.spark.ml.classification.*; import org.apache.spark.ml.linalg.*; import org.apache.spark.ml.regression.RandomForestRegressionModel; import org.apache.spark.mllib.linalg.DenseVector; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.tree.configuration.Algo; import org.apache.spark.mllib.tree.model.DecisionTreeModel; import org.apache.spark.mllib.tree.model.RandomForestModel; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import scala.Enumeration; import java.util.ArrayList; import java.util.List; import java.util.Random; abstract class Predictor { abstract double predict(Vector vector); } public class MainConvertModels { public static final int seed = 42; public static void main(String[] args) { int numRows = 1000; int numFeatures = 3; int numClasses = 2; double trainFraction = 0.8; double testFraction = 0.2; SparkSession spark = SparkSession.builder() .appName("conversion app") .master("local") .getOrCreate(); //Dataset data = getData(spark, "libsvm", "/opt/spark2/data/mllib/sample_libsvm_data.txt"); Dataset data = getDummyData(spark, numRows, numFeatures, numClasses); Dataset[] splits = data.randomSplit(new double[]{trainFraction, testFraction}, seed); Dataset trainingData = splits[0]; Dataset testData = splits[1]; testData.cache(); List labels = getLabels(testData); List features = getFeatures(testData); DecisionTreeClassifier classifier1 = new DecisionTreeClassifier(); DecisionTreeClassificationModel model1 = classifier1.fit(trainingData); final DecisionTreeModel convertedModel1 = convertDecisionTreeModel(model1, Algo.Classification()); RandomForestClassifier classifier = new RandomForestClassifier(); RandomForestClassificationModel model2 = classifier.fit(trainingData); final RandomForestModel convertedModel2 = convertRandomForestModel(model2); LogisticRegression lr = new LogisticRegression(); LogisticRegressionModel model3 = lr.fit(trainingData); final org.apache.spark.mllib.classification.LogisticRegressionModel convertedModel3 = convertLogisticRegressionModel(model3); System.out.println( "** DecisionTreeClassifier\n" + "** Original **" + getInfo(model1, testData) + "\n" + "** New **" + getInfo(new Predictor() { double predict(Vector vector) {return convertedModel1.predict(vector);} }, labels, features) + "\n" + "\n" + "** RandomForestClassifier\n" + "** Original **" + getInfo(model2, testData) + "\n" + "** New **" + getInfo(new Predictor() {double predict(Vector vector) {return convertedModel2.predict(vector);}}, labels, features) + "\n" + "\n" + "** LogisticRegression\n" + "** Original **" + getInfo(model3, testData) + "\n" + "** New **" + getInfo(new Predictor() {double predict(Vector vector) { return convertedModel3.predict(vector);}}, labels, features) + "\n" + ""); } static Dataset getData(SparkSession spark, String format, String location) { return spark.read() .format(format) .load(location); } static Dataset getDummyData(SparkSession spark, int numberRows, int numberFeatures, int labelUpperBound) { StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.empty()) }); double[][] vectors = prepareData(numberRows, numberFeatures);
[jira] [Updated] (SPARK-19449) Inconsistent results between ml package RandomForestClassificationModel and mllib package RandomForestModel
[ https://issues.apache.org/jira/browse/SPARK-19449?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] Aseem Bansal updated SPARK-19449: - Description: I worked on some code to convert ml package RandomForestClassificationModel to mllib package RandomForestModel. It was needed because we need to make predictions on the order of ms. I found that the results are inconsistent although the underlying DecisionTreeModel are exactly the same. So the behavior between the 2 implementations is inconsistent which should not be the case. The below code can be used to reproduce the issue. Can run this as a simple Java app as long as you have spark dependencies set up properly. {noformat} import org.apache.spark.ml.Transformer; import org.apache.spark.ml.classification.*; import org.apache.spark.ml.linalg.*; import org.apache.spark.ml.regression.RandomForestRegressionModel; import org.apache.spark.mllib.linalg.DenseVector; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.tree.configuration.Algo; import org.apache.spark.mllib.tree.model.DecisionTreeModel; import org.apache.spark.mllib.tree.model.RandomForestModel; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.Metadata; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import scala.Enumeration; import java.util.ArrayList; import java.util.List; import java.util.Random; abstract class Predictor { abstract double predict(Vector vector); } public class MainConvertModels { public static final int seed = 42; public static void main(String[] args) { int numRows = 1000; int numFeatures = 3; int numClasses = 2; double trainFraction = 0.8; double testFraction = 0.2; SparkSession spark = SparkSession.builder() .appName("conversion app") .master("local") .getOrCreate(); //Dataset data = getData(spark, "libsvm", "/opt/spark2/data/mllib/sample_libsvm_data.txt"); Dataset data = getDummyData(spark, numRows, numFeatures, numClasses); Dataset[] splits = data.randomSplit(new double[]{trainFraction, testFraction}, seed); Dataset trainingData = splits[0]; Dataset testData = splits[1]; testData.cache(); List labels = getLabels(testData); List features = getFeatures(testData); DecisionTreeClassifier classifier1 = new DecisionTreeClassifier(); DecisionTreeClassificationModel model1 = classifier1.fit(trainingData); final DecisionTreeModel convertedModel1 = convertDecisionTreeModel(model1, Algo.Classification()); RandomForestClassifier classifier = new RandomForestClassifier(); RandomForestClassificationModel model2 = classifier.fit(trainingData); final RandomForestModel convertedModel2 = convertRandomForestModel(model2); LogisticRegression lr = new LogisticRegression(); LogisticRegressionModel model3 = lr.fit(trainingData); final org.apache.spark.mllib.classification.LogisticRegressionModel convertedModel3 = convertLogisticRegressionModel(model3); System.out.println( "** DecisionTreeClassifier\n" + "** Original **" + getInfo(model1, testData) + "\n" + "** New **" + getInfo(new Predictor() { double predict(Vector vector) {return convertedModel1.predict(vector);} }, labels, features) + "\n" + "\n" + "** RandomForestClassifier\n" + "** Original **" + getInfo(model2, testData) + "\n" + "** New **" + getInfo(new Predictor() {double predict(Vector vector) {return convertedModel2.predict(vector);}}, labels, features) + "\n" + "\n" + "** LogisticRegression\n" + "** Original **" + getInfo(model3, testData) + "\n" + "** New **" + getInfo(new Predictor() {double predict(Vector vector) { return convertedModel3.predict(vector);}}, labels, features) + "\n" + ""); } static Dataset getData(SparkSession spark, String format, String location) { return spark.read() .format(format) .load(location); } static Dataset getDummyData(SparkSession spark, int numberRows, int numberFeatures, int labelUpperBound) { StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), false, Metadata.em