spark git commit: [SPARK-11382] Replace example code in mllib-decision-tree.md using include_example

meng Tue, 10 Nov 2015 10:06:57 -0800

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 39c1ebcd3 -> 6c1409e0b



[SPARK-11382] Replace example code in mllib-decision-tree.md using 
include_example

https://issues.apache.org/jira/browse/SPARK-11382

B.T.W. I fix an error in naive_bayes_example.py.

Author: Xusen Yin <yinxu...@gmail.com>

Closes #9596 from yinxusen/SPARK-11382.

(cherry picked from commit a81f47ff7498e7063c855ccf75bba81ab101b43e)
Signed-off-by: Xiangrui Meng <m...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6c1409e0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6c1409e0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6c1409e0

Branch: refs/heads/branch-1.6
Commit: 6c1409e0b57cb0202c434a9f6c2593e5ae241e70
Parents: 39c1ebc
Author: Xusen Yin <yinxu...@gmail.com>
Authored: Tue Nov 10 10:05:53 2015 -0800
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Tue Nov 10 10:06:02 2015 -0800

----------------------------------------------------------------------
 docs/mllib-decision-tree.md                     | 253 +------------------
 .../JavaDecisionTreeClassificationExample.java  |  91 +++++++
 .../JavaDecisionTreeRegressionExample.java      |  96 +++++++
 .../decision_tree_classification_example.py     |  55 ++++
 .../mllib/decision_tree_regression_example.py   |  56 ++++
 .../main/python/mllib/naive_bayes_example.py    |   1 +
 .../DecisionTreeClassificationExample.scala     |  67 +++++
 .../mllib/DecisionTreeRegressionExample.scala   |  66 +++++
 8 files changed, 438 insertions(+), 247 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/6c1409e0/docs/mllib-decision-tree.md
----------------------------------------------------------------------
diff --git a/docs/mllib-decision-tree.md b/docs/mllib-decision-tree.md
index b5b454b..77ce34e 100644
--- a/docs/mllib-decision-tree.md
+++ b/docs/mllib-decision-tree.md
@@ -194,137 +194,19 @@ maximum tree depth of 5. The test error is calculated to 
measure the algorithm a
 <div data-lang="scala" markdown="1">
 Refer to the [`DecisionTree` Scala 
docs](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) and 
[`DecisionTreeModel` Scala 
docs](api/scala/index.html#org.apache.spark.mllib.tree.model.DecisionTreeModel) 
for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.tree.DecisionTree
-import org.apache.spark.mllib.tree.model.DecisionTreeModel
-import org.apache.spark.mllib.util.MLUtils
-
-// Load and parse the data file.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-// Split the data into training and test sets (30% held out for testing)
-val splits = data.randomSplit(Array(0.7, 0.3))
-val (trainingData, testData) = (splits(0), splits(1))
-
-// Train a DecisionTree model.
-//  Empty categoricalFeaturesInfo indicates all features are continuous.
-val numClasses = 2
-val categoricalFeaturesInfo = Map[Int, Int]()
-val impurity = "gini"
-val maxDepth = 5
-val maxBins = 32
-
-val model = DecisionTree.trainClassifier(trainingData, numClasses, 
categoricalFeaturesInfo,
-  impurity, maxDepth, maxBins)
-
-// Evaluate model on test instances and compute test error
-val labelAndPreds = testData.map { point =>
-  val prediction = model.predict(point.features)
-  (point.label, prediction)
-}
-val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / 
testData.count()
-println("Test Error = " + testErr)
-println("Learned classification tree model:\n" + model.toDebugString)
-
-// Save and load model
-model.save(sc, "myModelPath")
-val sameModel = DecisionTreeModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example 
scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`DecisionTree` Java 
docs](api/java/org/apache/spark/mllib/tree/DecisionTree.html) and 
[`DecisionTreeModel` Java 
docs](api/java/org/apache/spark/mllib/tree/model/DecisionTreeModel.html) for 
details on the API.
 
-{% highlight java %}
-import java.util.HashMap;
-import scala.Tuple2;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.tree.DecisionTree;
-import org.apache.spark.mllib.tree.model.DecisionTreeModel;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.SparkConf;
-
-SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTree");
-JavaSparkContext sc = new JavaSparkContext(sparkConf);
-
-// Load and parse the data file.
-String datapath = "data/mllib/sample_libsvm_data.txt";
-JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), 
datapath).toJavaRDD();
-// Split the data into training and test sets (30% held out for testing)
-JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
-JavaRDD<LabeledPoint> trainingData = splits[0];
-JavaRDD<LabeledPoint> testData = splits[1];
-
-// Set parameters.
-//  Empty categoricalFeaturesInfo indicates all features are continuous.
-Integer numClasses = 2;
-Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, 
Integer>();
-String impurity = "gini";
-Integer maxDepth = 5;
-Integer maxBins = 32;
-
-// Train a DecisionTree model for classification.
-final DecisionTreeModel model = DecisionTree.trainClassifier(trainingData, 
numClasses,
-  categoricalFeaturesInfo, impurity, maxDepth, maxBins);
-
-// Evaluate model on test instances and compute test error
-JavaPairRDD<Double, Double> predictionAndLabel =
-  testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
-    @Override
-    public Tuple2<Double, Double> call(LabeledPoint p) {
-      return new Tuple2<Double, Double>(model.predict(p.features()), 
p.label());
-    }
-  });
-Double testErr =
-  1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, 
Boolean>() {
-    @Override
-    public Boolean call(Tuple2<Double, Double> pl) {
-      return !pl._1().equals(pl._2());
-    }
-  }).count() / testData.count();
-System.out.println("Test Error: " + testErr);
-System.out.println("Learned classification tree model:\n" + 
model.toDebugString());
-
-// Save and load model
-model.save(sc.sc(), "myModelPath");
-DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), "myModelPath");
-{% endhighlight %}
+{% include_example 
java/org/apache/spark/examples/mllib/JavaDecisionTreeClassificationExample.java 
%}
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`DecisionTree` Python 
docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.DecisionTree) and 
[`DecisionTreeModel` Python 
docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.DecisionTreeModel) for 
more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
-from pyspark.mllib.util import MLUtils
-
-# Load and parse the data file into an RDD of LabeledPoint.
-data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
-# Split the data into training and test sets (30% held out for testing)
-(trainingData, testData) = data.randomSplit([0.7, 0.3])
-
-# Train a DecisionTree model.
-#  Empty categoricalFeaturesInfo indicates all features are continuous.
-model = DecisionTree.trainClassifier(trainingData, numClasses=2, 
categoricalFeaturesInfo={},
-                                     impurity='gini', maxDepth=5, maxBins=32)
-
-# Evaluate model on test instances and compute test error
-predictions = model.predict(testData.map(lambda x: x.features))
-labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
-testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / 
float(testData.count())
-print('Test Error = ' + str(testErr))
-print('Learned classification tree model:')
-print(model.toDebugString())
-
-# Save and load model
-model.save(sc, "myModelPath")
-sameModel = DecisionTreeModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example python/mllib/decision_tree_classification_example.py %}
 </div>
 
 </div>
@@ -343,142 +225,19 @@ depth of 5. The Mean Squared Error (MSE) is computed at 
the end to evaluate
 <div data-lang="scala" markdown="1">
 Refer to the [`DecisionTree` Scala 
docs](api/scala/index.html#org.apache.spark.mllib.tree.DecisionTree) and 
[`DecisionTreeModel` Scala 
docs](api/scala/index.html#org.apache.spark.mllib.tree.model.DecisionTreeModel) 
for details on the API.
 
-{% highlight scala %}
-import org.apache.spark.mllib.tree.DecisionTree
-import org.apache.spark.mllib.tree.model.DecisionTreeModel
-import org.apache.spark.mllib.util.MLUtils
-
-// Load and parse the data file.
-val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
-// Split the data into training and test sets (30% held out for testing)
-val splits = data.randomSplit(Array(0.7, 0.3))
-val (trainingData, testData) = (splits(0), splits(1))
-
-// Train a DecisionTree model.
-//  Empty categoricalFeaturesInfo indicates all features are continuous.
-val categoricalFeaturesInfo = Map[Int, Int]()
-val impurity = "variance"
-val maxDepth = 5
-val maxBins = 32
-
-val model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo, 
impurity,
-  maxDepth, maxBins)
-
-// Evaluate model on test instances and compute test error
-val labelsAndPredictions = testData.map { point =>
-  val prediction = model.predict(point.features)
-  (point.label, prediction)
-}
-val testMSE = labelsAndPredictions.map{ case(v, p) => math.pow((v - p), 
2)}.mean()
-println("Test Mean Squared Error = " + testMSE)
-println("Learned regression tree model:\n" + model.toDebugString)
-
-// Save and load model
-model.save(sc, "myModelPath")
-val sameModel = DecisionTreeModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example 
scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala %}
 </div>
 
 <div data-lang="java" markdown="1">
 Refer to the [`DecisionTree` Java 
docs](api/java/org/apache/spark/mllib/tree/DecisionTree.html) and 
[`DecisionTreeModel` Java 
docs](api/java/org/apache/spark/mllib/tree/model/DecisionTreeModel.html) for 
details on the API.
 
-{% highlight java %}
-import java.util.HashMap;
-import scala.Tuple2;
-import org.apache.spark.api.java.function.Function2;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.JavaSparkContext;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.api.java.function.PairFunction;
-import org.apache.spark.mllib.regression.LabeledPoint;
-import org.apache.spark.mllib.tree.DecisionTree;
-import org.apache.spark.mllib.tree.model.DecisionTreeModel;
-import org.apache.spark.mllib.util.MLUtils;
-import org.apache.spark.SparkConf;
-
-SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTree");
-JavaSparkContext sc = new JavaSparkContext(sparkConf);
-
-// Load and parse the data file.
-String datapath = "data/mllib/sample_libsvm_data.txt";
-JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), 
datapath).toJavaRDD();
-// Split the data into training and test sets (30% held out for testing)
-JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
-JavaRDD<LabeledPoint> trainingData = splits[0];
-JavaRDD<LabeledPoint> testData = splits[1];
-
-// Set parameters.
-//  Empty categoricalFeaturesInfo indicates all features are continuous.
-Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, 
Integer>();
-String impurity = "variance";
-Integer maxDepth = 5;
-Integer maxBins = 32;
-
-// Train a DecisionTree model.
-final DecisionTreeModel model = DecisionTree.trainRegressor(trainingData,
-  categoricalFeaturesInfo, impurity, maxDepth, maxBins);
-
-// Evaluate model on test instances and compute test error
-JavaPairRDD<Double, Double> predictionAndLabel =
-  testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
-    @Override
-    public Tuple2<Double, Double> call(LabeledPoint p) {
-      return new Tuple2<Double, Double>(model.predict(p.features()), 
p.label());
-    }
-  });
-Double testMSE =
-  predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
-    @Override
-    public Double call(Tuple2<Double, Double> pl) {
-      Double diff = pl._1() - pl._2();
-      return diff * diff;
-    }
-  }).reduce(new Function2<Double, Double, Double>() {
-    @Override
-    public Double call(Double a, Double b) {
-      return a + b;
-    }
-  }) / testData.count();
-System.out.println("Test Mean Squared Error: " + testMSE);
-System.out.println("Learned regression tree model:\n" + model.toDebugString());
-
-// Save and load model
-model.save(sc.sc(), "myModelPath");
-DecisionTreeModel sameModel = DecisionTreeModel.load(sc.sc(), "myModelPath");
-{% endhighlight %}
+{% include_example 
java/org/apache/spark/examples/mllib/JavaDecisionTreeRegressionExample.java %}
 </div>
 
 <div data-lang="python" markdown="1">
 Refer to the [`DecisionTree` Python 
docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.DecisionTree) and 
[`DecisionTreeModel` Python 
docs](api/python/pyspark.mllib.html#pyspark.mllib.tree.DecisionTreeModel) for 
more details on the API.
 
-{% highlight python %}
-from pyspark.mllib.regression import LabeledPoint
-from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
-from pyspark.mllib.util import MLUtils
-
-# Load and parse the data file into an RDD of LabeledPoint.
-data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
-# Split the data into training and test sets (30% held out for testing)
-(trainingData, testData) = data.randomSplit([0.7, 0.3])
-
-# Train a DecisionTree model.
-#  Empty categoricalFeaturesInfo indicates all features are continuous.
-model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
-                                    impurity='variance', maxDepth=5, 
maxBins=32)
-
-# Evaluate model on test instances and compute test error
-predictions = model.predict(testData.map(lambda x: x.features))
-labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
-testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / 
float(testData.count())
-print('Test Mean Squared Error = ' + str(testMSE))
-print('Learned regression tree model:')
-print(model.toDebugString())
-
-# Save and load model
-model.save(sc, "myModelPath")
-sameModel = DecisionTreeModel.load(sc, "myModelPath")
-{% endhighlight %}
+{% include_example python/mllib/decision_tree_regression_example.py %}
 </div>
 
 </div>

http://git-wip-us.apache.org/repos/asf/spark/blob/6c1409e0/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeClassificationExample.java
----------------------------------------------------------------------
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeClassificationExample.java
 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeClassificationExample.java
new file mode 100644
index 0000000..5839b0c
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeClassificationExample.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import java.util.HashMap;
+import java.util.Map;
+
+import scala.Tuple2;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.tree.DecisionTree;
+import org.apache.spark.mllib.tree.model.DecisionTreeModel;
+import org.apache.spark.mllib.util.MLUtils;
+// $example off$
+
+class JavaDecisionTreeClassificationExample {
+
+  public static void main(String[] args) {
+
+    // $example on$
+    SparkConf sparkConf = new 
SparkConf().setAppName("JavaDecisionTreeClassificationExample");
+    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+
+    // Load and parse the data file.
+    String datapath = "data/mllib/sample_libsvm_data.txt";
+    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), 
datapath).toJavaRDD();
+    // Split the data into training and test sets (30% held out for testing)
+    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
+    JavaRDD<LabeledPoint> trainingData = splits[0];
+    JavaRDD<LabeledPoint> testData = splits[1];
+
+    // Set parameters.
+    //  Empty categoricalFeaturesInfo indicates all features are continuous.
+    Integer numClasses = 2;
+    Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, 
Integer>();
+    String impurity = "gini";
+    Integer maxDepth = 5;
+    Integer maxBins = 32;
+
+    // Train a DecisionTree model for classification.
+    final DecisionTreeModel model = DecisionTree.trainClassifier(trainingData, 
numClasses,
+      categoricalFeaturesInfo, impurity, maxDepth, maxBins);
+
+    // Evaluate model on test instances and compute test error
+    JavaPairRDD<Double, Double> predictionAndLabel =
+      testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
+        @Override
+        public Tuple2<Double, Double> call(LabeledPoint p) {
+          return new Tuple2<Double, Double>(model.predict(p.features()), 
p.label());
+        }
+      });
+    Double testErr =
+      1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, 
Boolean>() {
+        @Override
+        public Boolean call(Tuple2<Double, Double> pl) {
+          return !pl._1().equals(pl._2());
+        }
+      }).count() / testData.count();
+
+    System.out.println("Test Error: " + testErr);
+    System.out.println("Learned classification tree model:\n" + 
model.toDebugString());
+
+    // Save and load model
+    model.save(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
+    DecisionTreeModel sameModel = DecisionTreeModel
+      .load(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel");
+    // $example off$
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/6c1409e0/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeRegressionExample.java
----------------------------------------------------------------------
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeRegressionExample.java
 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeRegressionExample.java
new file mode 100644
index 0000000..ccde578
--- /dev/null
+++ 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaDecisionTreeRegressionExample.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib;
+
+// $example on$
+import java.util.HashMap;
+import java.util.Map;
+
+import scala.Tuple2;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.mllib.regression.LabeledPoint;
+import org.apache.spark.mllib.tree.DecisionTree;
+import org.apache.spark.mllib.tree.model.DecisionTreeModel;
+import org.apache.spark.mllib.util.MLUtils;
+// $example off$
+
+class JavaDecisionTreeRegressionExample {
+
+  public static void main(String[] args) {
+
+    // $example on$
+    SparkConf sparkConf = new 
SparkConf().setAppName("JavaDecisionTreeRegressionExample");
+    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
+
+    // Load and parse the data file.
+    String datapath = "data/mllib/sample_libsvm_data.txt";
+    JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), 
datapath).toJavaRDD();
+    // Split the data into training and test sets (30% held out for testing)
+    JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3});
+    JavaRDD<LabeledPoint> trainingData = splits[0];
+    JavaRDD<LabeledPoint> testData = splits[1];
+
+    // Set parameters.
+    // Empty categoricalFeaturesInfo indicates all features are continuous.
+    Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, 
Integer>();
+    String impurity = "variance";
+    Integer maxDepth = 5;
+    Integer maxBins = 32;
+
+    // Train a DecisionTree model.
+    final DecisionTreeModel model = DecisionTree.trainRegressor(trainingData,
+      categoricalFeaturesInfo, impurity, maxDepth, maxBins);
+
+    // Evaluate model on test instances and compute test error
+    JavaPairRDD<Double, Double> predictionAndLabel =
+      testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
+      @Override
+      public Tuple2<Double, Double> call(LabeledPoint p) {
+        return new Tuple2<Double, Double>(model.predict(p.features()), 
p.label());
+      }
+    });
+    Double testMSE =
+      predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() {
+        @Override
+        public Double call(Tuple2<Double, Double> pl) {
+          Double diff = pl._1() - pl._2();
+          return diff * diff;
+        }
+      }).reduce(new Function2<Double, Double, Double>() {
+        @Override
+        public Double call(Double a, Double b) {
+          return a + b;
+        }
+      }) / data.count();
+    System.out.println("Test Mean Squared Error: " + testMSE);
+    System.out.println("Learned regression tree model:\n" + 
model.toDebugString());
+
+    // Save and load model
+    model.save(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
+    DecisionTreeModel sameModel = DecisionTreeModel
+      .load(jsc.sc(), "target/tmp/myDecisionTreeRegressionModel");
+    // $example off$
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/6c1409e0/examples/src/main/python/mllib/decision_tree_classification_example.py
----------------------------------------------------------------------
diff --git 
a/examples/src/main/python/mllib/decision_tree_classification_example.py 
b/examples/src/main/python/mllib/decision_tree_classification_example.py
new file mode 100644
index 0000000..1b52976
--- /dev/null
+++ b/examples/src/main/python/mllib/decision_tree_classification_example.py
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Decision Tree Classification Example.
+"""
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
+from pyspark.mllib.util import MLUtils
+# $example off$
+
+if __name__ == "__main__":
+
+    sc = SparkContext(appName="PythonDecisionTreeClassificationExample")
+
+    # $example on$
+    # Load and parse the data file into an RDD of LabeledPoint.
+    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
+    # Split the data into training and test sets (30% held out for testing)
+    (trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+    # Train a DecisionTree model.
+    #  Empty categoricalFeaturesInfo indicates all features are continuous.
+    model = DecisionTree.trainClassifier(trainingData, numClasses=2, 
categoricalFeaturesInfo={},
+                                         impurity='gini', maxDepth=5, 
maxBins=32)
+
+    # Evaluate model on test instances and compute test error
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
+    testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / 
float(testData.count())
+    print('Test Error = ' + str(testErr))
+    print('Learned classification tree model:')
+    print(model.toDebugString())
+
+    # Save and load model
+    model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
+    sameModel = DecisionTreeModel.load(sc, 
"target/tmp/myDecisionTreeClassificationModel")
+    # $example off$

http://git-wip-us.apache.org/repos/asf/spark/blob/6c1409e0/examples/src/main/python/mllib/decision_tree_regression_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/mllib/decision_tree_regression_example.py 
b/examples/src/main/python/mllib/decision_tree_regression_example.py
new file mode 100644
index 0000000..cf518ea
--- /dev/null
+++ b/examples/src/main/python/mllib/decision_tree_regression_example.py
@@ -0,0 +1,56 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Decision Tree Regression Example.
+"""
+from __future__ import print_function
+
+from pyspark import SparkContext
+# $example on$
+from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
+from pyspark.mllib.util import MLUtils
+# $example off$
+
+if __name__ == "__main__":
+
+    sc = SparkContext(appName="PythonDecisionTreeRegressionExample")
+
+    # $example on$
+    # Load and parse the data file into an RDD of LabeledPoint.
+    data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
+    # Split the data into training and test sets (30% held out for testing)
+    (trainingData, testData) = data.randomSplit([0.7, 0.3])
+
+    # Train a DecisionTree model.
+    #  Empty categoricalFeaturesInfo indicates all features are continuous.
+    model = DecisionTree.trainRegressor(trainingData, 
categoricalFeaturesInfo={},
+                                        impurity='variance', maxDepth=5, 
maxBins=32)
+
+    # Evaluate model on test instances and compute test error
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
+    testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() 
/\
+        float(testData.count())
+    print('Test Mean Squared Error = ' + str(testMSE))
+    print('Learned regression tree model:')
+    print(model.toDebugString())
+
+    # Save and load model
+    model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
+    sameModel = DecisionTreeModel.load(sc, 
"target/tmp/myDecisionTreeRegressionModel")
+    # $example off$

http://git-wip-us.apache.org/repos/asf/spark/blob/6c1409e0/examples/src/main/python/mllib/naive_bayes_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/mllib/naive_bayes_example.py 
b/examples/src/main/python/mllib/naive_bayes_example.py
index a2e7dac..f5e120c 100644
--- a/examples/src/main/python/mllib/naive_bayes_example.py
+++ b/examples/src/main/python/mllib/naive_bayes_example.py
@@ -20,6 +20,7 @@ NaiveBayes Example.
 """
 from __future__ import print_function
 
+from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
 from pyspark.mllib.linalg import Vectors

http://git-wip-us.apache.org/repos/asf/spark/blob/6c1409e0/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
----------------------------------------------------------------------
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
new file mode 100644
index 0000000..d427bba
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeClassificationExample.scala
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.tree.DecisionTree
+import org.apache.spark.mllib.tree.model.DecisionTreeModel
+import org.apache.spark.mllib.util.MLUtils
+// $example off$
+import org.apache.spark.{SparkConf, SparkContext}
+
+object DecisionTreeClassificationExample {
+
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("DecisionTreeClassificationExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    // Load and parse the data file.
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+    // Split the data into training and test sets (30% held out for testing)
+    val splits = data.randomSplit(Array(0.7, 0.3))
+    val (trainingData, testData) = (splits(0), splits(1))
+
+    // Train a DecisionTree model.
+    //  Empty categoricalFeaturesInfo indicates all features are continuous.
+    val numClasses = 2
+    val categoricalFeaturesInfo = Map[Int, Int]()
+    val impurity = "gini"
+    val maxDepth = 5
+    val maxBins = 32
+
+    val model = DecisionTree.trainClassifier(trainingData, numClasses, 
categoricalFeaturesInfo,
+      impurity, maxDepth, maxBins)
+
+    // Evaluate model on test instances and compute test error
+    val labelAndPreds = testData.map { point =>
+      val prediction = model.predict(point.features)
+      (point.label, prediction)
+    }
+    val testErr = labelAndPreds.filter(r => r._1 != r._2).count().toDouble / 
testData.count()
+    println("Test Error = " + testErr)
+    println("Learned classification tree model:\n" + model.toDebugString)
+
+    // Save and load model
+    model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
+    val sameModel = DecisionTreeModel.load(sc, 
"target/tmp/myDecisionTreeClassificationModel")
+    // $example off$
+  }
+}
+// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/6c1409e0/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
----------------------------------------------------------------------
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
new file mode 100644
index 0000000..fb05e7d
--- /dev/null
+++ 
b/examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRegressionExample.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.mllib
+
+// $example on$
+import org.apache.spark.mllib.tree.DecisionTree
+import org.apache.spark.mllib.tree.model.DecisionTreeModel
+import org.apache.spark.mllib.util.MLUtils
+// $example off$
+import org.apache.spark.{SparkConf, SparkContext}
+
+object DecisionTreeRegressionExample {
+
+  def main(args: Array[String]): Unit = {
+    val conf = new SparkConf().setAppName("DecisionTreeRegressionExample")
+    val sc = new SparkContext(conf)
+
+    // $example on$
+    // Load and parse the data file.
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
+    // Split the data into training and test sets (30% held out for testing)
+    val splits = data.randomSplit(Array(0.7, 0.3))
+    val (trainingData, testData) = (splits(0), splits(1))
+
+    // Train a DecisionTree model.
+    //  Empty categoricalFeaturesInfo indicates all features are continuous.
+    val categoricalFeaturesInfo = Map[Int, Int]()
+    val impurity = "variance"
+    val maxDepth = 5
+    val maxBins = 32
+
+    val model = DecisionTree.trainRegressor(trainingData, 
categoricalFeaturesInfo, impurity,
+      maxDepth, maxBins)
+
+    // Evaluate model on test instances and compute test error
+    val labelsAndPredictions = testData.map { point =>
+      val prediction = model.predict(point.features)
+      (point.label, prediction)
+    }
+    val testMSE = labelsAndPredictions.map{ case (v, p) => math.pow(v - p, 2) 
}.mean()
+    println("Test Mean Squared Error = " + testMSE)
+    println("Learned regression tree model:\n" + model.toDebugString)
+
+    // Save and load model
+    model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
+    val sameModel = DecisionTreeModel.load(sc, 
"target/tmp/myDecisionTreeRegressionModel")
+    // $example off$
+  }
+}
+// scalastyle:on println


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11382] Replace example code in mllib-decision-tree.md using include_example

Reply via email to