spark git commit: [SPARK-14300][DOCS][MLLIB] Scala MLlib examples code merge and clean up

jkbradley Wed, 26 Oct 2016 13:35:32 -0700

Repository: spark
Updated Branches:
  refs/heads/master fb0a8a8dd -> dcdda1978



[SPARK-14300][DOCS][MLLIB] Scala MLlib examples code merge and clean up

## What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-14300

Duplicated code found in scala/examples/mllib, below all deleted in this PR:

- DenseGaussianMixture.scala
- StreamingLinearRegression.scala

## delete reasons:

#### delete: mllib/DenseGaussianMixture.scala

- duplicate of mllib/GaussianMixtureExample

#### delete: mllib/StreamingLinearRegression.scala

- duplicate of mllib/StreamingLinearRegressionExample

When merging and cleaning those code, be sure not disturb the previous example 
on and off blocks.

## How was this patch tested?

Test with `SKIP_API=1 jekyll` manually to make sure that works well.

Author: Xin Ren <iamsh...@126.com>

Closes #12195 from keypointt/SPARK-14300.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dcdda197
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dcdda197
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dcdda197

Branch: refs/heads/master
Commit: dcdda19785a272969fb1e3ec18382403aaad6c91
Parents: fb0a8a8
Author: Xin Ren <iamsh...@126.com>
Authored: Wed Oct 26 13:33:23 2016 -0700
Committer: Joseph K. Bradley <jos...@databricks.com>
Committed: Wed Oct 26 13:33:23 2016 -0700

----------------------------------------------------------------------
 .../examples/mllib/DenseGaussianMixture.scala   | 75 --------------------
 .../mllib/StreamingLinearRegression.scala       | 73 -------------------
 .../StreamingLinearRegressionExample.scala      | 19 +++++
 3 files changed, 19 insertions(+), 148 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/dcdda197/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
----------------------------------------------------------------------
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
deleted file mode 100644
index 90b817b..0000000
--- 
a/examples/src/main/scala/org/apache/spark/examples/mllib/DenseGaussianMixture.scala
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.{SparkConf, SparkContext}
-import org.apache.spark.mllib.clustering.GaussianMixture
-import org.apache.spark.mllib.linalg.Vectors
-
-/**
- * An example Gaussian Mixture Model EM app. Run with
- * {{{
- * ./bin/run-example mllib.DenseGaussianMixture <input> <k> <convergenceTol>
- * }}}
- * If you use it as a template to create your own app, please use 
`spark-submit` to submit your app.
- */
-object DenseGaussianMixture {
-  def main(args: Array[String]): Unit = {
-    if (args.length < 3) {
-      println("usage: DenseGmmEM <input file> <k> <convergenceTol> 
[maxIterations]")
-    } else {
-      val maxIterations = if (args.length > 3) args(3).toInt else 100
-      run(args(0), args(1).toInt, args(2).toDouble, maxIterations)
-    }
-  }
-
-  private def run(inputFile: String, k: Int, convergenceTol: Double, 
maxIterations: Int) {
-    val conf = new SparkConf().setAppName("Gaussian Mixture Model EM example")
-    val ctx = new SparkContext(conf)
-
-    val data = ctx.textFile(inputFile).map { line =>
-      Vectors.dense(line.trim.split(' ').map(_.toDouble))
-    }.cache()
-
-    val clusters = new GaussianMixture()
-      .setK(k)
-      .setConvergenceTol(convergenceTol)
-      .setMaxIterations(maxIterations)
-      .run(data)
-
-    for (i <- 0 until clusters.k) {
-      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
-        (clusters.weights(i), clusters.gaussians(i).mu, 
clusters.gaussians(i).sigma))
-    }
-
-    println("The membership value of each vector to all mixture components 
(first <= 100):")
-    val membership = clusters.predictSoft(data)
-    membership.take(100).foreach { x =>
-      print(" " + x.mkString(","))
-    }
-    println()
-    println("Cluster labels (first <= 100):")
-    val clusterLabels = clusters.predict(data)
-    clusterLabels.take(100).foreach { x =>
-      print(" " + x)
-    }
-    println()
-  }
-}
-// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/dcdda197/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
----------------------------------------------------------------------
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
deleted file mode 100644
index e559296..0000000
--- 
a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegression.scala
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// scalastyle:off println
-package org.apache.spark.examples.mllib
-
-import org.apache.spark.SparkConf
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.{LabeledPoint, 
StreamingLinearRegressionWithSGD}
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-
-/**
- * Train a linear regression model on one stream of data and make predictions
- * on another stream, where the data streams arrive as text files
- * into two different directories.
- *
- * The rows of the text files must be labeled data points in the form
- * `(y,[x1,x2,x3,...,xn])`
- * Where n is the number of features. n must be the same for train and test.
- *
- * Usage: StreamingLinearRegression <trainingDir> <testDir> <batchDuration> 
<numFeatures>
- *
- * To run on your local machine using the two directories `trainingDir` and 
`testDir`,
- * with updates every 5 seconds, and 2 features per data point, call:
- *    $ bin/run-example mllib.StreamingLinearRegression trainingDir testDir 5 2
- *
- * As you add text files to `trainingDir` the model will continuously update.
- * Anytime you add text files to `testDir`, you'll see predictions from the 
current model.
- *
- */
-object StreamingLinearRegression {
-
-  def main(args: Array[String]) {
-
-    if (args.length != 4) {
-      System.err.println(
-        "Usage: StreamingLinearRegression <trainingDir> <testDir> 
<batchDuration> <numFeatures>")
-      System.exit(1)
-    }
-
-    val conf = new 
SparkConf().setMaster("local").setAppName("StreamingLinearRegression")
-    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))
-
-    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse)
-    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)
-
-    val model = new StreamingLinearRegressionWithSGD()
-      .setInitialWeights(Vectors.zeros(args(3).toInt))
-
-    model.trainOn(trainingData)
-    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
-
-    ssc.start()
-    ssc.awaitTermination()
-
-  }
-
-}
-// scalastyle:on println

http://git-wip-us.apache.org/repos/asf/spark/blob/dcdda197/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
----------------------------------------------------------------------
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
index 0a1cd2d..2ba1a62 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala
@@ -26,6 +26,25 @@ import 
org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
 // $example off$
 import org.apache.spark.streaming._
 
+/**
+ * Train a linear regression model on one stream of data and make predictions
+ * on another stream, where the data streams arrive as text files
+ * into two different directories.
+ *
+ * The rows of the text files must be labeled data points in the form
+ * `(y,[x1,x2,x3,...,xn])`
+ * Where n is the number of features. n must be the same for train and test.
+ *
+ * Usage: StreamingLinearRegressionExample <trainingDir> <testDir>
+ *
+ * To run on your local machine using the two directories `trainingDir` and 
`testDir`,
+ * with updates every 5 seconds, and 2 features per data point, call:
+ *    $ bin/run-example mllib.StreamingLinearRegressionExample trainingDir 
testDir
+ *
+ * As you add text files to `trainingDir` the model will continuously update.
+ * Anytime you add text files to `testDir`, you'll see predictions from the 
current model.
+ *
+ */
 object StreamingLinearRegressionExample {
 
   def main(args: Array[String]): Unit = {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-14300][DOCS][MLLIB] Scala MLlib examples code merge and clean up

Reply via email to