incubator-systemml git commit: [SYSTEMML-578] [SYSTEMML-580] Adding ScalaTest for MLPipeline wrappers

niketanpansare Thu, 21 Apr 2016 14:59:54 -0700

Repository: incubator-systemml
Updated Branches:
  refs/heads/mlpipeline_test [created] c186e7513



[SYSTEMML-578] [SYSTEMML-580] Adding ScalaTest for MLPipeline wrappers

1. Adding ScalaAutomatedTestBase to make sure that ScalaTest run on
Windows environment.
2. Added test("test logistic regression with mlpipeline")

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c186e751
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c186e751
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c186e751

Branch: refs/heads/mlpipeline_test
Commit: c186e7513c67a0d5d4234f907ff0ef17c187ddee
Parents: 8a6b385
Author: Niketan Pansare <npan...@us.ibm.com>
Authored: Thu Apr 21 14:55:35 2016 -0700
Committer: Niketan Pansare <npan...@us.ibm.com>
Committed: Thu Apr 21 14:55:35 2016 -0700

----------------------------------------------------------------------
 .../sysml/api/ml/LogisticRegressionSuite.scala  | 52 ++++++++++++++++++++
 .../sysml/api/ml/ScalaAutomatedTestBase.scala   | 36 ++++++++++++++
 2 files changed, 88 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c186e751/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
----------------------------------------------------------------------
diff --git 
a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala 
b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
index 8d5d095..ca2549a 100644
--- a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
+++ b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
@@ -24,9 +24,21 @@ import org.scalatest.Matchers
 import org.apache.spark.Logging
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
+import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
+import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
+import org.apache.spark.mllib.linalg.Vector
+import scala.reflect.runtime.universe._
+
+case class LabeledDocument[T:TypeTag](id: Long, text: String, label: Double)
+case class Document[T:TypeTag](id: Long, text: String)
 
 class LogisticRegressionSuite extends FunSuite with WrapperSparkContext with 
Matchers with Logging {
 
+  // Note: This is required by every test to ensure that it runs successfully 
on windows laptop !!!
+  val loadConfig = ScalaAutomatedTestBase
+  
   test("run logistic regression with default") {
     //Make sure system ml home set when run wrapper
     val newsqlContext = new org.apache.spark.sql.SQLContext(sc);
@@ -48,4 +60,44 @@ class LogisticRegressionSuite extends FunSuite with 
WrapperSparkContext with Mat
     lrmodel.getIcpt shouldBe lr.getIcpt
     lrmodel.getMaxInnerIter shouldBe lr.getMaxInnerIter
   }
+  
+  test("test logistic regression with mlpipeline"){
+    //Make sure system ml home set when run wrapper
+    val newsqlContext = new org.apache.spark.sql.SQLContext(sc);
+    import newsqlContext.implicits._
+    val training = sc.parallelize(Seq(
+            LabeledDocument(0L, "a b c d e spark", 1.0),
+            LabeledDocument(1L, "b d", 2.0),
+            LabeledDocument(2L, "spark f g h", 1.0),
+            LabeledDocument(3L, "hadoop mapreduce", 2.0),
+            LabeledDocument(4L, "b spark who", 1.0),
+            LabeledDocument(5L, "g d a y", 2.0),
+            LabeledDocument(6L, "spark fly", 1.0),
+            LabeledDocument(7L, "was mapreduce", 2.0),
+            LabeledDocument(8L, "e spark program", 1.0),
+            LabeledDocument(9L, "a e c l", 2.0),
+            LabeledDocument(10L, "spark compile", 1.0),
+            LabeledDocument(11L, "hadoop software", 2.0)))
+
+    val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
+    val hashingTF = new 
HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
+    val lr = new LogisticRegression("log",sc)
+    val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, lr))
+    val crossval = new 
CrossValidator().setEstimator(pipeline).setEvaluator(new 
BinaryClassificationEvaluator)
+    val paramGrid = new ParamGridBuilder().addGrid(hashingTF.numFeatures, 
Array(10, 100, 1000)).addGrid(lr.regParam, Array(0.1, 0.01)).build()
+    crossval.setEstimatorParamMaps(paramGrid)
+    crossval.setNumFolds(2)
+    val lrmodel = crossval.fit(training.toDF)
+    val test = sc.parallelize(Seq(
+      Document(12L, "spark i j k"),
+      Document(13L, "l m n"),
+      Document(14L, "mapreduce spark"),
+      Document(15L, "apache hadoop")))
+    
+    lrmodel.transform(test.toDF).show
+    
+    lr.getIcpt shouldBe 0
+//    lrmodel.getIcpt shouldBe lr.getIcpt
+//    lrmodel.getMaxInnerIter shouldBe lr.getMaxInnerIter
+  }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c186e751/src/test/scala/org/apache/sysml/api/ml/ScalaAutomatedTestBase.scala
----------------------------------------------------------------------
diff --git 
a/src/test/scala/org/apache/sysml/api/ml/ScalaAutomatedTestBase.scala 
b/src/test/scala/org/apache/sysml/api/ml/ScalaAutomatedTestBase.scala
new file mode 100644
index 0000000..652f05e
--- /dev/null
+++ b/src/test/scala/org/apache/sysml/api/ml/ScalaAutomatedTestBase.scala
@@ -0,0 +1,36 @@
+package org.apache.sysml.api.ml
+
+import java.io.File
+
+object ScalaAutomatedTestBase {
+  // *** HACK ALERT *** HACK ALERT *** HACK ALERT ***
+       // Hadoop 2.4.1 doesn't work on Windows unless winutils.exe is 
available 
+       // under $HADOOP_HOME/bin and hadoop.dll is available in the Java 
library
+       // path. The following static initializer sets up JVM variables so that 
+       // Hadoop can find these native binaries, assuming that any Hadoop code
+       // loads after this class and that the JVM's current working directory
+       // is the root of this project. 
+       val osname = System.getProperty("os.name").toLowerCase();
+       if (osname.contains("win")) {
+               System.err.printf("AutomatedTestBase has detected a Windows OS 
and is overriding\n"
+                               + "hadoop.home.dir and java.library.path.\n");
+               val cwd = System.getProperty("user.dir");
+
+               System.setProperty("hadoop.home.dir", cwd + File.separator
+                               + "\\src\\test\\config\\hadoop_bin_windows");
+               System.setProperty("java.library.path", cwd + File.separator
+                               + 
"\\src\\test\\config\\hadoop_bin_windows\\bin");
+               
+
+           // Need to muck around with the classloader to get it to use the new
+               // value of java.library.path.
+               val sysPathsField = 
classOf[ClassLoader].getDeclaredField("sys_paths");
+               sysPathsField.setAccessible(true);
+         sysPathsField.set(null, null);
+               // IBM Java throws an exception here, so don't print the stack 
trace.
+       }
+
+       // This ensures that MLPipeline wrappers get appropriate paths to the 
scripts
+       ScriptsUtils.setSystemmlHome(System.getProperty("user.dir") + 
File.separator + "scripts")
+       // *** END HACK ***
+}
\ No newline at end of file

incubator-systemml git commit: [SYSTEMML-578] [SYSTEMML-580] Adding ScalaTest for MLPipeline wrappers

Reply via email to