Repository: incubator-systemml Updated Branches: refs/heads/mlpipeline_test [created] c186e7513
[SYSTEMML-578] [SYSTEMML-580] Adding ScalaTest for MLPipeline wrappers 1. Adding ScalaAutomatedTestBase to make sure that ScalaTest run on Windows environment. 2. Added test("test logistic regression with mlpipeline") Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c186e751 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c186e751 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c186e751 Branch: refs/heads/mlpipeline_test Commit: c186e7513c67a0d5d4234f907ff0ef17c187ddee Parents: 8a6b385 Author: Niketan Pansare <npan...@us.ibm.com> Authored: Thu Apr 21 14:55:35 2016 -0700 Committer: Niketan Pansare <npan...@us.ibm.com> Committed: Thu Apr 21 14:55:35 2016 -0700 ---------------------------------------------------------------------- .../sysml/api/ml/LogisticRegressionSuite.scala | 52 ++++++++++++++++++++ .../sysml/api/ml/ScalaAutomatedTestBase.scala | 36 ++++++++++++++ 2 files changed, 88 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c186e751/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala ---------------------------------------------------------------------- diff --git a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala index 8d5d095..ca2549a 100644 --- a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala +++ b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala @@ -24,9 +24,21 @@ import org.scalatest.Matchers import org.apache.spark.Logging import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator +import org.apache.spark.ml.feature.{HashingTF, Tokenizer} +import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} +import org.apache.spark.mllib.linalg.Vector +import scala.reflect.runtime.universe._ + +case class LabeledDocument[T:TypeTag](id: Long, text: String, label: Double) +case class Document[T:TypeTag](id: Long, text: String) class LogisticRegressionSuite extends FunSuite with WrapperSparkContext with Matchers with Logging { + // Note: This is required by every test to ensure that it runs successfully on windows laptop !!! + val loadConfig = ScalaAutomatedTestBase + test("run logistic regression with default") { //Make sure system ml home set when run wrapper val newsqlContext = new org.apache.spark.sql.SQLContext(sc); @@ -48,4 +60,44 @@ class LogisticRegressionSuite extends FunSuite with WrapperSparkContext with Mat lrmodel.getIcpt shouldBe lr.getIcpt lrmodel.getMaxInnerIter shouldBe lr.getMaxInnerIter } + + test("test logistic regression with mlpipeline"){ + //Make sure system ml home set when run wrapper + val newsqlContext = new org.apache.spark.sql.SQLContext(sc); + import newsqlContext.implicits._ + val training = sc.parallelize(Seq( + LabeledDocument(0L, "a b c d e spark", 1.0), + LabeledDocument(1L, "b d", 2.0), + LabeledDocument(2L, "spark f g h", 1.0), + LabeledDocument(3L, "hadoop mapreduce", 2.0), + LabeledDocument(4L, "b spark who", 1.0), + LabeledDocument(5L, "g d a y", 2.0), + LabeledDocument(6L, "spark fly", 1.0), + LabeledDocument(7L, "was mapreduce", 2.0), + LabeledDocument(8L, "e spark program", 1.0), + LabeledDocument(9L, "a e c l", 2.0), + LabeledDocument(10L, "spark compile", 1.0), + LabeledDocument(11L, "hadoop software", 2.0))) + + val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") + val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features") + val lr = new LogisticRegression("log",sc) + val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, lr)) + val crossval = new CrossValidator().setEstimator(pipeline).setEvaluator(new BinaryClassificationEvaluator) + val paramGrid = new ParamGridBuilder().addGrid(hashingTF.numFeatures, Array(10, 100, 1000)).addGrid(lr.regParam, Array(0.1, 0.01)).build() + crossval.setEstimatorParamMaps(paramGrid) + crossval.setNumFolds(2) + val lrmodel = crossval.fit(training.toDF) + val test = sc.parallelize(Seq( + Document(12L, "spark i j k"), + Document(13L, "l m n"), + Document(14L, "mapreduce spark"), + Document(15L, "apache hadoop"))) + + lrmodel.transform(test.toDF).show + + lr.getIcpt shouldBe 0 +// lrmodel.getIcpt shouldBe lr.getIcpt +// lrmodel.getMaxInnerIter shouldBe lr.getMaxInnerIter + } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c186e751/src/test/scala/org/apache/sysml/api/ml/ScalaAutomatedTestBase.scala ---------------------------------------------------------------------- diff --git a/src/test/scala/org/apache/sysml/api/ml/ScalaAutomatedTestBase.scala b/src/test/scala/org/apache/sysml/api/ml/ScalaAutomatedTestBase.scala new file mode 100644 index 0000000..652f05e --- /dev/null +++ b/src/test/scala/org/apache/sysml/api/ml/ScalaAutomatedTestBase.scala @@ -0,0 +1,36 @@ +package org.apache.sysml.api.ml + +import java.io.File + +object ScalaAutomatedTestBase { + // *** HACK ALERT *** HACK ALERT *** HACK ALERT *** + // Hadoop 2.4.1 doesn't work on Windows unless winutils.exe is available + // under $HADOOP_HOME/bin and hadoop.dll is available in the Java library + // path. The following static initializer sets up JVM variables so that + // Hadoop can find these native binaries, assuming that any Hadoop code + // loads after this class and that the JVM's current working directory + // is the root of this project. + val osname = System.getProperty("os.name").toLowerCase(); + if (osname.contains("win")) { + System.err.printf("AutomatedTestBase has detected a Windows OS and is overriding\n" + + "hadoop.home.dir and java.library.path.\n"); + val cwd = System.getProperty("user.dir"); + + System.setProperty("hadoop.home.dir", cwd + File.separator + + "\\src\\test\\config\\hadoop_bin_windows"); + System.setProperty("java.library.path", cwd + File.separator + + "\\src\\test\\config\\hadoop_bin_windows\\bin"); + + + // Need to muck around with the classloader to get it to use the new + // value of java.library.path. + val sysPathsField = classOf[ClassLoader].getDeclaredField("sys_paths"); + sysPathsField.setAccessible(true); + sysPathsField.set(null, null); + // IBM Java throws an exception here, so don't print the stack trace. + } + + // This ensures that MLPipeline wrappers get appropriate paths to the scripts + ScriptsUtils.setSystemmlHome(System.getProperty("user.dir") + File.separator + "scripts") + // *** END HACK *** +} \ No newline at end of file