incubator-systemml git commit: [SYSTEMML-578] [SYSTEMML-580] Adding ScalaTest for MLPipeline wrappers

2016-04-21 Thread niketanpansare
Repository: incubator-systemml
Updated Branches:
  refs/heads/mlpipeline_test [created] c186e7513


[SYSTEMML-578] [SYSTEMML-580] Adding ScalaTest for MLPipeline wrappers

1. Adding ScalaAutomatedTestBase to make sure that ScalaTest run on
Windows environment.
2. Added test("test logistic regression with mlpipeline")

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c186e751
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c186e751
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c186e751

Branch: refs/heads/mlpipeline_test
Commit: c186e7513c67a0d5d4234f907ff0ef17c187ddee
Parents: 8a6b385
Author: Niketan Pansare 
Authored: Thu Apr 21 14:55:35 2016 -0700
Committer: Niketan Pansare 
Committed: Thu Apr 21 14:55:35 2016 -0700

--
 .../sysml/api/ml/LogisticRegressionSuite.scala  | 52 
 .../sysml/api/ml/ScalaAutomatedTestBase.scala   | 36 ++
 2 files changed, 88 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c186e751/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
--
diff --git 
a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala 
b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
index 8d5d095..ca2549a 100644
--- a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
+++ b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala
@@ -24,9 +24,21 @@ import org.scalatest.Matchers
 import org.apache.spark.Logging
 import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
+import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
+import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
+import org.apache.spark.mllib.linalg.Vector
+import scala.reflect.runtime.universe._
+
+case class LabeledDocument[T:TypeTag](id: Long, text: String, label: Double)
+case class Document[T:TypeTag](id: Long, text: String)
 
 class LogisticRegressionSuite extends FunSuite with WrapperSparkContext with 
Matchers with Logging {
 
+  // Note: This is required by every test to ensure that it runs successfully 
on windows laptop !!!
+  val loadConfig = ScalaAutomatedTestBase
+  
   test("run logistic regression with default") {
 //Make sure system ml home set when run wrapper
 val newsqlContext = new org.apache.spark.sql.SQLContext(sc);
@@ -48,4 +60,44 @@ class LogisticRegressionSuite extends FunSuite with 
WrapperSparkContext with Mat
 lrmodel.getIcpt shouldBe lr.getIcpt
 lrmodel.getMaxInnerIter shouldBe lr.getMaxInnerIter
   }
+  
+  test("test logistic regression with mlpipeline"){
+//Make sure system ml home set when run wrapper
+val newsqlContext = new org.apache.spark.sql.SQLContext(sc);
+import newsqlContext.implicits._
+val training = sc.parallelize(Seq(
+LabeledDocument(0L, "a b c d e spark", 1.0),
+LabeledDocument(1L, "b d", 2.0),
+LabeledDocument(2L, "spark f g h", 1.0),
+LabeledDocument(3L, "hadoop mapreduce", 2.0),
+LabeledDocument(4L, "b spark who", 1.0),
+LabeledDocument(5L, "g d a y", 2.0),
+LabeledDocument(6L, "spark fly", 1.0),
+LabeledDocument(7L, "was mapreduce", 2.0),
+LabeledDocument(8L, "e spark program", 1.0),
+LabeledDocument(9L, "a e c l", 2.0),
+LabeledDocument(10L, "spark compile", 1.0),
+LabeledDocument(11L, "hadoop software", 2.0)))
+
+val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
+val hashingTF = new 
HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
+val lr = new LogisticRegression("log",sc)
+val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, lr))
+val crossval = new 
CrossValidator().setEstimator(pipeline).setEvaluator(new 
BinaryClassificationEvaluator)
+val paramGrid = new ParamGridBuilder().addGrid(hashingTF.numFeatures, 
Array(10, 100, 1000)).addGrid(lr.regParam, Array(0.1, 0.01)).build()
+crossval.setEstimatorParamMaps(paramGrid)
+crossval.setNumFolds(2)
+val lrmodel = crossval.fit(training.toDF)
+val test = sc.parallelize(Seq(
+  Document(12L, "spark i j k"),
+  Document(13L, "l m n"),
+  Document(14L, "mapreduce spark"),
+  Document(15L, "apache hadoop")))
+
+lrmodel.transform(test.toDF).show
+
+lr.getIcpt shouldBe 0
+//lrmodel.getIcpt shouldBe lr.getIcpt
+//

incubator-systemml git commit: [SYSTEMML-578] Refactor ml api package to set Scala version as default

2016-04-21 Thread deron
Repository: incubator-systemml
Updated Branches:
  refs/heads/master bc2231982 -> 8a6b3856c


[SYSTEMML-578] Refactor ml api package to set Scala version as default

Closes #116.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/8a6b3856
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/8a6b3856
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/8a6b3856

Branch: refs/heads/master
Commit: 8a6b3856ccbb65ea974e51636a60c078805c258f
Parents: bc22319
Author: Tommy YU 
Authored: Thu Apr 21 14:31:30 2016 -0700
Committer: Deron Eriksson 
Committed: Thu Apr 21 14:31:30 2016 -0700

--
 .../sysml/api/javaml/LogisticRegression.java| 473 ++
 .../api/javaml/LogisticRegressionModel.java | 179 +++
 .../apache/sysml/api/ml/LogisticRegression.java | 474 ---
 .../sysml/api/ml/LogisticRegressionModel.java   | 179 ---
 .../sysml/api/ml/LogisticRegression.scala   | 204 
 .../org/apache/sysml/api/ml/ScriptsUtils.scala  |  75 +++
 .../sysml/api/ml/scala/LogisticRegression.scala | 169 ---
 .../sysml/api/ml/scala/ScriptsUtils.scala   |  62 ---
 .../sysml/api/ml/LogisticRegressionSuite.scala  |  51 ++
 .../sysml/api/ml/WrapperSparkContext.scala  |  54 +++
 .../api/ml/scala/LogisticRegressionSuite.scala  |  50 --
 .../api/ml/scala/WrapperSparkContext.scala  |  55 ---
 12 files changed, 1036 insertions(+), 989 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8a6b3856/src/main/java/org/apache/sysml/api/javaml/LogisticRegression.java
--
diff --git a/src/main/java/org/apache/sysml/api/javaml/LogisticRegression.java 
b/src/main/java/org/apache/sysml/api/javaml/LogisticRegression.java
new file mode 100644
index 000..dbcc118
--- /dev/null
+++ b/src/main/java/org/apache/sysml/api/javaml/LogisticRegression.java
@@ -0,0 +1,473 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.api.javaml;
+
+import java.io.File;
+import java.util.HashMap;
+
+import org.apache.spark.SparkContext;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.classification.LogisticRegressionParams;
+import org.apache.spark.ml.classification.ProbabilisticClassifier;
+import org.apache.spark.ml.param.BooleanParam;
+import org.apache.spark.ml.param.DoubleParam;
+import org.apache.spark.ml.param.IntParam;
+import org.apache.spark.ml.param.StringArrayParam;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.SQLContext;
+import org.apache.sysml.api.MLContext;
+import org.apache.sysml.api.MLOutput;
+import org.apache.sysml.api.javaml.LogisticRegressionModel;
+import org.apache.sysml.api.ml.functions.ConvertSingleColumnToString;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt;
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
+
+/**
+ * 
+ * This class shows how SystemML can be integrated into MLPipeline. Note, it 
has not been optimized for performance and 
+ * is implemented as a proof of concept. An optimized pipeline can be 
constructed by usage of DML's 'parfor' construct.
+ * 
+ * TODO: 
+ * - Please note that this class expects 1-based labels. To run below example,
+ * please set environment variable 'SYSTEMML_HOME' and create folder 
'algorithms' 
+ * and place atleast two scripts in that folder 'MultiLogReg.dml' and 
'GLM-predict.dml'
+ * - It is not yet optimized for performance. 
+ * - Also, it needs to be extended to surface all the parameters of 
MultiLogReg.dml
+ * 
+ * Example usage:
+ * 
+ * // Code to 

[2/2] incubator-systemml git commit: [SYSTEMML-641] Performance core block matrix mult (cache/task config)

2016-04-21 Thread mboehm7
[SYSTEMML-641] Performance core block matrix mult (cache/task config)

This patch makes two configuration changes to our core block matrix
multiplications (mostly applicable for dense):
(1) Cache-conscious selection of case dense-dense "skinny rhs matrix"
(for common L2 cache size)
(2) Balanced task size configurations (better cache-blocking, number of
tasks as multiples of degree of parallelism k, balanced distribution of
iterations over tasks) 

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/bc223198
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/bc223198
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/bc223198

Branch: refs/heads/master
Commit: bc22319820f8e8f00770bad054475cc797781bed
Parents: 72e2166
Author: Matthias Boehm 
Authored: Thu Apr 21 00:37:42 2016 -0700
Committer: Matthias Boehm 
Committed: Thu Apr 21 10:34:05 2016 -0700

--
 .../runtime/matrix/data/LibMatrixMult.java  | 29 
 .../sysml/runtime/util/UtilFunctions.java   |  6 
 2 files changed, 30 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bc223198/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
--
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
index 488e95a..3142b2c 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java
@@ -184,10 +184,10 @@ public class LibMatrixMult
try {
ExecutorService pool = Executors.newFixedThreadPool( k 
);
ArrayList tasks = new 
ArrayList();
-   int nk = pm2 ? k : Math.max(Math.min(8*k,ru/8), k);
-   int blklen = (int)(Math.ceil((double)ru/nk));
-   for( int i=0; i blklens = getBalancedBlockSizes(ru, 
nk);
+   for( int i=0, rl=0; i taskret = pool.invokeAll(tasks);   
pool.shutdown();
@@ -3945,7 +3945,8 @@ public class LibMatrixMult
{
//transpose if dense-dense, skinny rhs matrix (not vector), and 
memory guarded by output 
return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse 
-   && m1.rlen>m2.clen && m2.rlen > 64 && m2.clen > 
1 && m2.clen < 64);
+   && m1.rlen > m2.clen && m2.rlen > 64 && m2.clen 
> 1 && m2.clen < 64
+   && 8*m2.rlen*m2.clen < 256*1024 ); //rhs fits 
in L2 cache
}

/**
@@ -4079,6 +4080,24 @@ public class LibMatrixMult

}

+   /**
+* 
+* @param len
+* @param k
+* @return
+*/
+   private static ArrayList getBalancedBlockSizes(int len, int k) 
{
+   ArrayList ret = new ArrayList();
+   int base = len / k;
+   int rest = len % k;
+   for( int i=0; i 0 )
+   ret.add(val);
+   }   
+   return ret; 
+   }
+   
/
// Task Implementations for Multi-Threaded Operations  //
/

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bc223198/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
--
diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java 
b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
index 21078c4..241b178 100644
--- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
@@ -254,6 +254,12 @@ public class UtilFunctions
else
return ((Integer)obj).intValue();
}
+   
+   public static int roundToNext(int val, int factor) {
+   //round up to next non-zero multiple of factor
+   int pval = Math.max(val, factor);
+   return ((pval + factor-1) / factor) * 

[1/2] incubator-systemml git commit: [SYSTEMML-583] Simplify jmlc transform meta read util (w/o spec), tests

2016-04-21 Thread mboehm7
Repository: incubator-systemml
Updated Branches:
  refs/heads/master 08a3f1c7e -> bc2231982


[SYSTEMML-583] Simplify jmlc transform meta read util (w/o spec), tests

Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/72e21663
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/72e21663
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/72e21663

Branch: refs/heads/master
Commit: 72e216637840eb17090725064c3208ade4ab6160
Parents: 08a3f1c
Author: Matthias Boehm 
Authored: Wed Apr 20 18:57:43 2016 -0700
Committer: Matthias Boehm 
Committed: Wed Apr 20 18:58:14 2016 -0700

--
 .../org/apache/sysml/api/jmlc/Connection.java   | 145 +--
 .../functions/jmlc/FrameReadMetaTest.java   |  18 ++-
 .../functions/jmlc/tfmtd_example/coltypes.csv   |   2 +-
 3 files changed, 119 insertions(+), 46 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/72e21663/src/main/java/org/apache/sysml/api/jmlc/Connection.java
--
diff --git a/src/main/java/org/apache/sysml/api/jmlc/Connection.java 
b/src/main/java/org/apache/sysml/api/jmlc/Connection.java
index 58aa0fc..984268a 100644
--- a/src/main/java/org/apache/sysml/api/jmlc/Connection.java
+++ b/src/main/java/org/apache/sysml/api/jmlc/Connection.java
@@ -406,6 +406,24 @@ public class Connection
return ret;
}

+   
+   
+   // Read transform meta data
+   
+   
+   /**
+* Reads transform meta data from an HDFS file path and converts it 
into an in-memory
+* FrameBlock object. The column names in the meta data file 
'column.names' is processed
+* with default separator ','.
+* 
+* @param metapath  hdfs file path to meta data directory
+* @return FrameBlock object representing transform metadata
+* @throws IOException
+*/
+   public FrameBlock readTransformMetaDataFromFile(String metapath) throws 
IOException {
+   return readTransformMetaDataFromFile(null, metapath, 
TfUtils.TXMTD_SEP);
+   }
+   
/**
 * Reads transform meta data from an HDFS file path and converts it 
into an in-memory
 * FrameBlock object. The column names in the meta data file 
'column.names' is processed
@@ -433,6 +451,8 @@ public class Connection
public FrameBlock readTransformMetaDataFromFile(String spec, String 
metapath, String colDelim) 
throws IOException 
{
+   //NOTE: this implementation assumes column alignment of 
colnames and coltypes
+   
//read column types (for sanity check column names)
String coltypesStr = 
MapReduceTool.readStringFromHDFSFile(metapath+File.separator+TfUtils.TXMTD_COLTYPES);
List coltypes = 
Arrays.asList(IOUtilFunctions.split(coltypesStr.trim(), TfUtils.TXMTD_SEP));
@@ -460,9 +480,25 @@ public class Connection
LOG.warn("Recode map for column '"+colName+"' 
does not exist.");
}
}
+
+   //get list of recode ids
+   List recodeIDs = parseRecodeColIDs(spec, coltypes);

//create frame block from in-memory strings
-   return convertToTransformMetaDataFrame(spec, rows, colnames, 
meta);
+   return convertToTransformMetaDataFrame(rows, recodeIDs, 
colnames, meta);
+   }
+   
+   /**
+* Reads transform meta data from the class path and converts it into 
an in-memory
+* FrameBlock object. The column names in the meta data file 
'column.names' is processed
+* with default separator ','.
+* 
+* @param metapath  resource path to meta data directory
+* @return FrameBlock object representing transform metadata
+* @throws IOException
+*/
+   public FrameBlock readTransformMetaDataFromPath(String metapath) throws 
IOException {
+   return readTransformMetaDataFromPath(null, metapath, 
TfUtils.TXMTD_SEP);
}

/**
@@ -492,6 +528,8 @@ public class Connection
public FrameBlock readTransformMetaDataFromPath(String spec, String 
metapath, String colDelim) 
throws IOException 
{
+   //NOTE: this implementation assumes column alignment of 
colnames and coltypes
+   
//read column types (for sanity check column names)
String coltypesStr =