incubator-systemml git commit: [SYSTEMML-578] [SYSTEMML-580] Adding ScalaTest for MLPipeline wrappers
Repository: incubator-systemml Updated Branches: refs/heads/mlpipeline_test [created] c186e7513 [SYSTEMML-578] [SYSTEMML-580] Adding ScalaTest for MLPipeline wrappers 1. Adding ScalaAutomatedTestBase to make sure that ScalaTest run on Windows environment. 2. Added test("test logistic regression with mlpipeline") Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/c186e751 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/c186e751 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/c186e751 Branch: refs/heads/mlpipeline_test Commit: c186e7513c67a0d5d4234f907ff0ef17c187ddee Parents: 8a6b385 Author: Niketan PansareAuthored: Thu Apr 21 14:55:35 2016 -0700 Committer: Niketan Pansare Committed: Thu Apr 21 14:55:35 2016 -0700 -- .../sysml/api/ml/LogisticRegressionSuite.scala | 52 .../sysml/api/ml/ScalaAutomatedTestBase.scala | 36 ++ 2 files changed, 88 insertions(+) -- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/c186e751/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala -- diff --git a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala index 8d5d095..ca2549a 100644 --- a/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala +++ b/src/test/scala/org/apache/sysml/api/ml/LogisticRegressionSuite.scala @@ -24,9 +24,21 @@ import org.scalatest.Matchers import org.apache.spark.Logging import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator +import org.apache.spark.ml.feature.{HashingTF, Tokenizer} +import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} +import org.apache.spark.mllib.linalg.Vector +import scala.reflect.runtime.universe._ + +case class LabeledDocument[T:TypeTag](id: Long, text: String, label: Double) +case class Document[T:TypeTag](id: Long, text: String) class LogisticRegressionSuite extends FunSuite with WrapperSparkContext with Matchers with Logging { + // Note: This is required by every test to ensure that it runs successfully on windows laptop !!! + val loadConfig = ScalaAutomatedTestBase + test("run logistic regression with default") { //Make sure system ml home set when run wrapper val newsqlContext = new org.apache.spark.sql.SQLContext(sc); @@ -48,4 +60,44 @@ class LogisticRegressionSuite extends FunSuite with WrapperSparkContext with Mat lrmodel.getIcpt shouldBe lr.getIcpt lrmodel.getMaxInnerIter shouldBe lr.getMaxInnerIter } + + test("test logistic regression with mlpipeline"){ +//Make sure system ml home set when run wrapper +val newsqlContext = new org.apache.spark.sql.SQLContext(sc); +import newsqlContext.implicits._ +val training = sc.parallelize(Seq( +LabeledDocument(0L, "a b c d e spark", 1.0), +LabeledDocument(1L, "b d", 2.0), +LabeledDocument(2L, "spark f g h", 1.0), +LabeledDocument(3L, "hadoop mapreduce", 2.0), +LabeledDocument(4L, "b spark who", 1.0), +LabeledDocument(5L, "g d a y", 2.0), +LabeledDocument(6L, "spark fly", 1.0), +LabeledDocument(7L, "was mapreduce", 2.0), +LabeledDocument(8L, "e spark program", 1.0), +LabeledDocument(9L, "a e c l", 2.0), +LabeledDocument(10L, "spark compile", 1.0), +LabeledDocument(11L, "hadoop software", 2.0))) + +val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") +val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features") +val lr = new LogisticRegression("log",sc) +val pipeline = new Pipeline().setStages(Array(tokenizer, hashingTF, lr)) +val crossval = new CrossValidator().setEstimator(pipeline).setEvaluator(new BinaryClassificationEvaluator) +val paramGrid = new ParamGridBuilder().addGrid(hashingTF.numFeatures, Array(10, 100, 1000)).addGrid(lr.regParam, Array(0.1, 0.01)).build() +crossval.setEstimatorParamMaps(paramGrid) +crossval.setNumFolds(2) +val lrmodel = crossval.fit(training.toDF) +val test = sc.parallelize(Seq( + Document(12L, "spark i j k"), + Document(13L, "l m n"), + Document(14L, "mapreduce spark"), + Document(15L, "apache hadoop"))) + +lrmodel.transform(test.toDF).show + +lr.getIcpt shouldBe 0 +//lrmodel.getIcpt shouldBe lr.getIcpt +//
incubator-systemml git commit: [SYSTEMML-578] Refactor ml api package to set Scala version as default
Repository: incubator-systemml Updated Branches: refs/heads/master bc2231982 -> 8a6b3856c [SYSTEMML-578] Refactor ml api package to set Scala version as default Closes #116. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/8a6b3856 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/8a6b3856 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/8a6b3856 Branch: refs/heads/master Commit: 8a6b3856ccbb65ea974e51636a60c078805c258f Parents: bc22319 Author: Tommy YUAuthored: Thu Apr 21 14:31:30 2016 -0700 Committer: Deron Eriksson Committed: Thu Apr 21 14:31:30 2016 -0700 -- .../sysml/api/javaml/LogisticRegression.java| 473 ++ .../api/javaml/LogisticRegressionModel.java | 179 +++ .../apache/sysml/api/ml/LogisticRegression.java | 474 --- .../sysml/api/ml/LogisticRegressionModel.java | 179 --- .../sysml/api/ml/LogisticRegression.scala | 204 .../org/apache/sysml/api/ml/ScriptsUtils.scala | 75 +++ .../sysml/api/ml/scala/LogisticRegression.scala | 169 --- .../sysml/api/ml/scala/ScriptsUtils.scala | 62 --- .../sysml/api/ml/LogisticRegressionSuite.scala | 51 ++ .../sysml/api/ml/WrapperSparkContext.scala | 54 +++ .../api/ml/scala/LogisticRegressionSuite.scala | 50 -- .../api/ml/scala/WrapperSparkContext.scala | 55 --- 12 files changed, 1036 insertions(+), 989 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/8a6b3856/src/main/java/org/apache/sysml/api/javaml/LogisticRegression.java -- diff --git a/src/main/java/org/apache/sysml/api/javaml/LogisticRegression.java b/src/main/java/org/apache/sysml/api/javaml/LogisticRegression.java new file mode 100644 index 000..dbcc118 --- /dev/null +++ b/src/main/java/org/apache/sysml/api/javaml/LogisticRegression.java @@ -0,0 +1,473 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysml.api.javaml; + +import java.io.File; +import java.util.HashMap; + +import org.apache.spark.SparkContext; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.classification.LogisticRegressionParams; +import org.apache.spark.ml.classification.ProbabilisticClassifier; +import org.apache.spark.ml.param.BooleanParam; +import org.apache.spark.ml.param.DoubleParam; +import org.apache.spark.ml.param.IntParam; +import org.apache.spark.ml.param.StringArrayParam; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.SQLContext; +import org.apache.sysml.api.MLContext; +import org.apache.sysml.api.MLOutput; +import org.apache.sysml.api.javaml.LogisticRegressionModel; +import org.apache.sysml.api.ml.functions.ConvertSingleColumnToString; +import org.apache.sysml.runtime.DMLRuntimeException; +import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt; +import org.apache.sysml.runtime.matrix.MatrixCharacteristics; +import org.apache.sysml.runtime.matrix.data.MatrixBlock; +import org.apache.sysml.runtime.matrix.data.MatrixIndexes; + +/** + * + * This class shows how SystemML can be integrated into MLPipeline. Note, it has not been optimized for performance and + * is implemented as a proof of concept. An optimized pipeline can be constructed by usage of DML's 'parfor' construct. + * + * TODO: + * - Please note that this class expects 1-based labels. To run below example, + * please set environment variable 'SYSTEMML_HOME' and create folder 'algorithms' + * and place atleast two scripts in that folder 'MultiLogReg.dml' and 'GLM-predict.dml' + * - It is not yet optimized for performance. + * - Also, it needs to be extended to surface all the parameters of MultiLogReg.dml + * + * Example usage: + * + * // Code to
[2/2] incubator-systemml git commit: [SYSTEMML-641] Performance core block matrix mult (cache/task config)
[SYSTEMML-641] Performance core block matrix mult (cache/task config) This patch makes two configuration changes to our core block matrix multiplications (mostly applicable for dense): (1) Cache-conscious selection of case dense-dense "skinny rhs matrix" (for common L2 cache size) (2) Balanced task size configurations (better cache-blocking, number of tasks as multiples of degree of parallelism k, balanced distribution of iterations over tasks) Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/bc223198 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/bc223198 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/bc223198 Branch: refs/heads/master Commit: bc22319820f8e8f00770bad054475cc797781bed Parents: 72e2166 Author: Matthias BoehmAuthored: Thu Apr 21 00:37:42 2016 -0700 Committer: Matthias Boehm Committed: Thu Apr 21 10:34:05 2016 -0700 -- .../runtime/matrix/data/LibMatrixMult.java | 29 .../sysml/runtime/util/UtilFunctions.java | 6 2 files changed, 30 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bc223198/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java -- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java index 488e95a..3142b2c 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixMult.java @@ -184,10 +184,10 @@ public class LibMatrixMult try { ExecutorService pool = Executors.newFixedThreadPool( k ); ArrayList tasks = new ArrayList(); - int nk = pm2 ? k : Math.max(Math.min(8*k,ru/8), k); - int blklen = (int)(Math.ceil((double)ru/nk)); - for( int i=0; i blklens = getBalancedBlockSizes(ru, nk); + for( int i=0, rl=0; i taskret = pool.invokeAll(tasks); pool.shutdown(); @@ -3945,7 +3945,8 @@ public class LibMatrixMult { //transpose if dense-dense, skinny rhs matrix (not vector), and memory guarded by output return (LOW_LEVEL_OPTIMIZATION && !m1.sparse && !m2.sparse - && m1.rlen>m2.clen && m2.rlen > 64 && m2.clen > 1 && m2.clen < 64); + && m1.rlen > m2.clen && m2.rlen > 64 && m2.clen > 1 && m2.clen < 64 + && 8*m2.rlen*m2.clen < 256*1024 ); //rhs fits in L2 cache } /** @@ -4079,6 +4080,24 @@ public class LibMatrixMult } + /** +* +* @param len +* @param k +* @return +*/ + private static ArrayList getBalancedBlockSizes(int len, int k) { + ArrayList ret = new ArrayList(); + int base = len / k; + int rest = len % k; + for( int i=0; i 0 ) + ret.add(val); + } + return ret; + } + / // Task Implementations for Multi-Threaded Operations // / http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bc223198/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java -- diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java index 21078c4..241b178 100644 --- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java +++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java @@ -254,6 +254,12 @@ public class UtilFunctions else return ((Integer)obj).intValue(); } + + public static int roundToNext(int val, int factor) { + //round up to next non-zero multiple of factor + int pval = Math.max(val, factor); + return ((pval + factor-1) / factor) *
[1/2] incubator-systemml git commit: [SYSTEMML-583] Simplify jmlc transform meta read util (w/o spec), tests
Repository: incubator-systemml Updated Branches: refs/heads/master 08a3f1c7e -> bc2231982 [SYSTEMML-583] Simplify jmlc transform meta read util (w/o spec), tests Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/72e21663 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/72e21663 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/72e21663 Branch: refs/heads/master Commit: 72e216637840eb17090725064c3208ade4ab6160 Parents: 08a3f1c Author: Matthias BoehmAuthored: Wed Apr 20 18:57:43 2016 -0700 Committer: Matthias Boehm Committed: Wed Apr 20 18:58:14 2016 -0700 -- .../org/apache/sysml/api/jmlc/Connection.java | 145 +-- .../functions/jmlc/FrameReadMetaTest.java | 18 ++- .../functions/jmlc/tfmtd_example/coltypes.csv | 2 +- 3 files changed, 119 insertions(+), 46 deletions(-) -- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/72e21663/src/main/java/org/apache/sysml/api/jmlc/Connection.java -- diff --git a/src/main/java/org/apache/sysml/api/jmlc/Connection.java b/src/main/java/org/apache/sysml/api/jmlc/Connection.java index 58aa0fc..984268a 100644 --- a/src/main/java/org/apache/sysml/api/jmlc/Connection.java +++ b/src/main/java/org/apache/sysml/api/jmlc/Connection.java @@ -406,6 +406,24 @@ public class Connection return ret; } + + + // Read transform meta data + + + /** +* Reads transform meta data from an HDFS file path and converts it into an in-memory +* FrameBlock object. The column names in the meta data file 'column.names' is processed +* with default separator ','. +* +* @param metapath hdfs file path to meta data directory +* @return FrameBlock object representing transform metadata +* @throws IOException +*/ + public FrameBlock readTransformMetaDataFromFile(String metapath) throws IOException { + return readTransformMetaDataFromFile(null, metapath, TfUtils.TXMTD_SEP); + } + /** * Reads transform meta data from an HDFS file path and converts it into an in-memory * FrameBlock object. The column names in the meta data file 'column.names' is processed @@ -433,6 +451,8 @@ public class Connection public FrameBlock readTransformMetaDataFromFile(String spec, String metapath, String colDelim) throws IOException { + //NOTE: this implementation assumes column alignment of colnames and coltypes + //read column types (for sanity check column names) String coltypesStr = MapReduceTool.readStringFromHDFSFile(metapath+File.separator+TfUtils.TXMTD_COLTYPES); List coltypes = Arrays.asList(IOUtilFunctions.split(coltypesStr.trim(), TfUtils.TXMTD_SEP)); @@ -460,9 +480,25 @@ public class Connection LOG.warn("Recode map for column '"+colName+"' does not exist."); } } + + //get list of recode ids + List recodeIDs = parseRecodeColIDs(spec, coltypes); //create frame block from in-memory strings - return convertToTransformMetaDataFrame(spec, rows, colnames, meta); + return convertToTransformMetaDataFrame(rows, recodeIDs, colnames, meta); + } + + /** +* Reads transform meta data from the class path and converts it into an in-memory +* FrameBlock object. The column names in the meta data file 'column.names' is processed +* with default separator ','. +* +* @param metapath resource path to meta data directory +* @return FrameBlock object representing transform metadata +* @throws IOException +*/ + public FrameBlock readTransformMetaDataFromPath(String metapath) throws IOException { + return readTransformMetaDataFromPath(null, metapath, TfUtils.TXMTD_SEP); } /** @@ -492,6 +528,8 @@ public class Connection public FrameBlock readTransformMetaDataFromPath(String spec, String metapath, String colDelim) throws IOException { + //NOTE: this implementation assumes column alignment of colnames and coltypes + //read column types (for sanity check column names) String coltypesStr =