This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 48e6f079fd [SYSTEMDS-3847] Fix non-functional perftest benchmarking 
suite
48e6f079fd is described below

commit 48e6f079fd93e794cbb8cddacae4b7407ee49ac8
Author: Matthias Boehm <[email protected]>
AuthorDate: Sat Apr 5 18:23:51 2025 +0200

    [SYSTEMDS-3847] Fix non-functional perftest benchmarking suite
    
    0) hard-coded server names / properties
    1) windows line endings
    2) memory configurations
    3) datagen scripts
    4) missing l2svm script
---
 scripts/builtin/msvmPredict.dml                    |   2 +-
 scripts/datagen/genRandData4ALS.dml                |  47 -----
 scripts/datagen/genRandData4ChisquaredTest.dml     |  87 --------
 scripts/datagen/genRandData4DecisionTree.sh        |  58 -----
 scripts/datagen/genRandData4DescriptiveStats.dml   | 149 -------------
 scripts/datagen/genRandData4FTest.dml              |  95 ---------
 scripts/datagen/genRandData4Kmeans.dml             | 120 -----------
 scripts/datagen/genRandData4LinearReg_LTstats.dml  | 233 ---------------------
 scripts/datagen/genRandData4LinearRegression.dml   |  61 ------
 scripts/datagen/genRandData4LogReg_LTstats.dml     | 233 ---------------------
 scripts/datagen/genRandData4LogisticRegression.dml |  72 -------
 scripts/datagen/genRandData4MultiClassSVM.dml      |  68 ------
 scripts/datagen/genRandData4Multinomial.dml        |  66 ------
 scripts/datagen/genRandData4NMF.dml                | 129 ------------
 scripts/datagen/genRandData4NMFBlockwise.dml       | 138 ------------
 scripts/datagen/genRandData4PCA.dml                |  61 ------
 scripts/datagen/genRandData4StratStats.dml         | 155 --------------
 scripts/datagen/genRandData4SurvAnalysis.dml       | 133 ------------
 scripts/datagen/genRandData4Transform.dml          |  96 ---------
 scripts/datagen/genRandData4Univariate.dml         |  61 ------
 scripts/datagen/obsolete/genCorrelatedData.dml     |  46 ----
 .../datagen/obsolete/genLinearRegressionData.dml   |  71 -------
 scripts/perftest/README.md                         |  59 ------
 scripts/perftest/datagen/genALSData.sh             |  68 ------
 scripts/perftest/datagen/genBinomialData.sh        |  78 -------
 scripts/perftest/datagen/genClusteringData.sh      |  68 ------
 .../datagen/genDescriptiveStatisticsData.sh        |  60 ------
 .../perftest/datagen/genDimensionReductionData.sh  |  61 ------
 scripts/perftest/datagen/genIOData.sh              |  72 -------
 scripts/perftest/datagen/genL2SVMData.sh           |  38 ----
 scripts/perftest/datagen/genMultinomialData.sh     |  78 -------
 scripts/perftest/datagen/genStratStatisticsData.sh |  61 ------
 .../log4j.properties}                              |  30 +--
 scripts/perftest/runAll.sh                         |  89 ++------
 scripts/perftest/runL2SVM.sh                       |   3 +-
 scripts/perftest/scripts/l2-svm-predict.dml        |  54 +----
 .../scripts/l2-svm.dml}                            |  34 +--
 scripts/perftest/sparkDML2.sh                      |  16 ++
 .../codegenalg/parttwo/AlgorithmDatagen.java       |   4 +-
 .../functions/misc/UnivariateStatsBasicTest.java   |   2 +-
 40 files changed, 72 insertions(+), 2984 deletions(-)

diff --git a/scripts/builtin/msvmPredict.dml b/scripts/builtin/msvmPredict.dml
index f869d7dc05..ed0271b14f 100644
--- a/scripts/builtin/msvmPredict.dml
+++ b/scripts/builtin/msvmPredict.dml
@@ -37,7 +37,7 @@
 m_msvmPredict = function(Matrix[Double] X, Matrix[Double] W)
   return(Matrix[Double] YRaw, Matrix[Double] Y)
 {
-  # Robustness for datasets with missing values 
+  # Robustness for datasets with missing values
   numNaNs = sum(isNaN(X))
   if( numNaNs > 0 ) {
     print("msvm: matrix X contains "+numNaNs+" missing values, replacing with 
0.")
diff --git a/scripts/datagen/genRandData4ALS.dml 
b/scripts/datagen/genRandData4ALS.dml
deleted file mode 100644
index f6c3562862..0000000000
--- a/scripts/datagen/genRandData4ALS.dml
+++ /dev/null
@@ -1,47 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-Xfile = $X; # input matrix X of size m x n
-Ufile = ifdef($U, " "); # original row factor of size m x r
-Vfile = ifdef($V, " "); # original col factor of size r x n
-m = $rows; # no. of rows of X
-n = $cols; # no. of cols of X
-r = $rank; # rank of factorization
-nnz = $nnz; # no. of nonzeros in X
-sigma = ifdef ($sigma, 0.01); # variance of Gaussian noise
-fmt = ifdef ($fmt, "binary"); # output format
-
-# generate original factors by sampling from a normal(0,1.0) distribution
-U = rand(rows = m, cols = r, pdf = "normal", seed = 123);
-V = rand(rows = n, cols = r, pdf = "normal", seed = 456);
-
-I = floor(rand(rows = nnz, cols = 1, min = 1, max = m + 0.999999999));
-J = floor(rand(rows = nnz, cols = 1, min = 1, max = n + 0.999999999));
-X = rand(rows = nnz, cols = 1, pdf = "normal") * sqrt(sigma);
-N = table(I, J, X);
-X = (N != 0) * (U %*% t(V)) + N;
-write(X, Xfile, format = fmt);
-if( Ufile != " " )
-  write(U, Ufile, format = fmt);
-if( Vfile != " " ) {
-  V = t(V);
-  write(V, Vfile, format = fmt);
-}
diff --git a/scripts/datagen/genRandData4ChisquaredTest.dml 
b/scripts/datagen/genRandData4ChisquaredTest.dml
deleted file mode 100644
index 8f2b945e01..0000000000
--- a/scripts/datagen/genRandData4ChisquaredTest.dml
+++ /dev/null
@@ -1,87 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates a two column matrix of categorical
-# variables
-# used to test systemds's chi-squared bivariate stat
-# computation
-
-# $1 is number of samples to generate
-# $2 is number of categories for 1st categorical variable
-# $3 is number of categories for 2nd categorical variable
-# $4 is the file to write out the chi-squared statistic to
-# $5 is the file to write out the generated data to
-
-numSamples = $1
-numCategories1 = $2
-numCategories2 = $3
-
-o = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=1.0, 
pdf="uniform", seed=0)
-o = o / sum(o)
-
-probs1 = rowSums(o)
-probs1 = probs1 / sum(probs1)
-probs2 = colSums(o)
-probs2 = probs2 / sum(probs2)
-e = probs1 %*% probs2
-
-chisquared = sum((o-e)^2/e)
-write(chisquared, $4, format="binary")
-
-oCDF = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=0.0, 
pdf="uniform", seed=0)
-for(i in 1:numCategories1){
-       for(j in 1:numCategories2){
-               if(i==1 & j==1){
-                       oCDF[i,j] = o[1,1]
-               }
-               if(i != 1 & j == 1){
-                       oCDF[i,j] = oCDF[i-1,numCategories2] + o[i,j]
-               }
-               if(j > 1){
-                       oCDF[i,j] = oCDF[i,j-1] + o[i,j]
-               }
-       }
-}
-
-one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform", seed=0)
-data = Rand(rows=numSamples, cols=2, min=0.0, max=0.0, pdf="uniform", seed=0)
-parfor(s in 1:numSamples){
-       r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0)
-       r = as.scalar(r_mat)
-
-       cat1 = -1
-       cat2 = -1
-       continue = 1
-       for(i in 1:numCategories1){
-               for(j in 1:numCategories2){
-                       cdf = as.scalar(oCDF[i,j])
-                       if(continue == 1 & r <= cdf){
-                               cat1 = i
-                               cat2 = j
-                               continue = 0
-                       }
-               }
-       }
-       
-       data[s,1] = cat1*one
-       data[s,2] = cat2*one
-}
-write(data, $5, format="binary")
diff --git a/scripts/datagen/genRandData4DecisionTree.sh 
b/scripts/datagen/genRandData4DecisionTree.sh
deleted file mode 100644
index 6564d518f1..0000000000
--- a/scripts/datagen/genRandData4DecisionTree.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR | 
SPARK | ECHO>   e.g. $0 perftest SPARK" ; exit 1 ; fi
-# if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" == 
"MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi
-
-BASE=$1/trees
-
-FORMAT="csv" 
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-PATH_LOCAL=/tmp/datagen
-PATH_HDFS=$BASE
-
-#### part 1: generating class labels and categorical features  
-${CMD} -f ../datagen/genRandData4DecisionTree1.dml $DASH-nvargs 
XCat=$BASE/XCat Y=$BASE/Y num_records=1000 num_cat=100 num_class=10 
num_distinct=100 sp=$DENSE_SP
-
-#### part 2: generating spec.json on HDFS
-NUM_FEATURES=100
-
-echo "{ \"ids\": true 
-       ,\"recode\": [1 " > $PATH_LOCAL/spec.json
-for i in $(seq 2 $NUM_FEATURES); do
-       echo " , "$i >> $PATH_LOCAL/spec.json
-done
-echo " ] , \"dummycode\": [ 1" >> $PATH_LOCAL/spec.json
-for i in $(seq 2 $NUM_FEATURES); do
-       echo " , "$i >> $PATH_LOCAL/spec.json
-done
-echo "] }" >> $PATH_LOCAL/spec.json
-
-hadoop fs -rm $PATH_HDFS/spec.json
-hadoop fs -copyFromLocal $PATH_LOCAL/spec.json $PATH_HDFS/spec.json  
-
-#### part 3: generating scale feature and transforming categorical features, 
finally combaning scale and categorical features
-${CMD} -f ../datagen/genRandData4DecisionTree2.dml $DASH-nvargs 
tPath=$BASE/metadata tSpec=$BASE/spec.json XCat=$BASE/XCat X=$BASE/X 
num_records=1000 num_scale=100 sp=$DENSE_SP fmt=$FORMAT
-
-
diff --git a/scripts/datagen/genRandData4DescriptiveStats.dml 
b/scripts/datagen/genRandData4DescriptiveStats.dml
deleted file mode 100644
index 6f96162074..0000000000
--- a/scripts/datagen/genRandData4DescriptiveStats.dml
+++ /dev/null
@@ -1,149 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
-------------------------------------------------
-  Parameters                                      
-------------------------------------------------
-$R          = #rows
-$C          = #columns
-$NC         = number of categorical attributes
-$MAXDOMAIN  = maximum domain size
-$DATA       = output file path on HDFS
-$SETSIZE    = Size of one bivariate set
-$LABELSETSIZE= Size of second bivariate set with labels
-$TYPES      = output attribute types
-$TYPES1     = Attribute types for Set1
-$TYPES2     = Attribute types for Set2
-$INDEX1     = Indices for Set1
-$INDEX2     = Indices for Set2
-$FMT        = output format
-------------------------------------------------
-hadoop jar SystemDS.jar -f genData4Stats.dml -nvargs R=1000000 C=1000 NC=50 
MAXDOMAIN=1100 DATA=stats/data TYPES=stats/types SETSIZE=15 LABELSETSIZE=10 
TYPES1=... Types2=... INDEX1=.. INDEX2=..FMT=csv
-------------------------------------------------
-*/
-
-
-FMT = ifdef($FMT,"binary"); # default output format
-
-# number of categorical attributes.. numC <= C
-numC = $NC;
-numO = as.integer(numC/2);
-numNominal = numC - numO;
-print("Categorical Mix = (" + numC + "," + numO + "," + numNominal +")");
-
-# maximum domain size among all categorical attributes
-maxDomainSize = $MAXDOMAIN;
-
-# Divide $C attributes according to the following logic:
-#
-#   1     numO       numC               C
-#   |-------|---------|-----------------|
-#      ord    nominal    scale
-#
-# numC+1-$C: scale
-# 1-numC/2: ordinal
-# (numC/2+1)-numC: nominal
-
-types = matrix(1, rows=1, cols=$C);
-ocutoff = numO;
-types[1,1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
-types[1, ocutoff+1:numC] = matrix(1,rows=1,cols=(numC-ocutoff))*2;
-
-# Generate data
-A = rand(rows=$R, cols=$C, sparsity=1);
-B = matrix(0,rows=nrow(A), cols=ncol(A));
-parfor (i in 1:numC) {
-    Ai = A[,i];
-
-    tmp = round(rand(rows=1,cols=1, min=1, max=maxDomainSize));
-    domain = as.scalar(tmp[1,1]);
-
-    # for some attributes, choose the maxDomainSize
-    tmp = rand(rows=1,cols=1);
-    if (as.scalar(tmp[1,1]) < 0.5) {
-        domain = maxDomainSize;
-    }
-
-    B[,i] = round(1+(domain-1)*Ai);
-}
-B[ ,(numC+1):ncol(A)] = A[, (numC+1):ncol(A)];
- 
-
-write(B, $DATA, format=FMT);
-write(types, $TYPES, format=FMT);
-
-# ----- Generator for Bivariate ---------
-
-settypes1 = matrix(1, rows=1, cols=$SETSIZE);
-index1   = matrix(0, rows=1, cols=$SETSIZE);
-
-catSetSize = as.integer($SETSIZE/2);
-ocutoff = as.integer(catSetSize/2);
-print("Set Mix = (" + $SETSIZE + "," + catSetSize + "," + ocutoff + ")" );
-settypes1[1, 1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
-settypes1[1, ocutoff+1:catSetSize] = 
matrix(1,rows=1,cols=(catSetSize-ocutoff))*2;
-
-# select ordinal indices
-tmp = rand(rows=1, cols=ocutoff);
-index1[1, 1:ocutoff] = round(1 + (numO-1)*tmp);
-
-# select nominal indices
-nominalSetSize = catSetSize-ocutoff;
-tmp = rand(rows=1, cols=nominalSetSize);
-index1[1, ocutoff+1:catSetSize] = round(numO+1 + (numC-numO-1)*tmp);
-
-# select scale attributes
-scaleSetSize = $SETSIZE-catSetSize;
-tmp = rand(rows=1, cols=scaleSetSize);
-index1[1, catSetSize+1:$SETSIZE] = round(numC+1 + ($C-numC-1)*tmp);
-
-
-# --- select types and indices for LABELSET
-settypes2 = matrix(2, rows=1, cols=$LABELSETSIZE);
-index2   = matrix(0, rows=1, cols=$LABELSETSIZE);
-if($LABELSETSIZE > 1) {
-    settypes2[1,1] = 1;
-    r = as.scalar(rand(rows=1,cols=1));
-    index2[1,1] = round(numC+1 + ($C-numC-1)*r)
-}
-else {
-    r = as.scalar(rand(rows=1,cols=1));
-    index2[1,1] = round( numO+1 + (numC-numO-1)*r )
-}
-
-for(i in 2:as.integer($LABELSETSIZE/2)) {
-    settypes2[1,i] = 3;
-    r = as.scalar(rand(rows=1,cols=1));
-    index2[1,i] = round( 1 + (numO-1)*r )
-}
-
-for(i in as.integer($LABELSETSIZE/2)+1:$LABELSETSIZE) {
-    settypes2[1,i] = 2;
-    r = as.scalar(rand(rows=1,cols=1));
-    index2[1,i] = round( numO+1 + (numC-numO-1)*r )
-}
-
-write(settypes1, $TYPES1, format=FMT);
-write(settypes2, $TYPES2, format=FMT);
-write(index1, $INDEX1, format=FMT);
-write(index2, $INDEX2, format=FMT);
-
diff --git a/scripts/datagen/genRandData4FTest.dml 
b/scripts/datagen/genRandData4FTest.dml
deleted file mode 100644
index 9f0e1d6c68..0000000000
--- a/scripts/datagen/genRandData4FTest.dml
+++ /dev/null
@@ -1,95 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for F-test
-#
-# $1 is number of groups (some of 
-#              which may share a gaussian)
-# $2 is number of actual groups 
-# $3 is number of points
-# $4 is mean of the gaussian means
-# $5 is mean of the gaussian std. deviations
-# $6 is file to store computed f-statistic
-# $7 is file to store generated data
-
-numGroups = $1
-numActualGroups = $2
-numSamples = $3
-meanOfMeans = $4
-meanOfStddevs = $5
-
-cntProbs = Rand(rows=numGroups, cols=1, min=0.0, max=1.0, pdf="uniform", 
seed=0)
-cntProbs = cntProbs/sum(cntProbs)
-cntArr = round(cntProbs * numSamples)
-last_cnt = cntArr[numGroups,1]
-cntArr[numGroups,1] = numSamples - (sum(cntArr) - last_cnt)
-
-permut = Rand(rows=numActualGroups, cols=numGroups, min=0.0, max=0.0, 
pdf="uniform")
-ones = Rand(rows=numActualGroups, cols=1, min=1.0, max=1.0, pdf="uniform")
-permut[,1:numActualGroups] = diag(ones)
-
-one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform")
-copy_start_index = numActualGroups+1
-parfor(i in copy_start_index:numGroups){
-       r = Rand(rows=1, cols=1, min=1.0, max=numActualGroups, pdf="uniform", 
seed=0)
-       j = as.scalar(round(r))
-       permut[j,i] = one
-}
-
-means_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0)
-abs_means = means_std + meanOfMeans
-means = t(t(abs_means) %*% permut)
-
-stddevs_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0)
-abs_stddevs = stddevs_std + meanOfStddevs
-stddevs = t(t(abs_stddevs) %*% permut)
-
-overall_mean = sum(means*cntArr)/numSamples
-
-explained_variance = sum(cntArr * (means - overall_mean)^2) / (numGroups-1.0)
-unexplained_variance = sum(cntArr * stddevs^2) / (numSamples - numGroups)
-f = explained_variance / unexplained_variance
-write(f, $6, format="binary")
-
-cntCDFs = cntProbs
-for(i in 2:numGroups){
-       cntCDFs[i,1] = cntCDFs[i-1,1] + cntProbs[i,1]
-}
-
-data = Rand(rows=numSamples, cols=1, min=0.0, max=0.0, pdf="uniform")
-parfor(i in 1:numSamples){
-       r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0)
-       r1 = as.scalar(r_mat)
-
-       g = -1
-       continue = 1
-       for(k in 1:numGroups){
-               cdf = as.scalar(cntCDFs[k,1])
-               if(continue==1 & r1<=cdf){
-                       g = k
-                       continue=0
-               }       
-       }
-       
-       point = Rand(rows=1, cols=1, pdf="normal", seed=0)
-       data[i,1] = point*stddevs[g,1] + means[g,1]
-}
-write(data, $7, format="binary")
diff --git a/scripts/datagen/genRandData4Kmeans.dml 
b/scripts/datagen/genRandData4Kmeans.dml
deleted file mode 100644
index 3098650b26..0000000000
--- a/scripts/datagen/genRandData4Kmeans.dml
+++ /dev/null
@@ -1,120 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Generates random Gaussian-mixture data to test k-Means clustering algorithms
-#
-# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME  TYPE   DEFAULT  MEANING
-# ----------------------------------------------------------------------------
-# nr    Int     ---     Number of records
-# nf    Int     ---     Number of features
-# nc    Int     ---     Number of clusters
-# dc    Double  ---     St.dev. of cluster "centroid" features from zero mean
-# dr    Double  ---     St.dev. of the 1-st feature in a record within cluster
-# fbf   Double  ---     Feature bias factor: Stdev(last) / Stdev(1-st) feature
-# cbf   Double  ---     Cluster bias factor: Prob[1-st clus] / Prob[k-th clus]
-# X     String  ---     Location to write matrix X with generated data records
-# C     String  ---     Location to write cluster "centroids" (Gaussian means)
-# Y     String  ---     Location to write assignment of records to cluster ids
-# YbyC  String  ---     Location to write rec-cluster assigns by min-dist to C
-# ----------------------------------------------------------------------------
-#
-# Example:
-# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100
-#     nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx 
YbyC=YbyC.mtx
-
-print ("BEGIN K-MEANS GENERATOR SCRIPT");
-
-num_records   = $nr;
-num_features  = $nf;
-num_centroids = $nc;
-dist_per_feature_centroids = $dc;
-dist_per_feature_first_record = $dr;
-feature_bias_factor = $fbf;
-cluster_bias_factor = $cbf;
-
-fileX    = ifdef ($X, "X");
-fileC    = ifdef ($C, "C");
-fileY    = ifdef ($Y, "Y");
-fileYbyC = ifdef ($YbyC, "YbyC");
-fmt      = ifdef ($fmt, "text");
-
-print ("Generating cluster distribution (mixture) centroids...");
-
-C = Rand (rows = num_centroids, cols = num_features, pdf = "normal");
-C = C * dist_per_feature_centroids;
-
-print ("Generating record-to-cluster assignments...");
-
-# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely
-# than "num_centroids" by the factor of "cluster_bias_factor"
-
-rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = 
"uniform");
-if (cluster_bias_factor == 1.0) {
-    Y = round (0.5 + rnd * num_centroids);
-} else {
-    rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / 
(num_centroids - 1)));
-    Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log 
(cluster_bias_factor));
-}
-
-print ("Generating within-cluster random shifts...");
-
-X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal");
-feature_factors = dist_per_feature_first_record * 
-    exp ((seq (1, num_features) - 1) / (num_features - 1) * log 
(feature_bias_factor));
-X_shift = X_shift %*% diag (feature_factors);
-
-print ("Generating records by shifting from centroids..."); 
-
-Y_bitmap_raw = table (seq (1, num_records), Y);
-Y_bitmap = matrix (0, rows = num_records, cols = num_centroids);
-Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw;
-X = Y_bitmap %*% C + X_shift;
-
-print ("Computing record-to-cluster assignments by minimum centroid 
distance...");
-
-D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2));
-P = (D <= rowMins (D));
-aggr_P = t(cumsum (t(P)));
-Y_by_C = rowSums (aggr_P == 0) + 1;
-
-print ("Computing useful statistics...");
-
-sumXsq = sum (X ^ 2);
-default_wcss  = sumXsq - sum (colSums (X) ^ 2) / num_records;
-attained_wcss = sumXsq + sum (rowMins (D));
-
-print ("Default (single-cluster) WCSS = " + default_wcss);
-print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + 
attained_wcss);
-
-print ("Writing out the resulting dataset...");
-
-write (X, fileX, format = fmt);
-write (C, fileC, format = fmt);
-write (Y, fileY, format = fmt);
-write (Y_by_C, fileYbyC, format = fmt);
-
-print ("Please run the scoring script to compare " + fileY + " with " + 
fileYbyC); 
-
-print ("DONE: K-MEANS GENERATOR SCRIPT");
-
diff --git a/scripts/datagen/genRandData4LinearReg_LTstats.dml 
b/scripts/datagen/genRandData4LinearReg_LTstats.dml
deleted file mode 100644
index 9bb1ca189e..0000000000
--- a/scripts/datagen/genRandData4LinearReg_LTstats.dml
+++ /dev/null
@@ -1,233 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# generates random data to test bi- and multinomial logistic regression
-
-# $N  = number of training samples
-# $Nt = number of test samples (or 0 if none)
-# $nf = number of features (independent variables)
-# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
-# $Xmin  = minimum feature value
-# $Xmax  = maximum feature value
-# $spars = controls sparsity in the generated data
-# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
-# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
-# $stdLT = requested standard deviation for the linear terms
-# $iceptmin = intercept, minimum value (0.0 disables intercept)
-# $iceptmax = intercept, maximum value (0.0 disables intercept)
-# $B  = location to store generated regression parameters
-# $X  = location to store generated training data
-# $Y  = location to store generated training category labels
-# $Xt = location to store generated test data
-# $Yt = location to store generated test category labels
-# $fmt = format of the output
-#
-# Example:
-# hadoop jar SystemDS.jar -f genRandData4LinearReg_LTstats.dml -nvargs
-#     N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 
avgLTmax=5.0 stdLT=1.25
-#     iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 
Yt=./Yt123 fmt=binary
-
-numTrainingSamples = $N;
-numTestSamples = $Nt;
-numFeatures = $nf;
-numCategories = $nc;
-minIntercept = $iceptmin;
-maxIntercept = $iceptmax;
-minXentry = $Xmin;
-maxXentry = $Xmax;
-minAvgLT = $avgLTmin;
-maxAvgLT = $avgLTmax;
-sparsityLevel = $spars;
-stdevLT = $stdLT;
-fileB  = ifdef ($B,  "B");
-fileX  = ifdef ($X,  "X");
-fileY  = ifdef ($Y,  "Y");
-fileXt = ifdef ($Xt, "Xt");
-fileYt = ifdef ($Yt, "Yt");
-fmt = ifdef ($fmt, "mm");
-
-numSamples = numTrainingSamples + numTestSamples;
-
-isBinomialPMOne = FALSE;
-if (numCategories == 1) {
-    numCategories = 2;
-    isBinomialPMOne = TRUE;
-}
-do_we_output_intercept = 1;
-if (minIntercept == 0 & maxIntercept == 0) {
-    do_we_output_intercept = 0;
-}
-
-X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = 
maxXentry, pdf = "uniform", sparsity = sparsityLevel);
-
-meanLT  = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = 
maxAvgLT, pdf = "uniform");
-sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
-b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, 
max = maxIntercept, pdf = "uniform");
-
-meanLT_minus_intercept = meanLT - b_intercept;
-[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
-
-ones = matrix (1.0, rows = numSamples, cols = 1);
-LT = X %*% B + ones %*% b_intercept;
-actual_meanLT  = colSums (LT) / numSamples;
-actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
-
-for (i in 1:(numCategories - 1)) {
-    if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) {
-        print ("Category " + i + ":  Intercept = " + as.scalar (b_intercept 
[1, i])); 
-    } else {
-        print ("Category " + i + ":  Intercept = " + as.scalar (b_intercept 
[1, i]) + ",  st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i])); 
-    }
-    print ("    Wanted LT mean = " + as.scalar (meanLT [1, i])        + ",  
st.dev. = " + as.scalar (new_sigmaLT [1, i]));
-    print ("    Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ",  
st.dev. = " + as.scalar (actual_sigmaLT [1, i]));
-}
-
-
-/*
-ones = matrix (1.0, rows = 1, cols = numCategories - 1);
-Prob = exp (LT);
-Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
-Prob = t(cumsum (t(Prob)));
-
-r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed 
= 0);
-R = r %*% ones;
-Y = 1 + rowSums (Prob < R);
-if (isBinomialPMOne) {
-    Y = 3 - 2 * Y;
-}
-*/
-
-/* USE FOR LINEAR REGRESSION */
-
-r = Rand (rows = numSamples, cols = 1, pdf = "normal");
-Y = LT [, 1] + r;
-
-
-if (do_we_output_intercept == 1) {
-    new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
-    new_B [1:nrow(B), 1:ncol(B)] = B;
-    new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
-    write (new_B, fileB, format=fmt);
-} else {
-    write (B, fileB, format=fmt);
-}
-
-if (numTestSamples > 0) {
-    X_train = X [1:numTrainingSamples,];
-    Y_train = Y [1:numTrainingSamples,];
-    X_test  = X [(numTrainingSamples+1):numSamples,];
-    Y_test  = Y [(numTrainingSamples+1):numSamples,];
-    write (X_train, fileX,  format=fmt);
-    write (Y_train, fileY,  format=fmt);
-    write (X_test,  fileXt, format=fmt);
-    write (Y_test,  fileYt, format=fmt);
-} else {
-    write (X, fileX, format=fmt);
-    write (Y, fileY, format=fmt);
-}
-
-
-
-
-
-
-# Generates weight vectors to ensure the desired statistics for Linear Terms = 
X %*% W
-# To be used for data generation in the testing of GLM, Logistic Regression, 
etc.
-# INPUT:  meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] 
are
-#         the desired mean and standard deviation for X %*% W[, i]
-# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
-#         new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully 
enforced,
-#         new_sigmaLT[1, i]  > sigmaLT[1, i] if we had to relax this 
constraint.
-generateWeights = 
-    function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
-    return   (Matrix[double] W, Matrix[double] new_sigmaLT)
-{
-    num_w = ncol (meanLT);  # Number of output weight vectors
-    dim_w = ncol (X);       # Number of features / dimensions in a weight 
vector
-    w_X = t(colSums(X));    # "Prohibited" weight shift direction that changes 
meanLT
-                            # (all orthogonal shift directions do not affect 
meanLT)
-
-    # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
-
-    w_1 = straightenX (X);
-    r_1 = (X %*% w_1) - 1.0;
-    norm_r_1_sq = sum (r_1 ^ 2);
-    
-    # For each W[, i] generate uniformly random directions to shift away from 
"w_1"
-    
-    DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
-    DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to 
w_X
-    XDW = X %*% DW;
-    
-    # Determine how far to shift in the chosen directions to satisfy the 
constraints
-    # Use the positive root of the quadratic equation; relax sigmaLT where 
needed
-    
-    a_qe = colSums (XDW ^ 2);
-    b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
-    c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
-
-    is_sigmaLT_OK = (c_qe <= 0);
-    new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) 
* sqrt (norm_r_1_sq / nrow(X));
-    c_qe = is_sigmaLT_OK * c_qe;
-    x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
-    
-    # Scale and shift "w_1" in the "DW" directions to produce the result:
-    
-    ones = matrix (1.0, rows = dim_w, cols = 1);
-    W = w_1 %*% meanLT + DW * (ones %*% x_qe);
-}
-
-# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
-# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
-# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
-straightenX =
-    function (Matrix[double] X)
-    return   (Matrix[double] w)
-{
-    w_X = t(colSums(X));
-    lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
-    eps = 0.000000001 * nrow(X);
-
-    # BEGIN LEAST SQUARES
-    
-    r_LS = - w_X;
-    z_LS = matrix (0.0, rows = ncol(X), cols = 1);
-    p_LS = - r_LS;
-    norm_r2_LS = sum (r_LS ^ 2);
-    i_LS = 0;
-    while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
-    {
-        temp_LS = X %*% p_LS;
-        q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
-        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
-        z_LS = z_LS + alpha_LS * p_LS;
-        old_norm_r2_LS = norm_r2_LS;
-        r_LS = r_LS + alpha_LS * q_LS;
-        norm_r2_LS = sum (r_LS ^ 2);
-        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
-        i_LS = i_LS + 1;
-    }
-    
-    # END LEAST SQUARES
-    
-    w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
-}
diff --git a/scripts/datagen/genRandData4LinearRegression.dml 
b/scripts/datagen/genRandData4LinearRegression.dml
deleted file mode 100644
index ebce4f30d1..0000000000
--- a/scripts/datagen/genRandData4LinearRegression.dml
+++ /dev/null
@@ -1,61 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates data to test linear regression
-
-# $1 is number of samples
-# $2 is number of features (independent variables)
-# $3 is maximum feature value (absolute value)
-# $4 is maximum weight (absolute value)
-# $5 is location to store generated weights
-# $6 is location to store generated data
-# $7 is location to store generated labels
-# $8 is 0/1. 0 suppresses noise, 1 will add noise to Y
-# $9 is b, 0 disables intercept
-# $10 controls sparsity in the generated data
-# $11 output format
-
-numSamples = $1
-numFeatures = $2
-maxFeatureValue = $3
-maxWeight = $4
-addNoise = $8
-b = $9
-fmt = $11
-
-X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", 
seed=0, sparsity=$10)
-w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
-X = X * maxFeatureValue
-w = w * maxWeight
-Y = X %*% w
-
-if( b != 0 ) {
-       b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
-       w =  rbind(w, t(b_mat))
-       Y = Y + b
-}
-
-noise = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
-Y = Y + addNoise*noise
-
-write(w, $5, format=fmt)
-write(X, $6, format=fmt)
-write(Y, $7, format=fmt)
diff --git a/scripts/datagen/genRandData4LogReg_LTstats.dml 
b/scripts/datagen/genRandData4LogReg_LTstats.dml
deleted file mode 100644
index f95342f708..0000000000
--- a/scripts/datagen/genRandData4LogReg_LTstats.dml
+++ /dev/null
@@ -1,233 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# generates random data to test bi- and multinomial logistic regression
-
-# $N  = number of training samples
-# $Nt = number of test samples (or 0 if none)
-# $nf = number of features (independent variables)
-# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
-# $Xmin  = minimum feature value
-# $Xmax  = maximum feature value
-# $spars = controls sparsity in the generated data
-# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
-# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
-# $stdLT = requested standard deviation for the linear terms
-# $iceptmin = intercept, minimum value (0.0 disables intercept)
-# $iceptmax = intercept, maximum value (0.0 disables intercept)
-# $B  = location to store generated regression parameters
-# $X  = location to store generated training data
-# $Y  = location to store generated training category labels
-# $Xt = location to store generated test data
-# $Yt = location to store generated test category labels
-#
-# Example:
-# hadoop jar SystemDS.jar -f genRandData4LogReg_LTstats.dml -nvargs
-#     N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 
avgLTmax=5.0 stdLT=1.25
-#     iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 
Yt=./Yt123
-
-numTrainingSamples = $N;
-numTestSamples = $Nt;
-numFeatures = $nf;
-numCategories = $nc;
-minIntercept = $iceptmin;
-maxIntercept = $iceptmax;
-minXentry = $Xmin;
-maxXentry = $Xmax;
-minAvgLT = $avgLTmin;
-maxAvgLT = $avgLTmax;
-sparsityLevel = $spars;
-stdevLT = $stdLT;
-fileB  = ifdef ($B,  "B");
-fileX  = ifdef ($X,  "X");
-fileY  = ifdef ($Y,  "Y");
-fileXt = ifdef ($Xt, "Xt");
-fileYt = ifdef ($Yt, "Yt");
-
-
-numSamples = numTrainingSamples + numTestSamples;
-
-isBinomialPMOne = FALSE;
-if (numCategories == 1) {
-    numCategories = 2;
-    isBinomialPMOne = TRUE;
-}
-do_we_output_intercept = 1;
-if (minIntercept == 0 & maxIntercept == 0) {
-    do_we_output_intercept = 0;
-}
-
-X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = 
maxXentry, pdf = "uniform", sparsity = sparsityLevel);
-
-meanLT  = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = 
maxAvgLT, pdf = "uniform");
-sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
-b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, 
max = maxIntercept, pdf = "uniform");
-
-meanLT_minus_intercept = meanLT - b_intercept;
-[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
-
-ones = matrix (1.0, rows = numSamples, cols = 1);
-LT = X %*% B + ones %*% b_intercept;
-actual_meanLT  = colSums (LT) / numSamples;
-actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
-
-for (i in 1:(numCategories - 1)) {
-    if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) {
-        print ("Category " + i + ":  Intercept = " + as.scalar (b_intercept 
[1, i])); 
-    } else {
-        print ("Category " + i + ":  Intercept = " + as.scalar (b_intercept 
[1, i]) + ",  st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i])); 
-    }
-    print ("    Wanted LT mean = " + as.scalar (meanLT [1, i])        + ",  
st.dev. = " + as.scalar (new_sigmaLT [1, i]));
-    print ("    Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ",  
st.dev. = " + as.scalar (actual_sigmaLT [1, i]));
-}
-
-
-ones = matrix (1.0, rows = 1, cols = numCategories - 1);
-Prob = exp (LT);
-Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
-Prob = t(cumsum (t(Prob)));
-
-r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed 
= 0);
-R = r %*% ones;
-Y = 1 + rowSums (Prob < R);
-if (isBinomialPMOne) {
-    Y = 3 - 2 * Y;
-}
-
-
-/* USE FOR LINEAR REGRESSION
-
-r = Rand (rows = numSamples, cols = 1, pdf = "normal");
-Y = LT [, 1] + r;
-
-*/
-
-
-if (do_we_output_intercept == 1) {
-    new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
-    new_B [1:nrow(B), 1:ncol(B)] = B;
-    new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
-    write (new_B, fileB, format="mm");
-} else {
-    write (B, fileB, format="mm");
-}
-
-if (numTestSamples > 0) {
-    X_train = X [1:numTrainingSamples,];
-    Y_train = Y [1:numTrainingSamples,];
-    X_test  = X [(numTrainingSamples+1):numSamples,];
-    Y_test  = Y [(numTrainingSamples+1):numSamples,];
-    write (X_train, fileX,  format="mm");
-    write (Y_train, fileY,  format="mm");
-    write (X_test,  fileXt, format="mm");
-    write (Y_test,  fileYt, format="mm");
-} else {
-    write (X, fileX, format="mm");
-    write (Y, fileY, format="mm");
-}
-
-
-
-
-
-
-# Generates weight vectors to ensure the desired statistics for Linear Terms = 
X %*% W
-# To be used for data generation in the testing of GLM, Logistic Regression, 
etc.
-# INPUT:  meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] 
are
-#         the desired mean and standard deviation for X %*% W[, i]
-# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
-#         new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully 
enforced,
-#         new_sigmaLT[1, i]  > sigmaLT[1, i] if we had to relax this 
constraint.
-generateWeights = 
-    function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
-    return   (Matrix[double] W, Matrix[double] new_sigmaLT)
-{
-    num_w = ncol (meanLT);  # Number of output weight vectors
-    dim_w = ncol (X);       # Number of features / dimensions in a weight 
vector
-    w_X = t(colSums(X));    # "Prohibited" weight shift direction that changes 
meanLT
-                            # (all orthogonal shift directions do not affect 
meanLT)
-
-    # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
-
-    w_1 = straightenX (X);
-    r_1 = (X %*% w_1) - 1.0;
-    norm_r_1_sq = sum (r_1 ^ 2);
-    
-    # For each W[, i] generate uniformly random directions to shift away from 
"w_1"
-    
-    DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
-    DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to 
w_X
-    XDW = X %*% DW;
-    
-    # Determine how far to shift in the chosen directions to satisfy the 
constraints
-    # Use the positive root of the quadratic equation; relax sigmaLT where 
needed
-    
-    a_qe = colSums (XDW ^ 2);
-    b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
-    c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
-
-    is_sigmaLT_OK = (c_qe <= 0);
-    new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) 
* sqrt (norm_r_1_sq / nrow(X));
-    c_qe = is_sigmaLT_OK * c_qe;
-    x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
-    
-    # Scale and shift "w_1" in the "DW" directions to produce the result:
-    
-    ones = matrix (1.0, rows = dim_w, cols = 1);
-    W = w_1 %*% meanLT + DW * (ones %*% x_qe);
-}
-
-# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
-# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
-# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
-straightenX =
-    function (Matrix[double] X)
-    return   (Matrix[double] w)
-{
-    w_X = t(colSums(X));
-    lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
-    eps = 0.000000001 * nrow(X);
-
-    # BEGIN LEAST SQUARES
-    
-    r_LS = - w_X;
-    z_LS = matrix (0.0, rows = ncol(X), cols = 1);
-    p_LS = - r_LS;
-    norm_r2_LS = sum (r_LS ^ 2);
-    i_LS = 0;
-    while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
-    {
-        temp_LS = X %*% p_LS;
-        q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
-        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
-        z_LS = z_LS + alpha_LS * p_LS;
-        old_norm_r2_LS = norm_r2_LS;
-        r_LS = r_LS + alpha_LS * q_LS;
-        norm_r2_LS = sum (r_LS ^ 2);
-        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
-        i_LS = i_LS + 1;
-    }
-    
-    # END LEAST SQUARES
-    
-    w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
-}
diff --git a/scripts/datagen/genRandData4LogisticRegression.dml 
b/scripts/datagen/genRandData4LogisticRegression.dml
deleted file mode 100644
index f0850938ad..0000000000
--- a/scripts/datagen/genRandData4LogisticRegression.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data to test linear logistic regression
-
-# $1 is number of samples
-# $2 is number of features (independent variables)
-# $3 is maximum feature value (absolute value)
-# $4 is maximum weight (absolute value)
-# $5 is location to store generated weights
-# $6 is location to store generated data
-# $7 is location to store generated labels
-# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
-# $9 is b, 0 disables intercept
-# $10 controls sparsity in the generated data
-# $11 output format
-# $12 transform labels. if 0 then -1/1; otherwise 1/2
-
-numSamples = $1
-numFeatures = $2
-maxFeatureValue = $3
-maxWeight = $4
-addNoise = $8
-b = $9
-
-X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", 
seed=0, sparsity=$10)
-X = X * maxFeatureValue 
-
-w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
-w = w * maxWeight
-
-ot = X %*% w
-if( b != 0) {
-       b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
-       w =  rbind(w, t(b_mat))
-       ot = ot + b
-}
-
-prob = 1 / (1 + exp(-ot))
-if( addNoise == 1 ){
-       r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-} 
-else {
-       print("this data generator generates the same dataset for both noise=0 
and noise=1")
-       r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-}
-
-Y = 1 - 2 * (prob < r)
-if( $12 == 1 )
-  Y = (Y + 3) / 2
-
-write(w, $5, format=$11)
-write(X, $6, format=$11)
-write(Y, $7, format=$11)
diff --git a/scripts/datagen/genRandData4MultiClassSVM.dml 
b/scripts/datagen/genRandData4MultiClassSVM.dml
deleted file mode 100644
index 011b4dab18..0000000000
--- a/scripts/datagen/genRandData4MultiClassSVM.dml
+++ /dev/null
@@ -1,68 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data to test linear logistic regression
-
-# $1 is number of samples
-# $2 is number of features (independent variables)
-# $3 is maximum feature value (absolute value)
-# $4 is maximum weight (absolute value)
-# $5 is location to store generated weights
-# $6 is location to store generated data
-# $7 is location to store generated labels
-# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
-# $9 is b, 0 disables intercept
-# $10 controls sparsity in the generated data
-
-numSamples = $1
-numFeatures = $2
-maxFeatureValue = $3
-maxWeight = $4
-addNoise = $8
-b = $9
-
-X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", 
seed=0, sparsity=$10)
-X = X * maxFeatureValue 
-
-w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
-w = w * maxWeight
-
-ot = X%*%w
-if(b!=0) {
-       b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
-       w =  t(cbind(t(w), b_mat))
-       ot = ot + b
-}
-
-prob = 1/(1+exp(-ot))
-if(addNoise == 1){
-       r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-}else{
-       print("this data generator generates the same dataset for both noise=0 
and noise=1")
-       r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-       #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform")
-}
-Y = 1 - 2 * (prob < r)
-Y = (Y+3)/2
-
-write(w, $5, format="binary")
-write(X, $6, format="binary")
-write(Y, $7, format="binary")
diff --git a/scripts/datagen/genRandData4Multinomial.dml 
b/scripts/datagen/genRandData4Multinomial.dml
deleted file mode 100644
index 93666758b5..0000000000
--- a/scripts/datagen/genRandData4Multinomial.dml
+++ /dev/null
@@ -1,66 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-num_records = $1;
-num_features = $2;
-
-p = $3; #sparsity
-num_categories = $4; #num classes
-is_intercept = $5==1; 
-
-stdevLT = 1.0;
-beta_range = 3.0 * stdevLT / sqrt (num_features * p);
-
-if (is_intercept) {
-    intercept = Rand (rows = 1, cols = num_categories - 1, min = -1.0, max = 
1.0);
-}
-
-X = Rand( rows = num_records, 
-          cols = num_features, 
-          min = 1, 
-          max = 5, 
-          pdf = "uniform", 
-          sparsity = p );
-
-B = Rand (rows = num_features, 
-          cols = num_categories - 1, 
-          min = -1.0, 
-          max = 1.0, 
-          pdf = "uniform", 
-          sparsity = 1.0) * beta_range;
-
-LT = X %*% B;
-if (is_intercept) {
-    LT = LT + matrix (1, rows = num_records, cols = 1) %*% intercept;
-}
-
-Prob = exp (LT);
-Prob = Prob / (1.0 + rowSums(Prob));
-Prob = t(cumsum (t(Prob)));
-
-r = Rand (rows = num_records, cols = 1, min = 0, max = 1, pdf = "uniform");
-Y = 1 + rowSums (Prob < r);
-
-# ensure all classes are represented
-Y[(num_records-num_categories+1):num_records,1] = seq(1,num_categories);
-
-write(X, $6, format=$8)
-write(Y, $7, format=$8);
\ No newline at end of file
diff --git a/scripts/datagen/genRandData4NMF.dml 
b/scripts/datagen/genRandData4NMF.dml
deleted file mode 100644
index a82ac4e0f1..0000000000
--- a/scripts/datagen/genRandData4NMF.dml
+++ /dev/null
@@ -1,129 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for non-negative
-# matrix factorization
-#
-# follows lda's generative model
-# see Blei, Ng & Jordan, JMLR'03 paper
-# titled Latent Dirichlet Allocation
-#
-# $1 is number of samples
-# $2 is number of features
-# $3 is number of latent factors
-# $4 is number of features per sample
-#       (may overlap). use this to vary
-#       sparsity.      
-# $5 is file to store sample mixtures
-# $6 is file to store factors
-# $7 is file to store generated data
-
-numDocuments = $1
-numFeatures = $2
-numTopics = $3
-numWordsPerDoc = $4
-
-docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, 
pdf="uniform", seed=0, sparsity=0.75)
-denomsTM = rowSums(docTopicMixtures)
-zerosInDenomsTM = denomsTM == 0
-denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
-parfor(i in 1:numTopics){
-       docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
-}
-write(docTopicMixtures, $5, format="binary")
-for(j in 2:numTopics){
-       docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
-}
-
-topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, 
pdf="uniform", seed=0, sparsity=0.75)
-parfor(i in 1:numTopics){
-       topicDist = topicDistributions[i,]
-       
-       denom2 = sum(topicDist)
-       if(denom2 == 0){
-               denom2 = denom2 + 0.1
-       }
-       
-       topicDistributions[i,] = topicDist / denom2
-}
-write(topicDistributions, $6, format="binary")
-for(j in 2:numFeatures){
-       topicDistributions[,j] = topicDistributions[,j-1] + 
topicDistributions[,j]
-}
-
-data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
-
-parfor(i in 1:numDocuments){
-       docTopic = docTopicMixtures[i,]
-       
-    ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");
-  
-       r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", 
seed=0)
-       r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", 
seed=0)
-       
-       for(j in 1:numWordsPerDoc){
-               rz = as.scalar(r_z[j,1])
-               continue = 1
-               
-               z = -1
-               #this is a workaround
-               #z=1    
-               
-               for(k1 in 1:numTopics){
-                       prob = as.scalar(docTopic[1,k1])
-                       if(continue==1 & rz <= prob){
-                               z=k1
-                               continue=0
-                       }
-               }
-               
-               if(z==-1){
-                       print("z is unassigned: " + z)
-                       z = numTopics
-               }
-               
-               rw = as.scalar(r_w[j,1])
-               continue = 1
-               
-               w = -1
-               #this is a workaround
-               #w = 1
-               
-               for(k2 in 1:numFeatures){
-                       prob = as.scalar(topicDistributions[z,k2])
-                       if(continue == 1 & rw <= prob){
-                               w = k2
-                               continue = 0
-                       }
-               }
-               
-               if(w==-1){
-                       print("w is unassigned: " + w)
-                       w = numFeatures
-               }
-               
-               ldata[1,w] = ldata[1,w] + 1
-       }
-  
-    data[i,] = ldata;
-}
-
-write(data, $7, format="binary")
diff --git a/scripts/datagen/genRandData4NMFBlockwise.dml 
b/scripts/datagen/genRandData4NMFBlockwise.dml
deleted file mode 100644
index 0ad548ead2..0000000000
--- a/scripts/datagen/genRandData4NMFBlockwise.dml
+++ /dev/null
@@ -1,138 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for non-negative
-# matrix factorization
-#
-# follows lda's generative model
-# see Blei, Ng & Jordan, JMLR'03 paper
-# titled Latent Dirichlet Allocation
-#
-# $1 is number of samples
-# $2 is number of features
-# $3 is number of latent factors
-# $4 is number of features per sample
-#       (may overlap). use this to vary
-#       sparsity.      
-# $5 is file to store sample mixtures
-# $6 is file to store factors
-# $7 is file to store generated data
-#
-# $8 is the blocksize, i.e., number of rows per block
-#    (should be set such that $8x$2 fits in mem budget)
-
-numDocuments = $1
-numFeatures = $2
-numTopics = $3
-numWordsPerDoc = $4
-blocksize = $8
-
-docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, 
pdf="uniform", seed=0, sparsity=0.75)
-denomsTM = rowSums(docTopicMixtures)
-zerosInDenomsTM = (denomsTM == 0)
-denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
-parfor(i in 1:numTopics){
-       docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
-}
-write(docTopicMixtures, $5, format="binary")
-for(j in 2:numTopics){
-       docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
-}
-
-topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, 
pdf="uniform", seed=0, sparsity=0.75)
-parfor(i in 1:numTopics){
-       topicDist = topicDistributions[i,]
-       
-       denom2 = sum(topicDist)
-       if(denom2 == 0){
-               denom2 = denom2 + 0.1
-       }
-       
-       topicDistributions[i,] = topicDist / denom2
-}
-write(topicDistributions, $6, format="binary")
-for(j in 2:numFeatures){
-       topicDistributions[,j] = topicDistributions[,j-1] + 
topicDistributions[,j]
-}
-
-data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
-
-#outer-loop for blockwise computation
-for( k in seq(1,numDocuments,blocksize) )  
-{
-  len = min(blocksize,numDocuments-k); #block length
-  data = data0[k:(k+len),];            #obtain block
-  
-  parfor(i in 1:len){
-       docTopic = docTopicMixtures[i,]
-       
-       r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", 
seed=0)
-       r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", 
seed=0)
-       
-       for(j in 1:numWordsPerDoc){
-               rz = as.scalar(r_z[j,1])
-               continue = 1
-               
-               z = -1
-               #this is a workaround
-               #z=1    
-               
-               for(k1 in 1:numTopics){
-                       prob = as.scalar(docTopic[1,k1])
-                       if(continue==1 & rz <= prob){
-                               z=k1
-                               continue=0
-                       }
-               }
-               
-               if(z==-1){
-                       print("z is unassigned: " + z)
-                       z = numTopics
-               }
-               
-               rw = as.scalar(r_w[j,1])
-               continue = 1
-               
-               w = -1
-               #this is a workaround
-               #w = 1
-               
-               for(k2 in 1:numFeatures){
-                       prob = as.scalar(topicDistributions[z,k2])
-                       if(continue == 1 & rw <= prob){
-                               w = k2
-                               continue = 0
-                       }
-               }
-               
-               if(w==-1){
-                       print("w is unassigned: " + w)
-                       w = numFeatures
-               }
-               
-               data[i,w] = data[i,w] + 1
-       }
-  }
-  
-  data0[k:(k+len),] = data; # write block back
-}
-
-write(data0, $7, format="binary")
diff --git a/scripts/datagen/genRandData4PCA.dml 
b/scripts/datagen/genRandData4PCA.dml
deleted file mode 100644
index 413d5c458e..0000000000
--- a/scripts/datagen/genRandData4PCA.dml
+++ /dev/null
@@ -1,61 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Synthetic data generator for PCA
-# 3 hidden dimensions (V1, V2, V3)
-# generates only "dense" data
-#
-# INPUT PARAMETERS:
-# 
--------------------------------------------------------------------------------------------
-# NAME   TYPE   DEFAULT  MEANING
-# 
--------------------------------------------------------------------------------------------
-# R      Int     10000   Number of rows
-# C      Int     1000    Number of categorical attributes
-# OUT    String  ---     Location (on HDFS) to store the generated dataset
-# FMT    String  "csv"   Matrix output format, usually "text", "csv" or 
"binary"
-# 
--------------------------------------------------------------------------------------------
-#
-# Example:
-# hadoop jar SystemDS.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 
OUT=/user/biuser/pcaData.mtx FMT=csv
-
-R   = ifdef ($R, 10000)
-C   = ifdef ($C, 1000)
-FMT = ifdef ($FMT, "csv");
-
-# Modified version of the procedure from Zou et.al., "Sparse Principal 
Component Analysis", 2006.
-
-# V1 ~ N(0,290); V2~N(0,300); V3 = -0.3V1+0.925V2 + e, e ~ N(0,1)
-V1 = 0 + 290*rand(rows=R, cols=1, pdf="normal");
-V2 = 0 + 300*rand(rows=R, cols=1, pdf="normal");
-V3 = -0.3*V1 + 0.925*V2 + rand(rows=R, cols=1, pdf="normal");
-
-C1 = ceil(C/2.5);
-C2 = ceil(C/2.5);
-C3 = C - C1 - C2;
-
-M = matrix(0, rows=R, cols=C)
-
-M[,1:C1]       = rand(rows=R, cols=C1, pdf="normal") + V1;
-M[,C1+1:C1+C2] = rand(rows=R, cols=C2, pdf="normal") + V2;
-M[,C1+C2+1:C]  = rand(rows=R, cols=C3, pdf="normal") + V3;
-
-write(M, $OUT, format=FMT);
diff --git a/scripts/datagen/genRandData4StratStats.dml 
b/scripts/datagen/genRandData4StratStats.dml
deleted file mode 100644
index 6a4c07f734..0000000000
--- a/scripts/datagen/genRandData4StratStats.dml
+++ /dev/null
@@ -1,155 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# THIS SCRIPT GENERATES SYNTHETIC DATA FOR STRATSTATS (STRATIFIED STATISTICS) 
TESTING
-#
-# INPUT PARAMETERS:
-# 
--------------------------------------------------------------------------------------------
-# NAME   TYPE   DEFAULT  MEANING
-# 
--------------------------------------------------------------------------------------------
-# nr     Int    100000   Number of records in the generated dataset
-# nf     Int      10     Number of features in the X and the Y parts of the 
generated dataset
-# smin   Int     10000   Minimum stratum value, a positive integer
-# smax   Int     20000   Maximum stratum value, a positive integer
-# prs    Double  100.0   How many times more likely to have minimum vs. 
maximum stratum value
-# pxnan  Double    0.05  Probability of a NaN replacing a value in X
-# pynan  Double    0.05  Probability of a NaN replacing a value in Y
-# psnan  Double    0.05  Probability of a NaN replacing a value in the stratum 
column
-# 
--------------------------------------------------------------------------------------------
-# mxmin  Double   10.0   Baseline (mean) value for the first feature in X
-# mxmax  Double   19.0   Baseline (mean) value for the last feature in X
-# mymin  Double   30.0   Baseline (mean) value for the first feature in Y 
(before adding X)
-# mymax  Double   39.0   Baseline (mean) value for the last feature in Y 
(before adding X)
-# bmin   Double    3.0   "Beta" multiplied by X before adding to Y, for the 
first feature
-# bmax   Double    3.0   "Beta" multiplied by X before adding to Y, for the 
last feature
-# 
--------------------------------------------------------------------------------------------
-# sxbmin Double    3.0   Standard deviation for the first feature in X, 
stratum dependent
-# sxbmax Double    3.0   Standard deviation for the last feature in X, stratum 
dependent
-# sxwmin Double    4.0   Standard deviation for the first feature in X, 
residual
-# sxwmax Double    4.0   Standard deviation for the last feature in X, residual
-# sybmin Double sqrt(28) Standard deviation for the first feature in Y, 
stratum dependent
-# sybmax Double sqrt(28) Standard deviation for the last feature in Y, stratum 
dependent
-# sywmin Double    6.0   Standard deviation for the first feature in Y, 
residual
-# sywmax Double    6.0   Standard deviation for the last feature in Y, residual
-# 
--------------------------------------------------------------------------------------------
-# D      String  "Data"  Location (on HDFS) to store the generated dataset
-# Xcid   String  "Xcid"  Location (on HDFS) to store the column indices of X 
features
-# Ycid   String  "Ycid"  Location (on HDFS) to store the column indices of Y 
features
-# A      String  "Aux"   Location (on HDFS) to store the auxiliary parameter 
values, if any
-# fmt    String  "text"  Matrix output format, usually "text", "mm", or "csv"
-# 
--------------------------------------------------------------------------------------------
-# OUTPUT: Matrix with the generated dataset, Xcid and Ycid, and possibly other 
auxiliaries
-
-num_records   = ifdef ($nr, 100000);
-num_features  = ifdef ($nf, 10);
-min_stratumID = ifdef ($smin, 10000);
-max_stratumID = ifdef ($smax, 20000);
-prob_ratio_min_to_max_stratumID = ifdef ($prs, 100);
-prob_NaN_in_X = ifdef ($pxnan, 0.05);
-prob_NaN_in_Y = ifdef ($pynan, 0.05);
-prob_NaN_in_stratum = ifdef ($psnan, 0.05);
-
-mean_X_min = ifdef ($mxmin, 31.0);
-mean_X_max = ifdef ($mxmax, 40.0);
-mean_Y_min = ifdef ($mymin, 11.0);
-mean_Y_max = ifdef ($mymax, 20.0);
-beta_min   = ifdef ($bmin,   3.0);
-beta_max   = ifdef ($bmax,   3.0);
-
-stdev_X_between_strata_min = ifdef ($sxbmin, 3.0);
-stdev_X_between_strata_max = ifdef ($sxbmax, 3.0);
-stdev_X_within_strata_min  = ifdef ($sxwmin, 4.0);
-stdev_X_within_strata_max  = ifdef ($sxwmax, 4.0);
-stdev_Y_between_strata_min = ifdef ($sybmin, sqrt(28.0));
-stdev_Y_between_strata_max = ifdef ($sybmax, sqrt(28.0));
-stdev_Y_within_strata_min  = ifdef ($sywmin, 6.0);
-stdev_Y_within_strata_max  = ifdef ($sywmax, 6.0);
-
-fileData = ifdef ($D,    "Data");
-fileXcid = ifdef ($Xcid, "Xcid");
-fileYcid = ifdef ($Ycid, "Ycid");
-fileAux  = ifdef ($A,    "Aux" );
-fmt      = ifdef ($fmt,  "text");
-
-# Generate the strata, from 1 to (max_stratumID - min_stratumID + 1), as 
multinomial
-# in which 1 is less likely than (max_stratumID - min_stratumID + 1) by a 
factor of
-# prob_ratio_min_to_max_stratumID
-
-r_power = (max_stratumID - min_stratumID) / log 
(prob_ratio_min_to_max_stratumID);
-r_bound = prob_ratio_min_to_max_stratumID ^ (1.0 + 1.0 / (max_stratumID - 
min_stratumID));
-
-if (r_bound < 1.0) {
-    R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = 
"uniform");
-    R_S = r_bound + R_S * (1.0-r_bound);
-} else {
-    R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = 
"uniform");
-    R_S = 1.0 + R_S * (r_bound-1);
-}
-
-SID = round (0.5 + log (R_S) * r_power);
-num_strata = max (SID);
-Smap = table (SID, seq (1, num_records, 1));
-
-# Compute baseline values and standard deviations of X, Y, and beta, at each 
feature
-
-mean_X = mean_X_min + ((mean_X_max - mean_X_min) / (num_features - 1)) * seq 
(0, num_features - 1, 1);
-mean_Y = mean_Y_min + ((mean_Y_max - mean_Y_min) / (num_features - 1)) * seq 
(0, num_features - 1, 1);
-betas  =   beta_min + ((  beta_max -   beta_min) / (num_features - 1)) * seq 
(0, num_features - 1, 1);
-
-stdev_X_within_strata  = stdev_X_within_strata_min  + 
-    ((stdev_X_within_strata_max  - stdev_X_within_strata_min ) / (num_features 
- 1)) * seq (0, num_features - 1, 1);
-stdev_X_between_strata = stdev_X_between_strata_min + 
-    ((stdev_X_between_strata_max - stdev_X_between_strata_min) / (num_features 
- 1)) * seq (0, num_features - 1, 1);
-stdev_Y_within_strata  = stdev_Y_within_strata_min  + 
-    ((stdev_Y_within_strata_max  - stdev_Y_within_strata_min ) / (num_features 
- 1)) * seq (0, num_features - 1, 1);
-stdev_Y_between_strata = stdev_Y_between_strata_min + 
-    ((stdev_Y_between_strata_max - stdev_Y_between_strata_min) / (num_features 
- 1)) * seq (0, num_features - 1, 1);
-
-# Generate X and Y matrices
-
-RX_strata  = Rand (rows = num_features, cols = num_strata,  pdf = "normal");  
# transposed
-RY_strata  = Rand (rows = num_features, cols = num_strata,  pdf = "normal");  
# to allow
-RX_records = Rand (rows = num_features, cols = num_records, pdf = "normal");  
# matrix-vector
-RY_records = Rand (rows = num_features, cols = num_records, pdf = "normal");  
# operations
-
-t_X = RX_records * stdev_X_within_strata + (RX_strata * stdev_X_between_strata 
+ mean_X) %*% Smap;
-t_Y = RY_records * stdev_Y_within_strata + (RY_strata * stdev_Y_between_strata 
+ mean_Y) %*% Smap + (t_X * betas);
-Data = cbind (min_stratumID - 1 + SID, t(t_X), t(t_Y));
-
-# Set up the NaNs
-
-RNaNS = Rand  (rows = num_records, cols = 1, min = 1.0, max = 1.0, sparsity = 
prob_NaN_in_stratum);
-RNaNX = Rand  (rows = num_records, cols = num_features, min = 1.0, max = 1.0, 
sparsity = prob_NaN_in_X);
-RNaNY = Rand  (rows = num_records, cols = num_features, min = 1.0, max = 1.0, 
sparsity = prob_NaN_in_Y);
-Mask = cbind (RNaNS, RNaNX, RNaNY) != 0;
-Data = Data + (1.0 - Mask) / (1.0 - Mask);
-
-# Output the dataset and the auxiliaries
-
-Xcid = t(seq (2, num_features + 1, 1));
-Ycid = t(seq (num_features + 2, 2 * num_features + 1, 1));
-Aux = cbind (mean_X, mean_Y, betas);
-
-write (Data, fileData, format=fmt);
-write (Xcid, fileXcid, format=fmt);
-write (Ycid, fileYcid, format=fmt);
-write (Aux,  fileAux,  format=fmt);
-
diff --git a/scripts/datagen/genRandData4SurvAnalysis.dml 
b/scripts/datagen/genRandData4SurvAnalysis.dml
deleted file mode 100644
index 75117cf6d7..0000000000
--- a/scripts/datagen/genRandData4SurvAnalysis.dml
+++ /dev/null
@@ -1,133 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#  
-# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL 
HAZARD MODELS
-# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTRIBUTION WITH PARAMETERS LAMBDA 
AND V
-#
-# INPUT   PARAMETERS:
-# 
---------------------------------------------------------------------------------------------
-# NAME    TYPE     DEFAULT      MEANING
-# 
---------------------------------------------------------------------------------------------
-# type    Sting    ---          The type of model for which the data is being 
generated: "kaplan-meier" or "cox"
-# n       Int                   Number of records 
-# lambda  Double   2.0          Scale parameter of the Weibull distribution 
used for generating timestamps 
-# v       Double   1.5          Shape parameter of the Weibull distribution 
used for generating timestamps 
-# p       Double   0.8          1 - probability of a record being censored
-# g       Int      2            If type=kaplan-meier the number of categorical 
features used for grouping 
-# s       Int      1            If type=kaplan-meier the number of categorical 
features used for stratifying
-# f       Int      10           If type=kaplan-meier maximum number of levels 
(i.e., distinct values) of g+s categorical features
-# m       Int      100          If type=cox the number of features in the model
-# sp      Double   1.0          If type=cox the sparsity of the feature matrix 
-# O       String   ---          Location to write the output matrix containing 
random data for the kaplan-meier or the cox model 
-# B       String   ---          If type=cox location to write the output 
matrix containing the coefficients for the cox model 
-# TE     String   ---                  Location to store column indices of X 
corresponding to timestamp (first row) and event information (second row)
-# F       String   ---                 Location to store column indices of X 
which are to be used for fitting the Cox model
-# fmt     String   "text"       The output format of results of the 
kaplan-meier analysis, such as "text" or "csv"
-# 
---------------------------------------------------------------------------------------------
-# OUTPUTS: 
-# 1- If type=kaplan-meier an n x (2+g+s) matrix O with      
-#    - column 1 contains timestamps generated randomly from a Weibull 
distribution with parameters lambda and v
-#       - column 2 contains the information whether an event occurred (1) or 
data is censored (0)
-#       - columns 3:2+g contain categorical features used for grouping 
-#    - columns 3+g:2+g+s contain categorical features used for stratifying
-#   if type=cox an n x (2+m) matrix O with 
-#       - column 1 contains timestamps generated randomly from a Weibull 
distribution with parameters lambda and v
-#       - column 2 contains the information whether an event occurred (1) or 
data is censored (0)
-#       - columns 3:2+m contain scale features 
-# 2- If type=cox a coefficient matrix B
-# 3- A column matrix TE containing the column indices of X corresponding to 
timestamp (first row) and event information (second row)
-# 4- A column matrix F containing the column indices of X which are to be used 
for KM analysis or fitting the Cox model
-
-type = $type; # either "kaplan-meier" or "cox" 
-num_records = $n; 
-lambda = ifdef ($l, 2.0); 
-p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored
-# parameters related to the kaplan-meier model
-n_groups = ifdef ($g, 2);
-n_strata = ifdef ($s, 1);
-max_level = ifdef ($f, 10);
-# parameters related to the cox model
-num_features = ifdef ($m, 1000);  
-sparsity = ifdef ($sp, 1.0); 
-fileO = $O;
-fileB = $B; 
-fileTE = $TE;
-fileF = $F;
-fmtO = ifdef ($fmt, "text"); # $fmt="text" 
-p_censor = 1 - p_event; # prob. that record is censored
-
-if (type == "kaplan-meier") {
-       
-       v = ifdef ($v, 1.5);
-       # generate categorical features used for grouping and stratifying
-       X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min = 
0.000000001, max = max_level - 0.000000001, pdf = "uniform"));
-       
-       # generate timestamps
-       U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); 
-       T = (-log (U) / lambda) ^ (1/v);
-
-} else if (type == "cox") {
-
-       v = ifdef ($v, 50);
-       # generate feature matrix
-       X = rand (rows = num_records, cols = num_features, min = 1, max = 5, 
pdf = "uniform", sparsity = sparsity);
-
-       # generate coefficients
-       B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf = 
"uniform", sparsity = 1.0); # * beta_range;       
-
-       # generate timestamps
-       U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); 
-       T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v);
-
-} else {
-       stop ("Wrong model type!");
-}
-
-Y = matrix (0, rows = num_records, cols = 2);
-event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max = 
(1 + p_event)));
-n_time = sum (event);
-Y[,2] = event;
-       
-# binning of event times
-min_T = min (T);
-max_T = max (T);
-# T = T - min_T;
-len = max_T - min_T;
-num_bins = len / n_time;
-T = ceil (T / num_bins);
-
-# print ("min(T) " + min(T) + " max(T) " + max(T));
-Y[,1] = T;
-
-O = cbind (Y, X);
-write (O, fileO, format = fmtO);
-
-if (type == "cox") {
-       write (B, fileB, format = fmtO);
-       
-}
-
-TE = matrix ("1 2", rows = 2, cols = 1);
-F = seq (1, num_features);
-write (TE, fileTE, format = fmtO);
-write (F, fileF, format = fmtO);
-
diff --git a/scripts/datagen/genRandData4Transform.dml 
b/scripts/datagen/genRandData4Transform.dml
deleted file mode 100644
index edab7c2873..0000000000
--- a/scripts/datagen/genRandData4Transform.dml
+++ /dev/null
@@ -1,96 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Generates random data to test transform with
-#
-# rows, cols: dimensions of the data matrix to be generated
-# prob_categorical: percentage of the generated cols to be categorical
-# min_domain, max_domain: provide a range for domain sizes of the generated 
categorical cols
-# prob_missing: percentage of the generated (scale) cols to have missing values
-# prob_missing_cell: probability of a cell to have a missing value
-# out_X, out_missing, out_categorical: output file names
-#
-
-#params for size of data
-num_rows = ifdef($rows, 1000)
-num_cols = ifdef($cols, 25)
-
-#params for kind of cols
-prob_categorical = ifdef($prob_cat, 0.1)
-min_domain_size = ifdef($min_domain, 1)
-max_domain_size = ifdef($max_domain, 10)
-
-#params for missing value cols
-prob_missing_col = ifdef($prob_missing, 0.1)
-prob_missing_val = ifdef($prob_missing_cell, 0.2)
-
-num_scalar_cols = as.double(num_cols)
-num_categorical_cols = 0.0
-scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
-if(prob_categorical > 0){
-  categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
-  categorical_ind = categorical_ind < prob_categorical
-  categorical_col_ids = removeEmpty(target=seq(1, num_cols, 
1)*categorical_ind, margin="rows")
-  num_categorical_cols = sum(categorical_ind)
-  write(categorical_col_ids, $out_categorical, format="csv")
-  
-  domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, 
pdf="uniform")
-  domain_sizes = round(min_domain_size + (max_domain_size - 
min_domain_size)*domain_sizes)
-  
-  categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, 
pdf="uniform")
-  categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))
-
-  scalar_ind = 1-categorical_ind
-}
-
-scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, 
margin="rows")
-num_scalar_cols = sum(scalar_ind)
-scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, 
pdf="uniform")
-  
-if(num_categorical_cols > 0 & num_scalar_cols > 0){
-  X = cbind(scalar_X, categorical_X)
-  permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, 
num_scalar_cols, num_cols)
-  fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
-  permut_mat = t(cbind(t(permut_mat), t(fill_in)))
-  X = X %*% permut_mat
-}else{
-  if(num_categorical_cols > 0) X = categorical_X
-  else{
-    if(num_scalar_cols > 0) X = scalar_X
-    else print("somehow, we've managed to compute that precisely 0 cols should 
be categorical and 0 cols should be scale")
-  }
-}
-
-if(prob_missing_col > 0){
-  missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
-  missing_col_ind = missing_col_ind < prob_missing_col
-  #currently only support missing value imputation for scale cols
-  missing_col_ind = missing_col_ind * scalar_ind
-  missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, 
margin="rows")
-  missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, 
max=1, pdf="uniform")
-  missing_values = missing_values < prob_missing_val
-  X = cbind(X, missing_values)
-  
-  write(missing_col_ids, $out_missing, format="csv")
-}
-
-write(X, $out_X, format="csv")
diff --git a/scripts/datagen/genRandData4Univariate.dml 
b/scripts/datagen/genRandData4Univariate.dml
deleted file mode 100644
index bcbd528eb9..0000000000
--- a/scripts/datagen/genRandData4Univariate.dml
+++ /dev/null
@@ -1,61 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random numbers from a distribution
-# with specified mean, standard deviation, 
-# skewness, kurtosis
-# mean and standard deviation are taken in as
-# arguments by this script
-# a,b,c,d are coefficients computed by some
-# equation solver determined from the specified
-# skewness and kurtosis using power method
-# polynomials
-#
-# for more details see:
-# Statistical Simulation: Power Method Polynomials
-# and Other Transformations
-# Author: Todd C. Headrick
-# Chapman & Hall/CRC, Boca Raton, FL, 2010.
-# ISBN 978-1-4200-6490-2
-
-# $1 is the number of random points to be sampled
-# $2 is specified mean
-# $3 is specified standard deviation
-# $4-$7 are a,b,c,d obtained by solving a system
-# of equations using specified kurtosis and skewness
-# $8 is the file to write out the generated data to
-
-numSamples = $1
-mu = $2
-sigma = $3
-a = $4
-b = $5
-c = $6
-d = $7
-
-
-print("a=" + a + " b=" + b + " c=" + c + " d=" + d)
-
-X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
-Y = a + b*X + c*X^2 + d*X^3
-
-Z = Y*sigma + mu
-write(Z, $8, format="binary")
diff --git a/scripts/datagen/obsolete/genCorrelatedData.dml 
b/scripts/datagen/obsolete/genCorrelatedData.dml
deleted file mode 100644
index fea33fd2e8..0000000000
--- a/scripts/datagen/obsolete/genCorrelatedData.dml
+++ /dev/null
@@ -1,46 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random correlated data
-# can generate any number of variables/columns
-# used to test univariate stats computation
-# by systemds
-
-# $1 is number of variables/columns
-# $2 is number of samples to create
-# $3 is the location to write out the covariance mat
-# $4 is the location to write out the generated data
-dims = $1
-numSamples = $2
-
-U = Rand(rows=dims, cols=dims, min=-1.0, max=1.0, pdf="uniform", seed=0)
-denoms = sqrt(colSums(U*U))
-parfor(i in 1:dims){
-       U[i,] = U[i,] / denoms
-}
-
-C = t(U)%*%U
-write(C, $3, format="binary")
-
-R = Rand(rows=numSamples, cols=dims, pdf="normal", seed=0)
-Rc = R%*%U
-write(Rc, $4, format="binary")
-
diff --git a/scripts/datagen/obsolete/genLinearRegressionData.dml 
b/scripts/datagen/obsolete/genLinearRegressionData.dml
deleted file mode 100644
index a3689541b0..0000000000
--- a/scripts/datagen/obsolete/genLinearRegressionData.dml
+++ /dev/null
@@ -1,71 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# This script generates random data for linear regression. A matrix is 
generated
-# consisting of a data matrix with a label column appended to it.
-#
-# INPUT PARAMETERS:
-# 
--------------------------------------------------------------------------------------------
-# NAME            TYPE    DEFAULT  MEANING
-# 
--------------------------------------------------------------------------------------------
-# numSamples      Int     ---      Number of samples
-# numFeatures     Int     ---      Number of features (independent variables)
-# maxFeatureValue Int     ---      Maximum feature value (absolute value)
-# maxWeight       Int     ---      Maximum weight (absolute value)
-# addNoise        Boolean ---      Determines whether noise should be added to 
Y
-# b               Double  ---      Intercept
-# sparsity        Double  ---      Controls the sparsity in the generated data 
(a value between 0 and 1)
-# output          String  ---      Location to write the generated data/label 
matrix
-# format          String  ---      Matrix output format
-# perc                   Double  0.8      Percentage of training sample
-# percFile               String  ---      File to store the percentages
-# 
--------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of random data with appended label column
-# 
---------------------------------------------------------------------------------------------
-#
-# Example
-# ./runStandaloneSystemDS.sh algorithms/datagen/genLinearRegressionData.dml 
-nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 
addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv
-#
-
-perc = ifdef($perc, 0.8)
-percFile = ifdef($percFile, "perc.csv")
-p = matrix(0, rows=2, cols=1)
-p[1,1] = perc
-p[2,1] = (1-perc) 
-write(p, percFile, format="csv")
-
-X = Rand(cols=$numFeatures, max=1, min=-1, pdf="uniform", rows=$numSamples, 
seed=0, sparsity=$sparsity)
-X = X * $maxFeatureValue
-
-w = Rand(cols=1, max=1, min=-1, pdf="uniform", rows=$numFeatures, seed=0)
-w = w * $maxWeight
-
-Y = X %*% w
-Y = Y + $b
-
-if ($addNoise == TRUE) {
-    noise = Rand(cols=1, pdf="normal", rows=$numSamples, seed=0)
-    Y = Y + noise
-}
-
-Z = cbind(X,Y)
-write(Z, $output, format=$format)
\ No newline at end of file
diff --git a/scripts/perftest/README.md b/scripts/perftest/README.md
deleted file mode 100755
index 14ea405b3a..0000000000
--- a/scripts/perftest/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements.  See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License.  You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% end comment %}
--->
-
-# Performance Tests SystemDS
-
-To run all performance tests for SystemDS:
- * install systemds,
- * install the prerequisites,
- * navigate to the perftest directory $`cd $SYSTEMDS_ROOT/scripts/perftest` 
- * generate the data,
- * and execute.
-
-There are a few prerequisites:
-
-## Install SystemDS
-
-- First follow the install guide: 
<http://apache.github.io/systemds/site/install> and build the project.
-- Install the python package for python api benchmarks: 
<https://apache.github.io/systemds/api/python/getting_started/install.html>
-- Prepare to run SystemDS: <https://apache.github.io/systemds/site/run>
-
-## Install Additional Prerequisites
-- Setup Intel MKL: <http://apache.github.io/systemds/site/run>
-- Setup OpenBlas: 
<https://github.com/xianyi/OpenBLAS/wiki/Precompiled-installation-packages>
-- Install Perf stat: 
<https://linoxide.com/linux-how-to/install-perf-tool-centos-ubuntu/>
-
-## Generate Test Data
-
-Using the scripts found in `$SYSTEMDS_ROOT/scripts/perftest/datagen`, generate 
the data for the tests you want to run. Note the sometimes optional and other 
times required parameters/args. Dataset size is likely the most important of 
these.
-
-## Run the Benchmarks
-
-**Reminder: The scripts should be run from the perftest folder.**
-
-Examples:
-
-```bash
-./runAll.sh
-```
-
-Or look inside the runAll script to see how to run individual tests.
-
-Time calculations in the bash scripts may additionally subtract a number, e.g. 
".4".
-This is done to accommodate for time lost by shell script and JVM startup 
overheads, to match the actual application runtime of SystemML.
diff --git a/scripts/perftest/datagen/genALSData.sh 
b/scripts/perftest/datagen/genALSData.sh
deleted file mode 100755
index 3d1a22a675..0000000000
--- a/scripts/perftest/datagen/genALSData.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
-  echo "Please execute scripts from directory 'perftest'"
-  exit 1;
-fi
-
-CMD=$1
-DATADIR=$2/als
-MAXMEM=$3
-
-FORMAT="text" # can be csv, mm, text, binary
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-echo "-- Generating ALS data." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense 
rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" | 
bc` sigma=0.01 fmt=$FORMAT &
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 
10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
-fi
-
-#generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 
100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 
100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense 
rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP" 
| bc` sigma=0.01 fmt=$FORMAT &
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse 
rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP" 
| bc` sigma=0.01 fmt=$FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense 
rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * 
$DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 
10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT
-fi
-
-#generate XL scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo 
"scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT
-  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo 
"scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genBinomialData.sh 
b/scripts/perftest/datagen/genBinomialData.sh
deleted file mode 100755
index 7bf3af96dd..0000000000
--- a/scripts/perftest/datagen/genBinomialData.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
-  echo "Please execute scripts from directory 'perftest'"
-  exit 1;
-fi
-
-CMD=$1
-BASE=$2/binomial
-MAXMEM=$3
-
-FORMAT="binary" # can be csv, mm, text, binary
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-echo -e "\n\n-- Generating binomial data..." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 
5 ${BASE}/w10k_1k_dense ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense 1 0 
$DENSE_SP $FORMAT 1       & pidDense80=$!
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 
5 ${BASE}/w10k_1k_sparse ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse 1 0 
$SPARSE_SP $FORMAT 1   & pidSparse80=$!
-  wait $pidDense80;  ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense ${BASE}/X10k_1k_dense_test 
${BASE}/y10k_1k_dense_test $FORMAT     &
-  wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse ${BASE}/X10k_1k_sparse_test 
${BASE}/y10k_1k_sparse_test $FORMAT &
-fi
-
-##generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 
5 ${BASE}/w100k_1k_dense ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense 1 0 
$DENSE_SP $FORMAT 1 & pidDense800=$!
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 
5 ${BASE}/w100k_1k_sparse ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse 1 0 
$SPARSE_SP $FORMAT 1 & pidSparse800=$!
-  wait $pidDense800;  ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense ${BASE}/X100k_1k_dense_test 
${BASE}/y100k_1k_dense_test $FORMAT &
-  wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse ${BASE}/X100k_1k_sparse_test 
${BASE}/y100k_1k_sparse_test $FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 1000000 1000 
5 5 ${BASE}/w1M_1k_dense ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense 1 0 
$DENSE_SP $FORMAT 1  & pidDense8000=$!
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 1000000 1000 
5 5 ${BASE}/w1M_1k_sparse ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse 1 0 
$SPARSE_SP $FORMAT 1  & pidSparse8000=$!
-  wait $pidDense8000;  ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense ${BASE}/X1M_1k_dense_test 
${BASE}/y1M_1k_dense_test $FORMAT &
-  wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse ${BASE}/X1M_1k_sparse_test 
${BASE}/y1M_1k_sparse_test $FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000000 1000 
5 5 ${BASE}/w10M_1k_dense ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense 1 0 
$DENSE_SP $FORMAT 1
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000000 1000 
5 5 ${BASE}/w10M_1k_sparse ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse 1 0 
$SPARSE_SP $FORMAT 1
-  ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_dense 
${BASE}/y10M_1k_dense ${BASE}/X10M_1k_dense_test ${BASE}/y10M_1k_dense_test 
$FORMAT
-  ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_sparse 
${BASE}/y10M_1k_sparse ${BASE}/X10M_1k_sparse_test ${BASE}/y10M_1k_sparse_test 
$FORMAT
-fi
-
-##generate XL scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000000 
1000 5 5 ${BASE}/w100M_1k_dense ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense 1 
0 $DENSE_SP $FORMAT 1
-  ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000000 
1000 5 5 ${BASE}/w100M_1k_sparse ${BASE}/X100M_1k_sparse 
${BASE}/y100M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1
-  ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_dense 
${BASE}/y100M_1k_dense ${BASE}/X100M_1k_dense_test ${BASE}/y100M_1k_dense_test 
$FORMAT
-  ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_sparse 
${BASE}/y100M_1k_sparse ${BASE}/X100M_1k_sparse_test 
${BASE}/y100M_1k_sparse_test $FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genClusteringData.sh 
b/scripts/perftest/datagen/genClusteringData.sh
deleted file mode 100755
index 35c49aaa6c..0000000000
--- a/scripts/perftest/datagen/genClusteringData.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
-  echo "Please execute scripts from directory 'perftest'"
-  exit 1;
-fi
-
-CMD=${1:-systemds}
-BASE=${2:-"temp"}/clustering
-MAXMEM=${3:-80}
-
-FORMAT="binary" 
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-echo "-- Generating clustering data..." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
-  ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=10000 nf=1000 nc=5 
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10k_1k_dense C=$BASE/C10k_1k_dense 
Y=$BASE/y10k_1k_dense YbyC=$BASE/YbyC10k_1k_dense fmt=$FORMAT & pidDense80=$!
-  wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args 
$BASE/X10k_1k_dense $BASE/y10k_1k_dense $BASE/X10k_1k_dense_test 
$BASE/y10k_1k_dense_test $FORMAT &
-fi
-
-#generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
-  ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=100000 nf=1000 nc=5 
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100k_1k_dense 
C=$BASE/C100k_1k_dense Y=$BASE/y100k_1k_dense YbyC=$BASE/YbyC100k_1k_dense 
fmt=$FORMAT & pidDense800=$!
-  wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args 
$BASE/X100k_1k_dense $BASE/y100k_1k_dense $BASE/X100k_1k_dense_test 
$BASE/y100k_1k_dense_test $FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
-  ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=1000000 nf=1000 nc=5 
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X1M_1k_dense C=$BASE/C1M_1k_dense 
Y=$BASE/y1M_1k_dense YbyC=$BASE/YbyC1M_1k_dense fmt=$FORMAT & pidDense8000=$!
-  wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args 
$BASE/X1M_1k_dense $BASE/y1M_1k_dense $BASE/X1M_1k_dense_test 
$BASE/y1M_1k_dense_test $FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
-  ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=10000000 nf=1000 
nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10M_1k_dense 
C=$BASE/C10M_1k_dense Y=$BASE/y10M_1k_dense YbyC=$BASE/YbyC10M_1k_dense 
fmt=$FORMAT
-  ${CMD} -f scripts/extractTestData.dml --args $BASE/X10M_1k_dense 
$BASE/y10M_1k_dense $BASE/X10M_1k_dense_test $BASE/y10M_1k_dense_test $FORMAT
-fi
-
-#generate LARGE scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
-  ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=100000000 nf=1000 
nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100M_1k_dense 
C=$BASE/C100M_1k_dense Y=$BASE/y100M_1k_dense YbyC=$BASE/YbyC100M_1k_dense 
fmt=$FORMAT
-  ${CMD} -f scripts/extractTestData.dml --args $BASE/X100M_1k_dense 
$BASE/y100M_1k_dense $BASE/X100M_1k_dense_test $BASE/y100M_1k_dense_test $FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genDescriptiveStatisticsData.sh 
b/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
deleted file mode 100755
index 55af5f139c..0000000000
--- a/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
-  echo "Please execute scripts from directory 'perftest'"
-  exit 1;
-fi
-
-CMD=$1
-BASE=$2/bivar
-MAXMEM=$3
-
-FORMAT="binary"
-
-c=1000
-nc=100
-mdomain=1100
-set=20
-labelset=10
-
-#XS data 10K rows
-if [ $MAXMEM -ge 80 ]; then
-  ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats 
--nvargs R=10000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10k/data 
TYPES=${BASE}/A_10k/types SETSIZE=$set LABELSETSIZE=$labelset 
TYPES1=${BASE}/A_10k/set1.types TYPES2=${BASE}/A_10k/set2.types 
INDEX1=${BASE}/A_10k/set1.indices INDEX2=${BASE}/A_10k/set2.indices FMT=$FORMAT 
&
-fi
-
-#S data 100K rows
-if [ $MAXMEM -ge 800 ]; then
-  ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats 
--nvargs R=100000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_100k/data 
TYPES=${BASE}/A_100k/types SETSIZE=$set LABELSETSIZE=$labelset 
TYPES1=${BASE}/A_100k/set1.types TYPES2=${BASE}/A_100k/set2.types 
INDEX1=${BASE}/A_100k/set1.indices INDEX2=${BASE}/A_100k/set2.indices 
FMT=$FORMAT &
-fi
-
-#M data 1M rows
-if [ $MAXMEM -ge 8000 ]; then
-  ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats 
--nvargs R=1000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_1M/data 
TYPES=${BASE}/A_1M/types SETSIZE=$set LABELSETSIZE=$labelset 
TYPES1=${BASE}/A_1M/set1.types TYPES2=${BASE}/A_1M/set2.types 
INDEX1=${BASE}/A_1M/set1.indices INDEX2=${BASE}/A_1M/set2.indices FMT=$FORMAT &
-fi
-
-#L data 10M rows
-if [ $MAXMEM -ge 80000 ]; then
-  ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats 
--nvargs R=10000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10M/data 
TYPES=${BASE}/A_10M/types SETSIZE=$set LABELSETSIZE=$labelset 
TYPES1=${BASE}/A_10M/set1.types TYPES2=${BASE}/A_10M/set2.types 
INDEX1=${BASE}/A_10M/set1.indices INDEX2=${BASE}/A_10M/set2.indices FMT=$FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genDimensionReductionData.sh 
b/scripts/perftest/datagen/genDimensionReductionData.sh
deleted file mode 100755
index 2f6cc21b16..0000000000
--- a/scripts/perftest/datagen/genDimensionReductionData.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
-  echo "Please execute scripts from directory 'perftest'"
-  exit 1;
-fi
-
-CMD=${1:-systemds}
-BASE=${2:-"temp"}/dimensionreduction
-MAXMEM=${3:-80}
-
-FORMAT="binary"
-
-echo "-- Generating Dimension Reduction data." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
-  ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=5000 C=2000 
OUT=$BASE/pcaData5k_2k_dense FMT=$FORMAT &
-fi
-
-#generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
-  ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=50000 C=2000 
OUT=$BASE/pcaData50k_2k_dense FMT=$FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
-  ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=500000 C=2000 
OUT=$BASE/pcaData500k_2k_dense FMT=$FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
-  ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=5000000 C=2000 
OUT=$BASE/pcaData5M_2k_dense FMT=$FORMAT
-fi
-
-#generate XL scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
-  ${CMD} -f ${EXTRADOT}./datagen/genRandData4PCA.dml --nvargs R=50000000 
C=2000 OUT=$BASE/pcaData50M_2k_dense FMT=$FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genIOData.sh 
b/scripts/perftest/datagen/genIOData.sh
deleted file mode 100755
index 46154f8636..0000000000
--- a/scripts/perftest/datagen/genIOData.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
-  echo "Please execute scripts from directory 'perftest'"
-  exit 1;
-fi
-
-CMD=${1:-systemds}
-DATADIR=${2:-"temp"}/io
-MAXMEM=${3:-1}
-
-FORMAT="csv" # can be csv, mm, text, binary
-
-echo "-- Generating IO data." >> results/times.txt;
-
-
-#generate XS scenarios (10MB)
-if [ $MAXMEM -ge 1 ]; then
-  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X500_250_dense 
R=500 C=250 Fmt=$FORMAT &
-fi
-
-#generate XS scenarios (10MB)
-if [ $MAXMEM -ge 10 ]; then
-  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X5k_250_dense 
R=5000 C=250 Fmt=$FORMAT &
-fi
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
-  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10k_1k_dense 
R=10000 C=1000 Fmt=$FORMAT &
-fi
-
-#generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
-  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100k_1k_dense 
R=100000 C=1000 Fmt=$FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
-  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X1M_1k_dense 
R=1000000 C=1000 Fmt=$FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
-  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10M_1k_dense 
R=10000000 C=1000 Fmt=$FORMAT &
-fi
-
-#generate XL scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
-  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100M_1k_dense 
R=100000000 C=1000 Fmt=$FORMAT &
-fi
-
-wait
diff --git a/scripts/perftest/datagen/genL2SVMData.sh 
b/scripts/perftest/datagen/genL2SVMData.sh
deleted file mode 100755
index d25e433530..0000000000
--- a/scripts/perftest/datagen/genL2SVMData.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
-  echo "Please execute scripts from directory 'perftest'"
-  exit 1;
-fi
-
-CMD=$1
-DATADIR=$2
-
-FORMAT="binary" # can be csv, mm, text, binary
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-BASEPATH=$(dirname $0)
-
-#generate XS scenarios (80MB)
-${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args 
10000 1000 5 5 ${DATADIR}/w10k_1k_dense ${DATADIR}/X10k_1k_dense 
${DATADIR}/Y10k_1k_dense 1 0 $DENSE_SP $FORMAT 1
-${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args 
10000 1000 5 5 ${DATADIR}/w10k_1k_sparse ${DATADIR}/X10k_1k_sparse 
${DATADIR}/Y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1
diff --git a/scripts/perftest/datagen/genMultinomialData.sh 
b/scripts/perftest/datagen/genMultinomialData.sh
deleted file mode 100755
index 43dd6ea7ff..0000000000
--- a/scripts/perftest/datagen/genMultinomialData.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
-  echo "Please execute scripts from directory 'perftest'"
-  exit 1;
-fi
-
-CMD=$1
-BASE=$2/multinomial
-MAXMEM=$3
-
-FORMAT="binary" 
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-echo "-- Generating multinomial data..." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 
$DENSE_SP 5 0 $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $FORMAT 1 & 
pidDense80=$!
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 
$SPARSE_SP 5 0 $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $FORMAT 1 & 
pidSparse80=$!
-  wait $pidDense80;  ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $BASE/X10k_1k_dense_k5_test 
$BASE/y10k_1k_dense_k5_test $FORMAT &
-  wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $BASE/X10k_1k_sparse_k5_test 
$BASE/y10k_1k_sparse_k5_test $FORMAT &
-fi
-
-##generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 
$DENSE_SP 5 0 $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $FORMAT 1 & 
pidDense800=$!
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 
$SPARSE_SP 5 0 $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $FORMAT 1 & 
pidSparse800=$!
-  wait $pidDense800;  ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $BASE/X100k_1k_dense_k5_test 
$BASE/y100k_1k_dense_k5_test $FORMAT &
-  wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $BASE/X100k_1k_sparse_k5_test 
$BASE/y100k_1k_sparse_k5_test $FORMAT &
-fi
-
-##generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 
$DENSE_SP 5 0 $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $FORMAT 1 & 
pidDense8000=$!
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 
$SPARSE_SP 5 0 $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $FORMAT 1 & 
pidSparse8000=$!
-  wait $pidDense8000;  ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $BASE/X1M_1k_dense_k5_test 
$BASE/y1M_1k_dense_k5_test $FORMAT &
-  wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $BASE/X1M_1k_sparse_k5_test 
$BASE/y1M_1k_sparse_k5_test $FORMAT &
-fi
-
-##generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 
$DENSE_SP 5 0 $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $FORMAT 1
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 
$SPARSE_SP 5 0 $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $FORMAT 1
-  ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_dense_k5 
$BASE/y10M_1k_dense_k5 $BASE/X10M_1k_dense_k5_test $BASE/y10M_1k_dense_k5_test 
$FORMAT
-  ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_sparse_k5 
$BASE/y10M_1k_sparse_k5 $BASE/X10M_1k_sparse_k5_test 
$BASE/y10M_1k_sparse_k5_test $FORMAT
-fi
-
-#generate LARGE scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 
$DENSE_SP 5 0 $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $FORMAT 1
-  ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 
$SPARSE_SP 5 0 $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $FORMAT 1
-  ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_dense_k5 
$BASE/y100M_1k_dense_k5 $BASE/X100M_1k_dense_k5_test 
$BASE/y100M_1k_dense_k5_test $FORMAT
-  ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_sparse_k5 
$BASE/y100M_1k_sparse_k5 $BASE/X100M_1k_sparse_k5_test 
$BASE/y100M_1k_sparse_k5_test $FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genStratStatisticsData.sh 
b/scripts/perftest/datagen/genStratStatisticsData.sh
deleted file mode 100755
index 19c38e3fc7..0000000000
--- a/scripts/perftest/datagen/genStratStatisticsData.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
-  echo "Please execute scripts from directory 'perftest'"
-  exit 1;
-fi
-
-CMD=$1
-BASE=$2/stratstats
-MAXMEM=$3
-
-FORMAT="binary"
-
-echo "-- Generating stats data..." >> results/times.txt;
-
-#XS data 10K rows
-if [ $MAXMEM -ge 80 ]; then
-  ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=10000 nf=100 D=${BASE}/A_10k/data Xcid=${BASE}/A_10k/Xcid 
Ycid=${BASE}/A_10k/Ycid A=${BASE}/A_10k/A fmt=$FORMAT &
-fi
-
-#S data 100K rows
-if [ $MAXMEM -ge 800 ]; then
-  ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=100000 nf=100 D=${BASE}/A_100k/data Xcid=${BASE}/A_100k/Xcid 
Ycid=${BASE}/A_100k/Ycid A=${BASE}/A_100k/A fmt=$FORMAT &
-fi
-
-#M data 1M rows
-if [ $MAXMEM -ge 8000 ]; then
-  ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=1000000 nf=100 D=${BASE}/A_1M/data Xcid=${BASE}/A_1M/Xcid 
Ycid=${BASE}/A_1M/Ycid A=${BASE}/A_1M/A fmt=$FORMAT &
-fi
-
-#L data 10M rows
-if [ $MAXMEM -ge 80000 ]; then
-  ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=10000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid 
Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT
-fi
-
-#XL data 100M rows
-if [ $MAXMEM -ge 800000 ]; then
-  ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=100000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid 
Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/datagen/genRandData4DecisionTree2.dml 
b/scripts/perftest/log4j.properties
similarity index 60%
rename from scripts/datagen/genRandData4DecisionTree2.dml
rename to scripts/perftest/log4j.properties
index 715924915c..9b751b57ca 100644
--- a/scripts/datagen/genRandData4DecisionTree2.dml
+++ b/scripts/perftest/log4j.properties
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,23 +19,13 @@
 #
 #-------------------------------------------------------------
 
+log4j.rootLogger=ERROR,console
 
-transformPath = $tPath;
-transformSpec = $tSpec;
-XCatFile = $XCat;
-XFile = $X;
-num_records = $num_records;
-num_scale_features = $num_scale;
-sparsity = $sp;
-fmt = $fmt;
-
-# generate scale features
-X_scale = rand (rows = num_records, cols = num_scale_features, min = 0, max = 
10, sparsity = sparsity); 
-
-# transform categorical features
-XCF = read (XCatFile);
-specJson = read(transformSpec, data_type="scalar", value_type="string");
-X_cat_transformed = transform (target = XCF, spec = specJson, transformPath = 
transformPath);
+log4j.logger.org.apache.sysds=ERROR
+log4j.logger.org.apache.spark=ERROR
+log4j.logger.org.apache.hadoop=OFF
 
-X = cbind (X_scale, X_cat_transformed);
-write (X, XFile, format = fmt);
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p 
%c{2}: %m%n
diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh
index 6d39043a74..17ad97b085 100755
--- a/scripts/perftest/runAll.sh
+++ b/scripts/perftest/runAll.sh
@@ -27,72 +27,19 @@ then
 fi
 
 # Command to be executed
-CMD="systemds"
+CMD="./sparkDML2.sh"
 TEMPFOLDER="temp"
 
 # Max memory of data to be benchmarked
 # Possible values: 80/80MB, 800/800MB, 8000/8000MB/8GB, 80000/80000MB/80GB, 
800000/800000MB/800GB
-MAXMEM=80
+MAXMEM=80000
 
 # Set properties
 export LOG4JPROP='conf/log4j-off.properties'
-export SYSDS_QUIET=1
-export SYSDS_EXEC_MODE="hybrid"
-export SYSTEMDS_STANDALONE_OPTS="-Xmx10g -Xms10g -Xmn2000m"
-export SYSDS_DISTRIBUTED=0
-
-if [ "$HOSTNAME" = "alpha" ]; then
-  # Just to make it easy to run on our machine without having to change 
anything.
-  export SYSTEMDS_STANDALONE_OPTS="-Xmx500g -Xms500g -Xmn50g"
-  export SYSDS_DISTRIBUTED=1
-  export SYSTEMDS_DISTRIBUTED_OPTS="\
-        --master yarn \
-        --deploy-mode client \
-        --driver-memory 500g \
-        --conf spark.driver.extraJavaOptions=\"-Xms500g -Xmn50g 
-Dlog4j.configuration=file:$LOG4JPROP\" \
-        --conf 
spark.executor.extraJavaOptions=\"-Dlog4j.configuration=file:$LOG4JPROP\" \
-        --conf spark.executor.heartbeatInterval=100s \
-        --files $LOG4JPROP \
-        --conf spark.network.timeout=512s \
-        --num-executors 6 \
-        --executor-memory 105g \
-        --executor-cores 32 \
-        "
-  MAXMEM="80GB"
-elif [ "$HOSTNAME" = "charlie" ]; then
-  export SYSTEMDS_STANDALONE_OPTS="-Xmx100g -Xms100g -Xmn10g"
-  export SYSDS_DISTRIBUTED=1
-  export SYSTEMDS_DISTRIBUTED_OPTS="\
-        --master yarn \
-        --deploy-mode client \
-        --driver-memory 100g \
-        --conf spark.driver.extraJavaOptions=\"-Xms100g -Xmn10g 
-Dlog4j.configuration=file:$LOG4JPROP\" \
-        --conf 
spark.executor.extraJavaOptions=\"-Dlog4j.configuration=file:$LOG4JPROP\" \
-        --conf spark.executor.heartbeatInterval=100s \
-        --files $LOG4JPROP \
-        --conf spark.network.timeout=512s \
-        --num-executors 6 \
-        --executor-memory 105g \
-        --executor-cores 32 \
-        "
-  MAXMEM="80GB"
-elif [ "$HOSTNAME" = "XPS-15-7590" ]; then
-  MAXMEM=800
-fi
-
-# Fix max mem to format.
-MAXMEM=${MAXMEM%"MB"}; MAXMEM=${MAXMEM/GB/"000"}
-
-# Possible lines to initialize Intel MKL, depending on version and install 
location
-if [ -d ~/intel ] && [ -d ~/intel/bin ] && [ -f ~/intel/bin/compilervars.sh ]; 
then
-    . ~/intel/bin/compilervars.sh intel64
-elif [ -d /opt ] && [ -d /opt/intel ] && [ -d /opt/intel/bin ]; then
-    . /opt/intel/bin/compilervars.sh intel64
-fi
 
 # make dirs if not exsisting
-mkdir -p logs 
-mkdir -p results 
+mkdir -p logs
+mkdir -p results
 mkdir -p temp
 
 # init time measurement
@@ -103,13 +50,13 @@ echo -e "\n$HOSTNAME" >> results/times.txt
 echo -e "\n\n" >> results/times.txt
 
 ## Data Gen
-# ./datagen/genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genBinomialData.out
-# ./datagen/genMultinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genMultinomialData.out
-# ./datagen/genDescriptiveStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genStatsData.out
-# ./datagen/genStratStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genStratStatsData.out
-# ./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genClusteringData.out
-# ./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genDimensionReductionData.out
-# ./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out
+./datagen/genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genBinomialData.out
+./datagen/genMultinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genMultinomialData.out
+#./datagen/genDescriptiveStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genStatsData.out
+#./datagen/genStratStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genStratStatsData.out
+#./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genClusteringData.out
+#./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> 
logs/genDimensionReductionData.out
+#./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out
 
 ### Micro Benchmarks:
 #./MatrixMult.sh ${CMD}
@@ -122,17 +69,17 @@ echo -e "\n\n" >> results/times.txt
 ./runAllBinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
 ./runAllMultinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
 ./runAllRegression.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./runAllStats.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./KnnMissingValueImputation.sh ${CMD} ${MAXMEM}
+#./runAllStats.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./KnnMissingValueImputation.sh ${CMD} ${MAXMEM}
 
 ### IO Benchmarks:
-./runAllIO.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./runAllIO.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
 
 # TODO The following benchmarks have yet to be written. The decision tree 
algorithms additionally need to be fixed.
-# add stepwise Linear 
+# add stepwise Linear
 # add stepwise GLM
 #./runAllTrees.sh $CMD $TEMPFOLDER
 # add randomForest
diff --git a/scripts/perftest/runL2SVM.sh b/scripts/perftest/runL2SVM.sh
index b7ddb64d40..622cb95043 100755
--- a/scripts/perftest/runL2SVM.sh
+++ b/scripts/perftest/runL2SVM.sh
@@ -37,8 +37,7 @@ for i in 0 1; do
    #training
    tstart=$(date +%s.%N)
 
-   # /algorithms/l2-svm.dml already calls a built-in function for the l2 svm.
-   ${CMD} -f ./../algorithms/l2-svm.dml \
+   ${CMD} -f scripts/l2-svm.dml \
       "$FEDERATEDCOMPILATION" \
       --config conf/SystemDS-config.xml \
       --stats \
diff --git a/scripts/perftest/scripts/l2-svm-predict.dml 
b/scripts/perftest/scripts/l2-svm-predict.dml
index 31db539ec7..7ab7c94c6e 100755
--- a/scripts/perftest/scripts/l2-svm-predict.dml
+++ b/scripts/perftest/scripts/l2-svm-predict.dml
@@ -28,58 +28,16 @@ cmdLine_fmt = ifdef($fmt, "text")
 
 X = read($X)
 
-w = read($model)
+W = read($model)
 
-dimensions = as.scalar(w[nrow(w),1])
+dimensions = as.scalar(W[nrow(W),1])
 if(dimensions != ncol(X))
   stop("Stopping due to invalid input: Model dimensions do not seem to match 
input data dimensions")
 
-intercept = as.scalar(w[nrow(w)-1,1])
-negative_label = as.scalar(w[nrow(w)-2,1])
-positive_label = as.scalar(w[nrow(w)-3,1])
-w = w[1:(nrow(w)-4),]
+intercept = as.scalar(W[nrow(W)-1,1])
+W = W[1:(nrow(W)-2),]
 
-[scores, Y] = l2svmPredict(X = X, W = w, verbose = TRUE)
+[scores, Y] = l2svmPredict(X = X, W = W, verbose = TRUE)
 
-if(cmdLine_scores != " ")
-  write(scores, cmdLine_scores, format=cmdLine_fmt)
+write(scores, cmdLine_scores, format=cmdLine_fmt)
 
-if(!cmdLine_scoring_only){
-  Y = read(cmdLine_Y)
-
-  pred = (scores >= 0)
-  pred_labels = pred*positive_label + (1-pred)*negative_label
-  num_correct = sum(pred_labels == Y)
-  acc = 100*num_correct/nrow(X)
-
-  acc_str = "Accuracy (%): " + acc
-  print(acc_str)
-
-  if(cmdLine_accuracy != " ")
-    write(acc_str, cmdLine_accuracy)
-
-  if(cmdLine_confusion != " ") {
-    pred = 2*pred - 1
-    
-    if(negative_label != -1 | positive_label != +1)
-      Y = 2/(positive_label - negative_label)*Y - (negative_label + 
positive_label)/(positive_label - negative_label)
-
-    pred_is_minus = (pred == -1)
-    pred_is_plus = 1 - pred_is_minus
-    y_is_minus = (Y == -1)
-    y_is_plus = 1 - y_is_minus
-
-    check_min_y_minus = sum(pred_is_minus*y_is_minus)
-    check_min_y_plus = sum(pred_is_minus*y_is_plus)
-    check_max_y_minus = sum(pred_is_plus*y_is_minus)
-    check_max_y_plus = sum(pred_is_plus*y_is_plus)
-
-    confusion_mat = matrix(0, rows=2, cols=2)
-    confusion_mat[1,1] = check_min_y_minus
-    confusion_mat[1,2] = check_min_y_plus
-    confusion_mat[2,1] = check_max_y_minus
-    confusion_mat[2,2] = check_max_y_plus
-
-    write(confusion_mat, cmdLine_confusion, format="csv")
-  }
-}
diff --git a/scripts/datagen/genRandData4DecisionTree1.dml 
b/scripts/perftest/scripts/l2-svm.dml
similarity index 61%
rename from scripts/datagen/genRandData4DecisionTree1.dml
rename to scripts/perftest/scripts/l2-svm.dml
index 7d1dd50d6b..ac64de679d 100644
--- a/scripts/datagen/genRandData4DecisionTree1.dml
+++ b/scripts/perftest/scripts/l2-svm.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,22 +19,22 @@
 #
 #-------------------------------------------------------------
 
+fmt = ifdef($fmt, "text")
+icpt = ifdef($icpt, 0)
+tol = ifdef($tol, 0.001)
+reg = ifdef($reg, 1.0)
+maxiter = ifdef($maxiter, 100)
+
+tol = as.double ($tol);
+X = read($X)
+Y = read($Y)
 
-XCatFile = $XCat;
-YFile = $Y;
-num_records = $num_records;
-num_cat_features = $num_cat;
-num_class = $num_class;
-num_distinct = $num_distinct;
-sparsity = $sp;
+model = l2svm(X = X, Y = Y, intercept = icpt, epsilon = tol, reg = reg, 
maxIterations = maxiter, verbose = FALSE)
 
-# generate class labels
-Y = floor (rand (rows = num_records, cols = 1, min = 1, max = num_class + 
0.99999999999999)); 
-Y_bin = table (seq (1, num_records), Y); 
-write (Y_bin, YFile);
+extra_model_params = matrix(0, rows=2, cols=ncol(model))
+extra_model_params[1, 1] = icpt
+extra_model_params[2, 1] = ncol(X)
 
-# generate categorical features
-X_cat = floor (rand (rows = num_records, cols = num_cat_features, min = 1, max 
= num_distinct + 0.99999999999999, sparsity = sparsity));
-fX_cat = as.frame(X_cat);
-write (fX_cat, XCatFile, format = "csv");
+w = t(cbind(t(model), t(extra_model_params)))
 
+write(w, $model, format=fmt)
diff --git a/scripts/perftest/sparkDML2.sh b/scripts/perftest/sparkDML2.sh
new file mode 100644
index 0000000000..dde9805719
--- /dev/null
+++ b/scripts/perftest/sparkDML2.sh
@@ -0,0 +1,16 @@
+ #Client mode spark-submit script
+export SPARK_HOME=/home/hadoop/spark-3.3.1-bin-hadoop3
+export HADOOP_CONF_DIR=/home/hadoop/hadoop-3.3.1/etc/hadoop
+
+$SPARK_HOME/bin/spark-submit \
+     --master yarn \
+     --deploy-mode client \
+     --driver-memory 20g \
+     --num-executors 6 \
+     --conf spark.driver.extraJavaOptions="-Xms20g -Xmn2g 
-Dlog4j.configuration=file:/home/mboehm/perftest/log4j.properties " \
+     --conf spark.ui.showConsoleProgress=true \
+     --conf spark.executor.heartbeatInterval=100s \
+     --conf spark.network.timeout=512s \
+     --executor-memory 200g \
+     --executor-cores 48 \
+      SystemDS.jar "$@" 
\ No newline at end of file
diff --git 
a/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmDatagen.java
 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmDatagen.java
index c23f3c5934..4473ef5f0e 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmDatagen.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmDatagen.java
@@ -241,13 +241,13 @@ public class AlgorithmDatagen extends AutomatedTestBase
                        double sparsity = sparse ? sparsity2 : sparsity1;
                        
                        if( type ==  DatagenType.LINREG) {
-                               fullDMLScriptName = 
"scripts/datagen/genRandData4LinearRegression.dml";
+                               fullDMLScriptName = 
"scripts/perftest/datagen/genRandData4LinearRegression.dml";
                                programArgs = new String[]{ "-stats", "-args",
                                        String.valueOf(rows), 
String.valueOf(cols), "10", "1", output("w"),
                                        output("X"), output("y"), "1", "1", 
String.valueOf(sparsity), "binary"};
                        }
                        else { //LOGREG
-                               fullDMLScriptName = 
"scripts/datagen/genRandData4LogisticRegression.dml";
+                               fullDMLScriptName = 
"scripts/perftest/datagen/genRandData4LogisticRegression.dml";
                                programArgs = new String[]{ "-stats", "-args",
                                        String.valueOf(rows), 
String.valueOf(cols), "10", "1", output("w"),
                                        output("X"), output("y"), "1", "1", 
String.valueOf(sparsity), "binary", "1"};
diff --git 
a/src/test/java/org/apache/sysds/test/functions/misc/UnivariateStatsBasicTest.java
 
b/src/test/java/org/apache/sysds/test/functions/misc/UnivariateStatsBasicTest.java
index edb505d6c3..68dd8fd101 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/misc/UnivariateStatsBasicTest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/misc/UnivariateStatsBasicTest.java
@@ -71,7 +71,7 @@ public class UnivariateStatsBasicTest extends 
AutomatedTestBase
                        loadTestConfiguration(config);
                        
                        //run univariate stats data generator
-                       fullDMLScriptName = 
"./scripts/datagen/"+TEST_NAME_DATAGEN+".dml";
+                       fullDMLScriptName = 
"./scripts/perftest/datagen/"+TEST_NAME_DATAGEN+".dml";
                        programArgs = new String[]{ "-args", "100000", "100", 
"10", "1", "2", "3", "4", input("uni.mtx") };
                        runTest(true, false, null, -1);
                        


Reply via email to