This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 48e6f079fd [SYSTEMDS-3847] Fix non-functional perftest benchmarking
suite
48e6f079fd is described below
commit 48e6f079fd93e794cbb8cddacae4b7407ee49ac8
Author: Matthias Boehm <[email protected]>
AuthorDate: Sat Apr 5 18:23:51 2025 +0200
[SYSTEMDS-3847] Fix non-functional perftest benchmarking suite
0) hard-coded server names / properties
1) windows line endings
2) memory configurations
3) datagen scripts
4) missing l2svm script
---
scripts/builtin/msvmPredict.dml | 2 +-
scripts/datagen/genRandData4ALS.dml | 47 -----
scripts/datagen/genRandData4ChisquaredTest.dml | 87 --------
scripts/datagen/genRandData4DecisionTree.sh | 58 -----
scripts/datagen/genRandData4DescriptiveStats.dml | 149 -------------
scripts/datagen/genRandData4FTest.dml | 95 ---------
scripts/datagen/genRandData4Kmeans.dml | 120 -----------
scripts/datagen/genRandData4LinearReg_LTstats.dml | 233 ---------------------
scripts/datagen/genRandData4LinearRegression.dml | 61 ------
scripts/datagen/genRandData4LogReg_LTstats.dml | 233 ---------------------
scripts/datagen/genRandData4LogisticRegression.dml | 72 -------
scripts/datagen/genRandData4MultiClassSVM.dml | 68 ------
scripts/datagen/genRandData4Multinomial.dml | 66 ------
scripts/datagen/genRandData4NMF.dml | 129 ------------
scripts/datagen/genRandData4NMFBlockwise.dml | 138 ------------
scripts/datagen/genRandData4PCA.dml | 61 ------
scripts/datagen/genRandData4StratStats.dml | 155 --------------
scripts/datagen/genRandData4SurvAnalysis.dml | 133 ------------
scripts/datagen/genRandData4Transform.dml | 96 ---------
scripts/datagen/genRandData4Univariate.dml | 61 ------
scripts/datagen/obsolete/genCorrelatedData.dml | 46 ----
.../datagen/obsolete/genLinearRegressionData.dml | 71 -------
scripts/perftest/README.md | 59 ------
scripts/perftest/datagen/genALSData.sh | 68 ------
scripts/perftest/datagen/genBinomialData.sh | 78 -------
scripts/perftest/datagen/genClusteringData.sh | 68 ------
.../datagen/genDescriptiveStatisticsData.sh | 60 ------
.../perftest/datagen/genDimensionReductionData.sh | 61 ------
scripts/perftest/datagen/genIOData.sh | 72 -------
scripts/perftest/datagen/genL2SVMData.sh | 38 ----
scripts/perftest/datagen/genMultinomialData.sh | 78 -------
scripts/perftest/datagen/genStratStatisticsData.sh | 61 ------
.../log4j.properties} | 30 +--
scripts/perftest/runAll.sh | 89 ++------
scripts/perftest/runL2SVM.sh | 3 +-
scripts/perftest/scripts/l2-svm-predict.dml | 54 +----
.../scripts/l2-svm.dml} | 34 +--
scripts/perftest/sparkDML2.sh | 16 ++
.../codegenalg/parttwo/AlgorithmDatagen.java | 4 +-
.../functions/misc/UnivariateStatsBasicTest.java | 2 +-
40 files changed, 72 insertions(+), 2984 deletions(-)
diff --git a/scripts/builtin/msvmPredict.dml b/scripts/builtin/msvmPredict.dml
index f869d7dc05..ed0271b14f 100644
--- a/scripts/builtin/msvmPredict.dml
+++ b/scripts/builtin/msvmPredict.dml
@@ -37,7 +37,7 @@
m_msvmPredict = function(Matrix[Double] X, Matrix[Double] W)
return(Matrix[Double] YRaw, Matrix[Double] Y)
{
- # Robustness for datasets with missing values
+ # Robustness for datasets with missing values
numNaNs = sum(isNaN(X))
if( numNaNs > 0 ) {
print("msvm: matrix X contains "+numNaNs+" missing values, replacing with
0.")
diff --git a/scripts/datagen/genRandData4ALS.dml
b/scripts/datagen/genRandData4ALS.dml
deleted file mode 100644
index f6c3562862..0000000000
--- a/scripts/datagen/genRandData4ALS.dml
+++ /dev/null
@@ -1,47 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-Xfile = $X; # input matrix X of size m x n
-Ufile = ifdef($U, " "); # original row factor of size m x r
-Vfile = ifdef($V, " "); # original col factor of size r x n
-m = $rows; # no. of rows of X
-n = $cols; # no. of cols of X
-r = $rank; # rank of factorization
-nnz = $nnz; # no. of nonzeros in X
-sigma = ifdef ($sigma, 0.01); # variance of Gaussian noise
-fmt = ifdef ($fmt, "binary"); # output format
-
-# generate original factors by sampling from a normal(0,1.0) distribution
-U = rand(rows = m, cols = r, pdf = "normal", seed = 123);
-V = rand(rows = n, cols = r, pdf = "normal", seed = 456);
-
-I = floor(rand(rows = nnz, cols = 1, min = 1, max = m + 0.999999999));
-J = floor(rand(rows = nnz, cols = 1, min = 1, max = n + 0.999999999));
-X = rand(rows = nnz, cols = 1, pdf = "normal") * sqrt(sigma);
-N = table(I, J, X);
-X = (N != 0) * (U %*% t(V)) + N;
-write(X, Xfile, format = fmt);
-if( Ufile != " " )
- write(U, Ufile, format = fmt);
-if( Vfile != " " ) {
- V = t(V);
- write(V, Vfile, format = fmt);
-}
diff --git a/scripts/datagen/genRandData4ChisquaredTest.dml
b/scripts/datagen/genRandData4ChisquaredTest.dml
deleted file mode 100644
index 8f2b945e01..0000000000
--- a/scripts/datagen/genRandData4ChisquaredTest.dml
+++ /dev/null
@@ -1,87 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates a two column matrix of categorical
-# variables
-# used to test systemds's chi-squared bivariate stat
-# computation
-
-# $1 is number of samples to generate
-# $2 is number of categories for 1st categorical variable
-# $3 is number of categories for 2nd categorical variable
-# $4 is the file to write out the chi-squared statistic to
-# $5 is the file to write out the generated data to
-
-numSamples = $1
-numCategories1 = $2
-numCategories2 = $3
-
-o = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=1.0,
pdf="uniform", seed=0)
-o = o / sum(o)
-
-probs1 = rowSums(o)
-probs1 = probs1 / sum(probs1)
-probs2 = colSums(o)
-probs2 = probs2 / sum(probs2)
-e = probs1 %*% probs2
-
-chisquared = sum((o-e)^2/e)
-write(chisquared, $4, format="binary")
-
-oCDF = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=0.0,
pdf="uniform", seed=0)
-for(i in 1:numCategories1){
- for(j in 1:numCategories2){
- if(i==1 & j==1){
- oCDF[i,j] = o[1,1]
- }
- if(i != 1 & j == 1){
- oCDF[i,j] = oCDF[i-1,numCategories2] + o[i,j]
- }
- if(j > 1){
- oCDF[i,j] = oCDF[i,j-1] + o[i,j]
- }
- }
-}
-
-one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform", seed=0)
-data = Rand(rows=numSamples, cols=2, min=0.0, max=0.0, pdf="uniform", seed=0)
-parfor(s in 1:numSamples){
- r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0)
- r = as.scalar(r_mat)
-
- cat1 = -1
- cat2 = -1
- continue = 1
- for(i in 1:numCategories1){
- for(j in 1:numCategories2){
- cdf = as.scalar(oCDF[i,j])
- if(continue == 1 & r <= cdf){
- cat1 = i
- cat2 = j
- continue = 0
- }
- }
- }
-
- data[s,1] = cat1*one
- data[s,2] = cat2*one
-}
-write(data, $5, format="binary")
diff --git a/scripts/datagen/genRandData4DecisionTree.sh
b/scripts/datagen/genRandData4DecisionTree.sh
deleted file mode 100644
index 6564d518f1..0000000000
--- a/scripts/datagen/genRandData4DecisionTree.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR |
SPARK | ECHO> e.g. $0 perftest SPARK" ; exit 1 ; fi
-# if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" ==
"MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi
-
-BASE=$1/trees
-
-FORMAT="csv"
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-PATH_LOCAL=/tmp/datagen
-PATH_HDFS=$BASE
-
-#### part 1: generating class labels and categorical features
-${CMD} -f ../datagen/genRandData4DecisionTree1.dml $DASH-nvargs
XCat=$BASE/XCat Y=$BASE/Y num_records=1000 num_cat=100 num_class=10
num_distinct=100 sp=$DENSE_SP
-
-#### part 2: generating spec.json on HDFS
-NUM_FEATURES=100
-
-echo "{ \"ids\": true
- ,\"recode\": [1 " > $PATH_LOCAL/spec.json
-for i in $(seq 2 $NUM_FEATURES); do
- echo " , "$i >> $PATH_LOCAL/spec.json
-done
-echo " ] , \"dummycode\": [ 1" >> $PATH_LOCAL/spec.json
-for i in $(seq 2 $NUM_FEATURES); do
- echo " , "$i >> $PATH_LOCAL/spec.json
-done
-echo "] }" >> $PATH_LOCAL/spec.json
-
-hadoop fs -rm $PATH_HDFS/spec.json
-hadoop fs -copyFromLocal $PATH_LOCAL/spec.json $PATH_HDFS/spec.json
-
-#### part 3: generating scale feature and transforming categorical features,
finally combaning scale and categorical features
-${CMD} -f ../datagen/genRandData4DecisionTree2.dml $DASH-nvargs
tPath=$BASE/metadata tSpec=$BASE/spec.json XCat=$BASE/XCat X=$BASE/X
num_records=1000 num_scale=100 sp=$DENSE_SP fmt=$FORMAT
-
-
diff --git a/scripts/datagen/genRandData4DescriptiveStats.dml
b/scripts/datagen/genRandData4DescriptiveStats.dml
deleted file mode 100644
index 6f96162074..0000000000
--- a/scripts/datagen/genRandData4DescriptiveStats.dml
+++ /dev/null
@@ -1,149 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
-------------------------------------------------
- Parameters
-------------------------------------------------
-$R = #rows
-$C = #columns
-$NC = number of categorical attributes
-$MAXDOMAIN = maximum domain size
-$DATA = output file path on HDFS
-$SETSIZE = Size of one bivariate set
-$LABELSETSIZE= Size of second bivariate set with labels
-$TYPES = output attribute types
-$TYPES1 = Attribute types for Set1
-$TYPES2 = Attribute types for Set2
-$INDEX1 = Indices for Set1
-$INDEX2 = Indices for Set2
-$FMT = output format
-------------------------------------------------
-hadoop jar SystemDS.jar -f genData4Stats.dml -nvargs R=1000000 C=1000 NC=50
MAXDOMAIN=1100 DATA=stats/data TYPES=stats/types SETSIZE=15 LABELSETSIZE=10
TYPES1=... Types2=... INDEX1=.. INDEX2=..FMT=csv
-------------------------------------------------
-*/
-
-
-FMT = ifdef($FMT,"binary"); # default output format
-
-# number of categorical attributes.. numC <= C
-numC = $NC;
-numO = as.integer(numC/2);
-numNominal = numC - numO;
-print("Categorical Mix = (" + numC + "," + numO + "," + numNominal +")");
-
-# maximum domain size among all categorical attributes
-maxDomainSize = $MAXDOMAIN;
-
-# Divide $C attributes according to the following logic:
-#
-# 1 numO numC C
-# |-------|---------|-----------------|
-# ord nominal scale
-#
-# numC+1-$C: scale
-# 1-numC/2: ordinal
-# (numC/2+1)-numC: nominal
-
-types = matrix(1, rows=1, cols=$C);
-ocutoff = numO;
-types[1,1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
-types[1, ocutoff+1:numC] = matrix(1,rows=1,cols=(numC-ocutoff))*2;
-
-# Generate data
-A = rand(rows=$R, cols=$C, sparsity=1);
-B = matrix(0,rows=nrow(A), cols=ncol(A));
-parfor (i in 1:numC) {
- Ai = A[,i];
-
- tmp = round(rand(rows=1,cols=1, min=1, max=maxDomainSize));
- domain = as.scalar(tmp[1,1]);
-
- # for some attributes, choose the maxDomainSize
- tmp = rand(rows=1,cols=1);
- if (as.scalar(tmp[1,1]) < 0.5) {
- domain = maxDomainSize;
- }
-
- B[,i] = round(1+(domain-1)*Ai);
-}
-B[ ,(numC+1):ncol(A)] = A[, (numC+1):ncol(A)];
-
-
-write(B, $DATA, format=FMT);
-write(types, $TYPES, format=FMT);
-
-# ----- Generator for Bivariate ---------
-
-settypes1 = matrix(1, rows=1, cols=$SETSIZE);
-index1 = matrix(0, rows=1, cols=$SETSIZE);
-
-catSetSize = as.integer($SETSIZE/2);
-ocutoff = as.integer(catSetSize/2);
-print("Set Mix = (" + $SETSIZE + "," + catSetSize + "," + ocutoff + ")" );
-settypes1[1, 1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
-settypes1[1, ocutoff+1:catSetSize] =
matrix(1,rows=1,cols=(catSetSize-ocutoff))*2;
-
-# select ordinal indices
-tmp = rand(rows=1, cols=ocutoff);
-index1[1, 1:ocutoff] = round(1 + (numO-1)*tmp);
-
-# select nominal indices
-nominalSetSize = catSetSize-ocutoff;
-tmp = rand(rows=1, cols=nominalSetSize);
-index1[1, ocutoff+1:catSetSize] = round(numO+1 + (numC-numO-1)*tmp);
-
-# select scale attributes
-scaleSetSize = $SETSIZE-catSetSize;
-tmp = rand(rows=1, cols=scaleSetSize);
-index1[1, catSetSize+1:$SETSIZE] = round(numC+1 + ($C-numC-1)*tmp);
-
-
-# --- select types and indices for LABELSET
-settypes2 = matrix(2, rows=1, cols=$LABELSETSIZE);
-index2 = matrix(0, rows=1, cols=$LABELSETSIZE);
-if($LABELSETSIZE > 1) {
- settypes2[1,1] = 1;
- r = as.scalar(rand(rows=1,cols=1));
- index2[1,1] = round(numC+1 + ($C-numC-1)*r)
-}
-else {
- r = as.scalar(rand(rows=1,cols=1));
- index2[1,1] = round( numO+1 + (numC-numO-1)*r )
-}
-
-for(i in 2:as.integer($LABELSETSIZE/2)) {
- settypes2[1,i] = 3;
- r = as.scalar(rand(rows=1,cols=1));
- index2[1,i] = round( 1 + (numO-1)*r )
-}
-
-for(i in as.integer($LABELSETSIZE/2)+1:$LABELSETSIZE) {
- settypes2[1,i] = 2;
- r = as.scalar(rand(rows=1,cols=1));
- index2[1,i] = round( numO+1 + (numC-numO-1)*r )
-}
-
-write(settypes1, $TYPES1, format=FMT);
-write(settypes2, $TYPES2, format=FMT);
-write(index1, $INDEX1, format=FMT);
-write(index2, $INDEX2, format=FMT);
-
diff --git a/scripts/datagen/genRandData4FTest.dml
b/scripts/datagen/genRandData4FTest.dml
deleted file mode 100644
index 9f0e1d6c68..0000000000
--- a/scripts/datagen/genRandData4FTest.dml
+++ /dev/null
@@ -1,95 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for F-test
-#
-# $1 is number of groups (some of
-# which may share a gaussian)
-# $2 is number of actual groups
-# $3 is number of points
-# $4 is mean of the gaussian means
-# $5 is mean of the gaussian std. deviations
-# $6 is file to store computed f-statistic
-# $7 is file to store generated data
-
-numGroups = $1
-numActualGroups = $2
-numSamples = $3
-meanOfMeans = $4
-meanOfStddevs = $5
-
-cntProbs = Rand(rows=numGroups, cols=1, min=0.0, max=1.0, pdf="uniform",
seed=0)
-cntProbs = cntProbs/sum(cntProbs)
-cntArr = round(cntProbs * numSamples)
-last_cnt = cntArr[numGroups,1]
-cntArr[numGroups,1] = numSamples - (sum(cntArr) - last_cnt)
-
-permut = Rand(rows=numActualGroups, cols=numGroups, min=0.0, max=0.0,
pdf="uniform")
-ones = Rand(rows=numActualGroups, cols=1, min=1.0, max=1.0, pdf="uniform")
-permut[,1:numActualGroups] = diag(ones)
-
-one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform")
-copy_start_index = numActualGroups+1
-parfor(i in copy_start_index:numGroups){
- r = Rand(rows=1, cols=1, min=1.0, max=numActualGroups, pdf="uniform",
seed=0)
- j = as.scalar(round(r))
- permut[j,i] = one
-}
-
-means_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0)
-abs_means = means_std + meanOfMeans
-means = t(t(abs_means) %*% permut)
-
-stddevs_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0)
-abs_stddevs = stddevs_std + meanOfStddevs
-stddevs = t(t(abs_stddevs) %*% permut)
-
-overall_mean = sum(means*cntArr)/numSamples
-
-explained_variance = sum(cntArr * (means - overall_mean)^2) / (numGroups-1.0)
-unexplained_variance = sum(cntArr * stddevs^2) / (numSamples - numGroups)
-f = explained_variance / unexplained_variance
-write(f, $6, format="binary")
-
-cntCDFs = cntProbs
-for(i in 2:numGroups){
- cntCDFs[i,1] = cntCDFs[i-1,1] + cntProbs[i,1]
-}
-
-data = Rand(rows=numSamples, cols=1, min=0.0, max=0.0, pdf="uniform")
-parfor(i in 1:numSamples){
- r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0)
- r1 = as.scalar(r_mat)
-
- g = -1
- continue = 1
- for(k in 1:numGroups){
- cdf = as.scalar(cntCDFs[k,1])
- if(continue==1 & r1<=cdf){
- g = k
- continue=0
- }
- }
-
- point = Rand(rows=1, cols=1, pdf="normal", seed=0)
- data[i,1] = point*stddevs[g,1] + means[g,1]
-}
-write(data, $7, format="binary")
diff --git a/scripts/datagen/genRandData4Kmeans.dml
b/scripts/datagen/genRandData4Kmeans.dml
deleted file mode 100644
index 3098650b26..0000000000
--- a/scripts/datagen/genRandData4Kmeans.dml
+++ /dev/null
@@ -1,120 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Generates random Gaussian-mixture data to test k-Means clustering algorithms
-#
-# INPUT PARAMETERS:
-# ----------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-# ----------------------------------------------------------------------------
-# nr Int --- Number of records
-# nf Int --- Number of features
-# nc Int --- Number of clusters
-# dc Double --- St.dev. of cluster "centroid" features from zero mean
-# dr Double --- St.dev. of the 1-st feature in a record within cluster
-# fbf Double --- Feature bias factor: Stdev(last) / Stdev(1-st) feature
-# cbf Double --- Cluster bias factor: Prob[1-st clus] / Prob[k-th clus]
-# X String --- Location to write matrix X with generated data records
-# C String --- Location to write cluster "centroids" (Gaussian means)
-# Y String --- Location to write assignment of records to cluster ids
-# YbyC String --- Location to write rec-cluster assigns by min-dist to C
-# ----------------------------------------------------------------------------
-#
-# Example:
-# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100
-# nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx
YbyC=YbyC.mtx
-
-print ("BEGIN K-MEANS GENERATOR SCRIPT");
-
-num_records = $nr;
-num_features = $nf;
-num_centroids = $nc;
-dist_per_feature_centroids = $dc;
-dist_per_feature_first_record = $dr;
-feature_bias_factor = $fbf;
-cluster_bias_factor = $cbf;
-
-fileX = ifdef ($X, "X");
-fileC = ifdef ($C, "C");
-fileY = ifdef ($Y, "Y");
-fileYbyC = ifdef ($YbyC, "YbyC");
-fmt = ifdef ($fmt, "text");
-
-print ("Generating cluster distribution (mixture) centroids...");
-
-C = Rand (rows = num_centroids, cols = num_features, pdf = "normal");
-C = C * dist_per_feature_centroids;
-
-print ("Generating record-to-cluster assignments...");
-
-# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely
-# than "num_centroids" by the factor of "cluster_bias_factor"
-
-rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf =
"uniform");
-if (cluster_bias_factor == 1.0) {
- Y = round (0.5 + rnd * num_centroids);
-} else {
- rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids /
(num_centroids - 1)));
- Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log
(cluster_bias_factor));
-}
-
-print ("Generating within-cluster random shifts...");
-
-X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal");
-feature_factors = dist_per_feature_first_record *
- exp ((seq (1, num_features) - 1) / (num_features - 1) * log
(feature_bias_factor));
-X_shift = X_shift %*% diag (feature_factors);
-
-print ("Generating records by shifting from centroids...");
-
-Y_bitmap_raw = table (seq (1, num_records), Y);
-Y_bitmap = matrix (0, rows = num_records, cols = num_centroids);
-Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw;
-X = Y_bitmap %*% C + X_shift;
-
-print ("Computing record-to-cluster assignments by minimum centroid
distance...");
-
-D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2));
-P = (D <= rowMins (D));
-aggr_P = t(cumsum (t(P)));
-Y_by_C = rowSums (aggr_P == 0) + 1;
-
-print ("Computing useful statistics...");
-
-sumXsq = sum (X ^ 2);
-default_wcss = sumXsq - sum (colSums (X) ^ 2) / num_records;
-attained_wcss = sumXsq + sum (rowMins (D));
-
-print ("Default (single-cluster) WCSS = " + default_wcss);
-print (num_centroids + "-cluster WCSS attained by the mixture centroids = " +
attained_wcss);
-
-print ("Writing out the resulting dataset...");
-
-write (X, fileX, format = fmt);
-write (C, fileC, format = fmt);
-write (Y, fileY, format = fmt);
-write (Y_by_C, fileYbyC, format = fmt);
-
-print ("Please run the scoring script to compare " + fileY + " with " +
fileYbyC);
-
-print ("DONE: K-MEANS GENERATOR SCRIPT");
-
diff --git a/scripts/datagen/genRandData4LinearReg_LTstats.dml
b/scripts/datagen/genRandData4LinearReg_LTstats.dml
deleted file mode 100644
index 9bb1ca189e..0000000000
--- a/scripts/datagen/genRandData4LinearReg_LTstats.dml
+++ /dev/null
@@ -1,233 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# generates random data to test bi- and multinomial logistic regression
-
-# $N = number of training samples
-# $Nt = number of test samples (or 0 if none)
-# $nf = number of features (independent variables)
-# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
-# $Xmin = minimum feature value
-# $Xmax = maximum feature value
-# $spars = controls sparsity in the generated data
-# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
-# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
-# $stdLT = requested standard deviation for the linear terms
-# $iceptmin = intercept, minimum value (0.0 disables intercept)
-# $iceptmax = intercept, maximum value (0.0 disables intercept)
-# $B = location to store generated regression parameters
-# $X = location to store generated training data
-# $Y = location to store generated training category labels
-# $Xt = location to store generated test data
-# $Yt = location to store generated test category labels
-# $fmt = format of the output
-#
-# Example:
-# hadoop jar SystemDS.jar -f genRandData4LinearReg_LTstats.dml -nvargs
-# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0
avgLTmax=5.0 stdLT=1.25
-# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123
Yt=./Yt123 fmt=binary
-
-numTrainingSamples = $N;
-numTestSamples = $Nt;
-numFeatures = $nf;
-numCategories = $nc;
-minIntercept = $iceptmin;
-maxIntercept = $iceptmax;
-minXentry = $Xmin;
-maxXentry = $Xmax;
-minAvgLT = $avgLTmin;
-maxAvgLT = $avgLTmax;
-sparsityLevel = $spars;
-stdevLT = $stdLT;
-fileB = ifdef ($B, "B");
-fileX = ifdef ($X, "X");
-fileY = ifdef ($Y, "Y");
-fileXt = ifdef ($Xt, "Xt");
-fileYt = ifdef ($Yt, "Yt");
-fmt = ifdef ($fmt, "mm");
-
-numSamples = numTrainingSamples + numTestSamples;
-
-isBinomialPMOne = FALSE;
-if (numCategories == 1) {
- numCategories = 2;
- isBinomialPMOne = TRUE;
-}
-do_we_output_intercept = 1;
-if (minIntercept == 0 & maxIntercept == 0) {
- do_we_output_intercept = 0;
-}
-
-X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max =
maxXentry, pdf = "uniform", sparsity = sparsityLevel);
-
-meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max =
maxAvgLT, pdf = "uniform");
-sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
-b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept,
max = maxIntercept, pdf = "uniform");
-
-meanLT_minus_intercept = meanLT - b_intercept;
-[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
-
-ones = matrix (1.0, rows = numSamples, cols = 1);
-LT = X %*% B + ones %*% b_intercept;
-actual_meanLT = colSums (LT) / numSamples;
-actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
-
-for (i in 1:(numCategories - 1)) {
- if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) {
- print ("Category " + i + ": Intercept = " + as.scalar (b_intercept
[1, i]));
- } else {
- print ("Category " + i + ": Intercept = " + as.scalar (b_intercept
[1, i]) + ", st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i]));
- }
- print (" Wanted LT mean = " + as.scalar (meanLT [1, i]) + ",
st.dev. = " + as.scalar (new_sigmaLT [1, i]));
- print (" Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ",
st.dev. = " + as.scalar (actual_sigmaLT [1, i]));
-}
-
-
-/*
-ones = matrix (1.0, rows = 1, cols = numCategories - 1);
-Prob = exp (LT);
-Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
-Prob = t(cumsum (t(Prob)));
-
-r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed
= 0);
-R = r %*% ones;
-Y = 1 + rowSums (Prob < R);
-if (isBinomialPMOne) {
- Y = 3 - 2 * Y;
-}
-*/
-
-/* USE FOR LINEAR REGRESSION */
-
-r = Rand (rows = numSamples, cols = 1, pdf = "normal");
-Y = LT [, 1] + r;
-
-
-if (do_we_output_intercept == 1) {
- new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
- new_B [1:nrow(B), 1:ncol(B)] = B;
- new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
- write (new_B, fileB, format=fmt);
-} else {
- write (B, fileB, format=fmt);
-}
-
-if (numTestSamples > 0) {
- X_train = X [1:numTrainingSamples,];
- Y_train = Y [1:numTrainingSamples,];
- X_test = X [(numTrainingSamples+1):numSamples,];
- Y_test = Y [(numTrainingSamples+1):numSamples,];
- write (X_train, fileX, format=fmt);
- write (Y_train, fileY, format=fmt);
- write (X_test, fileXt, format=fmt);
- write (Y_test, fileYt, format=fmt);
-} else {
- write (X, fileX, format=fmt);
- write (Y, fileY, format=fmt);
-}
-
-
-
-
-
-
-# Generates weight vectors to ensure the desired statistics for Linear Terms =
X %*% W
-# To be used for data generation in the testing of GLM, Logistic Regression,
etc.
-# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i]
are
-# the desired mean and standard deviation for X %*% W[, i]
-# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
-# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully
enforced,
-# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this
constraint.
-generateWeights =
- function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
- return (Matrix[double] W, Matrix[double] new_sigmaLT)
-{
- num_w = ncol (meanLT); # Number of output weight vectors
- dim_w = ncol (X); # Number of features / dimensions in a weight
vector
- w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes
meanLT
- # (all orthogonal shift directions do not affect
meanLT)
-
- # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
-
- w_1 = straightenX (X);
- r_1 = (X %*% w_1) - 1.0;
- norm_r_1_sq = sum (r_1 ^ 2);
-
- # For each W[, i] generate uniformly random directions to shift away from
"w_1"
-
- DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
- DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to
w_X
- XDW = X %*% DW;
-
- # Determine how far to shift in the chosen directions to satisfy the
constraints
- # Use the positive root of the quadratic equation; relax sigmaLT where
needed
-
- a_qe = colSums (XDW ^ 2);
- b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
- c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
-
- is_sigmaLT_OK = (c_qe <= 0);
- new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT)
* sqrt (norm_r_1_sq / nrow(X));
- c_qe = is_sigmaLT_OK * c_qe;
- x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
-
- # Scale and shift "w_1" in the "DW" directions to produce the result:
-
- ones = matrix (1.0, rows = dim_w, cols = 1);
- W = w_1 %*% meanLT + DW * (ones %*% x_qe);
-}
-
-# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1
-# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
-# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X).
-straightenX =
- function (Matrix[double] X)
- return (Matrix[double] w)
-{
- w_X = t(colSums(X));
- lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
- eps = 0.000000001 * nrow(X);
-
- # BEGIN LEAST SQUARES
-
- r_LS = - w_X;
- z_LS = matrix (0.0, rows = ncol(X), cols = 1);
- p_LS = - r_LS;
- norm_r2_LS = sum (r_LS ^ 2);
- i_LS = 0;
- while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
- {
- temp_LS = X %*% p_LS;
- q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
- alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
- z_LS = z_LS + alpha_LS * p_LS;
- old_norm_r2_LS = norm_r2_LS;
- r_LS = r_LS + alpha_LS * q_LS;
- norm_r2_LS = sum (r_LS ^ 2);
- p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
- i_LS = i_LS + 1;
- }
-
- # END LEAST SQUARES
-
- w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
-}
diff --git a/scripts/datagen/genRandData4LinearRegression.dml
b/scripts/datagen/genRandData4LinearRegression.dml
deleted file mode 100644
index ebce4f30d1..0000000000
--- a/scripts/datagen/genRandData4LinearRegression.dml
+++ /dev/null
@@ -1,61 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates data to test linear regression
-
-# $1 is number of samples
-# $2 is number of features (independent variables)
-# $3 is maximum feature value (absolute value)
-# $4 is maximum weight (absolute value)
-# $5 is location to store generated weights
-# $6 is location to store generated data
-# $7 is location to store generated labels
-# $8 is 0/1. 0 suppresses noise, 1 will add noise to Y
-# $9 is b, 0 disables intercept
-# $10 controls sparsity in the generated data
-# $11 output format
-
-numSamples = $1
-numFeatures = $2
-maxFeatureValue = $3
-maxWeight = $4
-addNoise = $8
-b = $9
-fmt = $11
-
-X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform",
seed=0, sparsity=$10)
-w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
-X = X * maxFeatureValue
-w = w * maxWeight
-Y = X %*% w
-
-if( b != 0 ) {
- b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
- w = rbind(w, t(b_mat))
- Y = Y + b
-}
-
-noise = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
-Y = Y + addNoise*noise
-
-write(w, $5, format=fmt)
-write(X, $6, format=fmt)
-write(Y, $7, format=fmt)
diff --git a/scripts/datagen/genRandData4LogReg_LTstats.dml
b/scripts/datagen/genRandData4LogReg_LTstats.dml
deleted file mode 100644
index f95342f708..0000000000
--- a/scripts/datagen/genRandData4LogReg_LTstats.dml
+++ /dev/null
@@ -1,233 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# generates random data to test bi- and multinomial logistic regression
-
-# $N = number of training samples
-# $Nt = number of test samples (or 0 if none)
-# $nf = number of features (independent variables)
-# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
-# $Xmin = minimum feature value
-# $Xmax = maximum feature value
-# $spars = controls sparsity in the generated data
-# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
-# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
-# $stdLT = requested standard deviation for the linear terms
-# $iceptmin = intercept, minimum value (0.0 disables intercept)
-# $iceptmax = intercept, maximum value (0.0 disables intercept)
-# $B = location to store generated regression parameters
-# $X = location to store generated training data
-# $Y = location to store generated training category labels
-# $Xt = location to store generated test data
-# $Yt = location to store generated test category labels
-#
-# Example:
-# hadoop jar SystemDS.jar -f genRandData4LogReg_LTstats.dml -nvargs
-# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0
avgLTmax=5.0 stdLT=1.25
-# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123
Yt=./Yt123
-
-numTrainingSamples = $N;
-numTestSamples = $Nt;
-numFeatures = $nf;
-numCategories = $nc;
-minIntercept = $iceptmin;
-maxIntercept = $iceptmax;
-minXentry = $Xmin;
-maxXentry = $Xmax;
-minAvgLT = $avgLTmin;
-maxAvgLT = $avgLTmax;
-sparsityLevel = $spars;
-stdevLT = $stdLT;
-fileB = ifdef ($B, "B");
-fileX = ifdef ($X, "X");
-fileY = ifdef ($Y, "Y");
-fileXt = ifdef ($Xt, "Xt");
-fileYt = ifdef ($Yt, "Yt");
-
-
-numSamples = numTrainingSamples + numTestSamples;
-
-isBinomialPMOne = FALSE;
-if (numCategories == 1) {
- numCategories = 2;
- isBinomialPMOne = TRUE;
-}
-do_we_output_intercept = 1;
-if (minIntercept == 0 & maxIntercept == 0) {
- do_we_output_intercept = 0;
-}
-
-X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max =
maxXentry, pdf = "uniform", sparsity = sparsityLevel);
-
-meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max =
maxAvgLT, pdf = "uniform");
-sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
-b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept,
max = maxIntercept, pdf = "uniform");
-
-meanLT_minus_intercept = meanLT - b_intercept;
-[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
-
-ones = matrix (1.0, rows = numSamples, cols = 1);
-LT = X %*% B + ones %*% b_intercept;
-actual_meanLT = colSums (LT) / numSamples;
-actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
-
-for (i in 1:(numCategories - 1)) {
- if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) {
- print ("Category " + i + ": Intercept = " + as.scalar (b_intercept
[1, i]));
- } else {
- print ("Category " + i + ": Intercept = " + as.scalar (b_intercept
[1, i]) + ", st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i]));
- }
- print (" Wanted LT mean = " + as.scalar (meanLT [1, i]) + ",
st.dev. = " + as.scalar (new_sigmaLT [1, i]));
- print (" Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ",
st.dev. = " + as.scalar (actual_sigmaLT [1, i]));
-}
-
-
-ones = matrix (1.0, rows = 1, cols = numCategories - 1);
-Prob = exp (LT);
-Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
-Prob = t(cumsum (t(Prob)));
-
-r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed
= 0);
-R = r %*% ones;
-Y = 1 + rowSums (Prob < R);
-if (isBinomialPMOne) {
- Y = 3 - 2 * Y;
-}
-
-
-/* USE FOR LINEAR REGRESSION
-
-r = Rand (rows = numSamples, cols = 1, pdf = "normal");
-Y = LT [, 1] + r;
-
-*/
-
-
-if (do_we_output_intercept == 1) {
- new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
- new_B [1:nrow(B), 1:ncol(B)] = B;
- new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
- write (new_B, fileB, format="mm");
-} else {
- write (B, fileB, format="mm");
-}
-
-if (numTestSamples > 0) {
- X_train = X [1:numTrainingSamples,];
- Y_train = Y [1:numTrainingSamples,];
- X_test = X [(numTrainingSamples+1):numSamples,];
- Y_test = Y [(numTrainingSamples+1):numSamples,];
- write (X_train, fileX, format="mm");
- write (Y_train, fileY, format="mm");
- write (X_test, fileXt, format="mm");
- write (Y_test, fileYt, format="mm");
-} else {
- write (X, fileX, format="mm");
- write (Y, fileY, format="mm");
-}
-
-
-
-
-
-
-# Generates weight vectors to ensure the desired statistics for Linear Terms =
X %*% W
-# To be used for data generation in the testing of GLM, Logistic Regression,
etc.
-# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i]
are
-# the desired mean and standard deviation for X %*% W[, i]
-# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
-# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully
enforced,
-# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this
constraint.
-generateWeights =
- function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
- return (Matrix[double] W, Matrix[double] new_sigmaLT)
-{
- num_w = ncol (meanLT); # Number of output weight vectors
- dim_w = ncol (X); # Number of features / dimensions in a weight
vector
- w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes
meanLT
- # (all orthogonal shift directions do not affect
meanLT)
-
- # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
-
- w_1 = straightenX (X);
- r_1 = (X %*% w_1) - 1.0;
- norm_r_1_sq = sum (r_1 ^ 2);
-
- # For each W[, i] generate uniformly random directions to shift away from
"w_1"
-
- DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
- DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to
w_X
- XDW = X %*% DW;
-
- # Determine how far to shift in the chosen directions to satisfy the
constraints
- # Use the positive root of the quadratic equation; relax sigmaLT where
needed
-
- a_qe = colSums (XDW ^ 2);
- b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
- c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
-
- is_sigmaLT_OK = (c_qe <= 0);
- new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT)
* sqrt (norm_r_1_sq / nrow(X));
- c_qe = is_sigmaLT_OK * c_qe;
- x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
-
- # Scale and shift "w_1" in the "DW" directions to produce the result:
-
- ones = matrix (1.0, rows = dim_w, cols = 1);
- W = w_1 %*% meanLT + DW * (ones %*% x_qe);
-}
-
-# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1
-# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
-# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X).
-straightenX =
- function (Matrix[double] X)
- return (Matrix[double] w)
-{
- w_X = t(colSums(X));
- lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
- eps = 0.000000001 * nrow(X);
-
- # BEGIN LEAST SQUARES
-
- r_LS = - w_X;
- z_LS = matrix (0.0, rows = ncol(X), cols = 1);
- p_LS = - r_LS;
- norm_r2_LS = sum (r_LS ^ 2);
- i_LS = 0;
- while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
- {
- temp_LS = X %*% p_LS;
- q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
- alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
- z_LS = z_LS + alpha_LS * p_LS;
- old_norm_r2_LS = norm_r2_LS;
- r_LS = r_LS + alpha_LS * q_LS;
- norm_r2_LS = sum (r_LS ^ 2);
- p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
- i_LS = i_LS + 1;
- }
-
- # END LEAST SQUARES
-
- w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
-}
diff --git a/scripts/datagen/genRandData4LogisticRegression.dml
b/scripts/datagen/genRandData4LogisticRegression.dml
deleted file mode 100644
index f0850938ad..0000000000
--- a/scripts/datagen/genRandData4LogisticRegression.dml
+++ /dev/null
@@ -1,72 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data to test linear logistic regression
-
-# $1 is number of samples
-# $2 is number of features (independent variables)
-# $3 is maximum feature value (absolute value)
-# $4 is maximum weight (absolute value)
-# $5 is location to store generated weights
-# $6 is location to store generated data
-# $7 is location to store generated labels
-# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
-# $9 is b, 0 disables intercept
-# $10 controls sparsity in the generated data
-# $11 output format
-# $12 transform labels. if 0 then -1/1; otherwise 1/2
-
-numSamples = $1
-numFeatures = $2
-maxFeatureValue = $3
-maxWeight = $4
-addNoise = $8
-b = $9
-
-X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform",
seed=0, sparsity=$10)
-X = X * maxFeatureValue
-
-w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
-w = w * maxWeight
-
-ot = X %*% w
-if( b != 0) {
- b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
- w = rbind(w, t(b_mat))
- ot = ot + b
-}
-
-prob = 1 / (1 + exp(-ot))
-if( addNoise == 1 ){
- r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-}
-else {
- print("this data generator generates the same dataset for both noise=0
and noise=1")
- r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-}
-
-Y = 1 - 2 * (prob < r)
-if( $12 == 1 )
- Y = (Y + 3) / 2
-
-write(w, $5, format=$11)
-write(X, $6, format=$11)
-write(Y, $7, format=$11)
diff --git a/scripts/datagen/genRandData4MultiClassSVM.dml
b/scripts/datagen/genRandData4MultiClassSVM.dml
deleted file mode 100644
index 011b4dab18..0000000000
--- a/scripts/datagen/genRandData4MultiClassSVM.dml
+++ /dev/null
@@ -1,68 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data to test linear logistic regression
-
-# $1 is number of samples
-# $2 is number of features (independent variables)
-# $3 is maximum feature value (absolute value)
-# $4 is maximum weight (absolute value)
-# $5 is location to store generated weights
-# $6 is location to store generated data
-# $7 is location to store generated labels
-# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
-# $9 is b, 0 disables intercept
-# $10 controls sparsity in the generated data
-
-numSamples = $1
-numFeatures = $2
-maxFeatureValue = $3
-maxWeight = $4
-addNoise = $8
-b = $9
-
-X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform",
seed=0, sparsity=$10)
-X = X * maxFeatureValue
-
-w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
-w = w * maxWeight
-
-ot = X%*%w
-if(b!=0) {
- b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
- w = t(cbind(t(w), b_mat))
- ot = ot + b
-}
-
-prob = 1/(1+exp(-ot))
-if(addNoise == 1){
- r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
-}else{
- print("this data generator generates the same dataset for both noise=0
and noise=1")
- r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
- #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform")
-}
-Y = 1 - 2 * (prob < r)
-Y = (Y+3)/2
-
-write(w, $5, format="binary")
-write(X, $6, format="binary")
-write(Y, $7, format="binary")
diff --git a/scripts/datagen/genRandData4Multinomial.dml
b/scripts/datagen/genRandData4Multinomial.dml
deleted file mode 100644
index 93666758b5..0000000000
--- a/scripts/datagen/genRandData4Multinomial.dml
+++ /dev/null
@@ -1,66 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-num_records = $1;
-num_features = $2;
-
-p = $3; #sparsity
-num_categories = $4; #num classes
-is_intercept = $5==1;
-
-stdevLT = 1.0;
-beta_range = 3.0 * stdevLT / sqrt (num_features * p);
-
-if (is_intercept) {
- intercept = Rand (rows = 1, cols = num_categories - 1, min = -1.0, max =
1.0);
-}
-
-X = Rand( rows = num_records,
- cols = num_features,
- min = 1,
- max = 5,
- pdf = "uniform",
- sparsity = p );
-
-B = Rand (rows = num_features,
- cols = num_categories - 1,
- min = -1.0,
- max = 1.0,
- pdf = "uniform",
- sparsity = 1.0) * beta_range;
-
-LT = X %*% B;
-if (is_intercept) {
- LT = LT + matrix (1, rows = num_records, cols = 1) %*% intercept;
-}
-
-Prob = exp (LT);
-Prob = Prob / (1.0 + rowSums(Prob));
-Prob = t(cumsum (t(Prob)));
-
-r = Rand (rows = num_records, cols = 1, min = 0, max = 1, pdf = "uniform");
-Y = 1 + rowSums (Prob < r);
-
-# ensure all classes are represented
-Y[(num_records-num_categories+1):num_records,1] = seq(1,num_categories);
-
-write(X, $6, format=$8)
-write(Y, $7, format=$8);
\ No newline at end of file
diff --git a/scripts/datagen/genRandData4NMF.dml
b/scripts/datagen/genRandData4NMF.dml
deleted file mode 100644
index a82ac4e0f1..0000000000
--- a/scripts/datagen/genRandData4NMF.dml
+++ /dev/null
@@ -1,129 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for non-negative
-# matrix factorization
-#
-# follows lda's generative model
-# see Blei, Ng & Jordan, JMLR'03 paper
-# titled Latent Dirichlet Allocation
-#
-# $1 is number of samples
-# $2 is number of features
-# $3 is number of latent factors
-# $4 is number of features per sample
-# (may overlap). use this to vary
-# sparsity.
-# $5 is file to store sample mixtures
-# $6 is file to store factors
-# $7 is file to store generated data
-
-numDocuments = $1
-numFeatures = $2
-numTopics = $3
-numWordsPerDoc = $4
-
-docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0,
pdf="uniform", seed=0, sparsity=0.75)
-denomsTM = rowSums(docTopicMixtures)
-zerosInDenomsTM = denomsTM == 0
-denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
-parfor(i in 1:numTopics){
- docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
-}
-write(docTopicMixtures, $5, format="binary")
-for(j in 2:numTopics){
- docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
-}
-
-topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0,
pdf="uniform", seed=0, sparsity=0.75)
-parfor(i in 1:numTopics){
- topicDist = topicDistributions[i,]
-
- denom2 = sum(topicDist)
- if(denom2 == 0){
- denom2 = denom2 + 0.1
- }
-
- topicDistributions[i,] = topicDist / denom2
-}
-write(topicDistributions, $6, format="binary")
-for(j in 2:numFeatures){
- topicDistributions[,j] = topicDistributions[,j-1] +
topicDistributions[,j]
-}
-
-data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
-
-parfor(i in 1:numDocuments){
- docTopic = docTopicMixtures[i,]
-
- ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");
-
- r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform",
seed=0)
- r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform",
seed=0)
-
- for(j in 1:numWordsPerDoc){
- rz = as.scalar(r_z[j,1])
- continue = 1
-
- z = -1
- #this is a workaround
- #z=1
-
- for(k1 in 1:numTopics){
- prob = as.scalar(docTopic[1,k1])
- if(continue==1 & rz <= prob){
- z=k1
- continue=0
- }
- }
-
- if(z==-1){
- print("z is unassigned: " + z)
- z = numTopics
- }
-
- rw = as.scalar(r_w[j,1])
- continue = 1
-
- w = -1
- #this is a workaround
- #w = 1
-
- for(k2 in 1:numFeatures){
- prob = as.scalar(topicDistributions[z,k2])
- if(continue == 1 & rw <= prob){
- w = k2
- continue = 0
- }
- }
-
- if(w==-1){
- print("w is unassigned: " + w)
- w = numFeatures
- }
-
- ldata[1,w] = ldata[1,w] + 1
- }
-
- data[i,] = ldata;
-}
-
-write(data, $7, format="binary")
diff --git a/scripts/datagen/genRandData4NMFBlockwise.dml
b/scripts/datagen/genRandData4NMFBlockwise.dml
deleted file mode 100644
index 0ad548ead2..0000000000
--- a/scripts/datagen/genRandData4NMFBlockwise.dml
+++ /dev/null
@@ -1,138 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random data for non-negative
-# matrix factorization
-#
-# follows lda's generative model
-# see Blei, Ng & Jordan, JMLR'03 paper
-# titled Latent Dirichlet Allocation
-#
-# $1 is number of samples
-# $2 is number of features
-# $3 is number of latent factors
-# $4 is number of features per sample
-# (may overlap). use this to vary
-# sparsity.
-# $5 is file to store sample mixtures
-# $6 is file to store factors
-# $7 is file to store generated data
-#
-# $8 is the blocksize, i.e., number of rows per block
-# (should be set such that $8x$2 fits in mem budget)
-
-numDocuments = $1
-numFeatures = $2
-numTopics = $3
-numWordsPerDoc = $4
-blocksize = $8
-
-docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0,
pdf="uniform", seed=0, sparsity=0.75)
-denomsTM = rowSums(docTopicMixtures)
-zerosInDenomsTM = (denomsTM == 0)
-denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
-parfor(i in 1:numTopics){
- docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
-}
-write(docTopicMixtures, $5, format="binary")
-for(j in 2:numTopics){
- docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
-}
-
-topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0,
pdf="uniform", seed=0, sparsity=0.75)
-parfor(i in 1:numTopics){
- topicDist = topicDistributions[i,]
-
- denom2 = sum(topicDist)
- if(denom2 == 0){
- denom2 = denom2 + 0.1
- }
-
- topicDistributions[i,] = topicDist / denom2
-}
-write(topicDistributions, $6, format="binary")
-for(j in 2:numFeatures){
- topicDistributions[,j] = topicDistributions[,j-1] +
topicDistributions[,j]
-}
-
-data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
-
-#outer-loop for blockwise computation
-for( k in seq(1,numDocuments,blocksize) )
-{
- len = min(blocksize,numDocuments-k); #block length
- data = data0[k:(k+len),]; #obtain block
-
- parfor(i in 1:len){
- docTopic = docTopicMixtures[i,]
-
- r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform",
seed=0)
- r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform",
seed=0)
-
- for(j in 1:numWordsPerDoc){
- rz = as.scalar(r_z[j,1])
- continue = 1
-
- z = -1
- #this is a workaround
- #z=1
-
- for(k1 in 1:numTopics){
- prob = as.scalar(docTopic[1,k1])
- if(continue==1 & rz <= prob){
- z=k1
- continue=0
- }
- }
-
- if(z==-1){
- print("z is unassigned: " + z)
- z = numTopics
- }
-
- rw = as.scalar(r_w[j,1])
- continue = 1
-
- w = -1
- #this is a workaround
- #w = 1
-
- for(k2 in 1:numFeatures){
- prob = as.scalar(topicDistributions[z,k2])
- if(continue == 1 & rw <= prob){
- w = k2
- continue = 0
- }
- }
-
- if(w==-1){
- print("w is unassigned: " + w)
- w = numFeatures
- }
-
- data[i,w] = data[i,w] + 1
- }
- }
-
- data0[k:(k+len),] = data; # write block back
-}
-
-write(data0, $7, format="binary")
diff --git a/scripts/datagen/genRandData4PCA.dml
b/scripts/datagen/genRandData4PCA.dml
deleted file mode 100644
index 413d5c458e..0000000000
--- a/scripts/datagen/genRandData4PCA.dml
+++ /dev/null
@@ -1,61 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Synthetic data generator for PCA
-# 3 hidden dimensions (V1, V2, V3)
-# generates only "dense" data
-#
-# INPUT PARAMETERS:
-#
--------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-#
--------------------------------------------------------------------------------------------
-# R Int 10000 Number of rows
-# C Int 1000 Number of categorical attributes
-# OUT String --- Location (on HDFS) to store the generated dataset
-# FMT String "csv" Matrix output format, usually "text", "csv" or
"binary"
-#
--------------------------------------------------------------------------------------------
-#
-# Example:
-# hadoop jar SystemDS.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000
OUT=/user/biuser/pcaData.mtx FMT=csv
-
-R = ifdef ($R, 10000)
-C = ifdef ($C, 1000)
-FMT = ifdef ($FMT, "csv");
-
-# Modified version of the procedure from Zou et.al., "Sparse Principal
Component Analysis", 2006.
-
-# V1 ~ N(0,290); V2~N(0,300); V3 = -0.3V1+0.925V2 + e, e ~ N(0,1)
-V1 = 0 + 290*rand(rows=R, cols=1, pdf="normal");
-V2 = 0 + 300*rand(rows=R, cols=1, pdf="normal");
-V3 = -0.3*V1 + 0.925*V2 + rand(rows=R, cols=1, pdf="normal");
-
-C1 = ceil(C/2.5);
-C2 = ceil(C/2.5);
-C3 = C - C1 - C2;
-
-M = matrix(0, rows=R, cols=C)
-
-M[,1:C1] = rand(rows=R, cols=C1, pdf="normal") + V1;
-M[,C1+1:C1+C2] = rand(rows=R, cols=C2, pdf="normal") + V2;
-M[,C1+C2+1:C] = rand(rows=R, cols=C3, pdf="normal") + V3;
-
-write(M, $OUT, format=FMT);
diff --git a/scripts/datagen/genRandData4StratStats.dml
b/scripts/datagen/genRandData4StratStats.dml
deleted file mode 100644
index 6a4c07f734..0000000000
--- a/scripts/datagen/genRandData4StratStats.dml
+++ /dev/null
@@ -1,155 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# THIS SCRIPT GENERATES SYNTHETIC DATA FOR STRATSTATS (STRATIFIED STATISTICS)
TESTING
-#
-# INPUT PARAMETERS:
-#
--------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-#
--------------------------------------------------------------------------------------------
-# nr Int 100000 Number of records in the generated dataset
-# nf Int 10 Number of features in the X and the Y parts of the
generated dataset
-# smin Int 10000 Minimum stratum value, a positive integer
-# smax Int 20000 Maximum stratum value, a positive integer
-# prs Double 100.0 How many times more likely to have minimum vs.
maximum stratum value
-# pxnan Double 0.05 Probability of a NaN replacing a value in X
-# pynan Double 0.05 Probability of a NaN replacing a value in Y
-# psnan Double 0.05 Probability of a NaN replacing a value in the stratum
column
-#
--------------------------------------------------------------------------------------------
-# mxmin Double 10.0 Baseline (mean) value for the first feature in X
-# mxmax Double 19.0 Baseline (mean) value for the last feature in X
-# mymin Double 30.0 Baseline (mean) value for the first feature in Y
(before adding X)
-# mymax Double 39.0 Baseline (mean) value for the last feature in Y
(before adding X)
-# bmin Double 3.0 "Beta" multiplied by X before adding to Y, for the
first feature
-# bmax Double 3.0 "Beta" multiplied by X before adding to Y, for the
last feature
-#
--------------------------------------------------------------------------------------------
-# sxbmin Double 3.0 Standard deviation for the first feature in X,
stratum dependent
-# sxbmax Double 3.0 Standard deviation for the last feature in X, stratum
dependent
-# sxwmin Double 4.0 Standard deviation for the first feature in X,
residual
-# sxwmax Double 4.0 Standard deviation for the last feature in X, residual
-# sybmin Double sqrt(28) Standard deviation for the first feature in Y,
stratum dependent
-# sybmax Double sqrt(28) Standard deviation for the last feature in Y, stratum
dependent
-# sywmin Double 6.0 Standard deviation for the first feature in Y,
residual
-# sywmax Double 6.0 Standard deviation for the last feature in Y, residual
-#
--------------------------------------------------------------------------------------------
-# D String "Data" Location (on HDFS) to store the generated dataset
-# Xcid String "Xcid" Location (on HDFS) to store the column indices of X
features
-# Ycid String "Ycid" Location (on HDFS) to store the column indices of Y
features
-# A String "Aux" Location (on HDFS) to store the auxiliary parameter
values, if any
-# fmt String "text" Matrix output format, usually "text", "mm", or "csv"
-#
--------------------------------------------------------------------------------------------
-# OUTPUT: Matrix with the generated dataset, Xcid and Ycid, and possibly other
auxiliaries
-
-num_records = ifdef ($nr, 100000);
-num_features = ifdef ($nf, 10);
-min_stratumID = ifdef ($smin, 10000);
-max_stratumID = ifdef ($smax, 20000);
-prob_ratio_min_to_max_stratumID = ifdef ($prs, 100);
-prob_NaN_in_X = ifdef ($pxnan, 0.05);
-prob_NaN_in_Y = ifdef ($pynan, 0.05);
-prob_NaN_in_stratum = ifdef ($psnan, 0.05);
-
-mean_X_min = ifdef ($mxmin, 31.0);
-mean_X_max = ifdef ($mxmax, 40.0);
-mean_Y_min = ifdef ($mymin, 11.0);
-mean_Y_max = ifdef ($mymax, 20.0);
-beta_min = ifdef ($bmin, 3.0);
-beta_max = ifdef ($bmax, 3.0);
-
-stdev_X_between_strata_min = ifdef ($sxbmin, 3.0);
-stdev_X_between_strata_max = ifdef ($sxbmax, 3.0);
-stdev_X_within_strata_min = ifdef ($sxwmin, 4.0);
-stdev_X_within_strata_max = ifdef ($sxwmax, 4.0);
-stdev_Y_between_strata_min = ifdef ($sybmin, sqrt(28.0));
-stdev_Y_between_strata_max = ifdef ($sybmax, sqrt(28.0));
-stdev_Y_within_strata_min = ifdef ($sywmin, 6.0);
-stdev_Y_within_strata_max = ifdef ($sywmax, 6.0);
-
-fileData = ifdef ($D, "Data");
-fileXcid = ifdef ($Xcid, "Xcid");
-fileYcid = ifdef ($Ycid, "Ycid");
-fileAux = ifdef ($A, "Aux" );
-fmt = ifdef ($fmt, "text");
-
-# Generate the strata, from 1 to (max_stratumID - min_stratumID + 1), as
multinomial
-# in which 1 is less likely than (max_stratumID - min_stratumID + 1) by a
factor of
-# prob_ratio_min_to_max_stratumID
-
-r_power = (max_stratumID - min_stratumID) / log
(prob_ratio_min_to_max_stratumID);
-r_bound = prob_ratio_min_to_max_stratumID ^ (1.0 + 1.0 / (max_stratumID -
min_stratumID));
-
-if (r_bound < 1.0) {
- R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf =
"uniform");
- R_S = r_bound + R_S * (1.0-r_bound);
-} else {
- R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf =
"uniform");
- R_S = 1.0 + R_S * (r_bound-1);
-}
-
-SID = round (0.5 + log (R_S) * r_power);
-num_strata = max (SID);
-Smap = table (SID, seq (1, num_records, 1));
-
-# Compute baseline values and standard deviations of X, Y, and beta, at each
feature
-
-mean_X = mean_X_min + ((mean_X_max - mean_X_min) / (num_features - 1)) * seq
(0, num_features - 1, 1);
-mean_Y = mean_Y_min + ((mean_Y_max - mean_Y_min) / (num_features - 1)) * seq
(0, num_features - 1, 1);
-betas = beta_min + (( beta_max - beta_min) / (num_features - 1)) * seq
(0, num_features - 1, 1);
-
-stdev_X_within_strata = stdev_X_within_strata_min +
- ((stdev_X_within_strata_max - stdev_X_within_strata_min ) / (num_features
- 1)) * seq (0, num_features - 1, 1);
-stdev_X_between_strata = stdev_X_between_strata_min +
- ((stdev_X_between_strata_max - stdev_X_between_strata_min) / (num_features
- 1)) * seq (0, num_features - 1, 1);
-stdev_Y_within_strata = stdev_Y_within_strata_min +
- ((stdev_Y_within_strata_max - stdev_Y_within_strata_min ) / (num_features
- 1)) * seq (0, num_features - 1, 1);
-stdev_Y_between_strata = stdev_Y_between_strata_min +
- ((stdev_Y_between_strata_max - stdev_Y_between_strata_min) / (num_features
- 1)) * seq (0, num_features - 1, 1);
-
-# Generate X and Y matrices
-
-RX_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal");
# transposed
-RY_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal");
# to allow
-RX_records = Rand (rows = num_features, cols = num_records, pdf = "normal");
# matrix-vector
-RY_records = Rand (rows = num_features, cols = num_records, pdf = "normal");
# operations
-
-t_X = RX_records * stdev_X_within_strata + (RX_strata * stdev_X_between_strata
+ mean_X) %*% Smap;
-t_Y = RY_records * stdev_Y_within_strata + (RY_strata * stdev_Y_between_strata
+ mean_Y) %*% Smap + (t_X * betas);
-Data = cbind (min_stratumID - 1 + SID, t(t_X), t(t_Y));
-
-# Set up the NaNs
-
-RNaNS = Rand (rows = num_records, cols = 1, min = 1.0, max = 1.0, sparsity =
prob_NaN_in_stratum);
-RNaNX = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0,
sparsity = prob_NaN_in_X);
-RNaNY = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0,
sparsity = prob_NaN_in_Y);
-Mask = cbind (RNaNS, RNaNX, RNaNY) != 0;
-Data = Data + (1.0 - Mask) / (1.0 - Mask);
-
-# Output the dataset and the auxiliaries
-
-Xcid = t(seq (2, num_features + 1, 1));
-Ycid = t(seq (num_features + 2, 2 * num_features + 1, 1));
-Aux = cbind (mean_X, mean_Y, betas);
-
-write (Data, fileData, format=fmt);
-write (Xcid, fileXcid, format=fmt);
-write (Ycid, fileYcid, format=fmt);
-write (Aux, fileAux, format=fmt);
-
diff --git a/scripts/datagen/genRandData4SurvAnalysis.dml
b/scripts/datagen/genRandData4SurvAnalysis.dml
deleted file mode 100644
index 75117cf6d7..0000000000
--- a/scripts/datagen/genRandData4SurvAnalysis.dml
+++ /dev/null
@@ -1,133 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL
HAZARD MODELS
-# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTRIBUTION WITH PARAMETERS LAMBDA
AND V
-#
-# INPUT PARAMETERS:
-#
---------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-#
---------------------------------------------------------------------------------------------
-# type Sting --- The type of model for which the data is being
generated: "kaplan-meier" or "cox"
-# n Int Number of records
-# lambda Double 2.0 Scale parameter of the Weibull distribution
used for generating timestamps
-# v Double 1.5 Shape parameter of the Weibull distribution
used for generating timestamps
-# p Double 0.8 1 - probability of a record being censored
-# g Int 2 If type=kaplan-meier the number of categorical
features used for grouping
-# s Int 1 If type=kaplan-meier the number of categorical
features used for stratifying
-# f Int 10 If type=kaplan-meier maximum number of levels
(i.e., distinct values) of g+s categorical features
-# m Int 100 If type=cox the number of features in the model
-# sp Double 1.0 If type=cox the sparsity of the feature matrix
-# O String --- Location to write the output matrix containing
random data for the kaplan-meier or the cox model
-# B String --- If type=cox location to write the output
matrix containing the coefficients for the cox model
-# TE String --- Location to store column indices of X
corresponding to timestamp (first row) and event information (second row)
-# F String --- Location to store column indices of X
which are to be used for fitting the Cox model
-# fmt String "text" The output format of results of the
kaplan-meier analysis, such as "text" or "csv"
-#
---------------------------------------------------------------------------------------------
-# OUTPUTS:
-# 1- If type=kaplan-meier an n x (2+g+s) matrix O with
-# - column 1 contains timestamps generated randomly from a Weibull
distribution with parameters lambda and v
-# - column 2 contains the information whether an event occurred (1) or
data is censored (0)
-# - columns 3:2+g contain categorical features used for grouping
-# - columns 3+g:2+g+s contain categorical features used for stratifying
-# if type=cox an n x (2+m) matrix O with
-# - column 1 contains timestamps generated randomly from a Weibull
distribution with parameters lambda and v
-# - column 2 contains the information whether an event occurred (1) or
data is censored (0)
-# - columns 3:2+m contain scale features
-# 2- If type=cox a coefficient matrix B
-# 3- A column matrix TE containing the column indices of X corresponding to
timestamp (first row) and event information (second row)
-# 4- A column matrix F containing the column indices of X which are to be used
for KM analysis or fitting the Cox model
-
-type = $type; # either "kaplan-meier" or "cox"
-num_records = $n;
-lambda = ifdef ($l, 2.0);
-p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored
-# parameters related to the kaplan-meier model
-n_groups = ifdef ($g, 2);
-n_strata = ifdef ($s, 1);
-max_level = ifdef ($f, 10);
-# parameters related to the cox model
-num_features = ifdef ($m, 1000);
-sparsity = ifdef ($sp, 1.0);
-fileO = $O;
-fileB = $B;
-fileTE = $TE;
-fileF = $F;
-fmtO = ifdef ($fmt, "text"); # $fmt="text"
-p_censor = 1 - p_event; # prob. that record is censored
-
-if (type == "kaplan-meier") {
-
- v = ifdef ($v, 1.5);
- # generate categorical features used for grouping and stratifying
- X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min =
0.000000001, max = max_level - 0.000000001, pdf = "uniform"));
-
- # generate timestamps
- U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1);
- T = (-log (U) / lambda) ^ (1/v);
-
-} else if (type == "cox") {
-
- v = ifdef ($v, 50);
- # generate feature matrix
- X = rand (rows = num_records, cols = num_features, min = 1, max = 5,
pdf = "uniform", sparsity = sparsity);
-
- # generate coefficients
- B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf =
"uniform", sparsity = 1.0); # * beta_range;
-
- # generate timestamps
- U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1);
- T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v);
-
-} else {
- stop ("Wrong model type!");
-}
-
-Y = matrix (0, rows = num_records, cols = 2);
-event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max =
(1 + p_event)));
-n_time = sum (event);
-Y[,2] = event;
-
-# binning of event times
-min_T = min (T);
-max_T = max (T);
-# T = T - min_T;
-len = max_T - min_T;
-num_bins = len / n_time;
-T = ceil (T / num_bins);
-
-# print ("min(T) " + min(T) + " max(T) " + max(T));
-Y[,1] = T;
-
-O = cbind (Y, X);
-write (O, fileO, format = fmtO);
-
-if (type == "cox") {
- write (B, fileB, format = fmtO);
-
-}
-
-TE = matrix ("1 2", rows = 2, cols = 1);
-F = seq (1, num_features);
-write (TE, fileTE, format = fmtO);
-write (F, fileF, format = fmtO);
-
diff --git a/scripts/datagen/genRandData4Transform.dml
b/scripts/datagen/genRandData4Transform.dml
deleted file mode 100644
index edab7c2873..0000000000
--- a/scripts/datagen/genRandData4Transform.dml
+++ /dev/null
@@ -1,96 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# Generates random data to test transform with
-#
-# rows, cols: dimensions of the data matrix to be generated
-# prob_categorical: percentage of the generated cols to be categorical
-# min_domain, max_domain: provide a range for domain sizes of the generated
categorical cols
-# prob_missing: percentage of the generated (scale) cols to have missing values
-# prob_missing_cell: probability of a cell to have a missing value
-# out_X, out_missing, out_categorical: output file names
-#
-
-#params for size of data
-num_rows = ifdef($rows, 1000)
-num_cols = ifdef($cols, 25)
-
-#params for kind of cols
-prob_categorical = ifdef($prob_cat, 0.1)
-min_domain_size = ifdef($min_domain, 1)
-max_domain_size = ifdef($max_domain, 10)
-
-#params for missing value cols
-prob_missing_col = ifdef($prob_missing, 0.1)
-prob_missing_val = ifdef($prob_missing_cell, 0.2)
-
-num_scalar_cols = as.double(num_cols)
-num_categorical_cols = 0.0
-scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
-if(prob_categorical > 0){
- categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
- categorical_ind = categorical_ind < prob_categorical
- categorical_col_ids = removeEmpty(target=seq(1, num_cols,
1)*categorical_ind, margin="rows")
- num_categorical_cols = sum(categorical_ind)
- write(categorical_col_ids, $out_categorical, format="csv")
-
- domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1,
pdf="uniform")
- domain_sizes = round(min_domain_size + (max_domain_size -
min_domain_size)*domain_sizes)
-
- categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1,
pdf="uniform")
- categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))
-
- scalar_ind = 1-categorical_ind
-}
-
-scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind,
margin="rows")
-num_scalar_cols = sum(scalar_ind)
-scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1,
pdf="uniform")
-
-if(num_categorical_cols > 0 & num_scalar_cols > 0){
- X = cbind(scalar_X, categorical_X)
- permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids,
num_scalar_cols, num_cols)
- fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
- permut_mat = t(cbind(t(permut_mat), t(fill_in)))
- X = X %*% permut_mat
-}else{
- if(num_categorical_cols > 0) X = categorical_X
- else{
- if(num_scalar_cols > 0) X = scalar_X
- else print("somehow, we've managed to compute that precisely 0 cols should
be categorical and 0 cols should be scale")
- }
-}
-
-if(prob_missing_col > 0){
- missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
- missing_col_ind = missing_col_ind < prob_missing_col
- #currently only support missing value imputation for scale cols
- missing_col_ind = missing_col_ind * scalar_ind
- missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind,
margin="rows")
- missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0,
max=1, pdf="uniform")
- missing_values = missing_values < prob_missing_val
- X = cbind(X, missing_values)
-
- write(missing_col_ids, $out_missing, format="csv")
-}
-
-write(X, $out_X, format="csv")
diff --git a/scripts/datagen/genRandData4Univariate.dml
b/scripts/datagen/genRandData4Univariate.dml
deleted file mode 100644
index bcbd528eb9..0000000000
--- a/scripts/datagen/genRandData4Univariate.dml
+++ /dev/null
@@ -1,61 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random numbers from a distribution
-# with specified mean, standard deviation,
-# skewness, kurtosis
-# mean and standard deviation are taken in as
-# arguments by this script
-# a,b,c,d are coefficients computed by some
-# equation solver determined from the specified
-# skewness and kurtosis using power method
-# polynomials
-#
-# for more details see:
-# Statistical Simulation: Power Method Polynomials
-# and Other Transformations
-# Author: Todd C. Headrick
-# Chapman & Hall/CRC, Boca Raton, FL, 2010.
-# ISBN 978-1-4200-6490-2
-
-# $1 is the number of random points to be sampled
-# $2 is specified mean
-# $3 is specified standard deviation
-# $4-$7 are a,b,c,d obtained by solving a system
-# of equations using specified kurtosis and skewness
-# $8 is the file to write out the generated data to
-
-numSamples = $1
-mu = $2
-sigma = $3
-a = $4
-b = $5
-c = $6
-d = $7
-
-
-print("a=" + a + " b=" + b + " c=" + c + " d=" + d)
-
-X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
-Y = a + b*X + c*X^2 + d*X^3
-
-Z = Y*sigma + mu
-write(Z, $8, format="binary")
diff --git a/scripts/datagen/obsolete/genCorrelatedData.dml
b/scripts/datagen/obsolete/genCorrelatedData.dml
deleted file mode 100644
index fea33fd2e8..0000000000
--- a/scripts/datagen/obsolete/genCorrelatedData.dml
+++ /dev/null
@@ -1,46 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# generates random correlated data
-# can generate any number of variables/columns
-# used to test univariate stats computation
-# by systemds
-
-# $1 is number of variables/columns
-# $2 is number of samples to create
-# $3 is the location to write out the covariance mat
-# $4 is the location to write out the generated data
-dims = $1
-numSamples = $2
-
-U = Rand(rows=dims, cols=dims, min=-1.0, max=1.0, pdf="uniform", seed=0)
-denoms = sqrt(colSums(U*U))
-parfor(i in 1:dims){
- U[i,] = U[i,] / denoms
-}
-
-C = t(U)%*%U
-write(C, $3, format="binary")
-
-R = Rand(rows=numSamples, cols=dims, pdf="normal", seed=0)
-Rc = R%*%U
-write(Rc, $4, format="binary")
-
diff --git a/scripts/datagen/obsolete/genLinearRegressionData.dml
b/scripts/datagen/obsolete/genLinearRegressionData.dml
deleted file mode 100644
index a3689541b0..0000000000
--- a/scripts/datagen/obsolete/genLinearRegressionData.dml
+++ /dev/null
@@ -1,71 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#
-# This script generates random data for linear regression. A matrix is
generated
-# consisting of a data matrix with a label column appended to it.
-#
-# INPUT PARAMETERS:
-#
--------------------------------------------------------------------------------------------
-# NAME TYPE DEFAULT MEANING
-#
--------------------------------------------------------------------------------------------
-# numSamples Int --- Number of samples
-# numFeatures Int --- Number of features (independent variables)
-# maxFeatureValue Int --- Maximum feature value (absolute value)
-# maxWeight Int --- Maximum weight (absolute value)
-# addNoise Boolean --- Determines whether noise should be added to
Y
-# b Double --- Intercept
-# sparsity Double --- Controls the sparsity in the generated data
(a value between 0 and 1)
-# output String --- Location to write the generated data/label
matrix
-# format String --- Matrix output format
-# perc Double 0.8 Percentage of training sample
-# percFile String --- File to store the percentages
-#
--------------------------------------------------------------------------------------------
-# OUTPUT: Matrix of random data with appended label column
-#
---------------------------------------------------------------------------------------------
-#
-# Example
-# ./runStandaloneSystemDS.sh algorithms/datagen/genLinearRegressionData.dml
-nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5
addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv
-#
-
-perc = ifdef($perc, 0.8)
-percFile = ifdef($percFile, "perc.csv")
-p = matrix(0, rows=2, cols=1)
-p[1,1] = perc
-p[2,1] = (1-perc)
-write(p, percFile, format="csv")
-
-X = Rand(cols=$numFeatures, max=1, min=-1, pdf="uniform", rows=$numSamples,
seed=0, sparsity=$sparsity)
-X = X * $maxFeatureValue
-
-w = Rand(cols=1, max=1, min=-1, pdf="uniform", rows=$numFeatures, seed=0)
-w = w * $maxWeight
-
-Y = X %*% w
-Y = Y + $b
-
-if ($addNoise == TRUE) {
- noise = Rand(cols=1, pdf="normal", rows=$numSamples, seed=0)
- Y = Y + noise
-}
-
-Z = cbind(X,Y)
-write(Z, $output, format=$format)
\ No newline at end of file
diff --git a/scripts/perftest/README.md b/scripts/perftest/README.md
deleted file mode 100755
index 14ea405b3a..0000000000
--- a/scripts/perftest/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-<!--
-{% comment %}
-Licensed to the Apache Software Foundation (ASF) under one or more
-contributor license agreements. See the NOTICE file distributed with
-this work for additional information regarding copyright ownership.
-The ASF licenses this file to you under the Apache License, Version 2.0
-(the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-{% end comment %}
--->
-
-# Performance Tests SystemDS
-
-To run all performance tests for SystemDS:
- * install systemds,
- * install the prerequisites,
- * navigate to the perftest directory $`cd $SYSTEMDS_ROOT/scripts/perftest`
- * generate the data,
- * and execute.
-
-There are a few prerequisites:
-
-## Install SystemDS
-
-- First follow the install guide:
<http://apache.github.io/systemds/site/install> and build the project.
-- Install the python package for python api benchmarks:
<https://apache.github.io/systemds/api/python/getting_started/install.html>
-- Prepare to run SystemDS: <https://apache.github.io/systemds/site/run>
-
-## Install Additional Prerequisites
-- Setup Intel MKL: <http://apache.github.io/systemds/site/run>
-- Setup OpenBlas:
<https://github.com/xianyi/OpenBLAS/wiki/Precompiled-installation-packages>
-- Install Perf stat:
<https://linoxide.com/linux-how-to/install-perf-tool-centos-ubuntu/>
-
-## Generate Test Data
-
-Using the scripts found in `$SYSTEMDS_ROOT/scripts/perftest/datagen`, generate
the data for the tests you want to run. Note the sometimes optional and other
times required parameters/args. Dataset size is likely the most important of
these.
-
-## Run the Benchmarks
-
-**Reminder: The scripts should be run from the perftest folder.**
-
-Examples:
-
-```bash
-./runAll.sh
-```
-
-Or look inside the runAll script to see how to run individual tests.
-
-Time calculations in the bash scripts may additionally subtract a number, e.g.
".4".
-This is done to accommodate for time lost by shell script and JVM startup
overheads, to match the actual application runtime of SystemML.
diff --git a/scripts/perftest/datagen/genALSData.sh
b/scripts/perftest/datagen/genALSData.sh
deleted file mode 100755
index 3d1a22a675..0000000000
--- a/scripts/perftest/datagen/genALSData.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
- echo "Please execute scripts from directory 'perftest'"
- exit 1;
-fi
-
-CMD=$1
-DATADIR=$2/als
-MAXMEM=$3
-
-FORMAT="text" # can be csv, mm, text, binary
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-echo "-- Generating ALS data." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense
rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" |
bc` sigma=0.01 fmt=$FORMAT &
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0;
10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
-fi
-
-#generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0;
100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0;
100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense
rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP"
| bc` sigma=0.01 fmt=$FORMAT &
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse
rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP"
| bc` sigma=0.01 fmt=$FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense
rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 *
$DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0;
10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT
-fi
-
-#generate XL scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo
"scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT
- ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo
"scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genBinomialData.sh
b/scripts/perftest/datagen/genBinomialData.sh
deleted file mode 100755
index 7bf3af96dd..0000000000
--- a/scripts/perftest/datagen/genBinomialData.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
- echo "Please execute scripts from directory 'perftest'"
- exit 1;
-fi
-
-CMD=$1
-BASE=$2/binomial
-MAXMEM=$3
-
-FORMAT="binary" # can be csv, mm, text, binary
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-echo -e "\n\n-- Generating binomial data..." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000 1000 5
5 ${BASE}/w10k_1k_dense ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense 1 0
$DENSE_SP $FORMAT 1 & pidDense80=$!
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000 1000 5
5 ${BASE}/w10k_1k_sparse ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse 1 0
$SPARSE_SP $FORMAT 1 & pidSparse80=$!
- wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense ${BASE}/X10k_1k_dense_test
${BASE}/y10k_1k_dense_test $FORMAT &
- wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse ${BASE}/X10k_1k_sparse_test
${BASE}/y10k_1k_sparse_test $FORMAT &
-fi
-
-##generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000 1000 5
5 ${BASE}/w100k_1k_dense ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense 1 0
$DENSE_SP $FORMAT 1 & pidDense800=$!
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000 1000 5
5 ${BASE}/w100k_1k_sparse ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse 1 0
$SPARSE_SP $FORMAT 1 & pidSparse800=$!
- wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense ${BASE}/X100k_1k_dense_test
${BASE}/y100k_1k_dense_test $FORMAT &
- wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse ${BASE}/X100k_1k_sparse_test
${BASE}/y100k_1k_sparse_test $FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 1000000 1000
5 5 ${BASE}/w1M_1k_dense ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense 1 0
$DENSE_SP $FORMAT 1 & pidDense8000=$!
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 1000000 1000
5 5 ${BASE}/w1M_1k_sparse ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse 1 0
$SPARSE_SP $FORMAT 1 & pidSparse8000=$!
- wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense ${BASE}/X1M_1k_dense_test
${BASE}/y1M_1k_dense_test $FORMAT &
- wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse ${BASE}/X1M_1k_sparse_test
${BASE}/y1M_1k_sparse_test $FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000000 1000
5 5 ${BASE}/w10M_1k_dense ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense 1 0
$DENSE_SP $FORMAT 1
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000000 1000
5 5 ${BASE}/w10M_1k_sparse ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse 1 0
$SPARSE_SP $FORMAT 1
- ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_dense
${BASE}/y10M_1k_dense ${BASE}/X10M_1k_dense_test ${BASE}/y10M_1k_dense_test
$FORMAT
- ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_sparse
${BASE}/y10M_1k_sparse ${BASE}/X10M_1k_sparse_test ${BASE}/y10M_1k_sparse_test
$FORMAT
-fi
-
-##generate XL scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000000
1000 5 5 ${BASE}/w100M_1k_dense ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense 1
0 $DENSE_SP $FORMAT 1
- ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000000
1000 5 5 ${BASE}/w100M_1k_sparse ${BASE}/X100M_1k_sparse
${BASE}/y100M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1
- ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_dense
${BASE}/y100M_1k_dense ${BASE}/X100M_1k_dense_test ${BASE}/y100M_1k_dense_test
$FORMAT
- ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_sparse
${BASE}/y100M_1k_sparse ${BASE}/X100M_1k_sparse_test
${BASE}/y100M_1k_sparse_test $FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genClusteringData.sh
b/scripts/perftest/datagen/genClusteringData.sh
deleted file mode 100755
index 35c49aaa6c..0000000000
--- a/scripts/perftest/datagen/genClusteringData.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
- echo "Please execute scripts from directory 'perftest'"
- exit 1;
-fi
-
-CMD=${1:-systemds}
-BASE=${2:-"temp"}/clustering
-MAXMEM=${3:-80}
-
-FORMAT="binary"
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-echo "-- Generating clustering data..." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
- ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=10000 nf=1000 nc=5
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10k_1k_dense C=$BASE/C10k_1k_dense
Y=$BASE/y10k_1k_dense YbyC=$BASE/YbyC10k_1k_dense fmt=$FORMAT & pidDense80=$!
- wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args
$BASE/X10k_1k_dense $BASE/y10k_1k_dense $BASE/X10k_1k_dense_test
$BASE/y10k_1k_dense_test $FORMAT &
-fi
-
-#generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
- ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=100000 nf=1000 nc=5
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100k_1k_dense
C=$BASE/C100k_1k_dense Y=$BASE/y100k_1k_dense YbyC=$BASE/YbyC100k_1k_dense
fmt=$FORMAT & pidDense800=$!
- wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args
$BASE/X100k_1k_dense $BASE/y100k_1k_dense $BASE/X100k_1k_dense_test
$BASE/y100k_1k_dense_test $FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
- ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=1000000 nf=1000 nc=5
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X1M_1k_dense C=$BASE/C1M_1k_dense
Y=$BASE/y1M_1k_dense YbyC=$BASE/YbyC1M_1k_dense fmt=$FORMAT & pidDense8000=$!
- wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args
$BASE/X1M_1k_dense $BASE/y1M_1k_dense $BASE/X1M_1k_dense_test
$BASE/y1M_1k_dense_test $FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
- ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=10000000 nf=1000
nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10M_1k_dense
C=$BASE/C10M_1k_dense Y=$BASE/y10M_1k_dense YbyC=$BASE/YbyC10M_1k_dense
fmt=$FORMAT
- ${CMD} -f scripts/extractTestData.dml --args $BASE/X10M_1k_dense
$BASE/y10M_1k_dense $BASE/X10M_1k_dense_test $BASE/y10M_1k_dense_test $FORMAT
-fi
-
-#generate LARGE scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
- ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=100000000 nf=1000
nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100M_1k_dense
C=$BASE/C100M_1k_dense Y=$BASE/y100M_1k_dense YbyC=$BASE/YbyC100M_1k_dense
fmt=$FORMAT
- ${CMD} -f scripts/extractTestData.dml --args $BASE/X100M_1k_dense
$BASE/y100M_1k_dense $BASE/X100M_1k_dense_test $BASE/y100M_1k_dense_test $FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
b/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
deleted file mode 100755
index 55af5f139c..0000000000
--- a/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
- echo "Please execute scripts from directory 'perftest'"
- exit 1;
-fi
-
-CMD=$1
-BASE=$2/bivar
-MAXMEM=$3
-
-FORMAT="binary"
-
-c=1000
-nc=100
-mdomain=1100
-set=20
-labelset=10
-
-#XS data 10K rows
-if [ $MAXMEM -ge 80 ]; then
- ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats
--nvargs R=10000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10k/data
TYPES=${BASE}/A_10k/types SETSIZE=$set LABELSETSIZE=$labelset
TYPES1=${BASE}/A_10k/set1.types TYPES2=${BASE}/A_10k/set2.types
INDEX1=${BASE}/A_10k/set1.indices INDEX2=${BASE}/A_10k/set2.indices FMT=$FORMAT
&
-fi
-
-#S data 100K rows
-if [ $MAXMEM -ge 800 ]; then
- ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats
--nvargs R=100000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_100k/data
TYPES=${BASE}/A_100k/types SETSIZE=$set LABELSETSIZE=$labelset
TYPES1=${BASE}/A_100k/set1.types TYPES2=${BASE}/A_100k/set2.types
INDEX1=${BASE}/A_100k/set1.indices INDEX2=${BASE}/A_100k/set2.indices
FMT=$FORMAT &
-fi
-
-#M data 1M rows
-if [ $MAXMEM -ge 8000 ]; then
- ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats
--nvargs R=1000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_1M/data
TYPES=${BASE}/A_1M/types SETSIZE=$set LABELSETSIZE=$labelset
TYPES1=${BASE}/A_1M/set1.types TYPES2=${BASE}/A_1M/set2.types
INDEX1=${BASE}/A_1M/set1.indices INDEX2=${BASE}/A_1M/set2.indices FMT=$FORMAT &
-fi
-
-#L data 10M rows
-if [ $MAXMEM -ge 80000 ]; then
- ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats
--nvargs R=10000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10M/data
TYPES=${BASE}/A_10M/types SETSIZE=$set LABELSETSIZE=$labelset
TYPES1=${BASE}/A_10M/set1.types TYPES2=${BASE}/A_10M/set2.types
INDEX1=${BASE}/A_10M/set1.indices INDEX2=${BASE}/A_10M/set2.indices FMT=$FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genDimensionReductionData.sh
b/scripts/perftest/datagen/genDimensionReductionData.sh
deleted file mode 100755
index 2f6cc21b16..0000000000
--- a/scripts/perftest/datagen/genDimensionReductionData.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
- echo "Please execute scripts from directory 'perftest'"
- exit 1;
-fi
-
-CMD=${1:-systemds}
-BASE=${2:-"temp"}/dimensionreduction
-MAXMEM=${3:-80}
-
-FORMAT="binary"
-
-echo "-- Generating Dimension Reduction data." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
- ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=5000 C=2000
OUT=$BASE/pcaData5k_2k_dense FMT=$FORMAT &
-fi
-
-#generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
- ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=50000 C=2000
OUT=$BASE/pcaData50k_2k_dense FMT=$FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
- ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=500000 C=2000
OUT=$BASE/pcaData500k_2k_dense FMT=$FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
- ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=5000000 C=2000
OUT=$BASE/pcaData5M_2k_dense FMT=$FORMAT
-fi
-
-#generate XL scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
- ${CMD} -f ${EXTRADOT}./datagen/genRandData4PCA.dml --nvargs R=50000000
C=2000 OUT=$BASE/pcaData50M_2k_dense FMT=$FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genIOData.sh
b/scripts/perftest/datagen/genIOData.sh
deleted file mode 100755
index 46154f8636..0000000000
--- a/scripts/perftest/datagen/genIOData.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
- echo "Please execute scripts from directory 'perftest'"
- exit 1;
-fi
-
-CMD=${1:-systemds}
-DATADIR=${2:-"temp"}/io
-MAXMEM=${3:-1}
-
-FORMAT="csv" # can be csv, mm, text, binary
-
-echo "-- Generating IO data." >> results/times.txt;
-
-
-#generate XS scenarios (10MB)
-if [ $MAXMEM -ge 1 ]; then
- ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X500_250_dense
R=500 C=250 Fmt=$FORMAT &
-fi
-
-#generate XS scenarios (10MB)
-if [ $MAXMEM -ge 10 ]; then
- ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X5k_250_dense
R=5000 C=250 Fmt=$FORMAT &
-fi
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
- ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10k_1k_dense
R=10000 C=1000 Fmt=$FORMAT &
-fi
-
-#generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
- ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100k_1k_dense
R=100000 C=1000 Fmt=$FORMAT &
-fi
-
-#generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
- ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X1M_1k_dense
R=1000000 C=1000 Fmt=$FORMAT &
-fi
-
-#generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
- ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10M_1k_dense
R=10000000 C=1000 Fmt=$FORMAT &
-fi
-
-#generate XL scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
- ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100M_1k_dense
R=100000000 C=1000 Fmt=$FORMAT &
-fi
-
-wait
diff --git a/scripts/perftest/datagen/genL2SVMData.sh
b/scripts/perftest/datagen/genL2SVMData.sh
deleted file mode 100755
index d25e433530..0000000000
--- a/scripts/perftest/datagen/genL2SVMData.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
- echo "Please execute scripts from directory 'perftest'"
- exit 1;
-fi
-
-CMD=$1
-DATADIR=$2
-
-FORMAT="binary" # can be csv, mm, text, binary
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-BASEPATH=$(dirname $0)
-
-#generate XS scenarios (80MB)
-${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args
10000 1000 5 5 ${DATADIR}/w10k_1k_dense ${DATADIR}/X10k_1k_dense
${DATADIR}/Y10k_1k_dense 1 0 $DENSE_SP $FORMAT 1
-${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args
10000 1000 5 5 ${DATADIR}/w10k_1k_sparse ${DATADIR}/X10k_1k_sparse
${DATADIR}/Y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1
diff --git a/scripts/perftest/datagen/genMultinomialData.sh
b/scripts/perftest/datagen/genMultinomialData.sh
deleted file mode 100755
index 43dd6ea7ff..0000000000
--- a/scripts/perftest/datagen/genMultinomialData.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
- echo "Please execute scripts from directory 'perftest'"
- exit 1;
-fi
-
-CMD=$1
-BASE=$2/multinomial
-MAXMEM=$3
-
-FORMAT="binary"
-DENSE_SP=0.9
-SPARSE_SP=0.01
-
-echo "-- Generating multinomial data..." >> results/times.txt;
-
-#generate XS scenarios (80MB)
-if [ $MAXMEM -ge 80 ]; then
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000 1000
$DENSE_SP 5 0 $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $FORMAT 1 &
pidDense80=$!
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000 1000
$SPARSE_SP 5 0 $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $FORMAT 1 &
pidSparse80=$!
- wait $pidDense80; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $BASE/X10k_1k_dense_k5_test
$BASE/y10k_1k_dense_k5_test $FORMAT &
- wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $BASE/X10k_1k_sparse_k5_test
$BASE/y10k_1k_sparse_k5_test $FORMAT &
-fi
-
-##generate S scenarios (800MB)
-if [ $MAXMEM -ge 800 ]; then
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000 1000
$DENSE_SP 5 0 $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $FORMAT 1 &
pidDense800=$!
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000 1000
$SPARSE_SP 5 0 $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $FORMAT 1 &
pidSparse800=$!
- wait $pidDense800; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $BASE/X100k_1k_dense_k5_test
$BASE/y100k_1k_dense_k5_test $FORMAT &
- wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $BASE/X100k_1k_sparse_k5_test
$BASE/y100k_1k_sparse_k5_test $FORMAT &
-fi
-
-##generate M scenarios (8GB)
-if [ $MAXMEM -ge 8000 ]; then
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000
$DENSE_SP 5 0 $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $FORMAT 1 &
pidDense8000=$!
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000
$SPARSE_SP 5 0 $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $FORMAT 1 &
pidSparse8000=$!
- wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $BASE/X1M_1k_dense_k5_test
$BASE/y1M_1k_dense_k5_test $FORMAT &
- wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $BASE/X1M_1k_sparse_k5_test
$BASE/y1M_1k_sparse_k5_test $FORMAT &
-fi
-
-##generate L scenarios (80GB)
-if [ $MAXMEM -ge 80000 ]; then
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000
$DENSE_SP 5 0 $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $FORMAT 1
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000
$SPARSE_SP 5 0 $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $FORMAT 1
- ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_dense_k5
$BASE/y10M_1k_dense_k5 $BASE/X10M_1k_dense_k5_test $BASE/y10M_1k_dense_k5_test
$FORMAT
- ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_sparse_k5
$BASE/y10M_1k_sparse_k5 $BASE/X10M_1k_sparse_k5_test
$BASE/y10M_1k_sparse_k5_test $FORMAT
-fi
-
-#generate LARGE scenarios (800GB)
-if [ $MAXMEM -ge 800000 ]; then
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000
$DENSE_SP 5 0 $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $FORMAT 1
- ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000
$SPARSE_SP 5 0 $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $FORMAT 1
- ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_dense_k5
$BASE/y100M_1k_dense_k5 $BASE/X100M_1k_dense_k5_test
$BASE/y100M_1k_dense_k5_test $FORMAT
- ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_sparse_k5
$BASE/y100M_1k_sparse_k5 $BASE/X100M_1k_sparse_k5_test
$BASE/y100M_1k_sparse_k5_test $FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genStratStatisticsData.sh
b/scripts/perftest/datagen/genStratStatisticsData.sh
deleted file mode 100755
index 19c38e3fc7..0000000000
--- a/scripts/perftest/datagen/genStratStatisticsData.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-if [ "$(basename $PWD)" != "perftest" ];
-then
- echo "Please execute scripts from directory 'perftest'"
- exit 1;
-fi
-
-CMD=$1
-BASE=$2/stratstats
-MAXMEM=$3
-
-FORMAT="binary"
-
-echo "-- Generating stats data..." >> results/times.txt;
-
-#XS data 10K rows
-if [ $MAXMEM -ge 80 ]; then
- ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=10000 nf=100 D=${BASE}/A_10k/data Xcid=${BASE}/A_10k/Xcid
Ycid=${BASE}/A_10k/Ycid A=${BASE}/A_10k/A fmt=$FORMAT &
-fi
-
-#S data 100K rows
-if [ $MAXMEM -ge 800 ]; then
- ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=100000 nf=100 D=${BASE}/A_100k/data Xcid=${BASE}/A_100k/Xcid
Ycid=${BASE}/A_100k/Ycid A=${BASE}/A_100k/A fmt=$FORMAT &
-fi
-
-#M data 1M rows
-if [ $MAXMEM -ge 8000 ]; then
- ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=1000000 nf=100 D=${BASE}/A_1M/data Xcid=${BASE}/A_1M/Xcid
Ycid=${BASE}/A_1M/Ycid A=${BASE}/A_1M/A fmt=$FORMAT &
-fi
-
-#L data 10M rows
-if [ $MAXMEM -ge 80000 ]; then
- ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=10000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid
Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT
-fi
-
-#XL data 100M rows
-if [ $MAXMEM -ge 800000 ]; then
- ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=100000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid
Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT
-fi
-
-wait
\ No newline at end of file
diff --git a/scripts/datagen/genRandData4DecisionTree2.dml
b/scripts/perftest/log4j.properties
similarity index 60%
rename from scripts/datagen/genRandData4DecisionTree2.dml
rename to scripts/perftest/log4j.properties
index 715924915c..9b751b57ca 100644
--- a/scripts/datagen/genRandData4DecisionTree2.dml
+++ b/scripts/perftest/log4j.properties
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,23 +19,13 @@
#
#-------------------------------------------------------------
+log4j.rootLogger=ERROR,console
-transformPath = $tPath;
-transformSpec = $tSpec;
-XCatFile = $XCat;
-XFile = $X;
-num_records = $num_records;
-num_scale_features = $num_scale;
-sparsity = $sp;
-fmt = $fmt;
-
-# generate scale features
-X_scale = rand (rows = num_records, cols = num_scale_features, min = 0, max =
10, sparsity = sparsity);
-
-# transform categorical features
-XCF = read (XCatFile);
-specJson = read(transformSpec, data_type="scalar", value_type="string");
-X_cat_transformed = transform (target = XCF, spec = specJson, transformPath =
transformPath);
+log4j.logger.org.apache.sysds=ERROR
+log4j.logger.org.apache.spark=ERROR
+log4j.logger.org.apache.hadoop=OFF
-X = cbind (X_scale, X_cat_transformed);
-write (X, XFile, format = fmt);
+log4j.appender.console=org.apache.log4j.ConsoleAppender
+log4j.appender.console.target=System.err
+log4j.appender.console.layout=org.apache.log4j.PatternLayout
+log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p
%c{2}: %m%n
diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh
index 6d39043a74..17ad97b085 100755
--- a/scripts/perftest/runAll.sh
+++ b/scripts/perftest/runAll.sh
@@ -27,72 +27,19 @@ then
fi
# Command to be executed
-CMD="systemds"
+CMD="./sparkDML2.sh"
TEMPFOLDER="temp"
# Max memory of data to be benchmarked
# Possible values: 80/80MB, 800/800MB, 8000/8000MB/8GB, 80000/80000MB/80GB,
800000/800000MB/800GB
-MAXMEM=80
+MAXMEM=80000
# Set properties
export LOG4JPROP='conf/log4j-off.properties'
-export SYSDS_QUIET=1
-export SYSDS_EXEC_MODE="hybrid"
-export SYSTEMDS_STANDALONE_OPTS="-Xmx10g -Xms10g -Xmn2000m"
-export SYSDS_DISTRIBUTED=0
-
-if [ "$HOSTNAME" = "alpha" ]; then
- # Just to make it easy to run on our machine without having to change
anything.
- export SYSTEMDS_STANDALONE_OPTS="-Xmx500g -Xms500g -Xmn50g"
- export SYSDS_DISTRIBUTED=1
- export SYSTEMDS_DISTRIBUTED_OPTS="\
- --master yarn \
- --deploy-mode client \
- --driver-memory 500g \
- --conf spark.driver.extraJavaOptions=\"-Xms500g -Xmn50g
-Dlog4j.configuration=file:$LOG4JPROP\" \
- --conf
spark.executor.extraJavaOptions=\"-Dlog4j.configuration=file:$LOG4JPROP\" \
- --conf spark.executor.heartbeatInterval=100s \
- --files $LOG4JPROP \
- --conf spark.network.timeout=512s \
- --num-executors 6 \
- --executor-memory 105g \
- --executor-cores 32 \
- "
- MAXMEM="80GB"
-elif [ "$HOSTNAME" = "charlie" ]; then
- export SYSTEMDS_STANDALONE_OPTS="-Xmx100g -Xms100g -Xmn10g"
- export SYSDS_DISTRIBUTED=1
- export SYSTEMDS_DISTRIBUTED_OPTS="\
- --master yarn \
- --deploy-mode client \
- --driver-memory 100g \
- --conf spark.driver.extraJavaOptions=\"-Xms100g -Xmn10g
-Dlog4j.configuration=file:$LOG4JPROP\" \
- --conf
spark.executor.extraJavaOptions=\"-Dlog4j.configuration=file:$LOG4JPROP\" \
- --conf spark.executor.heartbeatInterval=100s \
- --files $LOG4JPROP \
- --conf spark.network.timeout=512s \
- --num-executors 6 \
- --executor-memory 105g \
- --executor-cores 32 \
- "
- MAXMEM="80GB"
-elif [ "$HOSTNAME" = "XPS-15-7590" ]; then
- MAXMEM=800
-fi
-
-# Fix max mem to format.
-MAXMEM=${MAXMEM%"MB"}; MAXMEM=${MAXMEM/GB/"000"}
-
-# Possible lines to initialize Intel MKL, depending on version and install
location
-if [ -d ~/intel ] && [ -d ~/intel/bin ] && [ -f ~/intel/bin/compilervars.sh ];
then
- . ~/intel/bin/compilervars.sh intel64
-elif [ -d /opt ] && [ -d /opt/intel ] && [ -d /opt/intel/bin ]; then
- . /opt/intel/bin/compilervars.sh intel64
-fi
# make dirs if not exsisting
-mkdir -p logs
-mkdir -p results
+mkdir -p logs
+mkdir -p results
mkdir -p temp
# init time measurement
@@ -103,13 +50,13 @@ echo -e "\n$HOSTNAME" >> results/times.txt
echo -e "\n\n" >> results/times.txt
## Data Gen
-# ./datagen/genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genBinomialData.out
-# ./datagen/genMultinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genMultinomialData.out
-# ./datagen/genDescriptiveStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genStatsData.out
-# ./datagen/genStratStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genStratStatsData.out
-# ./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genClusteringData.out
-# ./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genDimensionReductionData.out
-# ./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out
+./datagen/genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genBinomialData.out
+./datagen/genMultinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genMultinomialData.out
+#./datagen/genDescriptiveStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genStatsData.out
+#./datagen/genStratStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genStratStatsData.out
+#./datagen/genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genClusteringData.out
+#./datagen/genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &>
logs/genDimensionReductionData.out
+#./datagen/genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out
### Micro Benchmarks:
#./MatrixMult.sh ${CMD}
@@ -122,17 +69,17 @@ echo -e "\n\n" >> results/times.txt
./runAllBinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
./runAllMultinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
./runAllRegression.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./runAllStats.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
-./KnnMissingValueImputation.sh ${CMD} ${MAXMEM}
+#./runAllStats.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./KnnMissingValueImputation.sh ${CMD} ${MAXMEM}
### IO Benchmarks:
-./runAllIO.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+#./runAllIO.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
# TODO The following benchmarks have yet to be written. The decision tree
algorithms additionally need to be fixed.
-# add stepwise Linear
+# add stepwise Linear
# add stepwise GLM
#./runAllTrees.sh $CMD $TEMPFOLDER
# add randomForest
diff --git a/scripts/perftest/runL2SVM.sh b/scripts/perftest/runL2SVM.sh
index b7ddb64d40..622cb95043 100755
--- a/scripts/perftest/runL2SVM.sh
+++ b/scripts/perftest/runL2SVM.sh
@@ -37,8 +37,7 @@ for i in 0 1; do
#training
tstart=$(date +%s.%N)
- # /algorithms/l2-svm.dml already calls a built-in function for the l2 svm.
- ${CMD} -f ./../algorithms/l2-svm.dml \
+ ${CMD} -f scripts/l2-svm.dml \
"$FEDERATEDCOMPILATION" \
--config conf/SystemDS-config.xml \
--stats \
diff --git a/scripts/perftest/scripts/l2-svm-predict.dml
b/scripts/perftest/scripts/l2-svm-predict.dml
index 31db539ec7..7ab7c94c6e 100755
--- a/scripts/perftest/scripts/l2-svm-predict.dml
+++ b/scripts/perftest/scripts/l2-svm-predict.dml
@@ -28,58 +28,16 @@ cmdLine_fmt = ifdef($fmt, "text")
X = read($X)
-w = read($model)
+W = read($model)
-dimensions = as.scalar(w[nrow(w),1])
+dimensions = as.scalar(W[nrow(W),1])
if(dimensions != ncol(X))
stop("Stopping due to invalid input: Model dimensions do not seem to match
input data dimensions")
-intercept = as.scalar(w[nrow(w)-1,1])
-negative_label = as.scalar(w[nrow(w)-2,1])
-positive_label = as.scalar(w[nrow(w)-3,1])
-w = w[1:(nrow(w)-4),]
+intercept = as.scalar(W[nrow(W)-1,1])
+W = W[1:(nrow(W)-2),]
-[scores, Y] = l2svmPredict(X = X, W = w, verbose = TRUE)
+[scores, Y] = l2svmPredict(X = X, W = W, verbose = TRUE)
-if(cmdLine_scores != " ")
- write(scores, cmdLine_scores, format=cmdLine_fmt)
+write(scores, cmdLine_scores, format=cmdLine_fmt)
-if(!cmdLine_scoring_only){
- Y = read(cmdLine_Y)
-
- pred = (scores >= 0)
- pred_labels = pred*positive_label + (1-pred)*negative_label
- num_correct = sum(pred_labels == Y)
- acc = 100*num_correct/nrow(X)
-
- acc_str = "Accuracy (%): " + acc
- print(acc_str)
-
- if(cmdLine_accuracy != " ")
- write(acc_str, cmdLine_accuracy)
-
- if(cmdLine_confusion != " ") {
- pred = 2*pred - 1
-
- if(negative_label != -1 | positive_label != +1)
- Y = 2/(positive_label - negative_label)*Y - (negative_label +
positive_label)/(positive_label - negative_label)
-
- pred_is_minus = (pred == -1)
- pred_is_plus = 1 - pred_is_minus
- y_is_minus = (Y == -1)
- y_is_plus = 1 - y_is_minus
-
- check_min_y_minus = sum(pred_is_minus*y_is_minus)
- check_min_y_plus = sum(pred_is_minus*y_is_plus)
- check_max_y_minus = sum(pred_is_plus*y_is_minus)
- check_max_y_plus = sum(pred_is_plus*y_is_plus)
-
- confusion_mat = matrix(0, rows=2, cols=2)
- confusion_mat[1,1] = check_min_y_minus
- confusion_mat[1,2] = check_min_y_plus
- confusion_mat[2,1] = check_max_y_minus
- confusion_mat[2,2] = check_max_y_plus
-
- write(confusion_mat, cmdLine_confusion, format="csv")
- }
-}
diff --git a/scripts/datagen/genRandData4DecisionTree1.dml
b/scripts/perftest/scripts/l2-svm.dml
similarity index 61%
rename from scripts/datagen/genRandData4DecisionTree1.dml
rename to scripts/perftest/scripts/l2-svm.dml
index 7d1dd50d6b..ac64de679d 100644
--- a/scripts/datagen/genRandData4DecisionTree1.dml
+++ b/scripts/perftest/scripts/l2-svm.dml
@@ -7,9 +7,9 @@
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
-#
+#
# http://www.apache.org/licenses/LICENSE-2.0
-#
+#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -19,22 +19,22 @@
#
#-------------------------------------------------------------
+fmt = ifdef($fmt, "text")
+icpt = ifdef($icpt, 0)
+tol = ifdef($tol, 0.001)
+reg = ifdef($reg, 1.0)
+maxiter = ifdef($maxiter, 100)
+
+tol = as.double ($tol);
+X = read($X)
+Y = read($Y)
-XCatFile = $XCat;
-YFile = $Y;
-num_records = $num_records;
-num_cat_features = $num_cat;
-num_class = $num_class;
-num_distinct = $num_distinct;
-sparsity = $sp;
+model = l2svm(X = X, Y = Y, intercept = icpt, epsilon = tol, reg = reg,
maxIterations = maxiter, verbose = FALSE)
-# generate class labels
-Y = floor (rand (rows = num_records, cols = 1, min = 1, max = num_class +
0.99999999999999));
-Y_bin = table (seq (1, num_records), Y);
-write (Y_bin, YFile);
+extra_model_params = matrix(0, rows=2, cols=ncol(model))
+extra_model_params[1, 1] = icpt
+extra_model_params[2, 1] = ncol(X)
-# generate categorical features
-X_cat = floor (rand (rows = num_records, cols = num_cat_features, min = 1, max
= num_distinct + 0.99999999999999, sparsity = sparsity));
-fX_cat = as.frame(X_cat);
-write (fX_cat, XCatFile, format = "csv");
+w = t(cbind(t(model), t(extra_model_params)))
+write(w, $model, format=fmt)
diff --git a/scripts/perftest/sparkDML2.sh b/scripts/perftest/sparkDML2.sh
new file mode 100644
index 0000000000..dde9805719
--- /dev/null
+++ b/scripts/perftest/sparkDML2.sh
@@ -0,0 +1,16 @@
+ #Client mode spark-submit script
+export SPARK_HOME=/home/hadoop/spark-3.3.1-bin-hadoop3
+export HADOOP_CONF_DIR=/home/hadoop/hadoop-3.3.1/etc/hadoop
+
+$SPARK_HOME/bin/spark-submit \
+ --master yarn \
+ --deploy-mode client \
+ --driver-memory 20g \
+ --num-executors 6 \
+ --conf spark.driver.extraJavaOptions="-Xms20g -Xmn2g
-Dlog4j.configuration=file:/home/mboehm/perftest/log4j.properties " \
+ --conf spark.ui.showConsoleProgress=true \
+ --conf spark.executor.heartbeatInterval=100s \
+ --conf spark.network.timeout=512s \
+ --executor-memory 200g \
+ --executor-cores 48 \
+ SystemDS.jar "$@"
\ No newline at end of file
diff --git
a/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmDatagen.java
b/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmDatagen.java
index c23f3c5934..4473ef5f0e 100644
---
a/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmDatagen.java
+++
b/src/test/java/org/apache/sysds/test/functions/codegenalg/parttwo/AlgorithmDatagen.java
@@ -241,13 +241,13 @@ public class AlgorithmDatagen extends AutomatedTestBase
double sparsity = sparse ? sparsity2 : sparsity1;
if( type == DatagenType.LINREG) {
- fullDMLScriptName =
"scripts/datagen/genRandData4LinearRegression.dml";
+ fullDMLScriptName =
"scripts/perftest/datagen/genRandData4LinearRegression.dml";
programArgs = new String[]{ "-stats", "-args",
String.valueOf(rows),
String.valueOf(cols), "10", "1", output("w"),
output("X"), output("y"), "1", "1",
String.valueOf(sparsity), "binary"};
}
else { //LOGREG
- fullDMLScriptName =
"scripts/datagen/genRandData4LogisticRegression.dml";
+ fullDMLScriptName =
"scripts/perftest/datagen/genRandData4LogisticRegression.dml";
programArgs = new String[]{ "-stats", "-args",
String.valueOf(rows),
String.valueOf(cols), "10", "1", output("w"),
output("X"), output("y"), "1", "1",
String.valueOf(sparsity), "binary", "1"};
diff --git
a/src/test/java/org/apache/sysds/test/functions/misc/UnivariateStatsBasicTest.java
b/src/test/java/org/apache/sysds/test/functions/misc/UnivariateStatsBasicTest.java
index edb505d6c3..68dd8fd101 100644
---
a/src/test/java/org/apache/sysds/test/functions/misc/UnivariateStatsBasicTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/misc/UnivariateStatsBasicTest.java
@@ -71,7 +71,7 @@ public class UnivariateStatsBasicTest extends
AutomatedTestBase
loadTestConfiguration(config);
//run univariate stats data generator
- fullDMLScriptName =
"./scripts/datagen/"+TEST_NAME_DATAGEN+".dml";
+ fullDMLScriptName =
"./scripts/perftest/datagen/"+TEST_NAME_DATAGEN+".dml";
programArgs = new String[]{ "-args", "100000", "100",
"10", "1", "2", "3", "4", input("uni.mtx") };
runTest(true, false, null, -1);