This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 56c782384b [SYSTEMDS-3847] Fix perftest refactoring (datagen scripts)
56c782384b is described below
commit 56c782384b73b560a1cdee7ff8c04dacefa2ec76
Author: Matthias Boehm <[email protected]>
AuthorDate: Sat Apr 5 19:04:19 2025 +0200
[SYSTEMDS-3847] Fix perftest refactoring (datagen scripts)
---
scripts/perftest/datagen/genALSData.sh | 68 ++++++
scripts/perftest/datagen/genBinomialData.sh | 78 +++++++
scripts/perftest/datagen/genClusteringData.sh | 68 ++++++
.../datagen/genDescriptiveStatisticsData.sh | 60 ++++++
.../perftest/datagen/genDimensionReductionData.sh | 61 ++++++
scripts/perftest/datagen/genIOData.sh | 72 +++++++
scripts/perftest/datagen/genL2SVMData.sh | 38 ++++
scripts/perftest/datagen/genMultinomialData.sh | 78 +++++++
scripts/perftest/datagen/genRandData4ALS.dml | 47 +++++
.../datagen/genRandData4ChisquaredTest.dml | 87 ++++++++
.../perftest/datagen/genRandData4DecisionTree.sh | 58 +++++
.../perftest/datagen/genRandData4DecisionTree1.dml | 40 ++++
.../perftest/datagen/genRandData4DecisionTree2.dml | 41 ++++
.../datagen/genRandData4DescriptiveStats.dml | 149 +++++++++++++
scripts/perftest/datagen/genRandData4FTest.dml | 95 +++++++++
scripts/perftest/datagen/genRandData4Kmeans.dml | 120 +++++++++++
.../datagen/genRandData4LinearReg_LTstats.dml | 233 +++++++++++++++++++++
.../datagen/genRandData4LinearRegression.dml | 61 ++++++
.../datagen/genRandData4LogReg_LTstats.dml | 233 +++++++++++++++++++++
.../datagen/genRandData4LogisticRegression.dml | 72 +++++++
.../perftest/datagen/genRandData4MultiClassSVM.dml | 68 ++++++
.../perftest/datagen/genRandData4Multinomial.dml | 66 ++++++
scripts/perftest/datagen/genRandData4NMF.dml | 129 ++++++++++++
.../perftest/datagen/genRandData4NMFBlockwise.dml | 138 ++++++++++++
scripts/perftest/datagen/genRandData4PCA.dml | 61 ++++++
.../perftest/datagen/genRandData4StratStats.dml | 155 ++++++++++++++
.../perftest/datagen/genRandData4SurvAnalysis.dml | 133 ++++++++++++
scripts/perftest/datagen/genRandData4Transform.dml | 96 +++++++++
.../perftest/datagen/genRandData4Univariate.dml | 61 ++++++
scripts/perftest/datagen/genStratStatisticsData.sh | 61 ++++++
scripts/perftest/sparkDML2.sh | 24 ++-
31 files changed, 2750 insertions(+), 1 deletion(-)
diff --git a/scripts/perftest/datagen/genALSData.sh
b/scripts/perftest/datagen/genALSData.sh
new file mode 100644
index 0000000000..3d1a22a675
--- /dev/null
+++ b/scripts/perftest/datagen/genALSData.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+ echo "Please execute scripts from directory 'perftest'"
+ exit 1;
+fi
+
+CMD=$1
+DATADIR=$2/als
+MAXMEM=$3
+
+FORMAT="text" # can be csv, mm, text, binary
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+echo "-- Generating ALS data." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense
rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" |
bc` sigma=0.01 fmt=$FORMAT &
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0;
10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
+fi
+
+#generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0;
100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0;
100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense
rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP"
| bc` sigma=0.01 fmt=$FORMAT &
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse
rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP"
| bc` sigma=0.01 fmt=$FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense
rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 *
$DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0;
10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT
+fi
+
+#generate XL scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo
"scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT
+ ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs
X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo
"scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genBinomialData.sh
b/scripts/perftest/datagen/genBinomialData.sh
new file mode 100644
index 0000000000..c911175ace
--- /dev/null
+++ b/scripts/perftest/datagen/genBinomialData.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+ echo "Please execute scripts from directory 'perftest'"
+ exit 1;
+fi
+
+CMD=$1
+BASE=$2/binomial
+MAXMEM=$3
+
+FORMAT="binary" # can be csv, mm, text, binary
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+echo -e "\n\n-- Generating binomial data..." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5
${BASE}/w10k_1k_dense ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense 1 0 $DENSE_SP
$FORMAT 1 & pidDense80=$!
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5
${BASE}/w10k_1k_sparse ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse 1 0
$SPARSE_SP $FORMAT 1 & pidSparse80=$!
+ wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense ${BASE}/X10k_1k_dense_test
${BASE}/y10k_1k_dense_test $FORMAT &
+ wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse ${BASE}/X10k_1k_sparse_test
${BASE}/y10k_1k_sparse_test $FORMAT &
+fi
+
+##generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5
${BASE}/w100k_1k_dense ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense 1 0
$DENSE_SP $FORMAT 1 & pidDense800=$!
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5
${BASE}/w100k_1k_sparse ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse 1 0
$SPARSE_SP $FORMAT 1 & pidSparse800=$!
+ wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense ${BASE}/X100k_1k_dense_test
${BASE}/y100k_1k_dense_test $FORMAT &
+ wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse ${BASE}/X100k_1k_sparse_test
${BASE}/y100k_1k_sparse_test $FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5
${BASE}/w1M_1k_dense ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense 1 0 $DENSE_SP
$FORMAT 1 & pidDense8000=$!
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5
${BASE}/w1M_1k_sparse ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse 1 0
$SPARSE_SP $FORMAT 1 & pidSparse8000=$!
+ wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense ${BASE}/X1M_1k_dense_test
${BASE}/y1M_1k_dense_test $FORMAT &
+ wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml --args
${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse ${BASE}/X1M_1k_sparse_test
${BASE}/y1M_1k_sparse_test $FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5
5 ${BASE}/w10M_1k_dense ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense 1 0
$DENSE_SP $FORMAT 1
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5
5 ${BASE}/w10M_1k_sparse ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse 1 0
$SPARSE_SP $FORMAT 1
+ ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_dense
${BASE}/y10M_1k_dense ${BASE}/X10M_1k_dense_test ${BASE}/y10M_1k_dense_test
$FORMAT
+ ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_sparse
${BASE}/y10M_1k_sparse ${BASE}/X10M_1k_sparse_test ${BASE}/y10M_1k_sparse_test
$FORMAT
+fi
+
+##generate XL scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5
5 ${BASE}/w100M_1k_dense ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense 1 0
$DENSE_SP $FORMAT 1
+ ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5
5 ${BASE}/w100M_1k_sparse ${BASE}/X100M_1k_sparse ${BASE}/y100M_1k_sparse 1 0
$SPARSE_SP $FORMAT 1
+ ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_dense
${BASE}/y100M_1k_dense ${BASE}/X100M_1k_dense_test ${BASE}/y100M_1k_dense_test
$FORMAT
+ ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_sparse
${BASE}/y100M_1k_sparse ${BASE}/X100M_1k_sparse_test
${BASE}/y100M_1k_sparse_test $FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genClusteringData.sh
b/scripts/perftest/datagen/genClusteringData.sh
new file mode 100644
index 0000000000..46adffb9e3
--- /dev/null
+++ b/scripts/perftest/datagen/genClusteringData.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+ echo "Please execute scripts from directory 'perftest'"
+ exit 1;
+fi
+
+CMD=${1:-systemds}
+BASE=${2:-"temp"}/clustering
+MAXMEM=${3:-80}
+
+FORMAT="binary"
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+echo "-- Generating clustering data..." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+ ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=10000 nf=1000 nc=5
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10k_1k_dense C=$BASE/C10k_1k_dense
Y=$BASE/y10k_1k_dense YbyC=$BASE/YbyC10k_1k_dense fmt=$FORMAT & pidDense80=$!
+ wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args
$BASE/X10k_1k_dense $BASE/y10k_1k_dense $BASE/X10k_1k_dense_test
$BASE/y10k_1k_dense_test $FORMAT &
+fi
+
+#generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+ ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=100000 nf=1000 nc=5
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100k_1k_dense
C=$BASE/C100k_1k_dense Y=$BASE/y100k_1k_dense YbyC=$BASE/YbyC100k_1k_dense
fmt=$FORMAT & pidDense800=$!
+ wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args
$BASE/X100k_1k_dense $BASE/y100k_1k_dense $BASE/X100k_1k_dense_test
$BASE/y100k_1k_dense_test $FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+ ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=1000000 nf=1000 nc=5
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X1M_1k_dense C=$BASE/C1M_1k_dense
Y=$BASE/y1M_1k_dense YbyC=$BASE/YbyC1M_1k_dense fmt=$FORMAT & pidDense8000=$!
+ wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args
$BASE/X1M_1k_dense $BASE/y1M_1k_dense $BASE/X1M_1k_dense_test
$BASE/y1M_1k_dense_test $FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+ ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=10000000 nf=1000 nc=5
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10M_1k_dense C=$BASE/C10M_1k_dense
Y=$BASE/y10M_1k_dense YbyC=$BASE/YbyC10M_1k_dense fmt=$FORMAT
+ ${CMD} -f scripts/extractTestData.dml --args $BASE/X10M_1k_dense
$BASE/y10M_1k_dense $BASE/X10M_1k_dense_test $BASE/y10M_1k_dense_test $FORMAT
+fi
+
+#generate LARGE scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+ ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=100000000 nf=1000 nc=5
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100M_1k_dense
C=$BASE/C100M_1k_dense Y=$BASE/y100M_1k_dense YbyC=$BASE/YbyC100M_1k_dense
fmt=$FORMAT
+ ${CMD} -f scripts/extractTestData.dml --args $BASE/X100M_1k_dense
$BASE/y100M_1k_dense $BASE/X100M_1k_dense_test $BASE/y100M_1k_dense_test $FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
b/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
new file mode 100644
index 0000000000..c59fdc6a2a
--- /dev/null
+++ b/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+ echo "Please execute scripts from directory 'perftest'"
+ exit 1;
+fi
+
+CMD=$1
+BASE=$2/bivar
+MAXMEM=$3
+
+FORMAT="binary"
+
+c=1000
+nc=100
+mdomain=1100
+set=20
+labelset=10
+
+#XS data 10K rows
+if [ $MAXMEM -ge 80 ]; then
+ ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats
--nvargs R=10000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10k/data
TYPES=${BASE}/A_10k/types SETSIZE=$set LABELSETSIZE=$labelset
TYPES1=${BASE}/A_10k/set1.types TYPES2=${BASE}/A_10k/set2.types
INDEX1=${BASE}/A_10k/set1.indices INDEX2=${BASE}/A_10k/set2.indices FMT=$FORMAT
&
+fi
+
+#S data 100K rows
+if [ $MAXMEM -ge 800 ]; then
+ ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats
--nvargs R=100000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_100k/data
TYPES=${BASE}/A_100k/types SETSIZE=$set LABELSETSIZE=$labelset
TYPES1=${BASE}/A_100k/set1.types TYPES2=${BASE}/A_100k/set2.types
INDEX1=${BASE}/A_100k/set1.indices INDEX2=${BASE}/A_100k/set2.indices
FMT=$FORMAT &
+fi
+
+#M data 1M rows
+if [ $MAXMEM -ge 8000 ]; then
+ ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats
--nvargs R=1000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_1M/data
TYPES=${BASE}/A_1M/types SETSIZE=$set LABELSETSIZE=$labelset
TYPES1=${BASE}/A_1M/set1.types TYPES2=${BASE}/A_1M/set2.types
INDEX1=${BASE}/A_1M/set1.indices INDEX2=${BASE}/A_1M/set2.indices FMT=$FORMAT &
+fi
+
+#L data 10M rows
+if [ $MAXMEM -ge 80000 ]; then
+ ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats
--nvargs R=10000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10M/data
TYPES=${BASE}/A_10M/types SETSIZE=$set LABELSETSIZE=$labelset
TYPES1=${BASE}/A_10M/set1.types TYPES2=${BASE}/A_10M/set2.types
INDEX1=${BASE}/A_10M/set1.indices INDEX2=${BASE}/A_10M/set2.indices FMT=$FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genDimensionReductionData.sh
b/scripts/perftest/datagen/genDimensionReductionData.sh
new file mode 100644
index 0000000000..cd90aa1758
--- /dev/null
+++ b/scripts/perftest/datagen/genDimensionReductionData.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+ echo "Please execute scripts from directory 'perftest'"
+ exit 1;
+fi
+
+CMD=${1:-systemds}
+BASE=${2:-"temp"}/dimensionreduction
+MAXMEM=${3:-80}
+
+FORMAT="binary"
+
+echo "-- Generating Dimension Reduction data." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+ ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=5000 C=2000
OUT=$BASE/pcaData5k_2k_dense FMT=$FORMAT &
+fi
+
+#generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+ ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=50000 C=2000
OUT=$BASE/pcaData50k_2k_dense FMT=$FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+ ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=500000 C=2000
OUT=$BASE/pcaData500k_2k_dense FMT=$FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+ ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=5000000 C=2000
OUT=$BASE/pcaData5M_2k_dense FMT=$FORMAT
+fi
+
+#generate XL scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+ ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=50000000 C=2000
OUT=$BASE/pcaData50M_2k_dense FMT=$FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genIOData.sh
b/scripts/perftest/datagen/genIOData.sh
new file mode 100644
index 0000000000..46154f8636
--- /dev/null
+++ b/scripts/perftest/datagen/genIOData.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+ echo "Please execute scripts from directory 'perftest'"
+ exit 1;
+fi
+
+CMD=${1:-systemds}
+DATADIR=${2:-"temp"}/io
+MAXMEM=${3:-1}
+
+FORMAT="csv" # can be csv, mm, text, binary
+
+echo "-- Generating IO data." >> results/times.txt;
+
+
+#generate XS scenarios (10MB)
+if [ $MAXMEM -ge 1 ]; then
+ ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X500_250_dense
R=500 C=250 Fmt=$FORMAT &
+fi
+
+#generate XS scenarios (10MB)
+if [ $MAXMEM -ge 10 ]; then
+ ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X5k_250_dense
R=5000 C=250 Fmt=$FORMAT &
+fi
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+ ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10k_1k_dense
R=10000 C=1000 Fmt=$FORMAT &
+fi
+
+#generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+ ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100k_1k_dense
R=100000 C=1000 Fmt=$FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+ ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X1M_1k_dense
R=1000000 C=1000 Fmt=$FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+ ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10M_1k_dense
R=10000000 C=1000 Fmt=$FORMAT &
+fi
+
+#generate XL scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+ ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100M_1k_dense
R=100000000 C=1000 Fmt=$FORMAT &
+fi
+
+wait
diff --git a/scripts/perftest/datagen/genL2SVMData.sh
b/scripts/perftest/datagen/genL2SVMData.sh
new file mode 100644
index 0000000000..d25e433530
--- /dev/null
+++ b/scripts/perftest/datagen/genL2SVMData.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+ echo "Please execute scripts from directory 'perftest'"
+ exit 1;
+fi
+
+CMD=$1
+DATADIR=$2
+
+FORMAT="binary" # can be csv, mm, text, binary
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+BASEPATH=$(dirname $0)
+
+#generate XS scenarios (80MB)
+${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args
10000 1000 5 5 ${DATADIR}/w10k_1k_dense ${DATADIR}/X10k_1k_dense
${DATADIR}/Y10k_1k_dense 1 0 $DENSE_SP $FORMAT 1
+${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args
10000 1000 5 5 ${DATADIR}/w10k_1k_sparse ${DATADIR}/X10k_1k_sparse
${DATADIR}/Y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1
diff --git a/scripts/perftest/datagen/genMultinomialData.sh
b/scripts/perftest/datagen/genMultinomialData.sh
new file mode 100644
index 0000000000..95c42f87dd
--- /dev/null
+++ b/scripts/perftest/datagen/genMultinomialData.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+ echo "Please execute scripts from directory 'perftest'"
+ exit 1;
+fi
+
+CMD=$1
+BASE=$2/multinomial
+MAXMEM=$3
+
+FORMAT="binary"
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+echo "-- Generating multinomial data..." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000 1000
$DENSE_SP 5 0 $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $FORMAT 1 &
pidDense80=$!
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000 1000
$SPARSE_SP 5 0 $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $FORMAT 1 &
pidSparse80=$!
+ wait $pidDense80; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $BASE/X10k_1k_dense_k5_test
$BASE/y10k_1k_dense_k5_test $FORMAT &
+ wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $BASE/X10k_1k_sparse_k5_test
$BASE/y10k_1k_sparse_k5_test $FORMAT &
+fi
+
+##generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000 1000
$DENSE_SP 5 0 $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $FORMAT 1 &
pidDense800=$!
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000 1000
$SPARSE_SP 5 0 $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $FORMAT 1 &
pidSparse800=$!
+ wait $pidDense800; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $BASE/X100k_1k_dense_k5_test
$BASE/y100k_1k_dense_k5_test $FORMAT &
+ wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $BASE/X100k_1k_sparse_k5_test
$BASE/y100k_1k_sparse_k5_test $FORMAT &
+fi
+
+##generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000
$DENSE_SP 5 0 $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $FORMAT 1 &
pidDense8000=$!
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000
$SPARSE_SP 5 0 $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $FORMAT 1 &
pidSparse8000=$!
+ wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $BASE/X1M_1k_dense_k5_test
$BASE/y1M_1k_dense_k5_test $FORMAT &
+ wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml $DASH-args
$BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $BASE/X1M_1k_sparse_k5_test
$BASE/y1M_1k_sparse_k5_test $FORMAT &
+fi
+
+##generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000
$DENSE_SP 5 0 $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $FORMAT 1
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000
$SPARSE_SP 5 0 $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $FORMAT 1
+ ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_dense_k5
$BASE/y10M_1k_dense_k5 $BASE/X10M_1k_dense_k5_test $BASE/y10M_1k_dense_k5_test
$FORMAT
+ ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_sparse_k5
$BASE/y10M_1k_sparse_k5 $BASE/X10M_1k_sparse_k5_test
$BASE/y10M_1k_sparse_k5_test $FORMAT
+fi
+
+#generate LARGE scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000
$DENSE_SP 5 0 $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $FORMAT 1
+ ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000
$SPARSE_SP 5 0 $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $FORMAT 1
+ ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_dense_k5
$BASE/y100M_1k_dense_k5 $BASE/X100M_1k_dense_k5_test
$BASE/y100M_1k_dense_k5_test $FORMAT
+ ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_sparse_k5
$BASE/y100M_1k_sparse_k5 $BASE/X100M_1k_sparse_k5_test
$BASE/y100M_1k_sparse_k5_test $FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genRandData4ALS.dml
b/scripts/perftest/datagen/genRandData4ALS.dml
new file mode 100644
index 0000000000..f6c3562862
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4ALS.dml
@@ -0,0 +1,47 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+Xfile = $X; # input matrix X of size m x n
+Ufile = ifdef($U, " "); # original row factor of size m x r
+Vfile = ifdef($V, " "); # original col factor of size r x n
+m = $rows; # no. of rows of X
+n = $cols; # no. of cols of X
+r = $rank; # rank of factorization
+nnz = $nnz; # no. of nonzeros in X
+sigma = ifdef ($sigma, 0.01); # variance of Gaussian noise
+fmt = ifdef ($fmt, "binary"); # output format
+
+# generate original factors by sampling from a normal(0,1.0) distribution
+U = rand(rows = m, cols = r, pdf = "normal", seed = 123);
+V = rand(rows = n, cols = r, pdf = "normal", seed = 456);
+
+I = floor(rand(rows = nnz, cols = 1, min = 1, max = m + 0.999999999));
+J = floor(rand(rows = nnz, cols = 1, min = 1, max = n + 0.999999999));
+X = rand(rows = nnz, cols = 1, pdf = "normal") * sqrt(sigma);
+N = table(I, J, X);
+X = (N != 0) * (U %*% t(V)) + N;
+write(X, Xfile, format = fmt);
+if( Ufile != " " )
+ write(U, Ufile, format = fmt);
+if( Vfile != " " ) {
+ V = t(V);
+ write(V, Vfile, format = fmt);
+}
diff --git a/scripts/perftest/datagen/genRandData4ChisquaredTest.dml
b/scripts/perftest/datagen/genRandData4ChisquaredTest.dml
new file mode 100644
index 0000000000..8f2b945e01
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4ChisquaredTest.dml
@@ -0,0 +1,87 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates a two column matrix of categorical
+# variables
+# used to test systemds's chi-squared bivariate stat
+# computation
+
+# $1 is number of samples to generate
+# $2 is number of categories for 1st categorical variable
+# $3 is number of categories for 2nd categorical variable
+# $4 is the file to write out the chi-squared statistic to
+# $5 is the file to write out the generated data to
+
+numSamples = $1
+numCategories1 = $2
+numCategories2 = $3
+
+o = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=1.0,
pdf="uniform", seed=0)
+o = o / sum(o)
+
+probs1 = rowSums(o)
+probs1 = probs1 / sum(probs1)
+probs2 = colSums(o)
+probs2 = probs2 / sum(probs2)
+e = probs1 %*% probs2
+
+chisquared = sum((o-e)^2/e)
+write(chisquared, $4, format="binary")
+
+oCDF = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=0.0,
pdf="uniform", seed=0)
+for(i in 1:numCategories1){
+ for(j in 1:numCategories2){
+ if(i==1 & j==1){
+ oCDF[i,j] = o[1,1]
+ }
+ if(i != 1 & j == 1){
+ oCDF[i,j] = oCDF[i-1,numCategories2] + o[i,j]
+ }
+ if(j > 1){
+ oCDF[i,j] = oCDF[i,j-1] + o[i,j]
+ }
+ }
+}
+
+one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform", seed=0)
+data = Rand(rows=numSamples, cols=2, min=0.0, max=0.0, pdf="uniform", seed=0)
+parfor(s in 1:numSamples){
+ r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0)
+ r = as.scalar(r_mat)
+
+ cat1 = -1
+ cat2 = -1
+ continue = 1
+ for(i in 1:numCategories1){
+ for(j in 1:numCategories2){
+ cdf = as.scalar(oCDF[i,j])
+ if(continue == 1 & r <= cdf){
+ cat1 = i
+ cat2 = j
+ continue = 0
+ }
+ }
+ }
+
+ data[s,1] = cat1*one
+ data[s,2] = cat2*one
+}
+write(data, $5, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4DecisionTree.sh
b/scripts/perftest/datagen/genRandData4DecisionTree.sh
new file mode 100644
index 0000000000..44978192fe
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4DecisionTree.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR |
SPARK | ECHO> e.g. $0 perftest SPARK" ; exit 1 ; fi
+if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" ==
"MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi
+
+BASE=$1/trees
+
+FORMAT="csv"
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+PATH_LOCAL=/tmp/datagen
+PATH_HDFS=$BASE
+
+#### part 1: generating class labels and categorical features
+${CMD} -f ../datagen/genRandData4DecisionTree1.dml $DASH-nvargs
XCat=$BASE/XCat Y=$BASE/Y num_records=1000 num_cat=100 num_class=10
num_distinct=100 sp=$DENSE_SP
+
+#### part 2: generating spec.json on HDFS
+NUM_FEATURES=100
+
+echo "{ \"ids\": true
+ ,\"recode\": [1 " > $PATH_LOCAL/spec.json
+for i in $(seq 2 $NUM_FEATURES); do
+ echo " , "$i >> $PATH_LOCAL/spec.json
+done
+echo " ] , \"dummycode\": [ 1" >> $PATH_LOCAL/spec.json
+for i in $(seq 2 $NUM_FEATURES); do
+ echo " , "$i >> $PATH_LOCAL/spec.json
+done
+echo "] }" >> $PATH_LOCAL/spec.json
+
+hadoop fs -rm $PATH_HDFS/spec.json
+hadoop fs -copyFromLocal $PATH_LOCAL/spec.json $PATH_HDFS/spec.json
+
+#### part 3: generating scale feature and transforming categorical features,
finally combaning scale and categorical features
+${CMD} -f ../datagen/genRandData4DecisionTree2.dml $DASH-nvargs
tPath=$BASE/metadata tSpec=$BASE/spec.json XCat=$BASE/XCat X=$BASE/X
num_records=1000 num_scale=100 sp=$DENSE_SP fmt=$FORMAT
+
+
diff --git a/scripts/perftest/datagen/genRandData4DecisionTree1.dml
b/scripts/perftest/datagen/genRandData4DecisionTree1.dml
new file mode 100644
index 0000000000..7d1dd50d6b
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4DecisionTree1.dml
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+XCatFile = $XCat;
+YFile = $Y;
+num_records = $num_records;
+num_cat_features = $num_cat;
+num_class = $num_class;
+num_distinct = $num_distinct;
+sparsity = $sp;
+
+# generate class labels
+Y = floor (rand (rows = num_records, cols = 1, min = 1, max = num_class +
0.99999999999999));
+Y_bin = table (seq (1, num_records), Y);
+write (Y_bin, YFile);
+
+# generate categorical features
+X_cat = floor (rand (rows = num_records, cols = num_cat_features, min = 1, max
= num_distinct + 0.99999999999999, sparsity = sparsity));
+fX_cat = as.frame(X_cat);
+write (fX_cat, XCatFile, format = "csv");
+
diff --git a/scripts/perftest/datagen/genRandData4DecisionTree2.dml
b/scripts/perftest/datagen/genRandData4DecisionTree2.dml
new file mode 100644
index 0000000000..715924915c
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4DecisionTree2.dml
@@ -0,0 +1,41 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+transformPath = $tPath;
+transformSpec = $tSpec;
+XCatFile = $XCat;
+XFile = $X;
+num_records = $num_records;
+num_scale_features = $num_scale;
+sparsity = $sp;
+fmt = $fmt;
+
+# generate scale features
+X_scale = rand (rows = num_records, cols = num_scale_features, min = 0, max =
10, sparsity = sparsity);
+
+# transform categorical features
+XCF = read (XCatFile);
+specJson = read(transformSpec, data_type="scalar", value_type="string");
+X_cat_transformed = transform (target = XCF, spec = specJson, transformPath =
transformPath);
+
+X = cbind (X_scale, X_cat_transformed);
+write (X, XFile, format = fmt);
diff --git a/scripts/perftest/datagen/genRandData4DescriptiveStats.dml
b/scripts/perftest/datagen/genRandData4DescriptiveStats.dml
new file mode 100644
index 0000000000..6f96162074
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4DescriptiveStats.dml
@@ -0,0 +1,149 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+------------------------------------------------
+ Parameters
+------------------------------------------------
+$R = #rows
+$C = #columns
+$NC = number of categorical attributes
+$MAXDOMAIN = maximum domain size
+$DATA = output file path on HDFS
+$SETSIZE = Size of one bivariate set
+$LABELSETSIZE= Size of second bivariate set with labels
+$TYPES = output attribute types
+$TYPES1 = Attribute types for Set1
+$TYPES2 = Attribute types for Set2
+$INDEX1 = Indices for Set1
+$INDEX2 = Indices for Set2
+$FMT = output format
+------------------------------------------------
+hadoop jar SystemDS.jar -f genData4Stats.dml -nvargs R=1000000 C=1000 NC=50
MAXDOMAIN=1100 DATA=stats/data TYPES=stats/types SETSIZE=15 LABELSETSIZE=10
TYPES1=... Types2=... INDEX1=.. INDEX2=..FMT=csv
+------------------------------------------------
+*/
+
+
+FMT = ifdef($FMT,"binary"); # default output format
+
+# number of categorical attributes.. numC <= C
+numC = $NC;
+numO = as.integer(numC/2);
+numNominal = numC - numO;
+print("Categorical Mix = (" + numC + "," + numO + "," + numNominal +")");
+
+# maximum domain size among all categorical attributes
+maxDomainSize = $MAXDOMAIN;
+
+# Divide $C attributes according to the following logic:
+#
+# 1 numO numC C
+# |-------|---------|-----------------|
+# ord nominal scale
+#
+# numC+1-$C: scale
+# 1-numC/2: ordinal
+# (numC/2+1)-numC: nominal
+
+types = matrix(1, rows=1, cols=$C);
+ocutoff = numO;
+types[1,1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
+types[1, ocutoff+1:numC] = matrix(1,rows=1,cols=(numC-ocutoff))*2;
+
+# Generate data
+A = rand(rows=$R, cols=$C, sparsity=1);
+B = matrix(0,rows=nrow(A), cols=ncol(A));
+parfor (i in 1:numC) {
+ Ai = A[,i];
+
+ tmp = round(rand(rows=1,cols=1, min=1, max=maxDomainSize));
+ domain = as.scalar(tmp[1,1]);
+
+ # for some attributes, choose the maxDomainSize
+ tmp = rand(rows=1,cols=1);
+ if (as.scalar(tmp[1,1]) < 0.5) {
+ domain = maxDomainSize;
+ }
+
+ B[,i] = round(1+(domain-1)*Ai);
+}
+B[ ,(numC+1):ncol(A)] = A[, (numC+1):ncol(A)];
+
+
+write(B, $DATA, format=FMT);
+write(types, $TYPES, format=FMT);
+
+# ----- Generator for Bivariate ---------
+
+settypes1 = matrix(1, rows=1, cols=$SETSIZE);
+index1 = matrix(0, rows=1, cols=$SETSIZE);
+
+catSetSize = as.integer($SETSIZE/2);
+ocutoff = as.integer(catSetSize/2);
+print("Set Mix = (" + $SETSIZE + "," + catSetSize + "," + ocutoff + ")" );
+settypes1[1, 1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
+settypes1[1, ocutoff+1:catSetSize] =
matrix(1,rows=1,cols=(catSetSize-ocutoff))*2;
+
+# select ordinal indices
+tmp = rand(rows=1, cols=ocutoff);
+index1[1, 1:ocutoff] = round(1 + (numO-1)*tmp);
+
+# select nominal indices
+nominalSetSize = catSetSize-ocutoff;
+tmp = rand(rows=1, cols=nominalSetSize);
+index1[1, ocutoff+1:catSetSize] = round(numO+1 + (numC-numO-1)*tmp);
+
+# select scale attributes
+scaleSetSize = $SETSIZE-catSetSize;
+tmp = rand(rows=1, cols=scaleSetSize);
+index1[1, catSetSize+1:$SETSIZE] = round(numC+1 + ($C-numC-1)*tmp);
+
+
+# --- select types and indices for LABELSET
+settypes2 = matrix(2, rows=1, cols=$LABELSETSIZE);
+index2 = matrix(0, rows=1, cols=$LABELSETSIZE);
+if($LABELSETSIZE > 1) {
+ settypes2[1,1] = 1;
+ r = as.scalar(rand(rows=1,cols=1));
+ index2[1,1] = round(numC+1 + ($C-numC-1)*r)
+}
+else {
+ r = as.scalar(rand(rows=1,cols=1));
+ index2[1,1] = round( numO+1 + (numC-numO-1)*r )
+}
+
+for(i in 2:as.integer($LABELSETSIZE/2)) {
+ settypes2[1,i] = 3;
+ r = as.scalar(rand(rows=1,cols=1));
+ index2[1,i] = round( 1 + (numO-1)*r )
+}
+
+for(i in as.integer($LABELSETSIZE/2)+1:$LABELSETSIZE) {
+ settypes2[1,i] = 2;
+ r = as.scalar(rand(rows=1,cols=1));
+ index2[1,i] = round( numO+1 + (numC-numO-1)*r )
+}
+
+write(settypes1, $TYPES1, format=FMT);
+write(settypes2, $TYPES2, format=FMT);
+write(index1, $INDEX1, format=FMT);
+write(index2, $INDEX2, format=FMT);
+
diff --git a/scripts/perftest/datagen/genRandData4FTest.dml
b/scripts/perftest/datagen/genRandData4FTest.dml
new file mode 100644
index 0000000000..9f0e1d6c68
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4FTest.dml
@@ -0,0 +1,95 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for F-test
+#
+# $1 is number of groups (some of
+# which may share a gaussian)
+# $2 is number of actual groups
+# $3 is number of points
+# $4 is mean of the gaussian means
+# $5 is mean of the gaussian std. deviations
+# $6 is file to store computed f-statistic
+# $7 is file to store generated data
+
+numGroups = $1
+numActualGroups = $2
+numSamples = $3
+meanOfMeans = $4
+meanOfStddevs = $5
+
+cntProbs = Rand(rows=numGroups, cols=1, min=0.0, max=1.0, pdf="uniform",
seed=0)
+cntProbs = cntProbs/sum(cntProbs)
+cntArr = round(cntProbs * numSamples)
+last_cnt = cntArr[numGroups,1]
+cntArr[numGroups,1] = numSamples - (sum(cntArr) - last_cnt)
+
+permut = Rand(rows=numActualGroups, cols=numGroups, min=0.0, max=0.0,
pdf="uniform")
+ones = Rand(rows=numActualGroups, cols=1, min=1.0, max=1.0, pdf="uniform")
+permut[,1:numActualGroups] = diag(ones)
+
+one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform")
+copy_start_index = numActualGroups+1
+parfor(i in copy_start_index:numGroups){
+ r = Rand(rows=1, cols=1, min=1.0, max=numActualGroups, pdf="uniform",
seed=0)
+ j = as.scalar(round(r))
+ permut[j,i] = one
+}
+
+means_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0)
+abs_means = means_std + meanOfMeans
+means = t(t(abs_means) %*% permut)
+
+stddevs_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0)
+abs_stddevs = stddevs_std + meanOfStddevs
+stddevs = t(t(abs_stddevs) %*% permut)
+
+overall_mean = sum(means*cntArr)/numSamples
+
+explained_variance = sum(cntArr * (means - overall_mean)^2) / (numGroups-1.0)
+unexplained_variance = sum(cntArr * stddevs^2) / (numSamples - numGroups)
+f = explained_variance / unexplained_variance
+write(f, $6, format="binary")
+
+cntCDFs = cntProbs
+for(i in 2:numGroups){
+ cntCDFs[i,1] = cntCDFs[i-1,1] + cntProbs[i,1]
+}
+
+data = Rand(rows=numSamples, cols=1, min=0.0, max=0.0, pdf="uniform")
+parfor(i in 1:numSamples){
+ r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0)
+ r1 = as.scalar(r_mat)
+
+ g = -1
+ continue = 1
+ for(k in 1:numGroups){
+ cdf = as.scalar(cntCDFs[k,1])
+ if(continue==1 & r1<=cdf){
+ g = k
+ continue=0
+ }
+ }
+
+ point = Rand(rows=1, cols=1, pdf="normal", seed=0)
+ data[i,1] = point*stddevs[g,1] + means[g,1]
+}
+write(data, $7, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4Kmeans.dml
b/scripts/perftest/datagen/genRandData4Kmeans.dml
new file mode 100644
index 0000000000..3098650b26
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4Kmeans.dml
@@ -0,0 +1,120 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Generates random Gaussian-mixture data to test k-Means clustering algorithms
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+# ----------------------------------------------------------------------------
+# nr Int --- Number of records
+# nf Int --- Number of features
+# nc Int --- Number of clusters
+# dc Double --- St.dev. of cluster "centroid" features from zero mean
+# dr Double --- St.dev. of the 1-st feature in a record within cluster
+# fbf Double --- Feature bias factor: Stdev(last) / Stdev(1-st) feature
+# cbf Double --- Cluster bias factor: Prob[1-st clus] / Prob[k-th clus]
+# X String --- Location to write matrix X with generated data records
+# C String --- Location to write cluster "centroids" (Gaussian means)
+# Y String --- Location to write assignment of records to cluster ids
+# YbyC String --- Location to write rec-cluster assigns by min-dist to C
+# ----------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100
+# nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx
YbyC=YbyC.mtx
+
+print ("BEGIN K-MEANS GENERATOR SCRIPT");
+
+num_records = $nr;
+num_features = $nf;
+num_centroids = $nc;
+dist_per_feature_centroids = $dc;
+dist_per_feature_first_record = $dr;
+feature_bias_factor = $fbf;
+cluster_bias_factor = $cbf;
+
+fileX = ifdef ($X, "X");
+fileC = ifdef ($C, "C");
+fileY = ifdef ($Y, "Y");
+fileYbyC = ifdef ($YbyC, "YbyC");
+fmt = ifdef ($fmt, "text");
+
+print ("Generating cluster distribution (mixture) centroids...");
+
+C = Rand (rows = num_centroids, cols = num_features, pdf = "normal");
+C = C * dist_per_feature_centroids;
+
+print ("Generating record-to-cluster assignments...");
+
+# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely
+# than "num_centroids" by the factor of "cluster_bias_factor"
+
+rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf =
"uniform");
+if (cluster_bias_factor == 1.0) {
+ Y = round (0.5 + rnd * num_centroids);
+} else {
+ rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids /
(num_centroids - 1)));
+ Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log
(cluster_bias_factor));
+}
+
+print ("Generating within-cluster random shifts...");
+
+X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal");
+feature_factors = dist_per_feature_first_record *
+ exp ((seq (1, num_features) - 1) / (num_features - 1) * log
(feature_bias_factor));
+X_shift = X_shift %*% diag (feature_factors);
+
+print ("Generating records by shifting from centroids...");
+
+Y_bitmap_raw = table (seq (1, num_records), Y);
+Y_bitmap = matrix (0, rows = num_records, cols = num_centroids);
+Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw;
+X = Y_bitmap %*% C + X_shift;
+
+print ("Computing record-to-cluster assignments by minimum centroid
distance...");
+
+D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2));
+P = (D <= rowMins (D));
+aggr_P = t(cumsum (t(P)));
+Y_by_C = rowSums (aggr_P == 0) + 1;
+
+print ("Computing useful statistics...");
+
+sumXsq = sum (X ^ 2);
+default_wcss = sumXsq - sum (colSums (X) ^ 2) / num_records;
+attained_wcss = sumXsq + sum (rowMins (D));
+
+print ("Default (single-cluster) WCSS = " + default_wcss);
+print (num_centroids + "-cluster WCSS attained by the mixture centroids = " +
attained_wcss);
+
+print ("Writing out the resulting dataset...");
+
+write (X, fileX, format = fmt);
+write (C, fileC, format = fmt);
+write (Y, fileY, format = fmt);
+write (Y_by_C, fileYbyC, format = fmt);
+
+print ("Please run the scoring script to compare " + fileY + " with " +
fileYbyC);
+
+print ("DONE: K-MEANS GENERATOR SCRIPT");
+
diff --git a/scripts/perftest/datagen/genRandData4LinearReg_LTstats.dml
b/scripts/perftest/datagen/genRandData4LinearReg_LTstats.dml
new file mode 100644
index 0000000000..9bb1ca189e
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4LinearReg_LTstats.dml
@@ -0,0 +1,233 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# generates random data to test bi- and multinomial logistic regression
+
+# $N = number of training samples
+# $Nt = number of test samples (or 0 if none)
+# $nf = number of features (independent variables)
+# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
+# $Xmin = minimum feature value
+# $Xmax = maximum feature value
+# $spars = controls sparsity in the generated data
+# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
+# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
+# $stdLT = requested standard deviation for the linear terms
+# $iceptmin = intercept, minimum value (0.0 disables intercept)
+# $iceptmax = intercept, maximum value (0.0 disables intercept)
+# $B = location to store generated regression parameters
+# $X = location to store generated training data
+# $Y = location to store generated training category labels
+# $Xt = location to store generated test data
+# $Yt = location to store generated test category labels
+# $fmt = format of the output
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4LinearReg_LTstats.dml -nvargs
+# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0
avgLTmax=5.0 stdLT=1.25
+# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123
Yt=./Yt123 fmt=binary
+
+numTrainingSamples = $N;
+numTestSamples = $Nt;
+numFeatures = $nf;
+numCategories = $nc;
+minIntercept = $iceptmin;
+maxIntercept = $iceptmax;
+minXentry = $Xmin;
+maxXentry = $Xmax;
+minAvgLT = $avgLTmin;
+maxAvgLT = $avgLTmax;
+sparsityLevel = $spars;
+stdevLT = $stdLT;
+fileB = ifdef ($B, "B");
+fileX = ifdef ($X, "X");
+fileY = ifdef ($Y, "Y");
+fileXt = ifdef ($Xt, "Xt");
+fileYt = ifdef ($Yt, "Yt");
+fmt = ifdef ($fmt, "mm");
+
+numSamples = numTrainingSamples + numTestSamples;
+
+isBinomialPMOne = FALSE;
+if (numCategories == 1) {
+ numCategories = 2;
+ isBinomialPMOne = TRUE;
+}
+do_we_output_intercept = 1;
+if (minIntercept == 0 & maxIntercept == 0) {
+ do_we_output_intercept = 0;
+}
+
+X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max =
maxXentry, pdf = "uniform", sparsity = sparsityLevel);
+
+meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max =
maxAvgLT, pdf = "uniform");
+sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
+b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept,
max = maxIntercept, pdf = "uniform");
+
+meanLT_minus_intercept = meanLT - b_intercept;
+[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
+
+ones = matrix (1.0, rows = numSamples, cols = 1);
+LT = X %*% B + ones %*% b_intercept;
+actual_meanLT = colSums (LT) / numSamples;
+actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
+
+for (i in 1:(numCategories - 1)) {
+ if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) {
+ print ("Category " + i + ": Intercept = " + as.scalar (b_intercept
[1, i]));
+ } else {
+ print ("Category " + i + ": Intercept = " + as.scalar (b_intercept
[1, i]) + ", st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i]));
+ }
+ print (" Wanted LT mean = " + as.scalar (meanLT [1, i]) + ",
st.dev. = " + as.scalar (new_sigmaLT [1, i]));
+ print (" Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ",
st.dev. = " + as.scalar (actual_sigmaLT [1, i]));
+}
+
+
+/*
+ones = matrix (1.0, rows = 1, cols = numCategories - 1);
+Prob = exp (LT);
+Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
+Prob = t(cumsum (t(Prob)));
+
+r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed
= 0);
+R = r %*% ones;
+Y = 1 + rowSums (Prob < R);
+if (isBinomialPMOne) {
+ Y = 3 - 2 * Y;
+}
+*/
+
+/* USE FOR LINEAR REGRESSION */
+
+r = Rand (rows = numSamples, cols = 1, pdf = "normal");
+Y = LT [, 1] + r;
+
+
+if (do_we_output_intercept == 1) {
+ new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
+ new_B [1:nrow(B), 1:ncol(B)] = B;
+ new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
+ write (new_B, fileB, format=fmt);
+} else {
+ write (B, fileB, format=fmt);
+}
+
+if (numTestSamples > 0) {
+ X_train = X [1:numTrainingSamples,];
+ Y_train = Y [1:numTrainingSamples,];
+ X_test = X [(numTrainingSamples+1):numSamples,];
+ Y_test = Y [(numTrainingSamples+1):numSamples,];
+ write (X_train, fileX, format=fmt);
+ write (Y_train, fileY, format=fmt);
+ write (X_test, fileXt, format=fmt);
+ write (Y_test, fileYt, format=fmt);
+} else {
+ write (X, fileX, format=fmt);
+ write (Y, fileY, format=fmt);
+}
+
+
+
+
+
+
+# Generates weight vectors to ensure the desired statistics for Linear Terms =
X %*% W
+# To be used for data generation in the testing of GLM, Logistic Regression,
etc.
+# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i]
are
+# the desired mean and standard deviation for X %*% W[, i]
+# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
+# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully
enforced,
+# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this
constraint.
+generateWeights =
+ function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
+ return (Matrix[double] W, Matrix[double] new_sigmaLT)
+{
+ num_w = ncol (meanLT); # Number of output weight vectors
+ dim_w = ncol (X); # Number of features / dimensions in a weight
vector
+ w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes
meanLT
+ # (all orthogonal shift directions do not affect
meanLT)
+
+ # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
+
+ w_1 = straightenX (X);
+ r_1 = (X %*% w_1) - 1.0;
+ norm_r_1_sq = sum (r_1 ^ 2);
+
+ # For each W[, i] generate uniformly random directions to shift away from
"w_1"
+
+ DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
+ DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to
w_X
+ XDW = X %*% DW;
+
+ # Determine how far to shift in the chosen directions to satisfy the
constraints
+ # Use the positive root of the quadratic equation; relax sigmaLT where
needed
+
+ a_qe = colSums (XDW ^ 2);
+ b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
+ c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
+
+ is_sigmaLT_OK = (c_qe <= 0);
+ new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT)
* sqrt (norm_r_1_sq / nrow(X));
+ c_qe = is_sigmaLT_OK * c_qe;
+ x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
+
+ # Scale and shift "w_1" in the "DW" directions to produce the result:
+
+ ones = matrix (1.0, rows = dim_w, cols = 1);
+ W = w_1 %*% meanLT + DW * (ones %*% x_qe);
+}
+
+# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1
+# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
+# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X).
+straightenX =
+ function (Matrix[double] X)
+ return (Matrix[double] w)
+{
+ w_X = t(colSums(X));
+ lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+ eps = 0.000000001 * nrow(X);
+
+ # BEGIN LEAST SQUARES
+
+ r_LS = - w_X;
+ z_LS = matrix (0.0, rows = ncol(X), cols = 1);
+ p_LS = - r_LS;
+ norm_r2_LS = sum (r_LS ^ 2);
+ i_LS = 0;
+ while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
+ {
+ temp_LS = X %*% p_LS;
+ q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
+ alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+ z_LS = z_LS + alpha_LS * p_LS;
+ old_norm_r2_LS = norm_r2_LS;
+ r_LS = r_LS + alpha_LS * q_LS;
+ norm_r2_LS = sum (r_LS ^ 2);
+ p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+ i_LS = i_LS + 1;
+ }
+
+ # END LEAST SQUARES
+
+ w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+}
diff --git a/scripts/perftest/datagen/genRandData4LinearRegression.dml
b/scripts/perftest/datagen/genRandData4LinearRegression.dml
new file mode 100644
index 0000000000..ebce4f30d1
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4LinearRegression.dml
@@ -0,0 +1,61 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates data to test linear regression
+
+# $1 is number of samples
+# $2 is number of features (independent variables)
+# $3 is maximum feature value (absolute value)
+# $4 is maximum weight (absolute value)
+# $5 is location to store generated weights
+# $6 is location to store generated data
+# $7 is location to store generated labels
+# $8 is 0/1. 0 suppresses noise, 1 will add noise to Y
+# $9 is b, 0 disables intercept
+# $10 controls sparsity in the generated data
+# $11 output format
+
+numSamples = $1
+numFeatures = $2
+maxFeatureValue = $3
+maxWeight = $4
+addNoise = $8
+b = $9
+fmt = $11
+
+X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform",
seed=0, sparsity=$10)
+w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
+X = X * maxFeatureValue
+w = w * maxWeight
+Y = X %*% w
+
+if( b != 0 ) {
+ b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
+ w = rbind(w, t(b_mat))
+ Y = Y + b
+}
+
+noise = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
+Y = Y + addNoise*noise
+
+write(w, $5, format=fmt)
+write(X, $6, format=fmt)
+write(Y, $7, format=fmt)
diff --git a/scripts/perftest/datagen/genRandData4LogReg_LTstats.dml
b/scripts/perftest/datagen/genRandData4LogReg_LTstats.dml
new file mode 100644
index 0000000000..f95342f708
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4LogReg_LTstats.dml
@@ -0,0 +1,233 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# generates random data to test bi- and multinomial logistic regression
+
+# $N = number of training samples
+# $Nt = number of test samples (or 0 if none)
+# $nf = number of features (independent variables)
+# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
+# $Xmin = minimum feature value
+# $Xmax = maximum feature value
+# $spars = controls sparsity in the generated data
+# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
+# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
+# $stdLT = requested standard deviation for the linear terms
+# $iceptmin = intercept, minimum value (0.0 disables intercept)
+# $iceptmax = intercept, maximum value (0.0 disables intercept)
+# $B = location to store generated regression parameters
+# $X = location to store generated training data
+# $Y = location to store generated training category labels
+# $Xt = location to store generated test data
+# $Yt = location to store generated test category labels
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4LogReg_LTstats.dml -nvargs
+# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0
avgLTmax=5.0 stdLT=1.25
+# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123
Yt=./Yt123
+
+numTrainingSamples = $N;
+numTestSamples = $Nt;
+numFeatures = $nf;
+numCategories = $nc;
+minIntercept = $iceptmin;
+maxIntercept = $iceptmax;
+minXentry = $Xmin;
+maxXentry = $Xmax;
+minAvgLT = $avgLTmin;
+maxAvgLT = $avgLTmax;
+sparsityLevel = $spars;
+stdevLT = $stdLT;
+fileB = ifdef ($B, "B");
+fileX = ifdef ($X, "X");
+fileY = ifdef ($Y, "Y");
+fileXt = ifdef ($Xt, "Xt");
+fileYt = ifdef ($Yt, "Yt");
+
+
+numSamples = numTrainingSamples + numTestSamples;
+
+isBinomialPMOne = FALSE;
+if (numCategories == 1) {
+ numCategories = 2;
+ isBinomialPMOne = TRUE;
+}
+do_we_output_intercept = 1;
+if (minIntercept == 0 & maxIntercept == 0) {
+ do_we_output_intercept = 0;
+}
+
+X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max =
maxXentry, pdf = "uniform", sparsity = sparsityLevel);
+
+meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max =
maxAvgLT, pdf = "uniform");
+sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
+b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept,
max = maxIntercept, pdf = "uniform");
+
+meanLT_minus_intercept = meanLT - b_intercept;
+[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
+
+ones = matrix (1.0, rows = numSamples, cols = 1);
+LT = X %*% B + ones %*% b_intercept;
+actual_meanLT = colSums (LT) / numSamples;
+actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
+
+for (i in 1:(numCategories - 1)) {
+ if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) {
+ print ("Category " + i + ": Intercept = " + as.scalar (b_intercept
[1, i]));
+ } else {
+ print ("Category " + i + ": Intercept = " + as.scalar (b_intercept
[1, i]) + ", st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i]));
+ }
+ print (" Wanted LT mean = " + as.scalar (meanLT [1, i]) + ",
st.dev. = " + as.scalar (new_sigmaLT [1, i]));
+ print (" Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ",
st.dev. = " + as.scalar (actual_sigmaLT [1, i]));
+}
+
+
+ones = matrix (1.0, rows = 1, cols = numCategories - 1);
+Prob = exp (LT);
+Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
+Prob = t(cumsum (t(Prob)));
+
+r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed
= 0);
+R = r %*% ones;
+Y = 1 + rowSums (Prob < R);
+if (isBinomialPMOne) {
+ Y = 3 - 2 * Y;
+}
+
+
+/* USE FOR LINEAR REGRESSION
+
+r = Rand (rows = numSamples, cols = 1, pdf = "normal");
+Y = LT [, 1] + r;
+
+*/
+
+
+if (do_we_output_intercept == 1) {
+ new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
+ new_B [1:nrow(B), 1:ncol(B)] = B;
+ new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
+ write (new_B, fileB, format="mm");
+} else {
+ write (B, fileB, format="mm");
+}
+
+if (numTestSamples > 0) {
+ X_train = X [1:numTrainingSamples,];
+ Y_train = Y [1:numTrainingSamples,];
+ X_test = X [(numTrainingSamples+1):numSamples,];
+ Y_test = Y [(numTrainingSamples+1):numSamples,];
+ write (X_train, fileX, format="mm");
+ write (Y_train, fileY, format="mm");
+ write (X_test, fileXt, format="mm");
+ write (Y_test, fileYt, format="mm");
+} else {
+ write (X, fileX, format="mm");
+ write (Y, fileY, format="mm");
+}
+
+
+
+
+
+
+# Generates weight vectors to ensure the desired statistics for Linear Terms =
X %*% W
+# To be used for data generation in the testing of GLM, Logistic Regression,
etc.
+# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i]
are
+# the desired mean and standard deviation for X %*% W[, i]
+# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
+# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully
enforced,
+# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this
constraint.
+generateWeights =
+ function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
+ return (Matrix[double] W, Matrix[double] new_sigmaLT)
+{
+ num_w = ncol (meanLT); # Number of output weight vectors
+ dim_w = ncol (X); # Number of features / dimensions in a weight
vector
+ w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes
meanLT
+ # (all orthogonal shift directions do not affect
meanLT)
+
+ # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
+
+ w_1 = straightenX (X);
+ r_1 = (X %*% w_1) - 1.0;
+ norm_r_1_sq = sum (r_1 ^ 2);
+
+ # For each W[, i] generate uniformly random directions to shift away from
"w_1"
+
+ DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
+ DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to
w_X
+ XDW = X %*% DW;
+
+ # Determine how far to shift in the chosen directions to satisfy the
constraints
+ # Use the positive root of the quadratic equation; relax sigmaLT where
needed
+
+ a_qe = colSums (XDW ^ 2);
+ b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
+ c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
+
+ is_sigmaLT_OK = (c_qe <= 0);
+ new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT)
* sqrt (norm_r_1_sq / nrow(X));
+ c_qe = is_sigmaLT_OK * c_qe;
+ x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
+
+ # Scale and shift "w_1" in the "DW" directions to produce the result:
+
+ ones = matrix (1.0, rows = dim_w, cols = 1);
+ W = w_1 %*% meanLT + DW * (ones %*% x_qe);
+}
+
+# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1
+# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
+# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X).
+straightenX =
+ function (Matrix[double] X)
+ return (Matrix[double] w)
+{
+ w_X = t(colSums(X));
+ lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+ eps = 0.000000001 * nrow(X);
+
+ # BEGIN LEAST SQUARES
+
+ r_LS = - w_X;
+ z_LS = matrix (0.0, rows = ncol(X), cols = 1);
+ p_LS = - r_LS;
+ norm_r2_LS = sum (r_LS ^ 2);
+ i_LS = 0;
+ while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
+ {
+ temp_LS = X %*% p_LS;
+ q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
+ alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+ z_LS = z_LS + alpha_LS * p_LS;
+ old_norm_r2_LS = norm_r2_LS;
+ r_LS = r_LS + alpha_LS * q_LS;
+ norm_r2_LS = sum (r_LS ^ 2);
+ p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+ i_LS = i_LS + 1;
+ }
+
+ # END LEAST SQUARES
+
+ w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+}
diff --git a/scripts/perftest/datagen/genRandData4LogisticRegression.dml
b/scripts/perftest/datagen/genRandData4LogisticRegression.dml
new file mode 100644
index 0000000000..f0850938ad
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4LogisticRegression.dml
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data to test linear logistic regression
+
+# $1 is number of samples
+# $2 is number of features (independent variables)
+# $3 is maximum feature value (absolute value)
+# $4 is maximum weight (absolute value)
+# $5 is location to store generated weights
+# $6 is location to store generated data
+# $7 is location to store generated labels
+# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
+# $9 is b, 0 disables intercept
+# $10 controls sparsity in the generated data
+# $11 output format
+# $12 transform labels. if 0 then -1/1; otherwise 1/2
+
+numSamples = $1
+numFeatures = $2
+maxFeatureValue = $3
+maxWeight = $4
+addNoise = $8
+b = $9
+
+X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform",
seed=0, sparsity=$10)
+X = X * maxFeatureValue
+
+w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
+w = w * maxWeight
+
+ot = X %*% w
+if( b != 0) {
+ b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
+ w = rbind(w, t(b_mat))
+ ot = ot + b
+}
+
+prob = 1 / (1 + exp(-ot))
+if( addNoise == 1 ){
+ r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+}
+else {
+ print("this data generator generates the same dataset for both noise=0
and noise=1")
+ r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+}
+
+Y = 1 - 2 * (prob < r)
+if( $12 == 1 )
+ Y = (Y + 3) / 2
+
+write(w, $5, format=$11)
+write(X, $6, format=$11)
+write(Y, $7, format=$11)
diff --git a/scripts/perftest/datagen/genRandData4MultiClassSVM.dml
b/scripts/perftest/datagen/genRandData4MultiClassSVM.dml
new file mode 100644
index 0000000000..011b4dab18
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4MultiClassSVM.dml
@@ -0,0 +1,68 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data to test linear logistic regression
+
+# $1 is number of samples
+# $2 is number of features (independent variables)
+# $3 is maximum feature value (absolute value)
+# $4 is maximum weight (absolute value)
+# $5 is location to store generated weights
+# $6 is location to store generated data
+# $7 is location to store generated labels
+# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
+# $9 is b, 0 disables intercept
+# $10 controls sparsity in the generated data
+
+numSamples = $1
+numFeatures = $2
+maxFeatureValue = $3
+maxWeight = $4
+addNoise = $8
+b = $9
+
+X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform",
seed=0, sparsity=$10)
+X = X * maxFeatureValue
+
+w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
+w = w * maxWeight
+
+ot = X%*%w
+if(b!=0) {
+ b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
+ w = t(cbind(t(w), b_mat))
+ ot = ot + b
+}
+
+prob = 1/(1+exp(-ot))
+if(addNoise == 1){
+ r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+}else{
+ print("this data generator generates the same dataset for both noise=0
and noise=1")
+ r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+ #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform")
+}
+Y = 1 - 2 * (prob < r)
+Y = (Y+3)/2
+
+write(w, $5, format="binary")
+write(X, $6, format="binary")
+write(Y, $7, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4Multinomial.dml
b/scripts/perftest/datagen/genRandData4Multinomial.dml
new file mode 100644
index 0000000000..93666758b5
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4Multinomial.dml
@@ -0,0 +1,66 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+num_records = $1;
+num_features = $2;
+
+p = $3; #sparsity
+num_categories = $4; #num classes
+is_intercept = $5==1;
+
+stdevLT = 1.0;
+beta_range = 3.0 * stdevLT / sqrt (num_features * p);
+
+if (is_intercept) {
+ intercept = Rand (rows = 1, cols = num_categories - 1, min = -1.0, max =
1.0);
+}
+
+X = Rand( rows = num_records,
+ cols = num_features,
+ min = 1,
+ max = 5,
+ pdf = "uniform",
+ sparsity = p );
+
+B = Rand (rows = num_features,
+ cols = num_categories - 1,
+ min = -1.0,
+ max = 1.0,
+ pdf = "uniform",
+ sparsity = 1.0) * beta_range;
+
+LT = X %*% B;
+if (is_intercept) {
+ LT = LT + matrix (1, rows = num_records, cols = 1) %*% intercept;
+}
+
+Prob = exp (LT);
+Prob = Prob / (1.0 + rowSums(Prob));
+Prob = t(cumsum (t(Prob)));
+
+r = Rand (rows = num_records, cols = 1, min = 0, max = 1, pdf = "uniform");
+Y = 1 + rowSums (Prob < r);
+
+# ensure all classes are represented
+Y[(num_records-num_categories+1):num_records,1] = seq(1,num_categories);
+
+write(X, $6, format=$8)
+write(Y, $7, format=$8);
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genRandData4NMF.dml
b/scripts/perftest/datagen/genRandData4NMF.dml
new file mode 100644
index 0000000000..a82ac4e0f1
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4NMF.dml
@@ -0,0 +1,129 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for non-negative
+# matrix factorization
+#
+# follows lda's generative model
+# see Blei, Ng & Jordan, JMLR'03 paper
+# titled Latent Dirichlet Allocation
+#
+# $1 is number of samples
+# $2 is number of features
+# $3 is number of latent factors
+# $4 is number of features per sample
+# (may overlap). use this to vary
+# sparsity.
+# $5 is file to store sample mixtures
+# $6 is file to store factors
+# $7 is file to store generated data
+
+numDocuments = $1
+numFeatures = $2
+numTopics = $3
+numWordsPerDoc = $4
+
+docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0,
pdf="uniform", seed=0, sparsity=0.75)
+denomsTM = rowSums(docTopicMixtures)
+zerosInDenomsTM = denomsTM == 0
+denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
+parfor(i in 1:numTopics){
+ docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
+}
+write(docTopicMixtures, $5, format="binary")
+for(j in 2:numTopics){
+ docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
+}
+
+topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0,
pdf="uniform", seed=0, sparsity=0.75)
+parfor(i in 1:numTopics){
+ topicDist = topicDistributions[i,]
+
+ denom2 = sum(topicDist)
+ if(denom2 == 0){
+ denom2 = denom2 + 0.1
+ }
+
+ topicDistributions[i,] = topicDist / denom2
+}
+write(topicDistributions, $6, format="binary")
+for(j in 2:numFeatures){
+ topicDistributions[,j] = topicDistributions[,j-1] +
topicDistributions[,j]
+}
+
+data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
+
+parfor(i in 1:numDocuments){
+ docTopic = docTopicMixtures[i,]
+
+ ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");
+
+ r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform",
seed=0)
+ r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform",
seed=0)
+
+ for(j in 1:numWordsPerDoc){
+ rz = as.scalar(r_z[j,1])
+ continue = 1
+
+ z = -1
+ #this is a workaround
+ #z=1
+
+ for(k1 in 1:numTopics){
+ prob = as.scalar(docTopic[1,k1])
+ if(continue==1 & rz <= prob){
+ z=k1
+ continue=0
+ }
+ }
+
+ if(z==-1){
+ print("z is unassigned: " + z)
+ z = numTopics
+ }
+
+ rw = as.scalar(r_w[j,1])
+ continue = 1
+
+ w = -1
+ #this is a workaround
+ #w = 1
+
+ for(k2 in 1:numFeatures){
+ prob = as.scalar(topicDistributions[z,k2])
+ if(continue == 1 & rw <= prob){
+ w = k2
+ continue = 0
+ }
+ }
+
+ if(w==-1){
+ print("w is unassigned: " + w)
+ w = numFeatures
+ }
+
+ ldata[1,w] = ldata[1,w] + 1
+ }
+
+ data[i,] = ldata;
+}
+
+write(data, $7, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4NMFBlockwise.dml
b/scripts/perftest/datagen/genRandData4NMFBlockwise.dml
new file mode 100644
index 0000000000..0ad548ead2
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4NMFBlockwise.dml
@@ -0,0 +1,138 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for non-negative
+# matrix factorization
+#
+# follows lda's generative model
+# see Blei, Ng & Jordan, JMLR'03 paper
+# titled Latent Dirichlet Allocation
+#
+# $1 is number of samples
+# $2 is number of features
+# $3 is number of latent factors
+# $4 is number of features per sample
+# (may overlap). use this to vary
+# sparsity.
+# $5 is file to store sample mixtures
+# $6 is file to store factors
+# $7 is file to store generated data
+#
+# $8 is the blocksize, i.e., number of rows per block
+# (should be set such that $8x$2 fits in mem budget)
+
+numDocuments = $1
+numFeatures = $2
+numTopics = $3
+numWordsPerDoc = $4
+blocksize = $8
+
+docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0,
pdf="uniform", seed=0, sparsity=0.75)
+denomsTM = rowSums(docTopicMixtures)
+zerosInDenomsTM = (denomsTM == 0)
+denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
+parfor(i in 1:numTopics){
+ docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
+}
+write(docTopicMixtures, $5, format="binary")
+for(j in 2:numTopics){
+ docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
+}
+
+topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0,
pdf="uniform", seed=0, sparsity=0.75)
+parfor(i in 1:numTopics){
+ topicDist = topicDistributions[i,]
+
+ denom2 = sum(topicDist)
+ if(denom2 == 0){
+ denom2 = denom2 + 0.1
+ }
+
+ topicDistributions[i,] = topicDist / denom2
+}
+write(topicDistributions, $6, format="binary")
+for(j in 2:numFeatures){
+ topicDistributions[,j] = topicDistributions[,j-1] +
topicDistributions[,j]
+}
+
+data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
+
+#outer-loop for blockwise computation
+for( k in seq(1,numDocuments,blocksize) )
+{
+ len = min(blocksize,numDocuments-k); #block length
+ data = data0[k:(k+len),]; #obtain block
+
+ parfor(i in 1:len){
+ docTopic = docTopicMixtures[i,]
+
+ r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform",
seed=0)
+ r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform",
seed=0)
+
+ for(j in 1:numWordsPerDoc){
+ rz = as.scalar(r_z[j,1])
+ continue = 1
+
+ z = -1
+ #this is a workaround
+ #z=1
+
+ for(k1 in 1:numTopics){
+ prob = as.scalar(docTopic[1,k1])
+ if(continue==1 & rz <= prob){
+ z=k1
+ continue=0
+ }
+ }
+
+ if(z==-1){
+ print("z is unassigned: " + z)
+ z = numTopics
+ }
+
+ rw = as.scalar(r_w[j,1])
+ continue = 1
+
+ w = -1
+ #this is a workaround
+ #w = 1
+
+ for(k2 in 1:numFeatures){
+ prob = as.scalar(topicDistributions[z,k2])
+ if(continue == 1 & rw <= prob){
+ w = k2
+ continue = 0
+ }
+ }
+
+ if(w==-1){
+ print("w is unassigned: " + w)
+ w = numFeatures
+ }
+
+ data[i,w] = data[i,w] + 1
+ }
+ }
+
+ data0[k:(k+len),] = data; # write block back
+}
+
+write(data0, $7, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4PCA.dml
b/scripts/perftest/datagen/genRandData4PCA.dml
new file mode 100644
index 0000000000..413d5c458e
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4PCA.dml
@@ -0,0 +1,61 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Synthetic data generator for PCA
+# 3 hidden dimensions (V1, V2, V3)
+# generates only "dense" data
+#
+# INPUT PARAMETERS:
+#
--------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
--------------------------------------------------------------------------------------------
+# R Int 10000 Number of rows
+# C Int 1000 Number of categorical attributes
+# OUT String --- Location (on HDFS) to store the generated dataset
+# FMT String "csv" Matrix output format, usually "text", "csv" or
"binary"
+#
--------------------------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000
OUT=/user/biuser/pcaData.mtx FMT=csv
+
+R = ifdef ($R, 10000)
+C = ifdef ($C, 1000)
+FMT = ifdef ($FMT, "csv");
+
+# Modified version of the procedure from Zou et.al., "Sparse Principal
Component Analysis", 2006.
+
+# V1 ~ N(0,290); V2~N(0,300); V3 = -0.3V1+0.925V2 + e, e ~ N(0,1)
+V1 = 0 + 290*rand(rows=R, cols=1, pdf="normal");
+V2 = 0 + 300*rand(rows=R, cols=1, pdf="normal");
+V3 = -0.3*V1 + 0.925*V2 + rand(rows=R, cols=1, pdf="normal");
+
+C1 = ceil(C/2.5);
+C2 = ceil(C/2.5);
+C3 = C - C1 - C2;
+
+M = matrix(0, rows=R, cols=C)
+
+M[,1:C1] = rand(rows=R, cols=C1, pdf="normal") + V1;
+M[,C1+1:C1+C2] = rand(rows=R, cols=C2, pdf="normal") + V2;
+M[,C1+C2+1:C] = rand(rows=R, cols=C3, pdf="normal") + V3;
+
+write(M, $OUT, format=FMT);
diff --git a/scripts/perftest/datagen/genRandData4StratStats.dml
b/scripts/perftest/datagen/genRandData4StratStats.dml
new file mode 100644
index 0000000000..6a4c07f734
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4StratStats.dml
@@ -0,0 +1,155 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# THIS SCRIPT GENERATES SYNTHETIC DATA FOR STRATSTATS (STRATIFIED STATISTICS)
TESTING
+#
+# INPUT PARAMETERS:
+#
--------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
--------------------------------------------------------------------------------------------
+# nr Int 100000 Number of records in the generated dataset
+# nf Int 10 Number of features in the X and the Y parts of the
generated dataset
+# smin Int 10000 Minimum stratum value, a positive integer
+# smax Int 20000 Maximum stratum value, a positive integer
+# prs Double 100.0 How many times more likely to have minimum vs.
maximum stratum value
+# pxnan Double 0.05 Probability of a NaN replacing a value in X
+# pynan Double 0.05 Probability of a NaN replacing a value in Y
+# psnan Double 0.05 Probability of a NaN replacing a value in the stratum
column
+#
--------------------------------------------------------------------------------------------
+# mxmin Double 10.0 Baseline (mean) value for the first feature in X
+# mxmax Double 19.0 Baseline (mean) value for the last feature in X
+# mymin Double 30.0 Baseline (mean) value for the first feature in Y
(before adding X)
+# mymax Double 39.0 Baseline (mean) value for the last feature in Y
(before adding X)
+# bmin Double 3.0 "Beta" multiplied by X before adding to Y, for the
first feature
+# bmax Double 3.0 "Beta" multiplied by X before adding to Y, for the
last feature
+#
--------------------------------------------------------------------------------------------
+# sxbmin Double 3.0 Standard deviation for the first feature in X,
stratum dependent
+# sxbmax Double 3.0 Standard deviation for the last feature in X, stratum
dependent
+# sxwmin Double 4.0 Standard deviation for the first feature in X,
residual
+# sxwmax Double 4.0 Standard deviation for the last feature in X, residual
+# sybmin Double sqrt(28) Standard deviation for the first feature in Y,
stratum dependent
+# sybmax Double sqrt(28) Standard deviation for the last feature in Y, stratum
dependent
+# sywmin Double 6.0 Standard deviation for the first feature in Y,
residual
+# sywmax Double 6.0 Standard deviation for the last feature in Y, residual
+#
--------------------------------------------------------------------------------------------
+# D String "Data" Location (on HDFS) to store the generated dataset
+# Xcid String "Xcid" Location (on HDFS) to store the column indices of X
features
+# Ycid String "Ycid" Location (on HDFS) to store the column indices of Y
features
+# A String "Aux" Location (on HDFS) to store the auxiliary parameter
values, if any
+# fmt String "text" Matrix output format, usually "text", "mm", or "csv"
+#
--------------------------------------------------------------------------------------------
+# OUTPUT: Matrix with the generated dataset, Xcid and Ycid, and possibly other
auxiliaries
+
+num_records = ifdef ($nr, 100000);
+num_features = ifdef ($nf, 10);
+min_stratumID = ifdef ($smin, 10000);
+max_stratumID = ifdef ($smax, 20000);
+prob_ratio_min_to_max_stratumID = ifdef ($prs, 100);
+prob_NaN_in_X = ifdef ($pxnan, 0.05);
+prob_NaN_in_Y = ifdef ($pynan, 0.05);
+prob_NaN_in_stratum = ifdef ($psnan, 0.05);
+
+mean_X_min = ifdef ($mxmin, 31.0);
+mean_X_max = ifdef ($mxmax, 40.0);
+mean_Y_min = ifdef ($mymin, 11.0);
+mean_Y_max = ifdef ($mymax, 20.0);
+beta_min = ifdef ($bmin, 3.0);
+beta_max = ifdef ($bmax, 3.0);
+
+stdev_X_between_strata_min = ifdef ($sxbmin, 3.0);
+stdev_X_between_strata_max = ifdef ($sxbmax, 3.0);
+stdev_X_within_strata_min = ifdef ($sxwmin, 4.0);
+stdev_X_within_strata_max = ifdef ($sxwmax, 4.0);
+stdev_Y_between_strata_min = ifdef ($sybmin, sqrt(28.0));
+stdev_Y_between_strata_max = ifdef ($sybmax, sqrt(28.0));
+stdev_Y_within_strata_min = ifdef ($sywmin, 6.0);
+stdev_Y_within_strata_max = ifdef ($sywmax, 6.0);
+
+fileData = ifdef ($D, "Data");
+fileXcid = ifdef ($Xcid, "Xcid");
+fileYcid = ifdef ($Ycid, "Ycid");
+fileAux = ifdef ($A, "Aux" );
+fmt = ifdef ($fmt, "text");
+
+# Generate the strata, from 1 to (max_stratumID - min_stratumID + 1), as
multinomial
+# in which 1 is less likely than (max_stratumID - min_stratumID + 1) by a
factor of
+# prob_ratio_min_to_max_stratumID
+
+r_power = (max_stratumID - min_stratumID) / log
(prob_ratio_min_to_max_stratumID);
+r_bound = prob_ratio_min_to_max_stratumID ^ (1.0 + 1.0 / (max_stratumID -
min_stratumID));
+
+if (r_bound < 1.0) {
+ R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf =
"uniform");
+ R_S = r_bound + R_S * (1.0-r_bound);
+} else {
+ R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf =
"uniform");
+ R_S = 1.0 + R_S * (r_bound-1);
+}
+
+SID = round (0.5 + log (R_S) * r_power);
+num_strata = max (SID);
+Smap = table (SID, seq (1, num_records, 1));
+
+# Compute baseline values and standard deviations of X, Y, and beta, at each
feature
+
+mean_X = mean_X_min + ((mean_X_max - mean_X_min) / (num_features - 1)) * seq
(0, num_features - 1, 1);
+mean_Y = mean_Y_min + ((mean_Y_max - mean_Y_min) / (num_features - 1)) * seq
(0, num_features - 1, 1);
+betas = beta_min + (( beta_max - beta_min) / (num_features - 1)) * seq
(0, num_features - 1, 1);
+
+stdev_X_within_strata = stdev_X_within_strata_min +
+ ((stdev_X_within_strata_max - stdev_X_within_strata_min ) / (num_features
- 1)) * seq (0, num_features - 1, 1);
+stdev_X_between_strata = stdev_X_between_strata_min +
+ ((stdev_X_between_strata_max - stdev_X_between_strata_min) / (num_features
- 1)) * seq (0, num_features - 1, 1);
+stdev_Y_within_strata = stdev_Y_within_strata_min +
+ ((stdev_Y_within_strata_max - stdev_Y_within_strata_min ) / (num_features
- 1)) * seq (0, num_features - 1, 1);
+stdev_Y_between_strata = stdev_Y_between_strata_min +
+ ((stdev_Y_between_strata_max - stdev_Y_between_strata_min) / (num_features
- 1)) * seq (0, num_features - 1, 1);
+
+# Generate X and Y matrices
+
+RX_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal");
# transposed
+RY_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal");
# to allow
+RX_records = Rand (rows = num_features, cols = num_records, pdf = "normal");
# matrix-vector
+RY_records = Rand (rows = num_features, cols = num_records, pdf = "normal");
# operations
+
+t_X = RX_records * stdev_X_within_strata + (RX_strata * stdev_X_between_strata
+ mean_X) %*% Smap;
+t_Y = RY_records * stdev_Y_within_strata + (RY_strata * stdev_Y_between_strata
+ mean_Y) %*% Smap + (t_X * betas);
+Data = cbind (min_stratumID - 1 + SID, t(t_X), t(t_Y));
+
+# Set up the NaNs
+
+RNaNS = Rand (rows = num_records, cols = 1, min = 1.0, max = 1.0, sparsity =
prob_NaN_in_stratum);
+RNaNX = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0,
sparsity = prob_NaN_in_X);
+RNaNY = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0,
sparsity = prob_NaN_in_Y);
+Mask = cbind (RNaNS, RNaNX, RNaNY) != 0;
+Data = Data + (1.0 - Mask) / (1.0 - Mask);
+
+# Output the dataset and the auxiliaries
+
+Xcid = t(seq (2, num_features + 1, 1));
+Ycid = t(seq (num_features + 2, 2 * num_features + 1, 1));
+Aux = cbind (mean_X, mean_Y, betas);
+
+write (Data, fileData, format=fmt);
+write (Xcid, fileXcid, format=fmt);
+write (Ycid, fileYcid, format=fmt);
+write (Aux, fileAux, format=fmt);
+
diff --git a/scripts/perftest/datagen/genRandData4SurvAnalysis.dml
b/scripts/perftest/datagen/genRandData4SurvAnalysis.dml
new file mode 100644
index 0000000000..75117cf6d7
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4SurvAnalysis.dml
@@ -0,0 +1,133 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL
HAZARD MODELS
+# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTRIBUTION WITH PARAMETERS LAMBDA
AND V
+#
+# INPUT PARAMETERS:
+#
---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
---------------------------------------------------------------------------------------------
+# type Sting --- The type of model for which the data is being
generated: "kaplan-meier" or "cox"
+# n Int Number of records
+# lambda Double 2.0 Scale parameter of the Weibull distribution
used for generating timestamps
+# v Double 1.5 Shape parameter of the Weibull distribution
used for generating timestamps
+# p Double 0.8 1 - probability of a record being censored
+# g Int 2 If type=kaplan-meier the number of categorical
features used for grouping
+# s Int 1 If type=kaplan-meier the number of categorical
features used for stratifying
+# f Int 10 If type=kaplan-meier maximum number of levels
(i.e., distinct values) of g+s categorical features
+# m Int 100 If type=cox the number of features in the model
+# sp Double 1.0 If type=cox the sparsity of the feature matrix
+# O String --- Location to write the output matrix containing
random data for the kaplan-meier or the cox model
+# B String --- If type=cox location to write the output
matrix containing the coefficients for the cox model
+# TE String --- Location to store column indices of X
corresponding to timestamp (first row) and event information (second row)
+# F String --- Location to store column indices of X
which are to be used for fitting the Cox model
+# fmt String "text" The output format of results of the
kaplan-meier analysis, such as "text" or "csv"
+#
---------------------------------------------------------------------------------------------
+# OUTPUTS:
+# 1- If type=kaplan-meier an n x (2+g+s) matrix O with
+# - column 1 contains timestamps generated randomly from a Weibull
distribution with parameters lambda and v
+# - column 2 contains the information whether an event occurred (1) or
data is censored (0)
+# - columns 3:2+g contain categorical features used for grouping
+# - columns 3+g:2+g+s contain categorical features used for stratifying
+# if type=cox an n x (2+m) matrix O with
+# - column 1 contains timestamps generated randomly from a Weibull
distribution with parameters lambda and v
+# - column 2 contains the information whether an event occurred (1) or
data is censored (0)
+# - columns 3:2+m contain scale features
+# 2- If type=cox a coefficient matrix B
+# 3- A column matrix TE containing the column indices of X corresponding to
timestamp (first row) and event information (second row)
+# 4- A column matrix F containing the column indices of X which are to be used
for KM analysis or fitting the Cox model
+
+type = $type; # either "kaplan-meier" or "cox"
+num_records = $n;
+lambda = ifdef ($l, 2.0);
+p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored
+# parameters related to the kaplan-meier model
+n_groups = ifdef ($g, 2);
+n_strata = ifdef ($s, 1);
+max_level = ifdef ($f, 10);
+# parameters related to the cox model
+num_features = ifdef ($m, 1000);
+sparsity = ifdef ($sp, 1.0);
+fileO = $O;
+fileB = $B;
+fileTE = $TE;
+fileF = $F;
+fmtO = ifdef ($fmt, "text"); # $fmt="text"
+p_censor = 1 - p_event; # prob. that record is censored
+
+if (type == "kaplan-meier") {
+
+ v = ifdef ($v, 1.5);
+ # generate categorical features used for grouping and stratifying
+ X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min =
0.000000001, max = max_level - 0.000000001, pdf = "uniform"));
+
+ # generate timestamps
+ U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1);
+ T = (-log (U) / lambda) ^ (1/v);
+
+} else if (type == "cox") {
+
+ v = ifdef ($v, 50);
+ # generate feature matrix
+ X = rand (rows = num_records, cols = num_features, min = 1, max = 5,
pdf = "uniform", sparsity = sparsity);
+
+ # generate coefficients
+ B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf =
"uniform", sparsity = 1.0); # * beta_range;
+
+ # generate timestamps
+ U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1);
+ T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v);
+
+} else {
+ stop ("Wrong model type!");
+}
+
+Y = matrix (0, rows = num_records, cols = 2);
+event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max =
(1 + p_event)));
+n_time = sum (event);
+Y[,2] = event;
+
+# binning of event times
+min_T = min (T);
+max_T = max (T);
+# T = T - min_T;
+len = max_T - min_T;
+num_bins = len / n_time;
+T = ceil (T / num_bins);
+
+# print ("min(T) " + min(T) + " max(T) " + max(T));
+Y[,1] = T;
+
+O = cbind (Y, X);
+write (O, fileO, format = fmtO);
+
+if (type == "cox") {
+ write (B, fileB, format = fmtO);
+
+}
+
+TE = matrix ("1 2", rows = 2, cols = 1);
+F = seq (1, num_features);
+write (TE, fileTE, format = fmtO);
+write (F, fileF, format = fmtO);
+
diff --git a/scripts/perftest/datagen/genRandData4Transform.dml
b/scripts/perftest/datagen/genRandData4Transform.dml
new file mode 100644
index 0000000000..edab7c2873
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4Transform.dml
@@ -0,0 +1,96 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Generates random data to test transform with
+#
+# rows, cols: dimensions of the data matrix to be generated
+# prob_categorical: percentage of the generated cols to be categorical
+# min_domain, max_domain: provide a range for domain sizes of the generated
categorical cols
+# prob_missing: percentage of the generated (scale) cols to have missing values
+# prob_missing_cell: probability of a cell to have a missing value
+# out_X, out_missing, out_categorical: output file names
+#
+
+#params for size of data
+num_rows = ifdef($rows, 1000)
+num_cols = ifdef($cols, 25)
+
+#params for kind of cols
+prob_categorical = ifdef($prob_cat, 0.1)
+min_domain_size = ifdef($min_domain, 1)
+max_domain_size = ifdef($max_domain, 10)
+
+#params for missing value cols
+prob_missing_col = ifdef($prob_missing, 0.1)
+prob_missing_val = ifdef($prob_missing_cell, 0.2)
+
+num_scalar_cols = as.double(num_cols)
+num_categorical_cols = 0.0
+scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
+if(prob_categorical > 0){
+ categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
+ categorical_ind = categorical_ind < prob_categorical
+ categorical_col_ids = removeEmpty(target=seq(1, num_cols,
1)*categorical_ind, margin="rows")
+ num_categorical_cols = sum(categorical_ind)
+ write(categorical_col_ids, $out_categorical, format="csv")
+
+ domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1,
pdf="uniform")
+ domain_sizes = round(min_domain_size + (max_domain_size -
min_domain_size)*domain_sizes)
+
+ categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1,
pdf="uniform")
+ categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))
+
+ scalar_ind = 1-categorical_ind
+}
+
+scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind,
margin="rows")
+num_scalar_cols = sum(scalar_ind)
+scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1,
pdf="uniform")
+
+if(num_categorical_cols > 0 & num_scalar_cols > 0){
+ X = cbind(scalar_X, categorical_X)
+ permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids,
num_scalar_cols, num_cols)
+ fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
+ permut_mat = t(cbind(t(permut_mat), t(fill_in)))
+ X = X %*% permut_mat
+}else{
+ if(num_categorical_cols > 0) X = categorical_X
+ else{
+ if(num_scalar_cols > 0) X = scalar_X
+ else print("somehow, we've managed to compute that precisely 0 cols should
be categorical and 0 cols should be scale")
+ }
+}
+
+if(prob_missing_col > 0){
+ missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
+ missing_col_ind = missing_col_ind < prob_missing_col
+ #currently only support missing value imputation for scale cols
+ missing_col_ind = missing_col_ind * scalar_ind
+ missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind,
margin="rows")
+ missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0,
max=1, pdf="uniform")
+ missing_values = missing_values < prob_missing_val
+ X = cbind(X, missing_values)
+
+ write(missing_col_ids, $out_missing, format="csv")
+}
+
+write(X, $out_X, format="csv")
diff --git a/scripts/perftest/datagen/genRandData4Univariate.dml
b/scripts/perftest/datagen/genRandData4Univariate.dml
new file mode 100644
index 0000000000..bcbd528eb9
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4Univariate.dml
@@ -0,0 +1,61 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random numbers from a distribution
+# with specified mean, standard deviation,
+# skewness, kurtosis
+# mean and standard deviation are taken in as
+# arguments by this script
+# a,b,c,d are coefficients computed by some
+# equation solver determined from the specified
+# skewness and kurtosis using power method
+# polynomials
+#
+# for more details see:
+# Statistical Simulation: Power Method Polynomials
+# and Other Transformations
+# Author: Todd C. Headrick
+# Chapman & Hall/CRC, Boca Raton, FL, 2010.
+# ISBN 978-1-4200-6490-2
+
+# $1 is the number of random points to be sampled
+# $2 is specified mean
+# $3 is specified standard deviation
+# $4-$7 are a,b,c,d obtained by solving a system
+# of equations using specified kurtosis and skewness
+# $8 is the file to write out the generated data to
+
+numSamples = $1
+mu = $2
+sigma = $3
+a = $4
+b = $5
+c = $6
+d = $7
+
+
+print("a=" + a + " b=" + b + " c=" + c + " d=" + d)
+
+X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
+Y = a + b*X + c*X^2 + d*X^3
+
+Z = Y*sigma + mu
+write(Z, $8, format="binary")
diff --git a/scripts/perftest/datagen/genStratStatisticsData.sh
b/scripts/perftest/datagen/genStratStatisticsData.sh
new file mode 100644
index 0000000000..330247cce0
--- /dev/null
+++ b/scripts/perftest/datagen/genStratStatisticsData.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+ echo "Please execute scripts from directory 'perftest'"
+ exit 1;
+fi
+
+CMD=$1
+BASE=$2/stratstats
+MAXMEM=$3
+
+FORMAT="binary"
+
+echo "-- Generating stats data..." >> results/times.txt;
+
+#XS data 10K rows
+if [ $MAXMEM -ge 80 ]; then
+ ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=10000 nf=100 D=${BASE}/A_10k/data Xcid=${BASE}/A_10k/Xcid
Ycid=${BASE}/A_10k/Ycid A=${BASE}/A_10k/A fmt=$FORMAT &
+fi
+
+#S data 100K rows
+if [ $MAXMEM -ge 800 ]; then
+ ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=100000 nf=100 D=${BASE}/A_100k/data Xcid=${BASE}/A_100k/Xcid
Ycid=${BASE}/A_100k/Ycid A=${BASE}/A_100k/A fmt=$FORMAT &
+fi
+
+#M data 1M rows
+if [ $MAXMEM -ge 8000 ]; then
+ ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=1000000 nf=100 D=${BASE}/A_1M/data Xcid=${BASE}/A_1M/Xcid
Ycid=${BASE}/A_1M/Ycid A=${BASE}/A_1M/A fmt=$FORMAT &
+fi
+
+#L data 10M rows
+if [ $MAXMEM -ge 80000 ]; then
+ ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=10000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid
Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT
+fi
+
+#XL data 100M rows
+if [ $MAXMEM -ge 800000 ]; then
+ ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs
nr=100000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid
Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/sparkDML2.sh b/scripts/perftest/sparkDML2.sh
index dde9805719..6102fb3d8a 100644
--- a/scripts/perftest/sparkDML2.sh
+++ b/scripts/perftest/sparkDML2.sh
@@ -1,3 +1,25 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
#Client mode spark-submit script
export SPARK_HOME=/home/hadoop/spark-3.3.1-bin-hadoop3
export HADOOP_CONF_DIR=/home/hadoop/hadoop-3.3.1/etc/hadoop
@@ -13,4 +35,4 @@ $SPARK_HOME/bin/spark-submit \
--conf spark.network.timeout=512s \
--executor-memory 200g \
--executor-cores 48 \
- SystemDS.jar "$@"
\ No newline at end of file
+ SystemDS.jar "$@"