This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new 436562767b [SYSTEMDS-3535] Scalable Linear Algebra Benchmark (SLAB) 436562767b is described below commit 436562767b5e98d231fb714f01a561b3f19c6d0f Author: ReneEnjilian <enjilianr...@gmail.com> AuthorDate: Sun Aug 18 19:01:39 2024 +0200 [SYSTEMDS-3535] Scalable Linear Algebra Benchmark (SLAB) Closes #2055. --- scripts/perftest/slab/data/gen_dense_data.py | 68 ++++ scripts/perftest/slab/data/gen_sparse_data.py | 77 +++++ .../distributed/run_distributed_ml_algorithms.sh | 89 +++++ ...HeteroscedasticityRobustStandardErrorsDistr.dml | 56 ++++ .../distributed/slabLogisticRegressionDistr.dml | 56 ++++ .../slabNonNegativeMatrixFactorizationDistr.dml | 55 +++ .../slabOrdinaryLeastSquaresRegressionDistr.dml | 41 +++ .../slab/mlAlgorithms/distributed/slabPCADistr.dml | 57 ++++ .../native/run_native_ml_algorithms.sh | 74 ++++ .../slab/mlAlgorithms/native/slabLinearRegCG.dml | 193 +++++++++++ .../slab/mlAlgorithms/native/slabMultiLogitReg.dml | 373 +++++++++++++++++++++ .../slab/mlAlgorithms/native/slabNativePCA.dml | 130 +++++++ .../single_node_dense/run_single_node_dense_ml.sh | 68 ++++ .../slabHeteroscedasticityRobustStandardErrors.dml | 55 +++ .../single_node_dense/slabLogisticRegression.dml | 55 +++ .../slabNonNegativeMatrixFactorization.dml | 54 +++ .../slabOrdinaryLeastSquaresRegression.dml | 40 +++ .../mlAlgorithms/single_node_dense/slabPCA.dml | 56 ++++ .../run_distributed_matrix_sparse.sh | 102 ++++++ .../distributed_sparse/slabFrobeniusNormSparse.dml | 35 ++ .../distributed_sparse/slabGramMatrixSparse.dml | 35 ++ .../slabMatrixAdditionSparse.dml | 37 ++ .../distributed_sparse/slabMatrixMultSparse.dml | 37 ++ .../slabMatrixVectorMultSparse.dml | 36 ++ .../distributed_sparse/slabTransposeSparse.dml | 34 ++ .../run_single_node_matrix_dense.sh | 64 ++++ .../single_node_dense/slabFrobeniusNorm.dml | 29 ++ .../operators/single_node_dense/slabGramMatrix.dml | 29 ++ .../single_node_dense/slabMatrixAddition.dml | 34 ++ .../operators/single_node_dense/slabMatrixMult.dml | 32 ++ .../single_node_dense/slabMatrixVectorMult.dml | 33 ++ .../operators/single_node_dense/slabTranspose.dml | 30 ++ .../perftest/slab/pipeline/run_slab_pipeline.sh | 78 +++++ .../slab/pipeline/slabMultiplicationChain.dml | 38 +++ scripts/perftest/slab/pipeline/slabSVD.dml | 38 +++ scripts/perftest/slab/slabUtils.dml | 49 +++ src/main/java/org/apache/sysds/api/DMLScript.java | 2 +- 37 files changed, 2368 insertions(+), 1 deletion(-) diff --git a/scripts/perftest/slab/data/gen_dense_data.py b/scripts/perftest/slab/data/gen_dense_data.py new file mode 100644 index 0000000000..82d0e7db91 --- /dev/null +++ b/scripts/perftest/slab/data/gen_dense_data.py @@ -0,0 +1,68 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +import os +import numpy as np +import pandas as pd + +def gen_data_dense(rows, cols, path, chunk_size=10000): + """ + Generate a dense matrix and save it to a CSV file. + + Parameters: + rows (int): Number of rows. + cols (int): Number of columns. + path (str): Path to save the generated matrix. + chunk_size (int): Number of rows per chunk to generate and save. + """ + with open(path, 'w') as f: + for start_row in range(0, rows, chunk_size): + end_row = min(start_row + chunk_size, rows) + chunk_rows = end_row - start_row + + # Generate a dense matrix with random values + chunk_matrix = np.random.random((chunk_rows, cols)) + + # Save the chunk to the CSV file + np.savetxt(f, chunk_matrix, delimiter=',') + # np.savetxt(f, chunk_matrix, delimiter=',', fmt='%.10f') + print(f"Saved chunk {start_row} to {end_row} to {path}") + +def main(): + # Hardcoded parameters + dense_gb = 0.0001 + + current_directory = os.getcwd() + target_directory = os.path.abspath(os.path.join(current_directory, '../../../../src/test/resources/datasets/slab/dense')) + os.makedirs(target_directory, exist_ok=True) + + k = int(np.ceil((dense_gb * 1e9) / float(8 * 100))) + + # Paths for saving the matrices + mpath_tall = os.path.join(target_directory, 'M_dense_tall.csv') + mpath_wide = os.path.join(target_directory, 'M_dense_wide.csv') + + # Generate and save dense matrices + gen_data_dense(k, 100, mpath_tall) + gen_data_dense(100, k, mpath_wide) + +if __name__ == "__main__": + main() diff --git a/scripts/perftest/slab/data/gen_sparse_data.py b/scripts/perftest/slab/data/gen_sparse_data.py new file mode 100644 index 0000000000..3279a46f14 --- /dev/null +++ b/scripts/perftest/slab/data/gen_sparse_data.py @@ -0,0 +1,77 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +import os +import numpy as np +import pandas as pd + +def gen_data_sparse(rows, cols, density, path, chunk_size=10000): + """ + Generate a sparse matrix with given density and save it to a CSV file in a dense format. + + Parameters: + rows (int): Number of rows. + cols (int): Number of columns. + density (float): Fraction of non-zero elements. + path (str): Path to save the generated matrix. + chunk_size (int): Number of rows per chunk to generate and save. + """ + with open(path, 'w') as f: + for start_row in range(0, rows, chunk_size): + end_row = min(start_row + chunk_size, rows) + chunk_rows = end_row - start_row + + chunk_matrix = np.zeros((chunk_rows, cols)) + + n_nonzero = int(density * chunk_rows * cols) + nonzero_indices = (np.random.randint(chunk_rows, size=n_nonzero), np.random.randint(cols, size=n_nonzero)) + chunk_matrix[nonzero_indices] = np.random.random(n_nonzero) + + np.savetxt(f, chunk_matrix, delimiter=',') + #np.savetxt(f, chunk_matrix, delimiter=',', fmt='%.10f') + + print(f"Saved chunk {start_row} to {end_row} to {path}") + +def main(): + # Hardcoded parameters + sparse_gb = 0.0001 + sparsity_values = [0.0001, 0.001, 0.01, 0.1] + + current_directory = os.getcwd() + target_directory = os.path.abspath(os.path.join(current_directory, '../../../../src/test/resources/datasets/slab/sparse')) + os.makedirs(target_directory, exist_ok=True) + + + for sr in sparsity_values: + stub = str(sr).replace('.', '_') + stub = "sparsity_"+stub + k = int(np.ceil((sparse_gb * 1e9) / float(8 * 100))) + + # Paths for saving the matrices + mpath_tall = os.path.join(target_directory, f'M_{stub}_tall.csv') + mpath_wide = os.path.join(target_directory, f'M_{stub}_wide.csv') + + # Generate and save sparse matrices + gen_data_sparse(k, 100, sr, mpath_tall) + gen_data_sparse(100, k, sr, mpath_wide) + +if __name__ == "__main__": + main() diff --git a/scripts/perftest/slab/mlAlgorithms/distributed/run_distributed_ml_algorithms.sh b/scripts/perftest/slab/mlAlgorithms/distributed/run_distributed_ml_algorithms.sh new file mode 100755 index 0000000000..29f5c35877 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/distributed/run_distributed_ml_algorithms.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Ensure script is run from the 'pipeline' directory +if [ "$(basename $PWD)" != "distributed" ]; then + echo "Please execute scripts from directory 'distributed'" + exit 1 +fi + + +# Set up the output directory +OUTPUT_DIR="output" +mkdir -p $OUTPUT_DIR + +# Define datasets +DATASET_PATH_DENSE="../../../../../src/test/resources/datasets/slab/dense" +DATASET_PATH_SPARSE="../../../../../src/test/resources/datasets/slab/sparse" +DENSE_DATASETS=("M_dense_tall.csv" "M_dense_wide.csv") +SPARSE_DATASETS=("M_sparsity_0_0001_tall.csv" "M_sparsity_0_0001_wide.csv" "M_sparsity_0_001_tall.csv" "M_sparsity_0_001_wide.csv" "M_sparsity_0_01_tall.csv" "M_sparsity_0_01_wide.csv" "M_sparsity_0_1_tall.csv" "M_sparsity_0_1_wide.csv") + +# Define DML files and corresponding output files +DML_FILES=("slabHeteroscedasticityRobustStandardErrorsDistr.dml" "slabLogisticRegressionDistr.dml" "slabNonNegativeMatrixFactorizationDistr.dml" "slabOrdinaryLeastSquaresRegressionDistr.dml" "slabPCADistr.dml") +OUTPUT_FILES=("slabHeteroscedasticityRobustStandardErrorsDistr_stats.txt" "slabLogisticRegressionDistr_stats.txt" "slabNonNegativeMatrixFactorizationDistr_stats.txt" "slabOrdinaryLeastSquaresRegressionDistr_stats.txt" "slabPCADistr_stats.txt") + +# Function to run DML script and handle errors +run_dml() { + local DML_FILE=$1 + local ARGS=$2 + local SPARSITY=$3 + local SHAPE=$4 + local OUTPUT_FILE=$5 + + # Run the DML script with -exec spark and -stats flag, and capture the output + TEMP_FILE=$(mktemp) + if systemds $DML_FILE -exec spark -args $ARGS -stats > $TEMP_FILE 2>&1; then + # Write the sparsity, shape, and SystemDS Statistics section to the output file + echo "Sparsity: $SPARSITY, Shape: $SHAPE" >> $OUTPUT_FILE + awk '/SystemDS Statistics:/{flag=1}flag' $TEMP_FILE >> $OUTPUT_FILE + else + echo "An error occurred while executing ${DML_FILE} with arguments ${ARGS}. Check ${TEMP_FILE} for details." >> $OUTPUT_FILE + fi + echo -e "\n\n\n\n" >> $OUTPUT_FILE # Add empty lines for separation + rm $TEMP_FILE +} + +# Iterate over each DML file +for index in ${!DML_FILES[@]}; do + DML_FILE=${DML_FILES[$index]} + OUTPUT_FILE=${OUTPUT_DIR}/${OUTPUT_FILES[$index]} + + # Clear the output file before writing + > $OUTPUT_FILE + + # Run with dense datasets + for DATASET in ${DENSE_DATASETS[@]}; do + SHAPE=$(echo $DATASET | grep -oP '(tall|wide)') + SPARSITY="dense" + run_dml $DML_FILE "${DATASET_PATH_DENSE}/${DATASET}" $SPARSITY $SHAPE $OUTPUT_FILE + echo "Execution of ${DML_FILE} with dataset ${DATASET} completed. Statistics appended to ${OUTPUT_FILE}" + done + + # Run with sparse datasets + for DATASET in ${SPARSE_DATASETS[@]}; do + SHAPE=$(echo $DATASET | grep -oP '(tall|wide)') + SPARSITY=$(echo $DATASET | grep -oP '0_\d+') + SPARSITY=${SPARSITY//_/\.} # Replace underscore with dot + run_dml $DML_FILE "${DATASET_PATH_SPARSE}/${DATASET}" $SPARSITY $SHAPE $OUTPUT_FILE + echo "Execution of ${DML_FILE} with dataset ${DATASET} completed. Statistics appended to ${OUTPUT_FILE}" + done +done diff --git a/scripts/perftest/slab/mlAlgorithms/distributed/slabHeteroscedasticityRobustStandardErrorsDistr.dml b/scripts/perftest/slab/mlAlgorithms/distributed/slabHeteroscedasticityRobustStandardErrorsDistr.dml new file mode 100644 index 0000000000..ef0044f5c3 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/distributed/slabHeteroscedasticityRobustStandardErrorsDistr.dml @@ -0,0 +1,56 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPath = $1 +X = read(dataPath, format="csv") +rvect = rand(rows=nrow(X), cols=1, pdf='uniform') +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) +b = reg(X,y) +y_hat = X %*% b +r2 = (y - y_hat)^2 + +for(ix in 1:5) { + tmp = robust_se(X, r2) + utils::printRandElements(tmp, 10) +} + +reg = function(matrix[double] X, matrix[double] y) + return (matrix[double] b) { + b = solve(t(X) %*% X, t(X) %*% y) +} + + +robust_se = function(matrix[double] X, + matrix[double] r2) + return (matrix[double] se) { + # NOTE: SVD is cheap since XTX is small! + [U, H, V] = svd(t(X) %*% X) + h = diag(H) + XTX_INV = U %*% diag(h^-1) %*% t(V) + S = diag(r2) + se = XTX_INV %*% (t(X) %*% S %*% X) %*% XTX_INV +} diff --git a/scripts/perftest/slab/mlAlgorithms/distributed/slabLogisticRegressionDistr.dml b/scripts/perftest/slab/mlAlgorithms/distributed/slabLogisticRegressionDistr.dml new file mode 100644 index 0000000000..265317ff64 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/distributed/slabLogisticRegressionDistr.dml @@ -0,0 +1,56 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPath = $1 +X = read(dataPath, format="csv") +rvect = rand(rows=nrow(X), cols=1, pdf='uniform') +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) + +for(ix in 1:5){ + tmp = logit(X, y, 10) + utils::printRandElements(tmp, 10) +} + +logit = function(matrix[double] X, + matrix[double] y, + Integer iterations) + return (matrix[double] w) { + + N = nrow(X) + w = matrix(0, rows=ncol(X), cols=1) + iteration = 0 + stepSize = 10 + + while (iteration < iterations) { + xb = X %*% w + delta = 1/(1+exp(-xb)) - y + stepSize = stepSize / 2 + w = w - ((stepSize * t(X) %*% delta)/N) + + iteration = iteration + 1 + } +} diff --git a/scripts/perftest/slab/mlAlgorithms/distributed/slabNonNegativeMatrixFactorizationDistr.dml b/scripts/perftest/slab/mlAlgorithms/distributed/slabNonNegativeMatrixFactorizationDistr.dml new file mode 100644 index 0000000000..f844695cb3 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/distributed/slabNonNegativeMatrixFactorizationDistr.dml @@ -0,0 +1,55 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPath = $1 +X = read(dataPath, format="csv") +rvect = rand(rows=1, cols=1) +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) + + +for(ix in 1:5){ + tmp = gnmf(X, 10, 10) + print(tmp) +} + +gnmf = function(matrix[double] X, Integer r, Integer iterations) + return (integer iteration) { + + W = rand(rows = nrow(X), cols = r, pdf = 'uniform') + H = rand(rows = r, cols = ncol(X), pdf = 'uniform') + + for (i in 1:3) { + W = W * ((X %*% t(H)) / (W %*% (H %*% t(H)))) + H = H * ((t(W) %*% X) / ((t(W) %*% W) %*% H)) + } + if ((as.scalar(W[1,1]) > 0) & (as.scalar(H[1,1]) > 0)) { + print(as.scalar(H[1,1])) + print(as.scalar(W[1,1])) + } + + iteration = 0 +} diff --git a/scripts/perftest/slab/mlAlgorithms/distributed/slabOrdinaryLeastSquaresRegressionDistr.dml b/scripts/perftest/slab/mlAlgorithms/distributed/slabOrdinaryLeastSquaresRegressionDistr.dml new file mode 100644 index 0000000000..d6607a5e0d --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/distributed/slabOrdinaryLeastSquaresRegressionDistr.dml @@ -0,0 +1,41 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPath = $1 +X = read(dataPath, format="csv") +rvect = rand(rows=nrow(X), cols=1, pdf='uniform') +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) + +for(ix in 1:5){ + tmp = reg(X, y) + utils::printRandElements(tmp, 10) +} + +reg = function(matrix[double] X, matrix[double] y) + return (matrix[double] b) { + b = solve(t(X) %*% X, t(X) %*% y) +} diff --git a/scripts/perftest/slab/mlAlgorithms/distributed/slabPCADistr.dml b/scripts/perftest/slab/mlAlgorithms/distributed/slabPCADistr.dml new file mode 100644 index 0000000000..72c9415d8b --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/distributed/slabPCADistr.dml @@ -0,0 +1,57 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPath = $1 +X = read(dataPath, format="csv") +rvect = rand(rows=nrow(X), cols=1, pdf='uniform') +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) + +for(ix in 1:5){ + tmp = pca(X, 5) + utils::printRandElements(tmp, 10) +} + +pca = function(matrix[double] X, Integer k) + return (matrix[double] PRJ) { + N = nrow(X) + K = ncol(X) + XS = X - colMeans(X) + S = (1/(N-1)) * (t(XS) %*% XS) + [eigvals, eigvects] = eigen(S) + + # Thanks to the Sysml implementation for this helpful bit + # of code to sort the eigenvectors + eigssorted = order(target=eigvals, by=1, + decreasing=TRUE, + index.return=TRUE) + diagmat = table(seq(1, K), eigssorted) + eigvals = diagmat %*% eigvals + eigvects = eigvects %*% diagmat + eigvects = eigvects[, 1:k] + + PRJ = XS %*% eigvects +} diff --git a/scripts/perftest/slab/mlAlgorithms/native/run_native_ml_algorithms.sh b/scripts/perftest/slab/mlAlgorithms/native/run_native_ml_algorithms.sh new file mode 100755 index 0000000000..f45bcddcb5 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/native/run_native_ml_algorithms.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Ensure script is run from the 'pipeline' directory +if [ "$(basename $PWD)" != "native" ]; then + echo "Please execute scripts from directory 'native'" + exit 1 +fi + + + +# Set up the output directory +OUTPUT_DIR="output" +mkdir -p $OUTPUT_DIR + +# Define row numbers for the DML scripts +ROW_NUMBERS=("1000" "10000" "100000" "1000000") + +# Define DML files and corresponding output files +DML_FILES=("slabLinearRegCG.dml" "slabMultiLogitReg.dml" "slabNativePCA.dml") +OUTPUT_FILES=("slabLinearRegCG_stats.txt" "slabMultiLogitReg_stats.txt" "slabNativePCA_stats.txt") + +# Function to run DML script and handle errors +run_dml() { + local DML_FILE=$1 + local ARGS=$2 + local OUTPUT_FILE=$3 + + # Run the DML script with -stats flag and capture the output + TEMP_FILE=$(mktemp) + if systemds $DML_FILE -args $ARGS -stats > $TEMP_FILE 2>&1; then + # Write the number of rows and SystemDS Statistics section to the output file + echo "Number of rows: $ARGS" >> $OUTPUT_FILE + awk '/SystemDS Statistics:/{flag=1}flag' $TEMP_FILE >> $OUTPUT_FILE + else + echo "An error occurred while executing ${DML_FILE} with arguments ${ARGS}. Check ${TEMP_FILE} for details." >> $OUTPUT_FILE + fi + echo -e "\n\n\n\n" >> $OUTPUT_FILE # Add empty lines for separation + rm $TEMP_FILE +} + +# Iterate over each DML file +for index in ${!DML_FILES[@]}; do + DML_FILE=${DML_FILES[$index]} + OUTPUT_FILE=${OUTPUT_DIR}/${OUTPUT_FILES[$index]} + + # Clear the output file before writing + > $OUTPUT_FILE + + # Iterate over each row number and execute the DML file + for ROW in ${ROW_NUMBERS[@]}; do + run_dml $DML_FILE $ROW $OUTPUT_FILE + echo "Execution of ${DML_FILE} with ${ROW} rows completed. Statistics appended to ${OUTPUT_FILE}" + done +done diff --git a/scripts/perftest/slab/mlAlgorithms/native/slabLinearRegCG.dml b/scripts/perftest/slab/mlAlgorithms/native/slabLinearRegCG.dml new file mode 100644 index 0000000000..c04519b60e --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/native/slabLinearRegCG.dml @@ -0,0 +1,193 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT SOLVES LINEAR REGRESSION USING THE CONJUGATE GRADIENT ALGORITHM +# + + + +# Parameters +intercept_status = 2; # 0: no intercept, 1: add intercept, 2: add intercept, shift & rescale +tolerance = 0.000001; # Tolerance for convergence +max_iteration = 100; # Maximum number of iterations +regularization = 0.000001; # Regularization constant + +print ("BEGIN LINEAR REGRESSION SCRIPT"); + +# Generate data internally +n = $1; # number of rows +m = 100; # number of columns +X = rand(rows=n, cols=m, min=0, max=1, sparsity=0.9, seed=42) +y = rand(rows=n, cols=1, min=0, max=1, sparsity=0.9, seed=24) + +sum_x = sum(X) +sum_y = sum(y) +for (ix in 1:5) { + + + ones_n = matrix(1, rows = n, cols = 1) + zero_cell = matrix(0, rows = 1, cols = 1) + + # Introduce the intercept, shift and rescale the columns of X if needed + m_ext = m + if (intercept_status == 1 | intercept_status == 2) { # add the intercept column + X = cbind(X, ones_n) + m_ext = ncol(X) + } + + scale_lambda = matrix(1, rows = m_ext, cols = 1) + if (intercept_status == 1 | intercept_status == 2) { + scale_lambda[m_ext, 1] = 0 + } + + if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1 + avg_X_cols = t(colSums(X)) / n + var_X_cols = (t(colSums(X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1) + is_unsafe = (var_X_cols <= 0) + scale_X = 1.0 / sqrt(var_X_cols * (1 - is_unsafe) + is_unsafe) + scale_X[m_ext, 1] = 1 + shift_X = -avg_X_cols * scale_X + shift_X[m_ext, 1] = 0 + } else { + scale_X = matrix(1, rows = m_ext, cols = 1) + shift_X = matrix(0, rows = m_ext, cols = 1) + } + + lambda = scale_lambda * regularization + beta_unscaled = matrix(0, rows = m_ext, cols = 1) + + if (max_iteration == 0) { + max_iteration = m_ext + } + i = 0 + + # BEGIN THE CONJUGATE GRADIENT ALGORITHM + print ("Running the CG algorithm...") + + r = -t(X) %*% y + + if (intercept_status == 2) { + r = scale_X * r + shift_X %*% r[m_ext, ] + } + + p = -r + norm_r2 = sum(r ^ 2) + norm_r2_initial = norm_r2 + norm_r2_target = norm_r2_initial * tolerance ^ 2 + print ("||r|| initial value = " + sqrt(norm_r2_initial) + ", target value = " + sqrt(norm_r2_target)) + + while (i < max_iteration & norm_r2 > norm_r2_target) { + if (intercept_status == 2) { + ssX_p = scale_X * p + ssX_p[m_ext, ] = ssX_p[m_ext, ] + t(shift_X) %*% p + } else { + ssX_p = p + } + + q = t(X) %*% (X %*% ssX_p) + + if (intercept_status == 2) { + q = scale_X * q + shift_X %*% q[m_ext, ] + } + + q = q + lambda * p + a = norm_r2 / sum(p * q) + beta_unscaled = beta_unscaled + a * p + r = r + a * q + old_norm_r2 = norm_r2 + norm_r2 = sum(r ^ 2) + p = -r + (norm_r2 / old_norm_r2) * p + i = i + 1 + print ("Iteration " + i + ": ||r|| / ||r init|| = " + sqrt(norm_r2 / norm_r2_initial)) + } + + if (i >= max_iteration) { + print ("Warning: the maximum number of iterations has been reached.") + } + print ("The CG algorithm is done.") + # END THE CONJUGATE GRADIENT ALGORITHM + + if (intercept_status == 2) { + beta = scale_X * beta_unscaled + beta[m_ext, ] = beta[m_ext, ] + t(shift_X) %*% beta_unscaled + } else { + beta = beta_unscaled + } + + print ("Computing the statistics...") + + avg_tot = sum(y) / n + ss_tot = sum(y ^ 2) + ss_avg_tot = ss_tot - n * avg_tot ^ 2 + var_tot = ss_avg_tot / (n - 1) + y_residual = y - X %*% beta + avg_res = sum(y_residual) / n + ss_res = sum(y_residual ^ 2) + ss_avg_res = ss_res - n * avg_res ^ 2 + + R2 = 1 - ss_res / ss_avg_tot + if (n > m_ext) { + dispersion = ss_res / (n - m_ext) + adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1)) + } else { + dispersion = 0.0 / 0.0 + adjusted_R2 = 0.0 / 0.0 + } + + R2_nobias = 1 - ss_avg_res / ss_avg_tot + deg_freedom = n - m - 1 + if (deg_freedom > 0) { + var_res = ss_avg_res / deg_freedom + adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1)) + } else { + var_res = 0.0 / 0.0 + adjusted_R2_nobias = 0.0 / 0.0 + print ("Warning: zero or negative number of degrees of freedom.") + } + + R2_vs_0 = 1 - ss_res / ss_tot + if (n > m) { + adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n) + } else { + adjusted_R2_vs_0 = 0.0 / 0.0 + } + + str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y + str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y + str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias + str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X) + str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f. + str = append (str, "R2," + R2); # R^2 of residual with bias included vs. total average + str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average + str = append (str, "R2_NOBIAS," + R2_nobias); # R^2 of residual with bias subtracted vs. total average + str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average + if (intercept_status == 0) { + str = append (str, "R2_VS_0," + R2_vs_0); # R^2 of residual with bias included vs. zero constant + str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant + } + + print (str); + + +} + + diff --git a/scripts/perftest/slab/mlAlgorithms/native/slabMultiLogitReg.dml b/scripts/perftest/slab/mlAlgorithms/native/slabMultiLogitReg.dml new file mode 100644 index 0000000000..cde11b5ea1 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/native/slabMultiLogitReg.dml @@ -0,0 +1,373 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Solves Multinomial Logistic Regression using Trust Region methods. +# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650) + +# INPUT PARAMETERS: +# -------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# -------------------------------------------------------------------------------------------- +# X String --- Location to read the matrix of feature vectors +# Y String --- Location to read the matrix with category labels +# B String --- Location to store estimated regression parameters (the betas) +# Log String " " Location to write per-iteration variables for log/debugging purposes +# icpt Int 0 Intercept presence, shifting and rescaling X columns: +# 0 = no intercept, no shifting, no rescaling; +# 1 = add intercept, but neither shift nor rescale X; +# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 +# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized +# tol Double 0.000001 tolerance ("epsilon") +# moi Int 100 max. number of outer (Newton) iterations +# mii Int 0 max. number of inner (conjugate gradient) iterations, 0 = no max +# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only) +# -------------------------------------------------------------------------------------------- +# The largest label represents the baseline category; if label -1 or 0 is present, then it is +# the baseline label (and it is converted to the largest label). +# +# The Log file, when requested, contains the following per-iteration variables in CSV format, +# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values: +# +# NAME MEANING +# ------------------------------------------------------------------------------------------- +# LINEAR_TERM_MIN The minimum value of X %*% B, used to check for overflows +# LINEAR_TERM_MAX The maximum value of X %*% B, used to check for overflows +# NUM_CG_ITERS Number of inner (Conj.Gradient) iterations in this outer iteration +# IS_TRUST_REACHED 1 = trust region boundary was reached, 0 = otherwise +# POINT_STEP_NORM L2-norm of iteration step from old point (i.e. matrix B) to new point +# OBJECTIVE The loss function we minimize (negative regularized log-likelihood) +# OBJ_DROP_REAL Reduction in the objective during this iteration, actual value +# OBJ_DROP_PRED Reduction in the objective predicted by a quadratic approximation +# OBJ_DROP_RATIO Actual-to-predicted reduction ratio, used to update the trust region +# IS_POINT_UPDATED 1 = new point accepted; 0 = new point rejected, old point restored +# GRADIENT_NORM L2-norm of the loss function gradient (omitted if point is rejected) +# TRUST_DELTA Updated trust region size, the "delta" +# ------------------------------------------------------------------------------------------- +# +# Script invocation example: +# hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs icpt=2 reg=1.0 tol=0.000001 moi=100 mii=20 +# X=INPUT_DIR/X123 Y=INPUT_DIR/Y123 B=OUTPUT_DIR/B123 fmt=csv Log=OUTPUT_DIR/log + + + +fileLog = ifdef($Log, " ") +fmtB = ifdef($fmt, "text") + +intercept_status = ifdef($icpt, 0) # $icpt = 0 +regularization = ifdef($reg, 0.0) # $reg = 0.0 +tol = ifdef($tol, 0.000001) # $tol = 0.000001 +maxiter = ifdef($moi, 100) # $moi = 100 +maxinneriter = ifdef($mii, 0) # $mii = 0 +tol = as.double(tol) + +print("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT") + +# Generate data internally +n = $1 # number of rows +m = 100 # number of columns +X = rand(rows=n, cols=m, min=0, max=1, sparsity=0.9, seed=42) +Y_vec = rand(rows=n, cols=1, min=1, max=3, sparsity=0.9, seed=24) # Random labels between 1 and 3 + +# force a pass over the data +sum_x = sum(X) +sum_y = sum(Y_vec) + +for (ix in 1:5) { + + + eta0 = 0.0001 + eta1 = 0.25 + eta2 = 0.75 + sigma1 = 0.25 + sigma2 = 0.5 + sigma3 = 4.0 + psi = 0.1 + + N = nrow(X) + D = ncol(X) + + # Introduce the intercept, shift and rescale the columns of X if needed + if (intercept_status == 1 | intercept_status == 2) { # add the intercept column + X = cbind(X, matrix(1, rows=N, cols=1)) + D = ncol(X) + } + + scale_lambda = matrix(1, rows=D, cols=1) + if (intercept_status == 1 | intercept_status == 2) { + scale_lambda[D, 1] = 0 + } + + if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1 + avg_X_cols = t(colSums(X)) / N + var_X_cols = (t(colSums(X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1) + is_unsafe = var_X_cols <= 0 + scale_X = 1.0 / sqrt(var_X_cols * (1 - is_unsafe) + is_unsafe) + scale_X[D, 1] = 1 + shift_X = -avg_X_cols * scale_X + shift_X[D, 1] = 0 + rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum(shift_X ^ 2) + } else { + scale_X = matrix(1, rows=D, cols=1) + shift_X = matrix(0, rows=D, cols=1) + rowSums_X_sq = rowSums(X ^ 2) + } + + # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) + # with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. + # The transform is then associatively applied to the other side of the expression, + # and is rewritten via "scale_X" and "shift_X" as follows: + # + # ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: + # ssX_A = diag(scale_X) %*% A; + # ssX_A[D, ] = ssX_A[D, ] + t(shift_X) %*% A; + # + # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: + # tssX_A = diag(scale_X) %*% A + shift_X %*% A[D, ]; + + # Convert "Y_vec" into indicator matrix: + max_y = max(Y_vec) + if (min(Y_vec) <= 0) { + # Category labels "0", "-1" etc. are converted into the largest label + Y_vec = Y_vec + (-Y_vec + max_y + 1) * (Y_vec <= 0) + max_y = max_y + 1 + } + Y = table(seq(1, N, 1), Y_vec, N, max_y) + K = ncol(Y) - 1 # The number of non-baseline categories + + lambda = (scale_lambda %*% matrix(1, rows=1, cols=K)) * regularization + delta = 0.5 * sqrt(D) / max(sqrt(rowSums_X_sq)) + + B = matrix(0, rows=D, cols=K) ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B; + ### LT = cbind(LT, matrix(0, rows=N, cols=1)); + ### LT = LT - rowMaxs(LT) %*% matrix(1, rows=1, cols=K+1); + P = matrix(1, rows=N, cols=K+1) ### exp_LT = exp(LT); + P = P / (K + 1) ### P = exp_LT / (rowSums(exp_LT) %*% matrix(1, rows=1, cols=K+1)); + obj = N * log(K + 1) ### obj = - sum(Y * LT) + sum(log(rowSums(exp_LT))) + 0.5 * sum(lambda * (B_new ^ 2)); + + Grad = t(X) %*% (P[, 1:K] - Y[, 1:K]) + if (intercept_status == 2) { + Grad = diag(scale_X) %*% Grad + shift_X %*% Grad[D, ] + } + Grad = Grad + lambda * B + norm_Grad = sqrt(sum(Grad ^ 2)) + norm_Grad_initial = norm_Grad + + if (maxinneriter == 0) { + maxinneriter = D * K + } + iter = 1 + + # boolean for convergence check + converge = (norm_Grad < tol) | (iter > maxiter) + + print("-- Initially: Objective = " + obj + ", Gradient Norm = " + norm_Grad + ", Trust Delta = " + delta) + + if (fileLog != " ") { + log_str = "OBJECTIVE,0," + obj + log_str = append(log_str, "GRADIENT_NORM,0," + norm_Grad) + log_str = append(log_str, "TRUST_DELTA,0," + delta) + } else { + log_str = " " + } + + while (!converge) { + # SOLVE TRUST REGION SUB-PROBLEM + S = matrix(0, rows=D, cols=K) + R = -Grad + V = R + delta2 = delta ^ 2 + inneriter = 1 + norm_R2 = sum(R ^ 2) + innerconverge = (sqrt(norm_R2) <= psi * norm_Grad) + is_trust_boundary_reached = 0 + + while (!innerconverge) { + if (intercept_status == 2) { + ssX_V = diag(scale_X) %*% V + ssX_V[D, ] = ssX_V[D, ] + t(shift_X) %*% V + } else { + ssX_V = V + } + Q = P[, 1:K] * (X %*% ssX_V) + HV = t(X) %*% (Q - P[, 1:K] * (rowSums(Q) %*% matrix(1, rows=1, cols=K))) + if (intercept_status == 2) { + HV = diag(scale_X) %*% HV + shift_X %*% HV[D, ] + } + HV = HV + lambda * V + alpha = norm_R2 / sum(V * HV) + Snew = S + alpha * V + norm_Snew2 = sum(Snew ^ 2) + if (norm_Snew2 <= delta2) { + S = Snew + R = R - alpha * HV + old_norm_R2 = norm_R2 + norm_R2 = sum(R ^ 2) + V = R + (norm_R2 / old_norm_R2) * V + innerconverge = (sqrt(norm_R2) <= psi * norm_Grad) + } else { + is_trust_boundary_reached = 1 + sv = sum(S * V) + v2 = sum(V ^ 2) + s2 = sum(S ^ 2) + rad = sqrt(sv ^ 2 + v2 * (delta2 - s2)) + if (sv >= 0) { + alpha = (delta2 - s2) / (sv + rad) + } else { + alpha = (rad - sv) / v2 + } + S = S + alpha * V + R = R - alpha * HV + innerconverge = TRUE + } + inneriter = inneriter + 1 + innerconverge = innerconverge | (inneriter > maxinneriter) + } + + # END TRUST REGION SUB-PROBLEM + + # compute rho, update B, obtain delta + gs = sum(S * Grad) + qk = -0.5 * (gs - sum(S * R)) + B_new = B + S + if (intercept_status == 2) { + ssX_B_new = diag(scale_X) %*% B_new + ssX_B_new[D, ] = ssX_B_new[D, ] + t(shift_X) %*% B_new + } else { + ssX_B_new = B_new + } + + LT = cbind((X %*% ssX_B_new), matrix(0, rows=N, cols=1)) + if (fileLog != " ") { + log_str = append(log_str, "LINEAR_TERM_MIN," + iter + "," + min(LT)) + log_str = append(log_str, "LINEAR_TERM_MAX," + iter + "," + max(LT)) + } + LT = LT - rowMaxs(LT) %*% matrix(1, rows=1, cols=K+1) + exp_LT = exp(LT) + P_new = exp_LT / (rowSums(exp_LT) %*% matrix(1, rows=1, cols=K+1)) + obj_new = -sum(Y * LT) + sum(log(rowSums(exp_LT))) + 0.5 * sum(lambda * (B_new ^ 2)) + + # Consider updating LT in the inner loop + # Consider the big "obj" and "obj_new" rounding-off their small difference below: + + actred = (obj - obj_new) + + rho = actred / qk + is_rho_accepted = (rho > eta0) + snorm = sqrt(sum(S ^ 2)) + + if (fileLog != " ") { + log_str = append(log_str, "NUM_CG_ITERS," + iter + "," + (inneriter - 1)) + log_str = append(log_str, "IS_TRUST_REACHED," + iter + "," + is_trust_boundary_reached) + log_str = append(log_str, "POINT_STEP_NORM," + iter + "," + snorm) + log_str = append(log_str, "OBJECTIVE," + iter + "," + obj_new) + log_str = append(log_str, "OBJ_DROP_REAL," + iter + "," + actred) + log_str = append(log_str, "OBJ_DROP_PRED," + iter + "," + qk) + log_str = append(log_str, "OBJ_DROP_RATIO," + iter + "," + rho) + } + + if (iter == 1) { + delta = min(delta, snorm) + } + + alpha2 = obj_new - obj - gs + if (alpha2 <= 0) { + alpha = sigma3 + } else { + alpha = max(sigma1, -0.5 * gs / alpha2) + } + + if (rho < eta0) { + delta = min(max(alpha, sigma1) * snorm, sigma2 * delta) + } else { + if (rho < eta1) { + delta = max(sigma1 * delta, min(alpha * snorm, sigma2 * delta)) + } else { + if (rho < eta2) { + delta = max(sigma1 * delta, min(alpha * snorm, sigma3 * delta)) + } else { + delta = max(delta, min(alpha * snorm, sigma3 * delta)) + } + } + } + + if (is_trust_boundary_reached == 1) { + print("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED") + } else { + print("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations") + } + print(" -- Obj.Reduction: Actual = " + actred + ", Predicted = " + qk + + " (A/P: " + (round(10000.0 * rho) / 10000.0) + "), Trust Delta = " + delta) + + if (is_rho_accepted) { + B = B_new + P = P_new + Grad = t(X) %*% (P[, 1:K] - Y[, 1:K]) + if (intercept_status == 2) { + Grad = diag(scale_X) %*% Grad + shift_X %*% Grad[D, ] + } + Grad = Grad + lambda * B + norm_Grad = sqrt(sum(Grad ^ 2)) + obj = obj_new + print(" -- New Objective = " + obj + ", Beta Change Norm = " + snorm + ", Gradient Norm = " + norm_Grad) + if (fileLog != " ") { + log_str = append(log_str, "IS_POINT_UPDATED," + iter + ",1") + log_str = append(log_str, "GRADIENT_NORM," + iter + "," + norm_Grad) + } + } else { + if (fileLog != " ") { + log_str = append(log_str, "IS_POINT_UPDATED," + iter + ",0") + } + } + + if (fileLog != " ") { + log_str = append(log_str, "TRUST_DELTA," + iter + "," + delta) + } + + iter = iter + 1 + converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) | + ((is_trust_boundary_reached == 0) & (abs(actred) < (abs(obj) + abs(obj_new)) * 0.00000000000001))) + if (converge) { + print("Termination / Convergence condition satisfied.") + } else { + print(" ") + } + } + + if (intercept_status == 2) { + B_out = diag(scale_X) %*% B + B_out[D, ] = B_out[D, ] + t(shift_X) %*% B + } else { + B_out = B + } + # write(B_out, fileB, format=fmtB) + + if (sum_x > 0.0) { + print(as.scalar(B[1, 1])) + } + + +} + +if (fileLog != " ") { + write(log_str, fileLog) +} + + diff --git a/scripts/perftest/slab/mlAlgorithms/native/slabNativePCA.dml b/scripts/perftest/slab/mlAlgorithms/native/slabNativePCA.dml new file mode 100644 index 0000000000..3f0da1848f --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/native/slabNativePCA.dml @@ -0,0 +1,130 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# This script performs Principal Component Analysis (PCA) on the given input data. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# INPUT String --- Location to read the matrix A of feature vectors +# K Int --- Indicates dimension of the new vector space constructed from eigen vectors +# CENTER Int 0 Indicates whether or not to center data +# SCALE Int 0 Indicates whether or not to scale data +# OFMT String --- Output data format +# PROJDATA Int 0 This argument indicates if the data should be projected or not +# MODEL String --- Location to already existing model: eigenvectors and eigenvalues +# OUTPUT String / Location to write output matrices (covariance matrix, new basis vectors, +# and data projected onto new basis vectors) +# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000 +# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1 +# --------------------------------------------------------------------------------------------- + +# Generate data internally instead of reading from a file +n = $1 # number of rows +m = 100 # number of columns +A = rand(rows=n, cols=m, min=0, max=1, sparsity=0.9, seed=42) + + + +sum_A = sum(A) +for (ix in 1:5) { + + + K = ifdef($K, ncol(A)) + ofmt = ifdef($OFMT, "CSV") + projectData = ifdef($PROJDATA, 1) + model = ifdef($MODEL, "") + center = ifdef($CENTER, 0) + scale = ifdef($SCALE, 0) + output = ifdef($OUTPUT, "/") + + evec_dominant = matrix(0, cols=1, rows=1) + + if (model != "") { + pass = 1.0 + # reuse existing model to project data + #evec_dominant = read(model+"/dominant.eigen.vectors") + } else { + if (model == "") { + model = output + } + + N = nrow(A) + D = ncol(A) + + # perform z-scoring (centering and scaling) + if (center == 1) { + cm = colMeans(A) + A = A - cm + } + if (scale == 1) { + cvars = colSums(A^2) + if (center == 1) { + cm = colMeans(A) + cvars = (cvars - N * (cm^2)) / (N - 1) + } + Azscored = A / sqrt(cvars) + A = Azscored + } + + # co-variance matrix + mu = colSums(A) / N + C = (t(A) %*% A) / (N - 1) - (N / (N - 1)) * t(mu) %*% mu + + # compute eigen vectors and values + [evalues, evectors] = eigen(C) + + decreasing_Idx = order(target=evalues, by=1, decreasing=TRUE, index.return=TRUE) + diagmat = table(seq(1, D), decreasing_Idx) + # sorts eigenvalues by decreasing order + evalues = diagmat %*% evalues + # sorts eigenvectors column-wise in the order of decreasing eigenvalues + evectors = evectors %*% diagmat + + # select K dominant eigen vectors + nvec = ncol(evectors) + + eval_dominant = evalues[1:K, 1] + evec_dominant = evectors[, 1:K] + + # the square root of eigenvalues + eval_stdev_dominant = sqrt(eval_dominant) + + #write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt) + #write(eval_dominant, model+"/dominant.eigen.values", format=ofmt) + #write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt) + } + if (projectData == 1 | model != "") { + # Construct new data set by treating computed dominant eigenvectors as the basis vectors + newA = A %*% evec_dominant + sum_newA = sum(newA) + if (sum_newA > 0) { + print(sum_newA) + } + #write(newA, output+"/projected.data", format=ofmt) + } + + +} + + diff --git a/scripts/perftest/slab/mlAlgorithms/single_node_dense/run_single_node_dense_ml.sh b/scripts/perftest/slab/mlAlgorithms/single_node_dense/run_single_node_dense_ml.sh new file mode 100755 index 0000000000..0d527da0d2 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/single_node_dense/run_single_node_dense_ml.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + + +# Set up the output directory +OUTPUT_DIR="output" +mkdir -p $OUTPUT_DIR + +# Define row numbers for the DML scripts +ROW_NUMBERS=("1000" "10000" "100000" "1000000") + +# Define DML files and corresponding output files +DML_FILES=("slabLogisticRegression.dml" "slabOrdinaryLeastSquaresRegression.dml" "slabHeteroscedasticityRobustStandardErrors.dml" "slabNonNegativeMatrixFactorization.dml" "slabPCA.dml") +OUTPUT_FILES=("slabLogisticRegression_stats.txt" "slabOrdinaryLeastSquaresRegression_stats.txt" "slabHeteroscedasticityRobustStandardErrors_stats.txt" "slabNonNegativeMatrixFactorization_stats.txt" "slabPCA_stats.txt") + +# Function to run DML script and handle errors +run_dml() { + local DML_FILE=$1 + local ARGS=$2 + local OUTPUT_FILE=$3 + + # Run the DML script with -stats flag and capture the output + TEMP_FILE=$(mktemp) + if systemds $DML_FILE -args $ARGS -stats > $TEMP_FILE 2>&1; then + # Write the number of rows and SystemDS Statistics section to the output file + echo "Number of rows: $ARGS" >> $OUTPUT_FILE + awk '/SystemDS Statistics:/{flag=1}flag' $TEMP_FILE >> $OUTPUT_FILE + else + echo "An error occurred while executing ${DML_FILE} with arguments ${ARGS}. Check ${TEMP_FILE} for details." >> $OUTPUT_FILE + fi + echo -e "\n\n\n\n" >> $OUTPUT_FILE # Add empty lines for separation + rm $TEMP_FILE +} + +# Iterate over each DML file +for index in ${!DML_FILES[@]}; do + DML_FILE=${DML_FILES[$index]} + OUTPUT_FILE=${OUTPUT_DIR}/${OUTPUT_FILES[$index]} + + # Clear the output file before writing + > $OUTPUT_FILE + + # Iterate over each row number and execute the DML file + for ROW in ${ROW_NUMBERS[@]}; do + run_dml $DML_FILE $ROW $OUTPUT_FILE + echo "Execution of ${DML_FILE} with ${ROW} rows completed. Statistics appended to ${OUTPUT_FILE}" + done +done diff --git a/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabHeteroscedasticityRobustStandardErrors.dml b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabHeteroscedasticityRobustStandardErrors.dml new file mode 100644 index 0000000000..4c012f2eef --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabHeteroscedasticityRobustStandardErrors.dml @@ -0,0 +1,55 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +X = rand(rows=$1, cols=100) +rvect = rand(rows=$1, cols=1, pdf='uniform') +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) +b = reg(X,y) +y_hat = X %*% b +r2 = (y - y_hat)^2 + +for(ix in 1:5) { + tmp = robust_se(X, r2) + utils::printRandElements(tmp, 10) +} + +reg = function(matrix[double] X, matrix[double] y) + return (matrix[double] b) { + b = solve(t(X) %*% X, t(X) %*% y) +} + + +robust_se = function(matrix[double] X, + matrix[double] r2) + return (matrix[double] se) { + # NOTE: SVD is cheap since XTX is small! + [U, H, V] = svd(t(X) %*% X) + h = diag(H) + XTX_INV = U %*% diag(h^-1) %*% t(V) + S = diag(r2) + se = XTX_INV %*% (t(X) %*% S %*% X) %*% XTX_INV +} diff --git a/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabLogisticRegression.dml b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabLogisticRegression.dml new file mode 100644 index 0000000000..a568f7a0b6 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabLogisticRegression.dml @@ -0,0 +1,55 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +X = rand(rows=$1, cols=100) +rvect = rand(rows=$1, cols=1, pdf='uniform') +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) + +for(ix in 1:5){ + tmp = logit(X, y, 10) + utils::printRandElements(tmp, 10) +} + +logit = function(matrix[double] X, + matrix[double] y, + Integer iterations) + return (matrix[double] w) { + + N = nrow(X) + w = matrix(0, rows=ncol(X), cols=1) + iteration = 0 + stepSize = 10 + + while (iteration < iterations) { + xb = X %*% w + delta = 1/(1+exp(-xb)) - y + stepSize = stepSize / 2 + w = w - ((stepSize * t(X) %*% delta)/N) + + iteration = iteration + 1 + } +} diff --git a/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabNonNegativeMatrixFactorization.dml b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabNonNegativeMatrixFactorization.dml new file mode 100644 index 0000000000..2c32cb9081 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabNonNegativeMatrixFactorization.dml @@ -0,0 +1,54 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +X = rand(rows=$1, cols=100) +rvect = rand(rows=1, cols=1) +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) + + +for(ix in 1:5){ + tmp = gnmf(X, 10, 10) + print(tmp) +} + +gnmf = function(matrix[double] X, Integer r, Integer iterations) + return (integer iteration) { + + W = rand(rows = nrow(X), cols = r, pdf = 'uniform') + H = rand(rows = r, cols = ncol(X), pdf = 'uniform') + + for (i in 1:3) { + W = W * ((X %*% t(H)) / (W %*% (H %*% t(H)))) + H = H * ((t(W) %*% X) / ((t(W) %*% W) %*% H)) + } + if ((as.scalar(W[1,1]) > 0) & (as.scalar(H[1,1]) > 0)) { + print(as.scalar(H[1,1])) + print(as.scalar(W[1,1])) + } + + iteration = 0 +} diff --git a/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabOrdinaryLeastSquaresRegression.dml b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabOrdinaryLeastSquaresRegression.dml new file mode 100644 index 0000000000..eee7aa7dc5 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabOrdinaryLeastSquaresRegression.dml @@ -0,0 +1,40 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +X = rand(rows=$1, cols=100) +rvect = rand(rows=$1, cols=1, pdf='uniform') +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) + +for(ix in 1:5){ + tmp = reg(X, y) + utils::printRandElements(tmp, 10) +} + +reg = function(matrix[double] X, matrix[double] y) + return (matrix[double] b) { + b = solve(t(X) %*% X, t(X) %*% y) +} diff --git a/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabPCA.dml b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabPCA.dml new file mode 100644 index 0000000000..7058073409 --- /dev/null +++ b/scripts/perftest/slab/mlAlgorithms/single_node_dense/slabPCA.dml @@ -0,0 +1,56 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +X = rand(rows=$1, cols=100) +rvect = rand(rows=$1, cols=1, pdf='uniform') +y = rvect > 0.80 +p = sum( X ) +q = sum( y ) +print(p) +print(q) + +for(ix in 1:5){ + tmp = pca(X, 5) + utils::printRandElements(tmp, 10) +} + +pca = function(matrix[double] X, Integer k) + return (matrix[double] PRJ) { + N = nrow(X) + K = ncol(X) + XS = X - colMeans(X) + S = (1/(N-1)) * (t(XS) %*% XS) + [eigvals, eigvects] = eigen(S) + + # Thanks to the Sysml implementation for this helpful bit + # of code to sort the eigenvectors + eigssorted = order(target=eigvals, by=1, + decreasing=TRUE, + index.return=TRUE) + diagmat = table(seq(1, K), eigssorted) + eigvals = diagmat %*% eigvals + eigvects = eigvects %*% diagmat + eigvects = eigvects[, 1:k] + + PRJ = XS %*% eigvects +} diff --git a/scripts/perftest/slab/operators/distributed_sparse/run_distributed_matrix_sparse.sh b/scripts/perftest/slab/operators/distributed_sparse/run_distributed_matrix_sparse.sh new file mode 100755 index 0000000000..acc6397d9c --- /dev/null +++ b/scripts/perftest/slab/operators/distributed_sparse/run_distributed_matrix_sparse.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Ensure script is run from the 'single_node_dense' directory +#if [ "$(basename $PWD)" != "single_node_dense" ]; then +# echo "Please execute scripts from directory 'single_node_dense'" +# exit 1 +#fi + + +# Set up the output directory +OUTPUT_DIR="output" +mkdir -p $OUTPUT_DIR + +# Define datasets +SPARSITY=("0_0001" "0_001" "0_01" "0_1") +SHAPES=("tall" "wide") + +# Define DML files and corresponding output files +DML_FILES=("slabFrobeniusNormSparse.dml" "slabGramMatrixSparse.dml" "slabMatrixAdditionSparse.dml" "slabMatrixMultSparse.dml" "slabMatrixVectorMultSparse.dml" "slabTransposeSparse.dml") +OUTPUT_FILES=("slabFrobeniusNormSparse_stats.txt" "slabGramMatrixSparse_stats.txt" "slabMatrixAdditionSparse_stats.txt" "slabMatrixMultSparse_stats.txt" "slabMatrixVectorMultSparse_stats.txt" "slabTransposeSparse_stats.txt") + +# Base path to datasets +DATASET_PATH="../../../../../src/test/resources/datasets/slab/sparse" + +# Iterate over each DML file +for index in ${!DML_FILES[@]}; do + DML_FILE=${DML_FILES[$index]} + OUTPUT_FILE=${OUTPUT_DIR}/${OUTPUT_FILES[$index]} + + # Clear the output file before writing + > $OUTPUT_FILE + + # Special handling for slabMatrixMultSparse.dml + if [ "$DML_FILE" == "slabMatrixMultSparse.dml" ]; then + for SPARSE in ${SPARSITY[@]}; do + for SHAPE in ${SHAPES[@]}; do + if [ "$SHAPE" == "tall" ]; then + CSV_FILE1="${DATASET_PATH}/M_sparsity_${SPARSE}_tall.csv" + CSV_FILE2="${DATASET_PATH}/M_sparsity_${SPARSE}_wide.csv" + else + CSV_FILE1="${DATASET_PATH}/M_sparsity_${SPARSE}_wide.csv" + CSV_FILE2="${DATASET_PATH}/M_sparsity_${SPARSE}_tall.csv" + fi + + # Run the DML script with -stats flag and capture the output + TEMP_FILE=$(mktemp) + systemds $DML_FILE -exec spark -args $CSV_FILE1 $CSV_FILE2 -stats > $TEMP_FILE 2>&1 + + # Write the sparsity and shape and SystemDS Statistics section to the output file + echo "Sparsity: ${SPARSE//_/\.}, Shape: $SHAPE" >> $OUTPUT_FILE + awk '/SystemDS Statistics:/{flag=1}flag' $TEMP_FILE >> $OUTPUT_FILE + echo -e "\n\n\n\n" >> $OUTPUT_FILE # Add empty lines for separation + + # Clean up temporary file + rm $TEMP_FILE + + echo "Execution of ${DML_FILE} with ${CSV_FILE1} and ${CSV_FILE2} completed. Statistics appended to ${OUTPUT_FILE}" + done + done + else + # Handling for other DML files + for SPARSE in ${SPARSITY[@]}; do + for SHAPE in ${SHAPES[@]}; do + CSV_FILE="${DATASET_PATH}/M_sparsity_${SPARSE}_${SHAPE}.csv" + + # Run the DML script with -stats flag and capture the output + TEMP_FILE=$(mktemp) + systemds $DML_FILE -exec spark -args $CSV_FILE -stats > $TEMP_FILE 2>&1 + + # Write the sparsity and shape and SystemDS Statistics section to the output file + echo "Sparsity: ${SPARSE//_/\.}, Shape: $SHAPE" >> $OUTPUT_FILE + awk '/SystemDS Statistics:/{flag=1}flag' $TEMP_FILE >> $OUTPUT_FILE + echo -e "\n\n\n\n" >> $OUTPUT_FILE # Add empty lines for separation + + # Clean up temporary file + rm $TEMP_FILE + + echo "Execution of ${DML_FILE} with ${CSV_FILE} completed. Statistics appended to ${OUTPUT_FILE}" + done + done + fi +done diff --git a/scripts/perftest/slab/operators/distributed_sparse/slabFrobeniusNormSparse.dml b/scripts/perftest/slab/operators/distributed_sparse/slabFrobeniusNormSparse.dml new file mode 100644 index 0000000000..ddd9d15cc5 --- /dev/null +++ b/scripts/perftest/slab/operators/distributed_sparse/slabFrobeniusNormSparse.dml @@ -0,0 +1,35 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +#dataPath = "../../../../../src/test/resources/datasets/slab/sparse/M_sparsity_0_1_tall.csv" +dataPath = $1 +M = read(dataPath, format="csv") +K = sum( M ) +print(K) + +for (ix in 1:5) { + R = sqrt(sum(M^2)) + if(K > 0.0) { + print(R) + } +} diff --git a/scripts/perftest/slab/operators/distributed_sparse/slabGramMatrixSparse.dml b/scripts/perftest/slab/operators/distributed_sparse/slabGramMatrixSparse.dml new file mode 100644 index 0000000000..21571aa8b4 --- /dev/null +++ b/scripts/perftest/slab/operators/distributed_sparse/slabGramMatrixSparse.dml @@ -0,0 +1,35 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPath = $1 + +M = read(dataPath, format="csv") +K = sum( M ) +print(K) + +for (ix in 1:5) { + R = t(M) %*% M + if(K > 0.0) { + utils::printRandElements(R, 10) + } +} diff --git a/scripts/perftest/slab/operators/distributed_sparse/slabMatrixAdditionSparse.dml b/scripts/perftest/slab/operators/distributed_sparse/slabMatrixAdditionSparse.dml new file mode 100644 index 0000000000..d141cd6ea8 --- /dev/null +++ b/scripts/perftest/slab/operators/distributed_sparse/slabMatrixAdditionSparse.dml @@ -0,0 +1,37 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPath = $1 +M = read(dataPath, format="csv") +K = sum( M ) +print(K) +N = read(dataPath, format="csv") +print(sum(N)) + +for (ix in 1:5) { + R = M + N + if(K > 0.0) { + utils::printRandElements(R, 10) + } +} + diff --git a/scripts/perftest/slab/operators/distributed_sparse/slabMatrixMultSparse.dml b/scripts/perftest/slab/operators/distributed_sparse/slabMatrixMultSparse.dml new file mode 100644 index 0000000000..32f9a3eff7 --- /dev/null +++ b/scripts/perftest/slab/operators/distributed_sparse/slabMatrixMultSparse.dml @@ -0,0 +1,37 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPathM = $1 +dataPathN = $2 +M = read(dataPathM, format="csv") +K = sum( M ) +print(K) +N = read(dataPathN, format="csv") +print(sum(N)) + +for (ix in 1:5) { + R = M %*% N + if(K > 0.0) { + utils::printRandElements(R, 10) + } +} diff --git a/scripts/perftest/slab/operators/distributed_sparse/slabMatrixVectorMultSparse.dml b/scripts/perftest/slab/operators/distributed_sparse/slabMatrixVectorMultSparse.dml new file mode 100644 index 0000000000..205158368f --- /dev/null +++ b/scripts/perftest/slab/operators/distributed_sparse/slabMatrixVectorMultSparse.dml @@ -0,0 +1,36 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPath = $1 +M = read(dataPath, format="csv") +K = sum( M ) +print(K) +w = rand(rows=ncol(M), cols=1) +print(sum(w)) + +for (ix in 1:5) { + R = M %*% w + if(K > 0.0) { + utils::printRandElements(R, 10) + } +} diff --git a/scripts/perftest/slab/operators/distributed_sparse/slabTransposeSparse.dml b/scripts/perftest/slab/operators/distributed_sparse/slabTransposeSparse.dml new file mode 100644 index 0000000000..ad0970047b --- /dev/null +++ b/scripts/perftest/slab/operators/distributed_sparse/slabTransposeSparse.dml @@ -0,0 +1,34 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +dataPath = $1 +M = read(dataPath, format="csv") +K = sum( M ) +print(K) + +for (ix in 1:5) { + R = t(M) + if(K > 0.0) { + utils::printRandElements(R, 10) + } +} diff --git a/scripts/perftest/slab/operators/single_node_dense/run_single_node_matrix_dense.sh b/scripts/perftest/slab/operators/single_node_dense/run_single_node_matrix_dense.sh new file mode 100755 index 0000000000..27e4475961 --- /dev/null +++ b/scripts/perftest/slab/operators/single_node_dense/run_single_node_matrix_dense.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Ensure script is run from the 'single_node_dense' directory +if [ "$(basename $PWD)" != "single_node_dense" ]; then + echo "Please execute scripts from directory 'single_node_dense'" + exit 1 +fi + + +# Set up the output directory +OUTPUT_DIR="output" +mkdir -p $OUTPUT_DIR + +# List of row numbers for testing +ROW_NUMBERS=("2500000" "5000000" "10000000" "20000000") + +# List of DML files and corresponding output files +DML_FILES=("slabFrobeniusNorm.dml" "slabGramMatrix.dml" "slabMatrixAddition.dml" "slabMatrixMult.dml" "slabMatrixVectorMult.dml" "slabTranspose.dml") +OUTPUT_FILES=("slabFrobeniusNorm_stats.txt" "slabGramMatrix_stats.txt" "slabMatrixAddition_stats.txt" "slabMatrixMult_stats.txt" "slabMatrixVectorMult_stats.txt" "slabTranspose_stats.txt") + +# Iterate over each DML file and execute it with different row numbers +for index in ${!DML_FILES[@]}; do + DML_FILE=${DML_FILES[$index]} + OUTPUT_FILE=${OUTPUT_DIR}/${OUTPUT_FILES[$index]} + + # Clear the output file before writing + > $OUTPUT_FILE + + for ROW in ${ROW_NUMBERS[@]}; do + # Run the DML script with -stats flag and capture the output + TEMP_FILE=$(mktemp) + systemds $DML_FILE -args $ROW -stats > $TEMP_FILE 2>&1 + + # Write the number of rows and SystemDS Statistics section to the output file + echo "Number of rows: $ROW" >> $OUTPUT_FILE + awk '/SystemDS Statistics:/{flag=1}flag' $TEMP_FILE >> $OUTPUT_FILE + echo -e "\n\n\n\n" >> $OUTPUT_FILE # Add empty lines for separation + + # Clean up temporary file + rm $TEMP_FILE + + echo "Execution of ${DML_FILE} with ${ROW} rows completed. Statistics appended to ${OUTPUT_FILE}" + done +done diff --git a/scripts/perftest/slab/operators/single_node_dense/slabFrobeniusNorm.dml b/scripts/perftest/slab/operators/single_node_dense/slabFrobeniusNorm.dml new file mode 100644 index 0000000000..acacd9a7b3 --- /dev/null +++ b/scripts/perftest/slab/operators/single_node_dense/slabFrobeniusNorm.dml @@ -0,0 +1,29 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +M = rand(rows = $1, cols = 100, pdf = 'uniform') +k = sum(M) +for (ix in 1:5) { + R = sqrt(sum(M^2)) + print(R) +} diff --git a/scripts/perftest/slab/operators/single_node_dense/slabGramMatrix.dml b/scripts/perftest/slab/operators/single_node_dense/slabGramMatrix.dml new file mode 100644 index 0000000000..7c3a061525 --- /dev/null +++ b/scripts/perftest/slab/operators/single_node_dense/slabGramMatrix.dml @@ -0,0 +1,29 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +M = rand(rows = $1, cols = 100, pdf = 'uniform') +k = sum(M) +for (ix in 1:5) { + R = t(M) %*% M + utils::printRandElements(R,10) +} diff --git a/scripts/perftest/slab/operators/single_node_dense/slabMatrixAddition.dml b/scripts/perftest/slab/operators/single_node_dense/slabMatrixAddition.dml new file mode 100644 index 0000000000..1d6489ec44 --- /dev/null +++ b/scripts/perftest/slab/operators/single_node_dense/slabMatrixAddition.dml @@ -0,0 +1,34 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +source("../../slabUtils.dml") as utils + +M = rand(rows = $1, cols = 100, pdf = 'uniform') +X = rand(rows = $1, cols = 100, pdf = 'uniform') +k = sum(M) + +for (ix in 1:5) { + R = M + X + utils::printRandElements(R,10) +} + + diff --git a/scripts/perftest/slab/operators/single_node_dense/slabMatrixMult.dml b/scripts/perftest/slab/operators/single_node_dense/slabMatrixMult.dml new file mode 100644 index 0000000000..555caae4ed --- /dev/null +++ b/scripts/perftest/slab/operators/single_node_dense/slabMatrixMult.dml @@ -0,0 +1,32 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +source("../../slabUtils.dml") as utils + +M = rand(rows = $1, cols = 100, pdf = 'uniform') +N = rand(rows = 100, cols = $1, pdf = 'uniform') +k = sum(M) + +for (ix in 1:5) { + R = M %*% N + utils::printRandElements(R,10) +} diff --git a/scripts/perftest/slab/operators/single_node_dense/slabMatrixVectorMult.dml b/scripts/perftest/slab/operators/single_node_dense/slabMatrixVectorMult.dml new file mode 100644 index 0000000000..e2cec6a7e7 --- /dev/null +++ b/scripts/perftest/slab/operators/single_node_dense/slabMatrixVectorMult.dml @@ -0,0 +1,33 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + + +M = rand(rows = $1, cols = 100, pdf = 'uniform') +w = rand(rows = 100, cols = 1, pdf = 'uniform') +k = sum(M) + +for (ix in 1:5) { + R = M %*% w + utils::printRandElements(R,10) +} + diff --git a/scripts/perftest/slab/operators/single_node_dense/slabTranspose.dml b/scripts/perftest/slab/operators/single_node_dense/slabTranspose.dml new file mode 100644 index 0000000000..add5c94578 --- /dev/null +++ b/scripts/perftest/slab/operators/single_node_dense/slabTranspose.dml @@ -0,0 +1,30 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../../slabUtils.dml") as utils + +M = rand(rows = $1, cols = 100, pdf = 'uniform') +k = sum(M) + +for (ix in 1:5) { + R = t(M) + utils::printRandElements(R,10) +} diff --git a/scripts/perftest/slab/pipeline/run_slab_pipeline.sh b/scripts/perftest/slab/pipeline/run_slab_pipeline.sh new file mode 100755 index 0000000000..9de2ecf2f8 --- /dev/null +++ b/scripts/perftest/slab/pipeline/run_slab_pipeline.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Ensure script is run from the 'pipeline' directory +if [ "$(basename $PWD)" != "pipeline" ]; then + echo "Please execute scripts from directory 'pipeline'" + exit 1 +fi + + +# Set up the output directory +OUTPUT_DIR="output" +mkdir -p $OUTPUT_DIR + +# Define row numbers for slabMultiplicationChain.dml +ROW_NUMBERS=("1000" "10000" "100000" "1000000") +MULTIPLICATION_CHAIN_FILE="slabMultiplicationChain.dml" +MULTIPLICATION_CHAIN_OUTPUT="${OUTPUT_DIR}/slabMultiplicationChain_stats.txt" + +# Clear the output file before writing +> $MULTIPLICATION_CHAIN_OUTPUT + +# Iterate over each row number and execute slabMultiplicationChain.dml +for ROW in ${ROW_NUMBERS[@]}; do + TEMP_FILE=$(mktemp) + if systemds $MULTIPLICATION_CHAIN_FILE -exec spark -args $ROW -stats > $TEMP_FILE 2>&1; then + echo "Number of rows: $ROW" >> $MULTIPLICATION_CHAIN_OUTPUT + awk '/SystemDS Statistics:/{flag=1}flag' $TEMP_FILE >> $MULTIPLICATION_CHAIN_OUTPUT + else + echo "An error occurred while executing ${MULTIPLICATION_CHAIN_FILE} with rows ${ROW}. Check ${TEMP_FILE} for details." >> $MULTIPLICATION_CHAIN_OUTPUT + fi + echo -e "\n\n\n\n" >> $MULTIPLICATION_CHAIN_OUTPUT # Add empty lines for separation + rm $TEMP_FILE + echo "Execution of ${MULTIPLICATION_CHAIN_FILE} with ${ROW} rows completed. Statistics appended to ${MULTIPLICATION_CHAIN_OUTPUT}" +done + +# Define datasets for slabSVD.dml +DATASET_PATH="../../../../src/test/resources/datasets/slab/dense" +DATASETS=("M_dense_tall.csv" "M_dense_wide.csv") +SVD_FILE="slabSVD.dml" +SVD_OUTPUT="${OUTPUT_DIR}/slabSVD_stats.txt" + +# Clear the output file before writing +> $SVD_OUTPUT + +# Iterate over each dataset and execute slabSVD.dml +for DATASET in ${DATASETS[@]}; do + SHAPE=$(echo $DATASET | grep -oP '(tall|wide)') + TEMP_FILE=$(mktemp) + if systemds $SVD_FILE -exec spark -args ${DATASET_PATH}/${DATASET} -stats > $TEMP_FILE 2>&1; then + echo "Shape: $SHAPE" >> $SVD_OUTPUT + awk '/SystemDS Statistics:/{flag=1}flag' $TEMP_FILE >> $SVD_OUTPUT + else + echo "An error occurred while executing ${SVD_FILE} with dataset ${DATASET}. Check ${TEMP_FILE} for details." >> $SVD_OUTPUT + fi + echo -e "\n\n\n\n" >> $SVD_OUTPUT # Add empty lines for separation + rm $TEMP_FILE + echo "Execution of ${SVD_FILE} with dataset ${DATASET} completed. Statistics appended to ${SVD_OUTPUT}" +done diff --git a/scripts/perftest/slab/pipeline/slabMultiplicationChain.dml b/scripts/perftest/slab/pipeline/slabMultiplicationChain.dml new file mode 100644 index 0000000000..5da116bbce --- /dev/null +++ b/scripts/perftest/slab/pipeline/slabMultiplicationChain.dml @@ -0,0 +1,38 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../slabUtils.dml") as utils + +t = rand(rows=$1, cols=1, pdf = 'uniform') +u = rand(rows=1, cols=$1, pdf = 'uniform') +v = rand(rows=$1, cols=1, pdf = 'uniform') + +q = sum(t) +r = sum(u) +s = sum(v) + +for(ix in 1:5) { + res = t %*% u %*% v + if ((q != 0) & (r != 0) & (s != 0)) { + print(as.scalar(res[1,1])) + } +} + diff --git a/scripts/perftest/slab/pipeline/slabSVD.dml b/scripts/perftest/slab/pipeline/slabSVD.dml new file mode 100644 index 0000000000..36b262c57b --- /dev/null +++ b/scripts/perftest/slab/pipeline/slabSVD.dml @@ -0,0 +1,38 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +source("../slabUtils.dml") as utils + +dataPath = $1 +M = read(dataPath, format="csv") +K = sum(M) +print(K) + +for(ix in 1:5){ + [U,D,V] = svd( M ) + if(K > 0){ + print(as.scalar(U[1,1])) + print(as.scalar(D[1,1])) + print(as.scalar(V[1,1])) + } +} + + diff --git a/scripts/perftest/slab/slabUtils.dml b/scripts/perftest/slab/slabUtils.dml new file mode 100644 index 0000000000..5f1659efc1 --- /dev/null +++ b/scripts/perftest/slab/slabUtils.dml @@ -0,0 +1,49 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +genClusters = function(Integer nClust, + Integer nObs, + Integer nCol, + Double sd, + Double sep) return (matrix[double] X) { + print("Hi from utils") + X = sample(nClust, nObs, TRUE)%*%matrix(sep, rows = 1, cols = nCol) + X = X + rand(rows = nObs, cols = nCol, pdf = 'normal') + print(sum(X)) +} + +allocMatrix = function(Integer rows, Integer cols) return (matrix[double] X) { + X = rand(rows = rows, cols = cols, pdf = 'uniform') + print(sum(X)) +} + +printRandElements = function(matrix[double] M, Integer numel) { + for (ix in 1:numel) { + r = rand(rows=1,cols=1,min=1,max=nrow(M),pdf="uniform") + row = as.integer(as.scalar(r)) + + c = rand(rows=1,cols=1,min=1,max=ncol(M),pdf="uniform") + col = as.integer(as.scalar(c)) + tmp = M[row,col] + print(as.scalar(tmp)) + } +} diff --git a/src/main/java/org/apache/sysds/api/DMLScript.java b/src/main/java/org/apache/sysds/api/DMLScript.java index cd86426a42..cd70760ea7 100644 --- a/src/main/java/org/apache/sysds/api/DMLScript.java +++ b/src/main/java/org/apache/sysds/api/DMLScript.java @@ -156,7 +156,7 @@ public class DMLScript // flag that indicates whether or not to suppress any prints to stdout public static boolean _suppressPrint2Stdout = false; //set default local spark configuration - used for local testing - public static boolean USE_LOCAL_SPARK_CONFIG = false; + public static boolean USE_LOCAL_SPARK_CONFIG = false; public static boolean _activeAM = false; /** * If true, allow DMLProgram to be generated while not halting due to validation errors/warnings