Repository: systemml Updated Branches: refs/heads/master f046051d4 -> cd1ae5b42
[MINOR] Performance test bug fixes Closes #565 Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cd1ae5b4 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cd1ae5b4 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cd1ae5b4 Branch: refs/heads/master Commit: cd1ae5b42499b3b97731de8b28a6d1db9cc9e7f3 Parents: f046051 Author: krishnakalyan3 <krishnakaly...@gmail.com> Authored: Thu Jul 13 14:28:56 2017 -0700 Committer: Nakul Jindal <naku...@gmail.com> Committed: Thu Jul 13 14:28:56 2017 -0700 ---------------------------------------------------------------------- scripts/perftest/python/datagen.py | 27 ++++--- scripts/perftest/python/predict.py | 48 ++++++------ scripts/perftest/python/run_perftest.py | 53 ++++++++----- scripts/perftest/python/train.py | 40 +++++----- scripts/perftest/python/utils.py | 112 +++++++++++++++++++++++---- 5 files changed, 192 insertions(+), 88 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/datagen.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py index d9c49e9..88a71f0 100755 --- a/scripts/perftest/python/datagen.py +++ b/scripts/perftest/python/datagen.py @@ -22,7 +22,7 @@ import itertools from os.path import join -from utils import split_rowcol, config_writer +from utils import split_rowcol, config_writer, mat_type_check # This file contains configuration settings for data generation DATA_FORMAT = 'csv' @@ -181,8 +181,8 @@ def stats1_datagen(matrix_dim, matrix_type, datagen_dir): NC = int(int(col)/2) config = dict(R=row, C=col, NC=NC, MAXDOMAIN=MAXDOMAIN, DATA=DATA, TYPES=TYPES, SETSIZE=SETSIZE, - LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1, INDEX2=INDEX2, - fmt=DATA_FORMAT) + LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1, + INDEX2=INDEX2, fmt=DATA_FORMAT) config_writer(full_path + '.json', config) @@ -207,7 +207,7 @@ def stats2_datagen(matrix_dim, matrix_type, datagen_dir): return full_path -def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir): +def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, dense_algos): """ This function has two responsibilities. Generate the configuration files for datagen algorithms and return a dictionary that will be used for execution. @@ -217,11 +217,17 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir) family type. matrix_type: String - Type of matrix to generate e.g dense or sparse + Type of matrix to generate e.g dense, sparse, all matrix_shape: String Shape of matrix to generate e.g 100k_10 + datagen_dir: String + Path of the data generation directory + + dense_algos: List + Algorithms that support only dense matrix type + return: Dictionary {string: list} This dictionary contains algorithms to be executed as keys and the path of configuration json files to be executed list of values. @@ -233,13 +239,10 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir) # Cross Product of all configurations for current_family in distinct_families: - if current_family in FAMILY_NO_MATRIX_TYPE: - config = list(itertools.product(matrix_shape, ['dense'])) - config_bundle[current_family] = config - else: - config = list(itertools.product(matrix_shape, matrix_type)) - # clustering : [[10k_1, dense], [10k_2, dense], ...] - config_bundle[current_family] = config + current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos) + config = list(itertools.product(matrix_shape, current_matrix_type)) + # clustering : [[10k_1, dense], [10k_2, dense], ...] + config_bundle[current_family] = config config_packets = {} for current_family, configs in config_bundle.items(): http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/predict.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py index bc034da..92d3af4 100755 --- a/scripts/perftest/python/predict.py +++ b/scripts/perftest/python/predict.py @@ -21,10 +21,8 @@ #------------------------------------------------------------- import sys -import os from os.path import join -import glob -from utils import create_dir, config_writer +from utils import config_writer, relevant_folders, mat_type_check # Contains configuration setting for predicting DATA_FORMAT = 'csv' @@ -221,7 +219,7 @@ def glm_gamma_predict(save_file_name, datagen_dir, train_dir, predict_dir): return full_path_predict -def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir): +def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, predict_dir, dense_algos): """ This function has two responsibilities. Generate the configuration files for prediction algorithms and return a dictionary that will be used for execution. @@ -230,6 +228,12 @@ def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir): The first tuple index contains algorithm name and the second index contains family type. + matrix_type: String + Type of matrix to generate e.g dense, sparse, all + + matrix_shape: String + Shape of matrix to generate e.g 100k_10 + datagen_dir: String Path of the data generation directory @@ -239,45 +243,39 @@ def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir): predict_dir: String Path of the prediction directory + dense_algos: List + Algorithms that support only dense matrix type + return: Dictionary {string: list} This dictionary contains algorithms to be executed as keys and the path of configuration json files to be executed list of values. """ - - algo_payload_distinct = set(map(lambda x: x[0], algo_payload)) - config_bundle = {} - for k, v in algo_payload: + for k, _ in algo_payload: config_bundle[k] = [] - for current_algo in algo_payload_distinct: - # Get all train folders related to the algorithm - train_path = join(train_dir, current_algo) - train_subdir = glob.glob(train_path + "*") - train_folders = list(filter(lambda x: os.path.isdir(x), train_subdir)) + for current_algo, current_family in algo_payload: + current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos) + train_folders = relevant_folders(train_dir, current_algo, current_family, + current_matrix_type, matrix_shape, 'train') if len(train_folders) == 0: print('training folders not present for {}'.format(current_algo)) sys.exit() for current_train_folder in train_folders: - save_name = current_train_folder.split('/')[-1] - # Get all datagen folders - data_gen_folder_name = '.'.join(save_name.split('.')[1:-1]) - data_gen_path = join(datagen_dir, data_gen_folder_name) - data_gen_subdir = glob.glob(data_gen_path + "*") - data_gen_folder = list(filter(lambda x: os.path.isdir(x), data_gen_subdir)) - - if len(data_gen_folder) == 0: + current_data_gen_dir = relevant_folders(datagen_dir, current_algo, current_family, + current_matrix_type, matrix_shape, 'data-gen') + if len(current_data_gen_dir) == 0: print('data-gen folders not present for {}'.format(current_family)) sys.exit() - # Ideally we will have more than one datagen directory to be found - current_data_gen_dir = list(data_gen_folder)[0] - + save_name = current_train_folder.split('/')[-1] algo_func = '_'.join([current_algo.lower().replace('-', '_')] + ['predict']) - conf_path = globals()[algo_func](save_name, current_data_gen_dir, + + # current_data_gen_dir has index 0 as we would expect one datagen for each algorithm + conf_path = globals()[algo_func](save_name, current_data_gen_dir[0], current_train_folder, predict_dir) config_bundle[current_algo].append(conf_path) http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/run_perftest.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py index b0257d4..3360285 100755 --- a/scripts/perftest/python/run_perftest.py +++ b/scripts/perftest/python/run_perftest.py @@ -26,13 +26,14 @@ import argparse from functools import reduce import os from os.path import join -from utils import get_families, config_reader, create_dir, get_existence, \ - exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics import logging from datetime import datetime from datagen import config_packets_datagen from train import config_packets_train from predict import config_packets_predict +from utils import get_families, config_reader, create_dir, get_existence, \ + exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics + # A packet is a dictionary # with key as the algorithm @@ -80,6 +81,8 @@ ML_PREDICT = {'Kmeans': 'Kmeans-predict', 'GLM_gamma': 'GLM-predict', 'GLM_binomial': 'GLM-predict'} +DENSE_TYPE_ALGOS = ['clustering', 'stats1', 'stats2'] + # Responsible for execution and metric logging def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode): @@ -125,7 +128,7 @@ def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode) print('data already exists {}'.format(config_path)) time = 'data_exists' else: - time = exec_dml_and_parse_time(exec_type, dml_file_name, config_file_name, args) + time = exec_dml_and_parse_time(exec_type, dml_file_name, config_file_name, args) # Write a _SUCCESS file only if time is found and in data-gen action_mode if len(time.split('.')) == 2 and action_mode == 'data-gen': @@ -152,7 +155,7 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode Contains the execution type singlenode / hybrid_spark mat_type: List - Type of matrix to generate dense or sparse + Type of matrix to generate dense, sparse, all mat_shape: List Dimensions of the input matrix with rows and columns @@ -201,12 +204,12 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode if 'data-gen' in mode: data_gen_dir = join(temp_dir, 'data-gen') create_dir(data_gen_dir) - conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir) + conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir, + DENSE_TYPE_ALGOS) for family_name, config_folders in conf_packet.items(): for config in config_folders: file_name = ML_GENDATA[family_name] algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen') - # Statistic family do not require to be split if family_name not in ['stats1', 'stats2']: exec_test_data(exec_type, config) @@ -215,7 +218,8 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode data_gen_dir = join(temp_dir, 'data-gen') train_dir = join(temp_dir, 'train') create_dir(train_dir) - conf_packet = config_packets_train(algos_to_run, data_gen_dir, train_dir) + conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, data_gen_dir, + train_dir, DENSE_TYPE_ALGOS) for algo_name, config_files in conf_packet.items(): for config in config_files: file_name = ML_TRAIN[algo_name] @@ -227,9 +231,12 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode predict_dir = join(temp_dir, 'predict') create_dir(predict_dir) algos_to_run_perdict = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run)) - if len(algos_to_run_perdict) < 0: + if len(algos_to_run_perdict) < 1: + # No algorithms with predict found pass - conf_packet = config_packets_predict(algos_to_run_perdict, data_gen_dir, train_dir, predict_dir) + conf_packet = config_packets_predict(algos_to_run_perdict, mat_type, mat_shape, data_gen_dir, + train_dir, predict_dir, DENSE_TYPE_ALGOS) + for algo_name, config_files in conf_packet.items(): for config in config_files: file_name = ML_PREDICT[algo_name] @@ -243,11 +250,12 @@ if __name__ == '__main__': print('SYSTEMML_HOME not found') sys.exit() + # Supported Arguments + mat_type = ['dense', 'sparse', 'all'] + workload = ['data-gen', 'train', 'predict'] + execution_mode = ['hybrid_spark', 'singlenode'] # Default Arguments - default_mat_type = ['dense', 'sparse'] - default_workload = ['data-gen', 'train', 'predict'] default_mat_shape = ['10k_100'] - default_execution_mode = ['hybrid_spark', 'singlenode'] # Default temp directory, contains everything generated in perftest default_temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp') @@ -274,21 +282,21 @@ if __name__ == '__main__': '(Overrides --family, available : ' + ', '.join(sorted(all_algos)) + ')', metavar='', choices=all_algos, nargs='+') - cparser.add_argument('--exec-type', default='singlenode', help='System-ML backend ' - '(available : singlenode, spark-hybrid)', metavar='', - choices=default_execution_mode) - cparser.add_argument('--mat-type', default=default_mat_type, help='space separated list of types of matrix to generate ' - '(available : dense, sparse)', metavar='', choices=default_mat_type, + cparser.add_argument('--exec-type', default='hybrid_spark', help='System-ML backend ' + 'available : ' + ','.join(execution_mode), metavar='', + choices=execution_mode) + cparser.add_argument('--mat-type', default=['all'], help='space separated list of types of matrix to generate ' + 'available : ' + ','.join(mat_type), metavar='', choices=mat_type, nargs='+') cparser.add_argument('--mat-shape', default=default_mat_shape, help='space separated list of shapes of matrices ' 'to generate (e.g 10k_1k, 20M_4k)', metavar='', nargs='+') cparser.add_argument('--temp-dir', default=default_temp_dir, help='temporary directory ' - 'where generated, training and prediction data is put', metavar='') + 'where generated, training and prediction data is put', metavar='') cparser.add_argument('--filename', default='perf_test', help='name of the output file for the perf' ' metrics', metavar='') - cparser.add_argument('--mode', default=default_workload, + cparser.add_argument('--mode', default=workload, help='space separated list of types of workloads to run (available: data-gen, train, predict)', - metavar='', choices=default_workload, nargs='+') + metavar='', choices=workload, nargs='+') # Args is a namespace args = cparser.parse_args() @@ -297,6 +305,11 @@ if __name__ == '__main__': # Debug arguments # print(arg_dict) + # default_mat_type validity + if len(args.mat_type) > 2: + print('length of --mat-type argument cannot be greater than two') + sys.exit() + # Check for validity of input arguments if args.family is not None: for fam in args.family: http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/train.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/train.py b/scripts/perftest/python/train.py index 1ab2880..627ba03 100755 --- a/scripts/perftest/python/train.py +++ b/scripts/perftest/python/train.py @@ -21,10 +21,8 @@ #------------------------------------------------------------- import sys -import glob -import os from os.path import join -from utils import config_writer +from utils import config_writer, relevant_folders, mat_type_check from functools import reduce # Contains configuration setting for training @@ -48,8 +46,8 @@ def binomial_m_svm_train(save_folder_name, datagen_dir, train_dir): model = join(full_path_train, 'model.data') Log = join(full_path_train, 'Log.data') - config = dict(X=X, Y=Y, icpt=icpt, classes=2, reg=reg, tol=tol, maxiter=maxiter, model=model, - Log=Log, fmt=DATA_FORMAT) + config = dict(X=X, Y=Y, icpt=icpt, classes=2, reg=reg, tol=tol, maxiter=maxiter, + model=model, Log=Log, fmt=DATA_FORMAT) config_writer(full_path_train + '.json', config) return data_folders @@ -117,8 +115,8 @@ def multinomial_m_svm_train(save_folder_name, datagen_dir, train_dir): model = join(full_path_train, 'model.data') Log = join(full_path_train, 'Log.data') - config = dict(X=X, Y=Y, icpt=icpt, classes=150, reg=reg, tol=tol, maxiter=maxiter, model=model, - Log=Log, fmt=DATA_FORMAT) + config = dict(X=X, Y=Y, icpt=icpt, classes=150, reg=reg, tol=tol, maxiter=maxiter, + model=model, Log=Log, fmt=DATA_FORMAT) config_writer(full_path_train + '.json', config) data_folders.append(full_path_train) @@ -358,7 +356,7 @@ def regression2_glm_poisson_train(save_folder_name, datagen_dir, train_dir): return data_folders -def config_packets_train(algo_payload, datagen_dir, train_dir): +def config_packets_train(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, dense_algos): """ This function has two responsibilities. Generate the configuration files for input training algorithms and return a dictionary that will be used for execution. @@ -367,39 +365,45 @@ def config_packets_train(algo_payload, datagen_dir, train_dir): The first tuple index contains algorithm name and the second index contains family type. + matrix_type: String + Type of matrix to generate e.g dense, sparse, all + + matrix_shape: String + Shape of matrix to generate e.g 100k_10 + datagen_dir: String Path of the data generation directory train_dir: String Path of the training directory + dense_algos: List + Algorithms that support only dense matrix type + return: {string: list} This dictionary contains algorithms to be executed as keys and the path of configuration json files to be executed list of values. - """ config_bundle = {} - for k, v in algo_payload: + for k, _ in algo_payload: config_bundle[k] = [] for current_algo, current_family in algo_payload: - data_gen_path = join(datagen_dir, current_family) - data_gen_subdir = glob.glob(data_gen_path + "*") - - # Filter for specific data gen - data_gen_folders = list(filter(lambda x: os.path.isdir(x), data_gen_subdir)) + current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos) + data_gen_folders = relevant_folders(datagen_dir, current_algo, current_family, + current_matrix_type, matrix_shape, 'data-gen') if len(data_gen_folders) == 0: print('datagen folders not present for {}'.format(current_family)) sys.exit() - for current_folder in data_gen_folders: - file_path_last = current_folder.split('/')[-1] + for current_datagen_dir in data_gen_folders: + file_path_last = current_datagen_dir.split('/')[-1] save_name = '.'.join([current_algo] + [file_path_last]) algo_func = '_'.join([current_family] + [current_algo.lower().replace('-', '_')] + ['train']) - conf_path = globals()[algo_func](save_name, current_folder, train_dir) + conf_path = globals()[algo_func](save_name, current_datagen_dir, train_dir) config_bundle[current_algo].append(conf_path) config_packets = {} http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/utils.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/utils.py b/scripts/perftest/python/utils.py index 464d7f6..4bba34f 100755 --- a/scripts/perftest/python/utils.py +++ b/scripts/perftest/python/utils.py @@ -27,11 +27,14 @@ import subprocess import shlex import re import logging +import sys +import glob +from functools import reduce # This file contains all the utility functions required for performance test module -def get_families(current_algo, ML_ALGO): +def get_families(current_algo, ml_algo): """ Given current algorithm we get its families. @@ -46,7 +49,7 @@ def get_families(current_algo, ML_ALGO): """ family_list = [] - for family, algos in ML_ALGO.items(): + for family, algos in ml_algo.items(): if current_algo in algos: family_list.append(family) return family_list @@ -138,7 +141,7 @@ def get_existence(path, action_mode): return exist -def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, args, Time=True): +def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, args, time=True): """ This function is responsible of execution of input arguments via python sub process, We also extract time obtained from the output of this subprocess @@ -181,7 +184,7 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, arg proc1 = subprocess.Popen(shlex.split(cmd_string), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if Time: + if time: proc1_log = [] while proc1.poll() is None: raw_std_out = proc1.stdout.readline() @@ -189,7 +192,7 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, arg proc1_log.append(decode_raw) logging.log(10, decode_raw) - out1, err1 = proc1.communicate() + _, err1 = proc1.communicate() if "Error" in str(err1): print('Error Found in {}'.format(dml_file_name)) @@ -197,9 +200,9 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, arg else: total_time = parse_time(proc1_log) - with open(execution_output_file, 'w') as f: + with open(execution_output_file, 'w') as file: for row in proc1_log: - f.write("%s\n" % str(row)) + file.write("%s\n" % str(row)) else: total_time = 'not_specified' @@ -253,20 +256,18 @@ def exec_test_data(exec_type, path): exec_dml_and_parse_time(exec_type, test_split_script, config_file_name, args, False) -def check_predict(current_algo, ML_PREDICT): +def check_predict(current_algo, ml_predict): """ To check if the current algorithm requires to run the predict current_algo: String Algorithm being processed - ML_PREDICT: Dictionary + ml_predict: Dictionary Key value pairs of algorithm and predict file to process """ - if current_algo in ML_PREDICT.keys(): + if current_algo in ml_predict.keys(): return True - else: - return False def get_folder_metrics(folder_name, action_mode): @@ -301,4 +302,89 @@ def get_folder_metrics(folder_name, action_mode): except IndexError: intercept = 'none' - return mat_type, mat_shape, intercept \ No newline at end of file + return mat_type, mat_shape, intercept + + +def mat_type_check(current_family, matrix_types, dense_algos): + """ + Some Algorithms support different matrix_type. This function give us the right matrix_type given + an algorithm + + current_family: String + Current family being porcessed in this function + + matrix_type: List + Type of matrix to generate dense, sparse, all + + dense_algos: List + Algorithms that support only dense matrix type + + return: List + Return the list of right matrix types supported by the family + """ + current_type = [] + for current_matrix_type in matrix_types: + if current_matrix_type == 'all': + if current_family in dense_algos: + current_type.append('dense') + else: + current_type.append('dense') + current_type.append('sparse') + + if current_matrix_type == 'sparse': + if current_family in dense_algos: + sys.exit('{} does not support {} matrix type'.format(current_family, + current_matrix_type)) + else: + current_type.append(current_matrix_type) + + if current_matrix_type == 'dense': + current_type.append(current_matrix_type) + + return current_type + + +def relevant_folders(path, algo, family, matrix_type, matrix_shape, mode): + """ + Finds the right folder to read the data based on given parameters + + path: String + Location of data-gen and training folders + + algo: String + Current algorithm being processed by this function + + family: String + Current family being processed by this function + + matrix_type: List + Type of matrix to generate dense, sparse, all + + matrix_shape: List + Dimensions of the input matrix with rows and columns + + mode: String + Based on mode and arguments we read the specific folders e.g data-gen folder or train folder + + return: List + List of folder locations to read data from + """ + folders = [] + for current_matrix_type in matrix_type: + for current_matrix_shape in matrix_shape: + if mode == 'data-gen': + data_gen_path = join(path, family) + sub_folder_name = '.'.join([current_matrix_type, current_matrix_shape]) + path_subdir = glob.glob(data_gen_path + '.' + sub_folder_name + "*") + + if mode == 'train': + train_path = join(path, algo) + sub_folder_name = '.'.join([family, current_matrix_type, current_matrix_shape]) + path_subdir = glob.glob(train_path + '.' + sub_folder_name + "*") + + path_folders = list(filter(lambda x: os.path.isdir(x), path_subdir)) + folders.append(path_folders) + + folders_flat = reduce(lambda x, y: x + y, folders) + + return folders_flat