This is an automated email from the ASF dual-hosted git repository. baunsgaard pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 79addc3c552b3f05d1cfbdfd89446bc3246855e3 Author: baunsgaard <[email protected]> AuthorDate: Tue Sep 14 21:46:23 2021 +0200 [SYSTEMDS-3132] Update FrameBlock to include double and boolean The previous rewrite did not include the boolean and double type. with this commit it does. --- .../sysds/runtime/matrix/data/FrameBlock.java | 31 ++++- .../python/systemds/examples/tutorials/adult.py | 22 ++- src/main/python/systemds/operator/nodes/frame.py | 28 ++-- src/main/python/systemds/operator/nodes/list.py | 5 +- .../python/systemds/operator/nodes/list_access.py | 9 +- src/main/python/systemds/operator/nodes/matrix.py | 21 +-- .../python/systemds/operator/nodes/multi_return.py | 11 +- src/main/python/systemds/operator/nodes/scalar.py | 14 +- src/main/python/systemds/operator/nodes/source.py | 10 +- .../python/systemds/operator/operation_node.py | 2 +- src/main/python/systemds/utils/converters.py | 20 ++- src/main/python/systemds/utils/helpers.py | 13 +- .../python/tests/examples/tutorials/test_adult.py | 149 +++++++++++++-------- 13 files changed, 213 insertions(+), 122 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java index 9ff106c..2a290f3 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java @@ -622,21 +622,44 @@ public class FrameBlock implements CacheBlock, Externalizable { } public byte[] getColumnAsBytes(int c){ + final int nRow = getNumRows(); switch(_schema[c]){ case INT64: long[] colLong = ((LongArray)_coldata[c])._data; - ByteBuffer longBuffer = ByteBuffer.allocate(8 * getNumRows()); + ByteBuffer longBuffer = ByteBuffer.allocate(8 * nRow); longBuffer.order(ByteOrder.LITTLE_ENDIAN); - for(int i = 0; i < getNumRows(); i++) + for(int i = 0; i < nRow; i++) longBuffer.putLong(colLong[i]); return longBuffer.array(); case INT32: int[] colInt = ((IntegerArray)_coldata[c])._data; - ByteBuffer intBuffer = ByteBuffer.allocate(4 * getNumRows()); + ByteBuffer intBuffer = ByteBuffer.allocate(4 * nRow); intBuffer.order(ByteOrder.LITTLE_ENDIAN); - for(int i = 0; i < getNumRows(); i++) + for(int i = 0; i < nRow; i++) intBuffer.putInt(colInt[i]); return intBuffer.array(); + case FP64: + double[] colDouble = ((DoubleArray)_coldata[c])._data; + ByteBuffer doubleBuffer = ByteBuffer.allocate(8 * nRow); + doubleBuffer.order(ByteOrder.nativeOrder()); + for(int i = 0; i < nRow; i++) + doubleBuffer.putDouble(colDouble[i]); + return doubleBuffer.array(); + case FP32: + float[] colFloat = ((FloatArray)_coldata[c])._data; + ByteBuffer floatBuffer = ByteBuffer.allocate(8 * nRow); + floatBuffer.order(ByteOrder.nativeOrder()); + for(int i = 0; i < nRow; i++) + floatBuffer.putDouble(colFloat[i]); + return floatBuffer.array(); + case BOOLEAN: + boolean[] colBool = ((BooleanArray)_coldata[c])._data; + // over allocating here.. we could maybe bit pack? + ByteBuffer booleanBuffer = ByteBuffer.allocate(nRow); + booleanBuffer.order(ByteOrder.nativeOrder()); + for(int i = 0; i < nRow; i++) + booleanBuffer.put((byte)(colBool[i]? 1:0)); + return booleanBuffer.array(); default: throw new NotImplementedException(); } diff --git a/src/main/python/systemds/examples/tutorials/adult.py b/src/main/python/systemds/examples/tutorials/adult.py index c28ed0a..5bd3cc2 100644 --- a/src/main/python/systemds/examples/tutorials/adult.py +++ b/src/main/python/systemds/examples/tutorials/adult.py @@ -59,25 +59,41 @@ class DataManager: def get_train_data(self, sds: SystemDSContext) -> 'Frame': self._get_data(self._train_data_loc) - return sds.read(self._train_data_loc) + return sds.read(self._train_data_loc)[:,0:14] def get_train_labels_pandas(self) -> pd.DataFrame: self._get_data(self._train_data_loc) return self._parse_data(self._train_data_loc)["income"] + def get_train_labels(self, sds: SystemDSContext) -> 'Frame': + self._get_data(self._train_data_loc) + return sds.read(self._train_data_loc)[:,14] + def get_test_data_pandas(self) -> pd.DataFrame: self._get_data(self._test_data_loc) return self._parse_data(self._test_data_loc)\ - .drop(labels=["income"], axis=1).to_numpy() + .drop(labels=["income"], axis=1) + + def get_test_data(self, sds: SystemDSContext) -> 'Frame': + self._get_data(self._test_data_loc) + return sds.read(self._test_data_loc)[:,0:14] def get_test_labels_pandas(self) -> pd.DataFrame: self._get_data(self._test_data_loc) return self._parse_data(self._test_data_loc)["income"] - def get_jspec(self) -> str: + def get_test_labels(self, sds: SystemDSContext) -> 'Frame': + self._get_data(self._test_data_loc) + return sds.read(self._test_data_loc)[:,14] + + def get_jspec_string(self) -> str: self._get_data(self._jspec_loc) with open(self._jspec_loc, "r") as f: return f.read() + + def get_jspec(self, sds: SystemDSContext) -> 'Scalar': + self._get_data(self._jspec_loc) + return sds.read(self._jspec_loc, data_type="scalar", value_type="string") def _parse_data(self, loc) -> pd.DataFrame: return pd.read_csv(loc) diff --git a/src/main/python/systemds/operator/nodes/frame.py b/src/main/python/systemds/operator/nodes/frame.py index 5fe02cc..efa75b0 100644 --- a/src/main/python/systemds/operator/nodes/frame.py +++ b/src/main/python/systemds/operator/nodes/frame.py @@ -22,16 +22,18 @@ __all__ = ["Frame"] import os -from typing import Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING, Iterable +from typing import (TYPE_CHECKING, Dict, Iterable, Optional, Sequence, Tuple, + Union) import numpy as np import pandas as pd from py4j.java_gateway import JavaObject, JVMView -from systemds.operator import OperationNode, Matrix, MultiReturn +from systemds.operator import Matrix, MultiReturn, OperationNode +from systemds.script_building.dag import DAGNode, OutputType from systemds.utils.consts import VALID_INPUT_TYPES +from systemds.utils.converters import (frame_block_to_pandas, + pandas_to_frame_block) from systemds.utils.helpers import get_slice_string -from systemds.utils.converters import pandas_to_frame_block, frame_block_to_pandas -from systemds.script_building.dag import OutputType, DAGNode if TYPE_CHECKING: # to avoid cyclic dependencies during runtime @@ -46,7 +48,7 @@ class Frame(OperationNode): unnamed_input_nodes: Union[str, Iterable[VALID_INPUT_TYPES]] = None, named_input_nodes: Dict[str, VALID_INPUT_TYPES] = None, - local_data: pd.DataFrame = None, brackets:bool = False) -> "Frame": + local_data: pd.DataFrame = None, brackets: bool = False) -> "Frame": is_python_local_data = False if local_data is not None: self._pd_dataframe = local_data @@ -87,11 +89,11 @@ class Frame(OperationNode): def transform_encode(self, spec: "Scalar"): params_dict = {"target": self, "spec": spec} - - frame = Frame(self.sds_context,"") - matrix = Matrix(self.sds_context,"") - output_nodes = [matrix,frame] - + + frame = Frame(self.sds_context, "") + matrix = Matrix(self.sds_context, "") + output_nodes = [matrix, frame] + op = MultiReturn( self.sds_context, "transformencode", @@ -125,18 +127,18 @@ class Frame(OperationNode): """ return Frame(self.sds_context, "cbind", [self, other]) - def replace(self, pattern:str, replacement:str) -> 'Frame': + def replace(self, pattern: str, replacement: str) -> 'Frame': """ Replace all instances of string with replacement string :param: pattern the string to replace :param: replacement the string to replace with :return: The Frame containing the replaced values """ - return Frame(self.sds_context, "replace", named_input_nodes={"target": self, "pattern": f"'{pattern}'", "replacement":f"'{replacement}'"}) + return Frame(self.sds_context, "replace", named_input_nodes={"target": self, "pattern": f"'{pattern}'", "replacement": f"'{replacement}'"}) def __str__(self): return "FrameNode" def __getitem__(self, i) -> 'Frame': sliceIns = get_slice_string(i) - return Frame(self.sds_context, '', [self, sliceIns], brackets=True) \ No newline at end of file + return Frame(self.sds_context, '', [self, sliceIns], brackets=True) diff --git a/src/main/python/systemds/operator/nodes/list.py b/src/main/python/systemds/operator/nodes/list.py index 09455a3..6f5bfb1 100644 --- a/src/main/python/systemds/operator/nodes/list.py +++ b/src/main/python/systemds/operator/nodes/list.py @@ -21,12 +21,11 @@ __all__ = ["List"] -from typing import Dict, Sequence, Tuple, Union, Iterable, List +from typing import Dict, Iterable, List, Sequence, Tuple, Union import numpy as np from py4j.java_gateway import JavaObject - -from systemds.operator import OperationNode, ListAccess +from systemds.operator import ListAccess, OperationNode from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES from systemds.utils.converters import numpy_to_matrix_block diff --git a/src/main/python/systemds/operator/nodes/list_access.py b/src/main/python/systemds/operator/nodes/list_access.py index 10d51f5..a954f9c 100644 --- a/src/main/python/systemds/operator/nodes/list_access.py +++ b/src/main/python/systemds/operator/nodes/list_access.py @@ -21,12 +21,11 @@ __all__ = ["ListAccess"] -from typing import Dict, Sequence, Tuple, Union, Iterable +from typing import Dict, Iterable, Sequence, Tuple, Union import numpy as np from py4j.java_gateway import JavaObject - -from systemds.operator import OperationNode, Matrix, Frame, Scalar +from systemds.operator import Frame, Matrix, OperationNode, Scalar from systemds.script_building.dag import OutputType @@ -49,7 +48,7 @@ class ListAccess(OperationNode): res = Matrix(self.sds_context, "as.matrix", [ent]) self._list_source._outputs[self._key] = res return res - + def as_frame(self) -> Frame: ent = self._list_source[self._key] res = Frame(self.sds_context, "as.frame", [ent]) @@ -63,4 +62,4 @@ class ListAccess(OperationNode): return res def __str__(self): - return "ListAccessNode" \ No newline at end of file + return "ListAccessNode" diff --git a/src/main/python/systemds/operator/nodes/matrix.py b/src/main/python/systemds/operator/nodes/matrix.py index 2603dd5..4f08735 100644 --- a/src/main/python/systemds/operator/nodes/matrix.py +++ b/src/main/python/systemds/operator/nodes/matrix.py @@ -22,17 +22,18 @@ __all__ = ["Matrix"] import os -from typing import Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING, Iterable +from typing import (TYPE_CHECKING, Dict, Iterable, Optional, Sequence, Tuple, + Union) import numpy as np from py4j.java_gateway import JavaObject, JVMView from systemds.operator import OperationNode, Scalar -from systemds.utils.consts import VALID_INPUT_TYPES -from systemds.utils.converters import numpy_to_matrix_block, matrix_block_to_numpy -from systemds.utils.helpers import get_slice_string from systemds.script_building.dag import OutputType - -from systemds.utils.consts import VALID_INPUT_TYPES, BINARY_OPERATIONS, VALID_ARITHMETIC_TYPES +from systemds.utils.consts import (BINARY_OPERATIONS, VALID_ARITHMETIC_TYPES, + VALID_INPUT_TYPES) +from systemds.utils.converters import (matrix_block_to_numpy, + numpy_to_matrix_block) +from systemds.utils.helpers import get_slice_string class Matrix(OperationNode): @@ -42,7 +43,7 @@ class Matrix(OperationNode): unnamed_input_nodes: Union[str, Iterable[VALID_INPUT_TYPES]] = None, named_input_nodes: Dict[str, VALID_INPUT_TYPES] = None, - local_data: np.array = None, brackets:bool = False ) -> 'Matrix': + local_data: np.array = None, brackets: bool = False) -> 'Matrix': is_python_local_data = False if local_data is not None: @@ -359,12 +360,12 @@ class Matrix(OperationNode): :return: The Matrix representing the result of this operation """ return Matrix(self.sds_context, "round", [self]) - - def replace(self, pattern:VALID_INPUT_TYPES, replacement:VALID_INPUT_TYPES) -> 'Matrix': + + def replace(self, pattern: VALID_INPUT_TYPES, replacement: VALID_INPUT_TYPES) -> 'Matrix': """ Replace all values with replacement value """ - return Matrix(self.sds_context, "replace", named_input_nodes={"target": self, "pattern": pattern, "replacement":replacement}) + return Matrix(self.sds_context, "replace", named_input_nodes={"target": self, "pattern": pattern, "replacement": replacement}) def __str__(self): return "MatrixNode" diff --git a/src/main/python/systemds/operator/nodes/multi_return.py b/src/main/python/systemds/operator/nodes/multi_return.py index 90b2de5..b766157 100644 --- a/src/main/python/systemds/operator/nodes/multi_return.py +++ b/src/main/python/systemds/operator/nodes/multi_return.py @@ -21,15 +21,15 @@ __all__ = ["MultiReturn"] -from typing import Dict, Sequence, Tuple, Union, Iterable, List +from typing import Dict, Iterable, List, Sequence, Tuple, Union import numpy as np from py4j.java_gateway import JavaObject - from systemds.operator import OperationNode from systemds.script_building.dag import OutputType from systemds.utils.consts import VALID_INPUT_TYPES -from systemds.utils.converters import matrix_block_to_numpy,frame_block_to_pandas +from systemds.utils.converters import (frame_block_to_pandas, + matrix_block_to_numpy) from systemds.utils.helpers import create_params_string @@ -68,7 +68,7 @@ class MultiReturn(OperationNode): result_var = [] jvmV = self.sds_context.java_gateway.jvm for idx, v in enumerate(self._script.out_var_name): - out_type =self._outputs[idx].output_type + out_type = self._outputs[idx].output_type if out_type == OutputType.MATRIX: result_var.append( matrix_block_to_numpy(jvmV, result_variables.getMatrixBlock(v))) @@ -78,7 +78,8 @@ class MultiReturn(OperationNode): elif out_type == OutputType.DOUBLE: result_var.append(result_variables.getDouble(v)) else: - raise NotImplementedError("Not Implemented Support of type" + out_type) + raise NotImplementedError( + "Not Implemented Support of type" + out_type) return result_var def __iter__(self): diff --git a/src/main/python/systemds/operator/nodes/scalar.py b/src/main/python/systemds/operator/nodes/scalar.py index 5078ac0..511ad34 100644 --- a/src/main/python/systemds/operator/nodes/scalar.py +++ b/src/main/python/systemds/operator/nodes/scalar.py @@ -22,16 +22,16 @@ __all__ = ["Scalar"] import os -from typing import Dict, Optional, Sequence, Tuple, Union, TYPE_CHECKING, Iterable +from typing import (TYPE_CHECKING, Dict, Iterable, Optional, Sequence, Tuple, + Union) import numpy as np from py4j.java_gateway import JavaObject, JVMView from systemds.operator import OperationNode -from systemds.utils.consts import VALID_INPUT_TYPES -from systemds.utils.converters import numpy_to_matrix_block from systemds.script_building.dag import OutputType - -from systemds.utils.consts import VALID_INPUT_TYPES, BINARY_OPERATIONS, VALID_ARITHMETIC_TYPES +from systemds.utils.consts import (BINARY_OPERATIONS, VALID_ARITHMETIC_TYPES, + VALID_INPUT_TYPES) +from systemds.utils.converters import numpy_to_matrix_block class Scalar(OperationNode): @@ -66,7 +66,8 @@ class Scalar(OperationNode): elif self.output_type == OutputType.STRING: return result_variables.getString(self._script.out_var_name[0]) else: - raise NotImplemented("Not currently support scalar type: " + self.output_type) + raise NotImplemented( + "Not currently support scalar type: " + self.output_type) def __add__(self, other: VALID_ARITHMETIC_TYPES) -> 'Scalar': return Scalar(self.sds_context, '+', [self, other]) @@ -226,4 +227,3 @@ class Scalar(OperationNode): def __str__(self): return "ScalarNode" - diff --git a/src/main/python/systemds/operator/nodes/source.py b/src/main/python/systemds/operator/nodes/source.py index d027209..3a61f11 100644 --- a/src/main/python/systemds/operator/nodes/source.py +++ b/src/main/python/systemds/operator/nodes/source.py @@ -26,7 +26,7 @@ from typing import (TYPE_CHECKING, Dict, Iterable, Optional, Sequence, Tuple, Union) import numpy as np -from systemds.operator import Matrix, OperationNode, Scalar, List +from systemds.operator import List, Matrix, OperationNode, Scalar from systemds.script_building.dag import OutputType @@ -48,7 +48,7 @@ class Func(object): argument_string, named_arguments = self.parse_inputs() named_intput_nodes = f'named_arguments = {{{named_arguments}}}' output_object = self.parse_outputs() - + definition = f'def {self._name}(self{argument_string}):' if self._outputs is None: output = f'out = {output_object}(self.sds_context, {operation}, named_input_nodes=named_arguments, output_type=OutputType.NONE)' @@ -101,13 +101,13 @@ class Func(object): elif var_l[0] == 'i': # integer if "integer" in var_l: return (self.split_to_value_and_def(var[7:]), 'Scalar') - else: # int + else: # int return (self.split_to_value_and_def(var[3:]), 'Scalar') elif var_l[0] == 'b': # boolean return (self.split_to_value_and_def(var[7:], True), 'Scalar') - elif var_l[0] == 'l': # list[unknown] + elif var_l[0] == 'l': # list[unknown] return (self.split_to_value_and_def(var[13:]), 'List') - elif var_l[0] == 's': # string + elif var_l[0] == 's': # string return (self.split_to_value_and_def(var[6:]), 'Scalar') else: raise NotImplementedError( diff --git a/src/main/python/systemds/operator/operation_node.py b/src/main/python/systemds/operator/operation_node.py index acc3988..99d823b 100644 --- a/src/main/python/systemds/operator/operation_node.py +++ b/src/main/python/systemds/operator/operation_node.py @@ -147,7 +147,7 @@ class OperationNode(DAGNode): assert len( unnamed_input_vars) == 2, 'Binary Operations need exactly two input variables' return f'{var_name}={unnamed_input_vars[0]}{self.operation}{unnamed_input_vars[1]}' - + inputs_comma_sep = create_params_string( unnamed_input_vars, named_input_vars) diff --git a/src/main/python/systemds/utils/converters.py b/src/main/python/systemds/utils/converters.py index 342c7a3..9f88271 100644 --- a/src/main/python/systemds/utils/converters.py +++ b/src/main/python/systemds/utils/converters.py @@ -19,10 +19,11 @@ # # ------------------------------------------------------------- + import numpy as np import pandas as pd -import time -from py4j.java_gateway import JavaClass, JavaObject, JVMView, JavaGateway +from py4j.java_gateway import JavaClass, JavaGateway, JavaObject, JVMView + def numpy_to_matrix_block(sds: 'SystemDSContext', np_arr: np.array): """Converts a given numpy array, to internal matrix block representation. @@ -33,6 +34,7 @@ def numpy_to_matrix_block(sds: 'SystemDSContext', np_arr: np.array): assert (np_arr.ndim <= 2), "np_arr invalid, because it has more than 2 dimensions" rows = np_arr.shape[0] cols = np_arr.shape[1] if np_arr.ndim == 2 else 1 + # If not numpy array then convert to numpy array if not isinstance(np_arr, np.ndarray): np_arr = np.asarray(np_arr, dtype=np.float64) @@ -133,7 +135,7 @@ def pandas_to_frame_block(sds: "SystemDSContext", pd_df: pd.DataFrame): def frame_block_to_pandas(sds: "SystemDSContext", fb: JavaObject): - start = time.time() + num_rows = fb.getNumRows() num_cols = fb.getNumColumns() data = [] @@ -156,9 +158,17 @@ def frame_block_to_pandas(sds: "SystemDSContext", fb: JavaObject): elif d_type == "Long": byteArray = fb.getColumnAsBytes(c_index) ret = np.frombuffer(byteArray, dtype=np.int64) + elif d_type == "Double": + byteArray = fb.getColumnAsBytes(c_index) + ret = np.frombuffer(byteArray, dtype=np.float64) + elif d_type == "Boolean": + # TODO maybe it is more efficient to bit pack the booleans. + # https://stackoverflow.com/questions/5602155/numpy-boolean-array-with-1-bit-entries + byteArray = fb.getColumnAsBytes(c_index) + ret = np.frombuffer(byteArray, dtype=np.dtype("?")) else: - raise NotImplementedError(f'Not Implemented {d_type} for systemds to pandas parsing') + raise NotImplementedError( + f'Not Implemented {d_type} for systemds to pandas parsing') df[fb.getColumnName(c_index)] = ret - return df diff --git a/src/main/python/systemds/utils/helpers.py b/src/main/python/systemds/utils/helpers.py index 67b4f8d..83ca596 100644 --- a/src/main/python/systemds/utils/helpers.py +++ b/src/main/python/systemds/utils/helpers.py @@ -1,4 +1,4 @@ -#------------------------------------------------------------- +# ------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file @@ -17,12 +17,12 @@ # specific language governing permissions and limitations # under the License. # -#------------------------------------------------------------- +# ------------------------------------------------------------- import os -from itertools import chain -from typing import Iterable, Dict from importlib.util import find_spec +from itertools import chain +from typing import Dict, Iterable from systemds.utils.consts import MODULE_NAME @@ -53,7 +53,8 @@ def get_module_dir() -> os.PathLike: def get_slice_string(i): if isinstance(i, tuple): if len(i) > 2: - raise ValueError(f'Invalid number of dimensions to slice {len(i)}, Only 2 dimensions allowed') + raise ValueError( + f'Invalid number of dimensions to slice {len(i)}, Only 2 dimensions allowed') else: return f'{get_slice_string(i[0])},{get_slice_string(i[1])}' elif isinstance(i, slice): @@ -69,4 +70,4 @@ def get_slice_string(i): else: # + 1 since R and systemDS is 1 indexed. sliceIns = i+1 - return sliceIns \ No newline at end of file + return sliceIns diff --git a/src/main/python/tests/examples/tutorials/test_adult.py b/src/main/python/tests/examples/tutorials/test_adult.py index b2e4f84..ddafc96 100644 --- a/src/main/python/tests/examples/tutorials/test_adult.py +++ b/src/main/python/tests/examples/tutorials/test_adult.py @@ -24,8 +24,10 @@ import unittest import numpy as np from systemds.context import SystemDSContext from systemds.examples.tutorials.adult import DataManager -from systemds.operator import OperationNode, Matrix, Frame -from systemds.operator.algorithm import kmeans, multiLogReg, multiLogRegPredict, l2svm, confusionMatrix, scale, scaleApply, split, winsorize +from systemds.operator import Frame, Matrix, OperationNode +from systemds.operator.algorithm import (confusionMatrix, kmeans, l2svm, + multiLogReg, multiLogRegPredict, + scale, scaleApply, split, winsorize) from systemds.script_building import DMLScript @@ -53,62 +55,102 @@ class Test_DMLScript(unittest.TestCase): def tearDownClass(cls): cls.sds.close() - # def test_train_data(self): - # x = self.d.get_train_data_pandas() - # self.assertEqual((32561, 14), x.shape) + def test_train_data(self): + x = self.d.get_train_data_pandas() + self.assertEqual((32561, 14), x.shape) - # def test_train_labels(self): - # y = self.d.get_train_labels_pandas() - # self.assertEqual((32561,), y.shape) + def test_train_labels(self): + y = self.d.get_train_labels_pandas() + self.assertEqual((32561,), y.shape) - # def test_test_data(self): - # x_l = self.d.get_test_data_pandas() - # self.assertEqual((16281, 14), x_l.shape) + def test_test_data(self): + x_l = self.d.get_test_data_pandas() + self.assertEqual((16281, 14), x_l.shape) - # def test_test_labels(self): - # y_l = self.d.get_test_labels_pandas() - # self.assertEqual((16281,), y_l.shape) + def test_test_labels(self): + y_l = self.d.get_test_labels_pandas() + self.assertEqual((16281,), y_l.shape) def test_train_data_pandas_vs_systemds(self): pandas = self.d.get_train_data_pandas() - systemds = self.d.get_train_data(self.sds).compute(verbose=True) - print(pandas) - print(systemds) - # self.assertEqual(pandas, systemds) - - - # def test_multi_log_reg(self): - # # Reduced because we want the tests to finish a bit faster. - # train_count = 15000 - # test_count = 5000 - - # train_data, train_labels, test_data, test_labels = self.d.get_preprocessed_dataset() - - # # Train data - # X = self.sds.from_numpy( train_data[:train_count]) - # Y = self.sds.from_numpy( train_labels[:train_count]) - # Y = Y + 1.0 - - # # Test data - # Xt = self.sds.from_numpy(test_data[:test_count]) - # Yt = self.sds.from_numpy(test_labels[:test_count]) - # Yt = Yt + 1.0 - - # betas = multiLogReg(X, Y) - - # [_, y_pred, acc] = multiLogRegPredict(Xt, betas, Yt).compute() - - # self.assertGreater(acc, 80) - - # confusion_matrix_abs, _ = confusionMatrix(self.sds.from_numpy(y_pred), Yt).compute() - - # self.assertTrue( - # np.allclose( - # confusion_matrix_abs, - # np.array([[3503, 503], - # [268, 726]]) - # ) - # ) + systemds = self.d.get_train_data(self.sds).compute() + self.assertTrue(len(pandas.columns.difference(systemds.columns)) == 0) + self.assertEqual(pandas.shape, systemds.shape) + + def test_train_labels_pandas_vs_systemds(self): + # Pandas does not strip the parsed values.. so i have to do it here. + pandas = np.array( + [x.strip() for x in self.d.get_train_labels_pandas().to_numpy().flatten()]) + systemds = self.d.get_train_labels( + self.sds).compute().to_numpy().flatten() + comp = pandas == systemds + self.assertTrue(comp.all()) + + def test_test_labels_pandas_vs_systemds(self): + # Pandas does not strip the parsed values.. so i have to do it here. + pandas = np.array( + [x.strip() for x in self.d.get_test_labels_pandas().to_numpy().flatten()]) + systemds = self.d.get_test_labels( + self.sds).compute().to_numpy().flatten() + comp = pandas == systemds + self.assertTrue(comp.all()) + + def test_transform_encode_train_data(self): + jspec = self.d.get_jspec(self.sds) + train_x, M1 = self.d.get_train_data(self.sds).transform_encode(spec=jspec) + train_x_numpy = train_x.compute() + self.assertEqual((32561, 107), train_x_numpy.shape) + + def test_transform_encode_apply_test_data(self): + jspec = self.d.get_jspec(self.sds) + train_x, M1 = self.d.get_train_data(self.sds).transform_encode(spec=jspec) + test_x = self.d.get_test_data(self.sds).transform_apply(spec=jspec, meta=M1) + test_x_numpy = test_x.compute() + self.assertEqual((16281, 107), test_x_numpy.shape) + + def test_transform_encode_train_labels(self): + jspec_dict = {"recode":["income"]} + jspec = self.sds.scalar(f'"{jspec_dict}"') + train_y, M1 = self.d.get_train_labels(self.sds).transform_encode(spec=jspec) + train_y_numpy = train_y.compute() + self.assertEqual((32561, 1), train_y_numpy.shape) + + def test_transform_encode_test_labels(self): + jspec_dict = {"recode":["income"]} + jspec = self.sds.scalar(f'"{jspec_dict}"') + train_y, M1 = self.d.get_train_labels(self.sds).transform_encode(spec=jspec) + test_y = self.d.get_test_labels(self.sds).transform_apply(spec=jspec, meta=M1) + test_y_numpy = test_y.compute() + self.assertEqual((16281, 1), test_y_numpy.shape) + + def test_multi_log_reg(self): + # Reduced because we want the tests to finish a bit faster. + train_count = 10000 + test_count = 500 + + jspec_data = self.d.get_jspec(self.sds) + train_x_frame = self.d.get_train_data(self.sds)[0:train_count] + train_x, M1 = train_x_frame.transform_encode(spec=jspec_data) + test_x_frame = self.d.get_test_data(self.sds)[0:test_count] + test_x = test_x_frame.transform_apply(spec=jspec_data, meta=M1) + + jspec_dict = {"recode": ["income"]} + jspec_labels = self.sds.scalar(f'"{jspec_dict}"') + train_y_frame = self.d.get_train_labels(self.sds)[0:train_count] + train_y, M2 = train_y_frame.transform_encode(spec=jspec_labels) + test_y_frame = self.d.get_test_labels(self.sds)[0:test_count] + test_y = test_y_frame.transform_apply(spec=jspec_labels, meta=M2) + + betas = multiLogReg(train_x, train_y) + [_, y_pred, acc] = multiLogRegPredict(test_x, betas, test_y) + + [_, conf_avg] = confusionMatrix(y_pred, test_y) + confusion_numpy = conf_avg.compute() + + self.assertTrue(confusion_numpy[0][0] > 0.8) + self.assertTrue(confusion_numpy[0][1] < 0.5) + self.assertTrue(confusion_numpy[1][1] > 0.5) + self.assertTrue(confusion_numpy[1][0] < 0.2) # def test_neural_net(self): # # Reduced because we want the tests to finish a bit faster. @@ -137,8 +179,6 @@ class Test_DMLScript(unittest.TestCase): # #probs = FFN_package.predict(Xt, network).compute(True) # # FFN_package.eval(Yt, Yt).compute() - - # def test_level1(self): # # Reduced because we want the tests to finish a bit faster. # train_count = 15000 @@ -319,6 +359,5 @@ class Test_DMLScript(unittest.TestCase): # ################################################################################################################ - if __name__ == "__main__": unittest.main(exit=False)
