mboehm7 commented on a change in pull request #881: spark wip for review
URL: https://github.com/apache/systemml/pull/881#discussion_r407234720
 
 

 ##########
 File path: scripts/staging/slicing/sparked/sparked_utils.py
 ##########
 @@ -0,0 +1,122 @@
+from pyspark.sql.functions import udf
+from pyspark.sql.types import FloatType
+
+from slicing.base.SparkedNode import SparkedNode
+from slicing.base.slicer import opt_fun, union
+
+calc_loss = udf(lambda target, prediction, type: calc_loss_fun(target, 
prediction, type), FloatType())
+model_type_init = udf(lambda type: init_model_type(type))
+
+
+def calc_loss_fun(target, prediction, type):
+    if type == 0:
+        return (prediction - target) ** 2
+    elif type == 1:
+        if target == prediction:
+            return float(1)
+        else:
+            return float(0)
+
+
+def init_model_type(model_type):
+    if model_type == "regression":
+        return 0
+    elif model_type == "classification":
+        return 1
+
+
+def slice_join_nonsense(node_i, node_j, cur_lvl):
+    commons = 0
+    for attr1 in node_i.attributes:
+        for attr2 in node_j.attributes:
+            if attr1 == attr2:
+                commons = commons + 1
+    return commons != cur_lvl - 1
+
+
+def make_first_level(features, predictions, loss, top_k, alpha, k, w, 
loss_type):
+    first_level = []
+    # First level slices are enumerated in a "classic way" (getting data and 
not analyzing bounds
+    for feature in features:
+        new_node = SparkedNode(loss, predictions)
+        new_node.parents = [feature]
+        new_node.attributes.append(feature)
+        new_node.name = new_node.make_name()
+        new_node.key = new_node.make_key()
+        new_node.process_slice(loss_type)
+        new_node.score = opt_fun(new_node.loss, new_node.size, loss, 
len(predictions), w)
+        new_node.c_upper = new_node.score
+        first_level.append(new_node)
+        new_node.print_debug(top_k, 0)
+        # constraints for 1st level nodes to be problematic candidates
+        #if new_node.check_constraint(top_k, len(predictions), alpha):
+            # this method updates top k slices if needed
+            #top_k.add_new_top_slice(new_node)
+    return first_level
+
+
+def slice_union_nonsense(node_i, node_j):
 
 Review comment:
   Maybe find better names for these methods (drop the 'nonsense' part)

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to