[GitHub] [madlib] reductionista commented on a change in pull request #505: DBSCAN: Add indexing optimizations to improve the runtime

GitBox Mon, 20 Jul 2020 19:52:44 -0700


reductionista commented on a change in pull request #505:
URL: https://github.com/apache/madlib/pull/505#discussion_r457791756




##########
File path: src/ports/postgres/modules/dbscan/dbscan.py_in
##########
@@ -175,9 +383,113 @@ def dbscan(schema_madlib, source_table, output_table, 
id_column, expr_point, eps
             """.format(**locals())
         plpy.execute(sql)
 
-        plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}, {3}".format(
+        plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}, {3}, {4}, 
{5}".format(
                      distance_table, core_points_table, core_edge_table,
-                     reachable_points_table))
+                     reachable_points_table, sf_step_table, sf_edge_table))
+
+def dbscan_kd(schema_madlib, source_table, id_column, expr_point, eps,
+              min_samples, metric, depth):
+
+    n_features = num_features(source_table, expr_point)
+
+    # If squared_dist_norm2 is used, we assume eps is set for the squared 
distance
+    # That means the border only needs to be sqrt(eps) wide
+    local_eps = sqrt(eps) if metric == DEFAULT_METRIC else eps
+
+    kd_array, case_when_clause, border_cl1, border_cl2 = build_kd_tree(
+        schema_madlib, source_table, expr_point, depth, n_features, local_eps)
+
+    kd_source_table = unique_string(desp='kd_source_table')
+    kd_border_table1 = unique_string(desp='kd_border_table1')
+    kd_border_table2 = unique_string(desp='kd_border_table2')
+
+    dist_leaf_sql = ''  if is_platform_pg() else 'DISTRIBUTED BY (__leaf_id__)'
+    plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}".format(kd_source_table, 
kd_border_table1, kd_border_table2))
+
+    output_sql = """
+        CREATE TABLE {kd_source_table} AS
+            SELECT *,
+                   CASE {case_when_clause} END AS __leaf_id__
+            FROM {source_table}
+            {dist_leaf_sql}
+        """.format(**locals())
+    plpy.execute(output_sql)
+
+    border_sql = """
+        CREATE TABLE {kd_border_table1} AS
+            SELECT *
+            FROM {source_table}
+            WHERE {border_cl1}
+        """.format(**locals())
+    plpy.execute(border_sql)
+
+    border_sql = """
+        CREATE TABLE {kd_border_table2} AS
+            SELECT *
+            FROM {source_table}
+            WHERE {border_cl2}
+        """.format(**locals())
+    plpy.execute(border_sql)
+
+    return kd_source_table, kd_border_table1, kd_border_table2
+
+
+def build_kd_tree(schema_madlib, source_table, expr_point,
+                  depth, n_features, eps, **kwargs):
+    """
+        KD-tree function to create a partitioning for KNN
+        Args:
+            @param schema_madlib        Name of the Madlib Schema
+            @param source_table         Training data table
+            @param output_table         Name of the table to store kd tree
+            @param expr_point    Name of the column with training data
+                                        or expression that evaluates to a

Review comment:
       I downloaded a dataset with long/lat coordinates, and tried running it 
on that.  It says in the usage message you can pass an expression that 
evaluates to a numeric array.  But it fails for one reason for `kd_tree`:
   ```
   dbscan=# select dbscan('midloc_i', 'dbscan_i_out', 'incidnum', 
'ARRAY[midloc2_xlongitude, midloc2_ylatitude]', 1.0, 5, 'dist_angle', 
'kd_tree', 3);
   ERROR:  42883: spiexceptions.UndefinedFunction: function 
madlib.rtree_step(numeric, numeric[], numeric, integer, unknown, integer[], 
integer) does not exist
   LINE 4:                    madlib.rtree_step( incidnum,
                              ^
   HINT:  No function matches the given name and argument types. You might need 
to add explicit type casts.
   QUERY:
               CREATE TABLE 
__madlib_temp_rtree_step_table84420211_1595297086_24983320__ AS
               SELECT __leaf_id__,
                      madlib.rtree_step( incidnum,
                                                  ARRAY[midloc2_xlongitude, 
midloc2_ylatitude],
                                                  1.0,
                                                  5,
                                                  'dist_angle',
                                                  ARRAY[176, 376, 377, 402, 
408, 381, 383, 404, 410],
                                                  __leaf_id__
                                                  )
               FROM __madlib_temp_kd_source_table94243596_1595297085_46815714__ 
GROUP BY __leaf_id__
               DISTRIBUTED BY (__leaf_id__)
   
   CONTEXT:  Traceback (most recent call last):
     PL/Python function "dbscan", line 21, in <module>
       return dbscan.dbscan(**globals())
     PL/Python function "dbscan", line 127, in dbscan
   PL/Python function "dbscan"
   LOCATION:  PLy_elog, plpy_elog.c:121
   dbscan=# select dbscan('midloc_i', 'dbscan_i_out', 'incidnum', 
'ARRAY[midloc2_xlongitude, midloc2_ylatitude]', 1.0, 5, 'dist_angle', 'brute', 
3);
   ```
   And for a different reason for `brute_force`:
   ```
   ERROR:  42601: spiexceptions.SyntaxError: syntax error at or near ","
   LINE 7: ...                   __t1__.ARRAY[midloc2_xlongitude, midloc2_...
                                                                ^
   QUERY:
                   CREATE TABLE 
__madlib_temp_distance_table56125095_1595296428_16728204__ AS
                   SELECT __src__, __dest__ FROM (
                       SELECT  __t1__.incidnum AS __src__,
                               __t2__.incidnum AS __dest__,
                               madlib.dist_angle(
                                   __t1__.ARRAY[midloc2_xlongitude, 
midloc2_ylatitude], __t2__.ARRAY[midloc2_xlongitude, midloc2_ylatitude]) AS 
__dist__
                       FROM midloc_i AS __t1__, midloc_i AS __t2__)q1
                   WHERE __dist__ < 1.0
                   DISTRIBUTED BY (__src__)
   
   CONTEXT:  Traceback (most recent call last):
     PL/Python function "dbscan", line 21, in <module>
       return dbscan.dbscan(**globals())
     PL/Python function "dbscan", line 221, in dbscan
   PL/Python function "dbscan"
   LOCATION:  PLy_elog, plpy_elog.c:121
   ```

##########
File path: src/ports/postgres/modules/dbscan/dbscan.py_in
##########
@@ -235,6 +726,10 @@ def _validate_dbscan(schema_madlib, source_table, 
output_table, id_column,
     fn_dist_list = ['dist_norm1', 'dist_norm2', 'squared_dist_norm2', 
'dist_angle', 'dist_tanimoto']
     _assert(metric in fn_dist_list, "dbscan Error: metric has to be one of the 
madlib defined distance functions")
 

Review comment:
       Can we list them here, so the user doesn't have to search through the 
user guide for them?

##########
File path: src/ports/postgres/modules/dbscan/dbscan.py_in
##########
@@ -282,7 +777,8 @@ SELECT {schema_madlib}.dbscan(
     min_samples,        -- The minimum size of a cluster
     metric,             -- The name of the function to use to calculate the
                         -- distance
-    algorithm           -- The algorithm to use for dbscan.
+    algorithm,          -- The algorithm to use for dbscan

Review comment:
       I had to look at the source code to figure out what to use for `metric` 
and `algorithm`, after taking a few guesses. Maybe we can list the valid 
choices here?

##########
File path: src/ports/postgres/modules/dbscan/dbscan.py_in
##########
@@ -209,8 +521,187 @@ def dbscan_predict(schema_madlib, dbscan_table, 
source_table, id_column,
             """.format(**locals())
         result = plpy.execute(sql)
 
+def rtree_transition(state, id_in, expr_points, eps, min_samples, metric, 
n_rows, leaf_id, **kwargs):
+
+    SD = kwargs['SD']
+    if not state:
+        data = {}
+        SD['counter{0}'.format(leaf_id)] = 0
+    else:
+        data = SD['data{0}'.format(leaf_id)]
+
+    data[id_in] = expr_points
+    SD['counter{0}'.format(leaf_id)] = SD['counter{0}'.format(leaf_id)]+1
+    SD['data{0}'.format(leaf_id)] = data
+    ret = [[-1,-1],[-1,-1]]
+
+    my_n_rows = n_rows[leaf_id]
+
+    if SD['counter{0}'.format(leaf_id)] == my_n_rows:
+
+        core_counts = {}
+        core_lists = {}
+        p = index.Property()
+        p.dimension = len(expr_points)
+        idx = index.Index(properties=p)
+        ret = []
+
+        if metric == 'dist_norm1':
+            fn_dist = distance.cityblock
+        elif metric == 'dist_norm2':
+            fn_dist = distance.euclidean
+        else:
+            fn_dist = distance.sqeuclidean
+
+        for key1, value1 in data.items():
+            idx.add(key1,value1+value1,key1)
+
+        for key1, value1 in data.items():
+
+            v1 = []
+            v2 = []
+            for dim in value1:
+                v1.append(dim-eps)
+                v2.append(dim+eps)
+
+            # Array concat
+            v = v1+v2
+            hits = idx.intersection(v)
+
+            if key1 not in core_counts:
+                core_counts[key1] = 0
+                core_lists[key1] = []
+
+            for key2 in hits:
+                value2 = data[key2]
+                dist = fn_dist(value1, value2)
+                if dist <= eps:
+                    core_counts[key1] += 1
+                    core_lists[key1].append(key2)
+
+        for key1, value1 in core_counts.items():
+            if value1 >= min_samples:
+                for key2 in core_lists[key1]:
+                    ret.append([key1,key2])
+
+    return ret
+
+def rtree_merge(state1, state2, **kwargs):
+
+    if not state1:
+        return state2
+    elif not state2:
+        return state1

Review comment:
       Probably good to add an `else` return errror, just in case there is some 
bug introduced at some point where both state1 and state2 are non-NULL.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [madlib] reductionista commented on a change in pull request #505: DBSCAN: Add indexing optimizations to improve the runtime

Reply via email to