reductionista commented on a change in pull request #505: URL: https://github.com/apache/madlib/pull/505#discussion_r457791756
########## File path: src/ports/postgres/modules/dbscan/dbscan.py_in ########## @@ -175,9 +383,113 @@ def dbscan(schema_madlib, source_table, output_table, id_column, expr_point, eps """.format(**locals()) plpy.execute(sql) - plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}, {3}".format( + plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}, {3}, {4}, {5}".format( distance_table, core_points_table, core_edge_table, - reachable_points_table)) + reachable_points_table, sf_step_table, sf_edge_table)) + +def dbscan_kd(schema_madlib, source_table, id_column, expr_point, eps, + min_samples, metric, depth): + + n_features = num_features(source_table, expr_point) + + # If squared_dist_norm2 is used, we assume eps is set for the squared distance + # That means the border only needs to be sqrt(eps) wide + local_eps = sqrt(eps) if metric == DEFAULT_METRIC else eps + + kd_array, case_when_clause, border_cl1, border_cl2 = build_kd_tree( + schema_madlib, source_table, expr_point, depth, n_features, local_eps) + + kd_source_table = unique_string(desp='kd_source_table') + kd_border_table1 = unique_string(desp='kd_border_table1') + kd_border_table2 = unique_string(desp='kd_border_table2') + + dist_leaf_sql = '' if is_platform_pg() else 'DISTRIBUTED BY (__leaf_id__)' + plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}".format(kd_source_table, kd_border_table1, kd_border_table2)) + + output_sql = """ + CREATE TABLE {kd_source_table} AS + SELECT *, + CASE {case_when_clause} END AS __leaf_id__ + FROM {source_table} + {dist_leaf_sql} + """.format(**locals()) + plpy.execute(output_sql) + + border_sql = """ + CREATE TABLE {kd_border_table1} AS + SELECT * + FROM {source_table} + WHERE {border_cl1} + """.format(**locals()) + plpy.execute(border_sql) + + border_sql = """ + CREATE TABLE {kd_border_table2} AS + SELECT * + FROM {source_table} + WHERE {border_cl2} + """.format(**locals()) + plpy.execute(border_sql) + + return kd_source_table, kd_border_table1, kd_border_table2 + + +def build_kd_tree(schema_madlib, source_table, expr_point, + depth, n_features, eps, **kwargs): + """ + KD-tree function to create a partitioning for KNN + Args: + @param schema_madlib Name of the Madlib Schema + @param source_table Training data table + @param output_table Name of the table to store kd tree + @param expr_point Name of the column with training data + or expression that evaluates to a Review comment: I downloaded a dataset with long/lat coordinates, and tried running it on that. It says in the usage message you can pass an expression that evaluates to a numeric array. But it fails for one reason for `kd_tree`: ``` dbscan=# select dbscan('midloc_i', 'dbscan_i_out', 'incidnum', 'ARRAY[midloc2_xlongitude, midloc2_ylatitude]', 1.0, 5, 'dist_angle', 'kd_tree', 3); ERROR: 42883: spiexceptions.UndefinedFunction: function madlib.rtree_step(numeric, numeric[], numeric, integer, unknown, integer[], integer) does not exist LINE 4: madlib.rtree_step( incidnum, ^ HINT: No function matches the given name and argument types. You might need to add explicit type casts. QUERY: CREATE TABLE __madlib_temp_rtree_step_table84420211_1595297086_24983320__ AS SELECT __leaf_id__, madlib.rtree_step( incidnum, ARRAY[midloc2_xlongitude, midloc2_ylatitude], 1.0, 5, 'dist_angle', ARRAY[176, 376, 377, 402, 408, 381, 383, 404, 410], __leaf_id__ ) FROM __madlib_temp_kd_source_table94243596_1595297085_46815714__ GROUP BY __leaf_id__ DISTRIBUTED BY (__leaf_id__) CONTEXT: Traceback (most recent call last): PL/Python function "dbscan", line 21, in <module> return dbscan.dbscan(**globals()) PL/Python function "dbscan", line 127, in dbscan PL/Python function "dbscan" LOCATION: PLy_elog, plpy_elog.c:121 dbscan=# select dbscan('midloc_i', 'dbscan_i_out', 'incidnum', 'ARRAY[midloc2_xlongitude, midloc2_ylatitude]', 1.0, 5, 'dist_angle', 'brute', 3); ``` And for a different reason for `brute_force`: ``` ERROR: 42601: spiexceptions.SyntaxError: syntax error at or near "," LINE 7: ... __t1__.ARRAY[midloc2_xlongitude, midloc2_... ^ QUERY: CREATE TABLE __madlib_temp_distance_table56125095_1595296428_16728204__ AS SELECT __src__, __dest__ FROM ( SELECT __t1__.incidnum AS __src__, __t2__.incidnum AS __dest__, madlib.dist_angle( __t1__.ARRAY[midloc2_xlongitude, midloc2_ylatitude], __t2__.ARRAY[midloc2_xlongitude, midloc2_ylatitude]) AS __dist__ FROM midloc_i AS __t1__, midloc_i AS __t2__)q1 WHERE __dist__ < 1.0 DISTRIBUTED BY (__src__) CONTEXT: Traceback (most recent call last): PL/Python function "dbscan", line 21, in <module> return dbscan.dbscan(**globals()) PL/Python function "dbscan", line 221, in dbscan PL/Python function "dbscan" LOCATION: PLy_elog, plpy_elog.c:121 ``` ########## File path: src/ports/postgres/modules/dbscan/dbscan.py_in ########## @@ -235,6 +726,10 @@ def _validate_dbscan(schema_madlib, source_table, output_table, id_column, fn_dist_list = ['dist_norm1', 'dist_norm2', 'squared_dist_norm2', 'dist_angle', 'dist_tanimoto'] _assert(metric in fn_dist_list, "dbscan Error: metric has to be one of the madlib defined distance functions") Review comment: Can we list them here, so the user doesn't have to search through the user guide for them? ########## File path: src/ports/postgres/modules/dbscan/dbscan.py_in ########## @@ -282,7 +777,8 @@ SELECT {schema_madlib}.dbscan( min_samples, -- The minimum size of a cluster metric, -- The name of the function to use to calculate the -- distance - algorithm -- The algorithm to use for dbscan. + algorithm, -- The algorithm to use for dbscan Review comment: I had to look at the source code to figure out what to use for `metric` and `algorithm`, after taking a few guesses. Maybe we can list the valid choices here? ########## File path: src/ports/postgres/modules/dbscan/dbscan.py_in ########## @@ -209,8 +521,187 @@ def dbscan_predict(schema_madlib, dbscan_table, source_table, id_column, """.format(**locals()) result = plpy.execute(sql) +def rtree_transition(state, id_in, expr_points, eps, min_samples, metric, n_rows, leaf_id, **kwargs): + + SD = kwargs['SD'] + if not state: + data = {} + SD['counter{0}'.format(leaf_id)] = 0 + else: + data = SD['data{0}'.format(leaf_id)] + + data[id_in] = expr_points + SD['counter{0}'.format(leaf_id)] = SD['counter{0}'.format(leaf_id)]+1 + SD['data{0}'.format(leaf_id)] = data + ret = [[-1,-1],[-1,-1]] + + my_n_rows = n_rows[leaf_id] + + if SD['counter{0}'.format(leaf_id)] == my_n_rows: + + core_counts = {} + core_lists = {} + p = index.Property() + p.dimension = len(expr_points) + idx = index.Index(properties=p) + ret = [] + + if metric == 'dist_norm1': + fn_dist = distance.cityblock + elif metric == 'dist_norm2': + fn_dist = distance.euclidean + else: + fn_dist = distance.sqeuclidean + + for key1, value1 in data.items(): + idx.add(key1,value1+value1,key1) + + for key1, value1 in data.items(): + + v1 = [] + v2 = [] + for dim in value1: + v1.append(dim-eps) + v2.append(dim+eps) + + # Array concat + v = v1+v2 + hits = idx.intersection(v) + + if key1 not in core_counts: + core_counts[key1] = 0 + core_lists[key1] = [] + + for key2 in hits: + value2 = data[key2] + dist = fn_dist(value1, value2) + if dist <= eps: + core_counts[key1] += 1 + core_lists[key1].append(key2) + + for key1, value1 in core_counts.items(): + if value1 >= min_samples: + for key2 in core_lists[key1]: + ret.append([key1,key2]) + + return ret + +def rtree_merge(state1, state2, **kwargs): + + if not state1: + return state2 + elif not state2: + return state1 Review comment: Probably good to add an `else` return errror, just in case there is some bug introduced at some point where both state1 and state2 are non-NULL. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org