reductionista commented on a change in pull request #505: URL: https://github.com/apache/madlib/pull/505#discussion_r458999437
########## File path: src/ports/postgres/modules/dbscan/dbscan.py_in ########## @@ -175,9 +383,113 @@ def dbscan(schema_madlib, source_table, output_table, id_column, expr_point, eps """.format(**locals()) plpy.execute(sql) - plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}, {3}".format( + plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}, {3}, {4}, {5}".format( distance_table, core_points_table, core_edge_table, - reachable_points_table)) + reachable_points_table, sf_step_table, sf_edge_table)) + +def dbscan_kd(schema_madlib, source_table, id_column, expr_point, eps, + min_samples, metric, depth): + + n_features = num_features(source_table, expr_point) + + # If squared_dist_norm2 is used, we assume eps is set for the squared distance + # That means the border only needs to be sqrt(eps) wide + local_eps = sqrt(eps) if metric == DEFAULT_METRIC else eps + + kd_array, case_when_clause, border_cl1, border_cl2 = build_kd_tree( + schema_madlib, source_table, expr_point, depth, n_features, local_eps) + + kd_source_table = unique_string(desp='kd_source_table') + kd_border_table1 = unique_string(desp='kd_border_table1') + kd_border_table2 = unique_string(desp='kd_border_table2') + + dist_leaf_sql = '' if is_platform_pg() else 'DISTRIBUTED BY (__leaf_id__)' + plpy.execute("DROP TABLE IF EXISTS {0}, {1}, {2}".format(kd_source_table, kd_border_table1, kd_border_table2)) + + output_sql = """ + CREATE TABLE {kd_source_table} AS + SELECT *, + CASE {case_when_clause} END AS __leaf_id__ + FROM {source_table} + {dist_leaf_sql} + """.format(**locals()) + plpy.execute(output_sql) + + border_sql = """ + CREATE TABLE {kd_border_table1} AS + SELECT * + FROM {source_table} + WHERE {border_cl1} + """.format(**locals()) + plpy.execute(border_sql) + + border_sql = """ + CREATE TABLE {kd_border_table2} AS + SELECT * + FROM {source_table} + WHERE {border_cl2} + """.format(**locals()) + plpy.execute(border_sql) + + return kd_source_table, kd_border_table1, kd_border_table2 + + +def build_kd_tree(schema_madlib, source_table, expr_point, + depth, n_features, eps, **kwargs): + """ + KD-tree function to create a partitioning for KNN + Args: + @param schema_madlib Name of the Madlib Schema + @param source_table Training data table + @param output_table Name of the table to store kd tree + @param expr_point Name of the column with training data + or expression that evaluates to a Review comment: Update: the first error (for `kd-tree`) was not due to the `point` parameter being an array, it was because the `id` parameter was of type `NUMERIC` instead of `INTEGER`, despite all the values being integers. This raises the question: should we allow the user to use any unique column as an id (a TEXT label perhaps?), or keep the current limitation that its type be INTEGER, SMALLINT, or BIGINT. Unless it's a simple change to make it more general, I think we can just stick with the current limitation. After changing the input column to INTEGER, I found that `kd-tree` also runs into the second error, same as `brute`. However, I was able to get around both of these errors by creating a view: ``` CREATE VIEW midloc_v AS SELECT ROW_NUMBER() OVER() AS id, ARRAY[midloc2_xlongitude, midloc2_ylatitude] AS point FROM midloc_i; SELECT dbscan('midloc_v', 'dbscan_v_eps0point1min5', 'id', 'point', 0.1, 5, 'dist_angle', 'kd_tree', 3); ┌────────┐ │ dbscan │ ├────────┤ │ │ └────────┘ (1 row) Time: 59756.533 ms dbscan=# select cluster_id, count(*) from dbscan_v_eps0point1min5 group by 1 ┌────────────┬───────┐ │ cluster_id │ count │ ├────────────┼───────┤ │ 0 │ 2138 │ │ 1 │ 7 │ │ 2 │ 7 │ │ 3 │ 6 │ │ 4 │ 7 │ │ 5 │ 14 │ │ 6 │ 14 │ │ 7 │ 10 │ │ 8 │ 19 │ │ 9 │ 5 │ │ 10 │ 8 │ │ 11 │ 12 │ │ 12 │ 7 │ │ 13 │ 5 │ │ 14 │ 5 │ │ 15 │ 5 │ │ 16 │ 9 │ │ 17 │ 5 │ │ 18 │ 7 │ │ 19 │ 5 │ │ 20 │ 5 │ │ 21 │ 10 │ │ 22 │ 15 │ │ 23 │ 9 │ │ 24 │ 8 │ │ 25 │ 6 │ │ 26 │ 5 │ │ 27 │ 5 │ │ 28 │ 6 │ └────────────┴───────┘ (29 rows) ``` I was also pleased to see a very nice performance improvement with the index: only `1 min` for `kd-tree` compared to `25 min` for `brute`! ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org