orhankislal commented on a change in pull request #571: URL: https://github.com/apache/madlib/pull/571#discussion_r701859754
########## File path: src/ports/postgres/modules/dbscan/dbscan.py_in ########## @@ -23,136 +23,247 @@ from utilities.control import MinWarning from utilities.utilities import _assert from utilities.utilities import unique_string from utilities.utilities import add_postfix -from utilities.utilities import NUMERIC, ONLY_ARRAY +from utilities.utilities import INTEGER, NUMERIC, ONLY_ARRAY from utilities.utilities import is_valid_psql_type from utilities.utilities import is_platform_pg +from utilities.utilities import num_features +from utilities.utilities import get_seg_number from utilities.validate_args import input_tbl_valid, output_tbl_valid from utilities.validate_args import is_var_valid from utilities.validate_args import cols_in_tbl_valid from utilities.validate_args import get_expr_type from utilities.validate_args import get_algorithm_name from graph.wcc import wcc -BRUTE_FORCE = 'brute_force' -KD_TREE = 'kd_tree' +from math import log +from math import ceil +from math import sqrt +from time import time +from collections import deque -def dbscan(schema_madlib, source_table, output_table, id_column, expr_point, eps, min_samples, metric, algorithm, **kwargs): +from scipy.spatial import distance +import numpy as np + +import utilities.debug as DEBUG +DEBUG.plpy_info_enabled = False +DEBUG.plpy_execute_enabled = False +DEBUG.timings_enabled = False + +try: + from rtree import index +except ImportError: + RTREE_ENABLED=0 +else: + RTREE_ENABLED=1 + +METHOD_BRUTE_FORCE = 'brute_force' +METHOD_OPTIMIZED = 'optimized' +DEFAULT_MIN_SAMPLES = 5 +DEFAULT_METRIC = 'squared_dist_norm2' + +def dbscan(schema_madlib, source_table, output_table, id_column, expr_point, + eps, min_samples, metric, algorithm, max_segmentation_depth, **kwargs): with MinWarning("warning"): + # algorithm=None is handled in get_algorithm_name() + min_samples = DEFAULT_MIN_SAMPLES if not min_samples else min_samples + metric = DEFAULT_METRIC if not metric else metric + num_segs = get_seg_number() - min_samples = 5 if not min_samples else min_samples - metric = 'squared_dist_norm2' if not metric else metric - algorithm = 'brute' if not algorithm else algorithm + algorithm = get_algorithm_name(algorithm, METHOD_OPTIMIZED, + [METHOD_BRUTE_FORCE, METHOD_OPTIMIZED], 'DBSCAN') - algorithm = get_algorithm_name(algorithm, BRUTE_FORCE, - [BRUTE_FORCE, KD_TREE], 'DBSCAN') + if max_segmentation_depth is None: + # Default to num_segs + max_depth = num_segs + else: + max_depth = max_segmentation_depth + if algorithm != METHOD_OPTIMIZED: + plpy.warn("Ignoring max_segmentation_depth={} param, " Review comment: plpy doesn't seem to have a warn function. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@madlib.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org