Github user hpandeycodeit commented on a diff in the pull request: https://github.com/apache/incubator-madlib/pull/168#discussion_r134070999 --- Diff: src/ports/postgres/modules/knn/knn.py_in --- @@ -127,5 +124,102 @@ def knn_validate_src(schema_madlib, point_source, point_column_name, label_colum "Data type '{0}' is not a valid type for column '{1}' in table '{2}'.".format(colType, id_column_name, test_source)) return k -# ---------------------------------------------------------------------- -m4_changequote(<!`!>, <!'!>) + + + + +def knn(schema_madlib, point_source, point_column_name, label_column_name, + test_source, test_column_name, id_column_name, output_table, operation, k): + + """ + KNN function to find the K Nearest neighbours + Args: + @param schema_madlib Name of the Madlib Schema + @param point_source Training data table + @param point_column_name Name of the column with training data points. + @param label_column_name Name of the column with labels/values of training data points. + @param test_source Name of the table containing the test data points. + @param test_column_name Name of the column with testing data points. + @param id_column_name Name of the column having ids of data points in test data table. + @param output_table Name of the table to store final results. + @param k default: 1. Number of nearest neighbors to consider + + + Returns: + VARCHAR Name of the output table. + """ + + + oldClientMinMessages = plpy.execute("SELECT setting FROM pg_settings WHERE name = 'client_min_messages'")[0]['setting']; + + plpy.execute("SET client_min_messages TO warning"); + + + k_val = knn_validate_src(schema_madlib, point_source, point_column_name, + label_column_name, test_source, + test_column_name, id_column_name, + output_table, operation, k) + + + plpy.execute("SELECT {schema_madlib}.create_schema_pg_temp()".format(schema_madlib = schema_madlib)); + + x_temp_table = unique_string(desp='x_temp_table') + y_temp_table = unique_string(desp='y_temp_table') + label_column_name_unique = unique_string(desp='label_column_name_unique') + test_id = unique_string(desp='test_id') + + convert_boolean_to_int = ''; + if operation == 'c': + convert_boolean_to_int = '::INTEGER'; + + madlib_knn_interm = unique_string(desp='madlib_knn_interm') + + plpy.execute("""DROP TABLE IF EXISTS pg_temp.{madlib_knn_interm}""".format(**locals())); + plpy.execute( + """ + CREATE TEMP TABLE pg_temp.{madlib_knn_interm} AS + SELECT * + FROM + ( + SELECT row_number() over (partition by {test_id} order by dist) AS r , {x_temp_table}.* + FROM + ( + SELECT test.{id_column_name} AS {test_id} , {schema_madlib}.squared_dist_norm2(train.{point_column_name} ,test.{test_column_name}) AS dist, train.{label_column_name} {convert_boolean_to_int} AS {label_column_name_unique} + FROM {point_source} AS train, {test_source} AS test + ) {x_temp_table} + ){y_temp_table} + WHERE {y_temp_table}.r <= {k_val}""".format(**locals())); + + if operation == 'c': + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id} AS id, {test_column_name} , {schema_madlib}.mode({label_column_name_unique}) AS prediction + FROM pg_temp.{madlib_knn_interm} join {test_source} ON {test_id} = {id_column_name} + GROUP BY {test_id} , {test_column_name}""".format(**locals())) + + + else: + plpy.execute( + """ + CREATE TABLE {output_table} AS + SELECT {test_id} AS id, {test_column_name} , avg( {label_column_name_unique} ) AS prediction + FROM + pg_temp.{madlib_knn_interm} join {test_source} on {test_id} ={id_column_name} + GROUP BY {test_id} , {test_column_name} + ORDER BY {test_id}""".format(**locals())) + + + plpy.execute("SET client_min_messages TO "+ oldClientMinMessages) + + if operation == 'c': + returnstring = 'The classification results have been written to output table '+ output_table; + else: + returnstring = 'The regression results have been written to output table '+ output_table; + + plpy.execute("""DROP TABLE pg_temp.{madlib_knn_interm}""".format(**locals())); + + return returnstring; + + --- End diff -- I did some changes as per the style guide. Will fix it all in a day or two
--- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. ---