[ https://issues.apache.org/jira/browse/MADLIB-1018?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15767969#comment-15767969 ]
Frank McQuillan commented on MADLIB-1018: ----------------------------------------- i.e., this means accept an expression > Fix K-means support for array input for data points > --------------------------------------------------- > > Key: MADLIB-1018 > URL: https://issues.apache.org/jira/browse/MADLIB-1018 > Project: Apache MADlib > Issue Type: Bug > Components: Module: k-Means Clustering > Reporter: Frank McQuillan > Priority: Minor > Fix For: v1.10 > > > For k-means, normally you should be able to do array[col1, col2…] for the 2nd > parameter, but that does not work. This JIRA is to be able to support > array[col1, col2…]. > {code} > expr_point > TEXT. The name of the column with point coordinates. > {code} > {code} > SELECT madlib.kmeans_random('customers_train', > 'array[creditamount, accountbalance]', > 3 > ); > {code} > produces > {code} > --------------------------------------------------------------------------- > InternalError Traceback (most recent call last) > <ipython-input-50-0b939dd162ef> in <module>() > ----> 1 get_ipython().run_cell_magic(u'sql', u'', u"\nSELECT > madlib.kmeans_random('customers_train',\n 'array[creditamount, > accountbalance]',\n 3\n );\n") > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc > in run_cell_magic(self, magic_name, line, cell) > 2291 magic_arg_s = self.var_expand(line, stack_depth) > 2292 with self.builtin_trap: > -> 2293 result = fn(magic_arg_s, cell) > 2294 return result > 2295 > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sql/magic.pyc in > execute(self, line, cell, local_ns) > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc > in <lambda>(f, *a, **k) > 191 # but it's overkill for just that one bit of state. > 192 def magic_deco(arg): > --> 193 call = lambda f, *a, **k: f(*a, **k) > 194 > 195 if callable(arg): > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sql/magic.pyc in > execute(self, line, cell, local_ns) > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc > in <lambda>(f, *a, **k) > 191 # but it's overkill for just that one bit of state. > 192 def magic_deco(arg): > --> 193 call = lambda f, *a, **k: f(*a, **k) > 194 > 195 if callable(arg): > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sql/magic.pyc in > execute(self, line, cell, local_ns) > 78 return self._persist_dataframe(parsed['sql'], conn, > user_ns) > 79 try: > ---> 80 result = sql.run.run(conn, parsed['sql'], self, user_ns) > 81 return result > 82 except (ProgrammingError, OperationalError) as e: > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sql/run.pyc in > run(conn, sql, config, user_namespace) > 270 raise Exception("ipython_sql does not support > transactions") > 271 txt = sqlalchemy.sql.text(statement) > --> 272 result = conn.session.execute(txt, user_namespace) > 273 try: > 274 conn.session.execute('commit') > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc > in execute(self, object, *multiparams, **params) > 912 type(object)) > 913 else: > --> 914 return meth(self, multiparams, params) > 915 > 916 def _execute_function(self, func, multiparams, params): > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/sql/elements.pyc > in _execute_on_connection(self, connection, multiparams, params) > 321 > 322 def _execute_on_connection(self, connection, multiparams, params): > --> 323 return connection._execute_clauseelement(self, multiparams, > params) > 324 > 325 def unique_params(self, *optionaldict, **kwargs): > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc > in _execute_clauseelement(self, elem, multiparams, params) > 1008 compiled_sql, > 1009 distilled_params, > -> 1010 compiled_sql, distilled_params > 1011 ) > 1012 if self._has_events or self.engine._has_events: > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc > in _execute_context(self, dialect, constructor, statement, parameters, *args) > 1144 parameters, > 1145 cursor, > -> 1146 context) > 1147 > 1148 if self._has_events or self.engine._has_events: > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc > in _handle_dbapi_exception(self, e, statement, parameters, cursor, context) > 1339 util.raise_from_cause( > 1340 sqlalchemy_exception, > -> 1341 exc_info > 1342 ) > 1343 else: > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/util/compat.pyc > in raise_from_cause(exception, exc_info) > 197 exc_info = sys.exc_info() > 198 exc_type, exc_value, exc_tb = exc_info > --> 199 reraise(type(exception), exception, tb=exc_tb) > 200 > 201 if py3k: > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/base.pyc > in _execute_context(self, dialect, constructor, statement, parameters, *args) > 1137 statement, > 1138 parameters, > -> 1139 context) > 1140 except Exception as e: > 1141 self._handle_dbapi_exception( > /Users/fmcquillan/anaconda/lib/python2.7/site-packages/sqlalchemy/engine/default.pyc > in do_execute(self, cursor, statement, parameters, context) > 448 > 449 def do_execute(self, cursor, statement, parameters, context=None): > --> 450 cursor.execute(statement, parameters) > 451 > 452 def do_execute_no_params(self, cursor, statement, context=None): > InternalError: (psycopg2.InternalError) plpy.SPIError: syntax error at or > near "," (plpython.c:4651) > LINE 44: ... _src.array[creditamount, accountb... > ^ > QUERY: > SELECT > 1 AS _iteration, > madlib.array_to_1d((_state).centroids) AS centroids, > (_state).old_centroid_ids, > (_state).objective_fn, > (_state).frac_reassigned > FROM > ( > SELECT ( > SELECT > CAST(( > madlib.matrix_agg( > _centroid::FLOAT8[] > ORDER BY _new_centroid_id), > array_agg(_new_centroid_id ORDER BY _new_centroid_id), > sum(_objective_fn), > CAST(sum(_num_reassigned) AS DOUBLE PRECISION) > / sum(_num_points) > ) AS madlib.kmeans_state) > FROM ( > SELECT > (_new_centroid).column_id AS _new_centroid_id, > sum((_new_centroid).distance) AS _objective_fn, > count(*) AS _num_points, > sum( > CAST( > coalesce( > (CAST( > (SELECT (CAST > ((madlib.array_to_2d($1), $2, $3, $4) > AS madlib.kmeans_state)).old_centroid_ids) AS > INTEGER[] > ))[(_new_centroid).column_id + 1] != > _old_centroid_id, > TRUE > ) > AS INTEGER > ) > ) AS _num_reassigned, > madlib.avg(_point::FLOAT8[]) AS _centroid > FROM ( > SELECT > -- PostgreSQL/Greenplum tuning: > -- VOLATILE function as optimization fence > madlib.noop(), > _src.array[creditamount, accountbalance] AS > _point, > madlib.closest_column( > (SELECT (CAST ((madlib.array_to_2d($1), $2, > $3, $4) > AS madlib.kmeans_state)).centroids) > , _src.array[creditamount, > accountbalance]::FLOAT8[] > , 'madlib.squared_dist_norm2' > > ) > AS _new_centroid, > (madlib.closest_column((SELECT (CAST > ((madlib.array_to_2d($5), $6, $7, $8) > AS madlib.kmeans_state)).centroids) > , _src.array[creditamount, > accountbalance]::FLOAT8[] > , 'madlib.squared_dist_norm2' > > ) > ).column_id > AS _old_centroid_id > FROM customers_train AS _src > WHERE > abs(coalesce(madlib.svec_elsum(array[creditamount, accountbalance]), > 'Infinity'::FLOAT8)) < 'Infinity'::FLOAT8 > AND NOT > madlib.array_contains_null(_src.array[creditamount, accountbalance]::FLOAT8[]) > ) AS _points_with_assignments > GROUP BY (_new_centroid).column_id > ) AS _new_centroids > ) AS _state > ) q > > CONTEXT: Traceback (most recent call last): > PL/Python function "internal_compute_kmeans", line 22, in <module> > return kmeans.compute_kmeans(**globals()) > PL/Python function "internal_compute_kmeans", line 332, in compute_kmeans > PL/Python function "internal_compute_kmeans", line 227, in update > PL/Python function "internal_compute_kmeans" > SQL statement "SELECT madlib.internal_compute_kmeans( '_madlib_kmeans_args', > '_madlib_kmeans_state', textin(regclassout( $1 )), $2 , textin(regprocout( > $3 )))" > PL/pgSQL function "kmeans" line 103 at assignment > SQL statement "SELECT madlib.kmeans( $1 , $2 , > madlib.kmeans_random_seeding( $1 , $2 , $3 ), 'madlib.squared_dist_norm2', > 'madlib.avg', 20, 0.001)" > PL/pgSQL function "kmeans_random" line 4 at assignment > [SQL: "SELECT madlib.kmeans_random('customers_train',\n > 'array[creditamount, accountbalance]',\n 3\n );"] > {code} > The workaround is to create a view: > {code} > CREATE VIEW cluster_params AS (SELECT *, array[creditamount, accountbalance] > as p1 FROM customers_train); > SELECT madlib.kmeans_random('cluster_params', > 'p1', > 3 > ); > {code} > produces > {code} > ("{{8619.6635514,3490.145919},{2343.72082019,6004.36435331},{2191.06698565,1908.8522488}}",9660868534.24,0.001,11) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332)