Github user iyerr3 commented on a diff in the pull request:
https://github.com/apache/madlib/pull/289#discussion_r201498820
--- Diff:
src/ports/postgres/modules/recursive_partitioning/random_forest.py_in ---
@@ -1333,42 +1368,69 @@ def _create_group_table(
# -------------------------------------------------------------------------
-def _create_empty_result_table(schema_madlib, output_table_name):
+def _create_empty_result_table(schema_madlib, output_table_name,
importance):
"""Create the result table for all trees in the forest"""
+ impurity_var_imp_str = """, impurity_var_importance double
precision[]);""" if importance else ");"
+
sql_create_empty_result_table = """
CREATE TABLE {output_table_name} (
gid integer,
sample_id integer,
- tree {schema_madlib}.bytea8);
+ tree {schema_madlib}.bytea8
+ {impurity_var_imp_str}
""".format(**locals())
plpy.notice("sql_create_empty_result_table:\n" +
sql_create_empty_result_table)
plpy.execute(sql_create_empty_result_table)
# ------------------------------------------------------------
def _insert_into_result_table(schema_madlib, tree_states,
output_table_name,
- grp_key_to_grp_cols, sample_id):
+ grp_key_to_grp_cols, sample_id, importance,
grouping_cols):
"""Insert one tree to result table"""
+
+ impurity_var_imp_str = ''
+ importance_query = ''
+ importance_results = ''
--- End diff --
IMO these variables in else clause is easier to read. Also
`importance_results` is limited to the `if` block and does not require to be
defined outside that scope.
---