[GitHub] [madlib] reductionista commented on a change in pull request #573: @orhankislal WCC: Optimize subtx count and catalog entry frequency

GitBox Mon, 08 Nov 2021 15:23:10 -0800


reductionista commented on a change in pull request #573:
URL: https://github.com/apache/madlib/pull/573#discussion_r744040215




##########
File path: src/ports/postgres/modules/graph/wcc.py_in
##########
@@ -120,6 +121,7 @@ def wcc(schema_madlib, vertex_table, vertex_id, edge_table, 
edge_args,
     new_to_update_where_condition = ''
     edge_to_update_where_condition = ''
     edge_inverse_to_update_where_condition = ''
+    grouping_cols_select = ''

Review comment:
       I think you can remove this line--looks like it is not used anywhere

##########
File path: src/ports/postgres/modules/graph/wcc.py_in
##########
@@ -162,52 +160,128 @@ def wcc(schema_madlib, vertex_table, vertex_id, 
edge_table, edge_args,
         edge_inverse_to_update_where_condition = ' AND ' + \
             _check_groups(edge_inverse, toupdate, grouping_cols_list)
         join_grouping_cols = _check_groups(subq, distinct_grp_table, 
grouping_cols_list)
-        group_by_clause_newupdate = ('' if not grouping_cols else
-                                     '{0}, 
{1}.{2}'.format(subq_prefixed_grouping_cols,
+        group_by_clause_newupdate = ('{0}, 
{1}.{2}'.format(subq_prefixed_grouping_cols,
                                                            subq, vertex_id))
-        plpy.execute("""
-                CREATE TABLE {newupdate} AS
-                SELECT {subq}.{vertex_id},
-                        CAST({BIGINT_MAX} AS BIGINT) AS {component_id}
-                        {select_grouping_cols}
-                FROM {distinct_grp_table} INNER JOIN (
-                    SELECT {select_grouping_cols_clause} {src} AS {vertex_id}
-                    FROM {edge_table}
-                    UNION
-                    SELECT {select_grouping_cols_clause} {dest} AS {vertex_id}
-                    FROM {edge_inverse}
-                ) {subq}
-                ON {join_grouping_cols}
-                GROUP BY {group_by_clause_newupdate}
-                {distribution}
-            """.format(select_grouping_cols=',' + subq_prefixed_grouping_cols,
-                       select_grouping_cols_clause=grouping_cols_comma,
-                       **locals()))
-        # drop intermediate table
-        plpy.execute("DROP TABLE IF EXISTS {0}".format(distinct_grp_table))
-        plpy.execute("""
-                CREATE TEMP TABLE {message} AS
-                SELECT {vertex_id},
-                        CAST({vertex_id} AS BIGINT) AS {component_id}
-                        {select_grouping_cols_clause}
-                FROM {newupdate}
-                {distribution}
-            """.format(select_grouping_cols_clause=comma_grouping_cols,
-                       **locals()))
+

Review comment:
       Can we move this definition of `group_by_clause_newupdate` into the 
grouping_cols specific conditional block below?

##########
File path: src/ports/postgres/modules/graph/wcc.py_in
##########
@@ -299,24 +315,27 @@ def wcc(schema_madlib, vertex_table, vertex_id, 
edge_table, edge_args,
     rename_table(schema_madlib, newupdate, out_table)
     # Create summary table. We only need the vertex_id and grouping columns
     # in it.
-    plpy.execute("""
-            CREATE TABLE {out_table_summary} (
-                {grouping_cols_summary}
-                vertex_table    TEXT,
-                vertex_id   TEXT,
-                vertex_id_type  TEXT
-            )
-        """.format(grouping_cols_summary='' if not grouping_cols else
-                   'grouping_cols TEXT, ', **locals()))
     vertex_id_type = get_expr_type(vertex_id, vertex_table)
     plpy.execute("""
-            INSERT INTO {out_table_summary} VALUES
-            ({grouping_cols_summary} '{vertex_table}', '{vertex_id}',
-            '{vertex_id_type}')
-        """.format(grouping_cols_summary='' if not grouping_cols else
-                   "'{0}', ".format(grouping_cols), **locals()))
-    plpy.execute("DROP TABLE IF EXISTS {0},{1},{2},{3} ".
-                 format(message, oldupdate, newupdate, toupdate))
+        CREATE TABLE {out_table_summary} (
+            {grouping_cols_summary}
+            vertex_table    TEXT,
+            vertex_id   TEXT,
+            vertex_id_type  TEXT
+        );
+    """.format(grouping_cols_summary='' if not grouping_cols else
+                    'grouping_cols TEXT, ',
+               **locals()))
+
+    plpy.execute("""
+        INSERT INTO {out_table_summary} VALUES
+        ({grouping_cols_txt} '{vertex_table}', '{vertex_id}',
+        '{vertex_id_type}');
+
+        DROP TABLE IF EXISTS {message},{oldupdate},{newupdate},{toupdate};
+    """.format(grouping_cols_txt='' if not grouping_cols else
+                    "'{0}', ".format(grouping_cols),
+               **locals()))

Review comment:
       Since there's nothing that needs to happen in between, why not combine 
these two into a single `plpy.execute()` call?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [madlib] reductionista commented on a change in pull request #573: @orhankislal WCC: Optimize subtx count and catalog entry frequency

Reply via email to