incubator-madlib git commit: Mulltiple: Add grouping support for SSSP and support GPDB5

okislal Mon, 17 Apr 2017 11:23:32 -0700

Repository: incubator-madlib
Updated Branches:
  refs/heads/master c82b9d0ad -> 8faf62263



Mulltiple: Add grouping support for SSSP and support GPDB5

JIRA: MADLIB-1081

- This commit adds grouping support for SSSP as well as its path function.
- Update chi2 test for GPDB5 alpha compatibility.
- Decouple DROP and CREATE statements for various modules.

Closes #113


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/8faf6226
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/8faf6226
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/8faf6226

Branch: refs/heads/master
Commit: 8faf62263f6c5aa4281e2d3dc33e389d41784c0e
Parents: c82b9d0
Author: Orhan Kislal <okis...@pivotal.io>
Authored: Mon Apr 17 11:17:41 2017 -0700
Committer: Orhan Kislal <okis...@pivotal.io>
Committed: Mon Apr 17 11:17:41 2017 -0700

----------------------------------------------------------------------
 .../elastic_net_generate_result.py_in           |   2 +-
 .../postgres/modules/graph/graph_utils.py_in    |   2 +-
 src/ports/postgres/modules/graph/sssp.py_in     | 716 ++++++++++++++-----
 src/ports/postgres/modules/graph/sssp.sql_in    | 132 +++-
 .../postgres/modules/graph/test/sssp.sql_in     |  75 +-
 src/ports/postgres/modules/pca/pca.py_in        |   6 +-
 .../modules/stats/test/chi2_test.sql_in         |   2 +-
 .../validation/internal/cross_validation.py_in  |   6 +-
 8 files changed, 739 insertions(+), 202 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8faf6226/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in 
b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
index c48beca..6246ed9 100644
--- a/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
+++ b/src/ports/postgres/modules/elastic_net/elastic_net_generate_result.py_in
@@ -81,8 +81,8 @@ def _elastic_net_generate_result(optimizer, iteration_run, 
**args):
                    schema_madlib=args["schema_madlib"])
 
     # Create the output table
+    plpy.execute("DROP TABLE IF EXISTS {tbl_result}".format(**args))
     plpy.execute("""
-             DROP TABLE IF EXISTS {tbl_result};
              CREATE TABLE {tbl_result} (
                  {select_grouping_info}
                  family            text,

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8faf6226/src/ports/postgres/modules/graph/graph_utils.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/graph_utils.py_in 
b/src/ports/postgres/modules/graph/graph_utils.py_in
index 2d83301..25f70a5 100644
--- a/src/ports/postgres/modules/graph/graph_utils.py_in
+++ b/src/ports/postgres/modules/graph/graph_utils.py_in
@@ -72,7 +72,7 @@ def validate_graph_coding(vertex_table, vertex_id, 
edge_table, edge_params,
                """Graph {func_name}: The vertex column {vertex_id} is not 
present in vertex table ({vertex_table}) """.
                format(**locals()))
        _assert(columns_exist_in_table(edge_table, edge_params.values()),
-               """Graph {func_name}: Not all columns from {cols} present in 
edge table ({edge_table})""".
+               """Graph {func_name}: Not all columns from {cols} are present 
in edge table ({edge_table})""".
                format(cols=edge_params.values(), **locals()))
 
        return None

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8faf6226/src/ports/postgres/modules/graph/sssp.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/sssp.py_in 
b/src/ports/postgres/modules/graph/sssp.py_in
index 4d27761..2520830 100644
--- a/src/ports/postgres/modules/graph/sssp.py_in
+++ b/src/ports/postgres/modules/graph/sssp.py_in
@@ -33,27 +33,56 @@ from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string
-from utilities.validate_args import get_cols
-from utilities.validate_args import unquote_ident
+from utilities.utilities import _string_to_array
+from utilities.utilities import split_quoted_delimited_str
 from utilities.validate_args import table_exists
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_is_empty
+from utilities.validate_args import get_cols_and_types
+from utilities.validate_args import get_expr_type
 
 m4_changequote(`<!', `!>')
 
+
+def _check_groups(tbl1, tbl2, grp_list):
+
+       """
+       Helper function for joining tables with groups.
+       Args:
+               @param tbl1       Name of the first table
+               @param tbl2       Name of the second table
+               @param grp_list   The list of grouping columns
+       """
+
+       return ' AND '.join([" {tbl1}.{i} = {tbl2}.{i} ".format(**locals())
+               for i in grp_list])
+
+def _grp_from_table(tbl, grp_list):
+
+       """
+       Helper function for selecting grouping columns of a table
+       Args:
+               @param tbl        Name of the table
+               @param grp_list   The list of grouping columns
+       """
+       return ' , '.join([" {tbl}.{i} ".format(**locals())
+               for i in grp_list])
+
 def graph_sssp(schema_madlib, vertex_table, vertex_id, edge_table,
-               edge_args, source_vertex, out_table, **kwargs):
+               edge_args, source_vertex, out_table, grouping_cols, **kwargs):
+
        """
     Single source shortest path function for graphs using the Bellman-Ford
     algorhtm [1].
     Args:
-        @param vertex_table     Name of the table that contains the vertex 
data.
-        @param vertex_id        Name of the column containing the vertex ids.
-        @param edge_table       Name of the table that contains the edge data.
-        @param edge_args        A comma-delimited string containing multiple
-                                                       named arguments of the 
form "name=value".
-        @param source_vertex    The source vertex id for the algorithm to 
start.
-        @param out_table           Name of the table to store the result of 
SSSP.
+        @param vertex_table    Name of the table that contains the vertex data.
+        @param vertex_id       Name of the column containing the vertex ids.
+        @param edge_table      Name of the table that contains the edge data.
+        @param edge_args       A comma-delimited string containing multiple
+                               named arguments of the form "name=value".
+        @param source_vertex   The source vertex id for the algorithm to start.
+        @param out_table       Name of the table to store the result of SSSP.
+        @param grouping_cols   The list of grouping columns.
 
     [1] https://en.wikipedia.org/wiki/Bellman-Ford_algorithm
     """
@@ -61,6 +90,7 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, 
edge_table,
        with MinWarning("warning"):
 
                INT_MAX = 2147483647
+               INFINITY = "'Infinity'"
                EPSILON = 0.000001
 
                message = unique_string(desp='message')
@@ -73,8 +103,23 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, 
edge_table,
                edge_params = extract_keyvalue_params(edge_args,
                                             params_types,
                                             default_args)
+
+               # Prepare the input for recording in the summary table
                if vertex_id is None:
+                       v_st= "NULL"
                        vertex_id = "id"
+               else:
+                       v_st = vertex_id
+               if edge_args is None:
+                       e_st = "NULL"
+               else:
+                       e_st = edge_args
+               if grouping_cols is None:
+                       g_st = "NULL"
+                       glist = None
+               else:
+                       g_st = grouping_cols
+                       glist = split_quoted_delimited_str(grouping_cols)
 
                src = edge_params["src"]
                dest = edge_params["dest"]
@@ -85,47 +130,91 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, 
edge_table,
                local_distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>,
                        <!"DISTRIBUTED BY (id)"!>)
 
-               validate_sssp(vertex_table, vertex_id, edge_table,
-                       edge_params, source_vertex, out_table)
+               is_hawq = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
+               _validate_sssp(vertex_table, vertex_id, edge_table,
+                       edge_params, source_vertex, out_table, glist)
 
                plpy.execute(" DROP TABLE IF EXISTS {0},{1},{2}".format(
                        message,oldupdate,newupdate))
 
+               # Initialize grouping related variables
+               comma_grp = ""
+               comma_grp_e = ""
+               comma_grp_m = ""
+               grp_comma = ""
+               checkg_oo = ""
+               checkg_eo = ""
+               checkg_ex = ""
+               checkg_om = ""
+               group_by = ""
+
+               if grouping_cols is not None:
+                       comma_grp = " , " + grouping_cols
+                       group_by = " , " + _grp_from_table(edge_table,glist)
+                       comma_grp_e = " , " + _grp_from_table(edge_table,glist)
+                       comma_grp_m = " , " + _grp_from_table("message",glist)
+                       grp_comma = grouping_cols + " , "
+
+                       checkg_oo_sub = 
_check_groups(out_table,"oldupdate",glist)
+                       checkg_oo = " AND " + checkg_oo_sub
+                       checkg_eo = " AND " + 
_check_groups(edge_table,"oldupdate",glist)
+                       checkg_ex = " AND " + 
_check_groups(edge_table,"x",glist)
+                       checkg_om = " AND " + 
_check_groups("out_table","message",glist)
+
+               w_type = get_expr_type(weight,edge_table).lower()
+               init_w = INT_MAX
+               if w_type in ['double precision','float8']:
+                       init_w = INFINITY
+
                # We keep a table of every vertex, the minimum cost to that 
destination
                # seen so far and the parent to this vertex in the associated 
shortest
-               # path. This table will be updated throughtout the execution.
+               # path. This table will be updated throughout the execution.
                plpy.execute(
-                       """ CREATE TABLE {out_table} AS
-                               SELECT {vertex_id} AS {vertex_id},
-                                       CAST('Infinity' AS DOUBLE PRECISION) AS 
{weight},
-                                       NULL::INT AS parent
-                               FROM {vertex_table}
-                               WHERE {vertex_id} IS NOT NULL
+                       """ CREATE TABLE {out_table} AS ( SELECT
+                                       {grp_comma} {src} AS {vertex_id}, 
{weight},
+                                       {src} AS parent FROM {edge_table} LIMIT 
0)
                                {distribution} """.format(**locals()))
 
+               # We keep a summary table to keep track of the parameters used 
for this
+               # SSSP run. This table is used in the path finding function to 
eliminate
+               # the need for repetition.
+               plpy.execute( """ CREATE TABLE {out_table}_summary  (
+                       vertex_table            TEXT,
+                       vertex_id               TEXT,
+                       edge_table              TEXT,
+                       edge_args               TEXT,
+                       source_vertex           INTEGER,
+                       out_table               TEXT,
+                       grouping_cols           TEXT)
+                       """.format(**locals()))
+               plpy.execute( """ INSERT INTO {out_table}_summary VALUES
+                       ('{vertex_table}', '{v_st}', '{edge_table}', '{e_st}',
+                       {source_vertex}, '{out_table}', '{g_st}')
+                       """.format(**locals()))
+
                # We keep 2 update tables and alternate them during the 
execution.
                # This is necessary since we need to know which vertices are 
updated in
                # the previous iteration to calculate the next set of updates.
                plpy.execute(
-                       """ CREATE TEMP TABLE {oldupdate}(
-                               id INT, val DOUBLE PRECISION, parent INT)
+                       """ CREATE TEMP TABLE {oldupdate} AS ( SELECT
+                                       {src} AS id, {weight},
+                                       {src} AS parent {comma_grp} FROM 
{edge_table} LIMIT 0)
                                {local_distribution}
                                """.format(**locals()))
                plpy.execute(
-                       """ CREATE TEMP TABLE {newupdate}(
-                               id INT, val DOUBLE PRECISION, parent INT)
+                       """ CREATE TEMP TABLE {newupdate} AS ( SELECT
+                                       {src} AS id, {weight},
+                                       {src} AS parent {comma_grp} FROM 
{edge_table} LIMIT 0)
                                {local_distribution}
                                """.format(**locals()))
 
                # Since HAWQ does not allow us to update, we create a new table 
and
-               # rename at every iteration
-               temp_table = unique_string(desp='temp')
-               sql = m4_ifdef(<!__HAWQ__!>,
-                       """ CREATE TABLE {temp_table} (
-                                       {vertex_id} INT, {weight} DOUBLE 
PRECISION, parent INT)
-                                       {distribution};
-                       """,  <!''!>)
-               plpy.execute(sql.format(**locals()))
+               # rename at every iteration.
+               if is_hawq:
+                       temp_table = unique_string(desp='temp')
+                       sql =""" CREATE TABLE {temp_table} AS ( SELECT * FROM 
{out_table} )
+                               {distribution} """
+                       plpy.execute(sql.format(**locals()))
 
                # GPDB and HAWQ have distributed by clauses to help them with 
indexing.
                # For Postgres we add the indices manually.
@@ -137,45 +226,117 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, 
edge_table,
                        <!''!>)
                plpy.execute(sql_index)
 
-               # The source can be reached with 0 cost and it has itself as 
the parent.
-               plpy.execute(
-                       """ INSERT INTO {oldupdate}
-                               VALUES({source_vertex},0,{source_vertex})
-                       """.format(**locals()))
+               # The initialization step is quite different when grouping is 
involved
+               # since not every group (subgraph) will have the same set of 
vertices.
+
+               # Example:
+               # Assume there are two grouping columns g1 and g2
+               # g1 values are 0 and 1. g2 values are 5 and 6
+               if grouping_cols is not None:
+
+                       distinct_grp_table = unique_string(desp='grp')
+                       plpy.execute(""" DROP TABLE IF EXISTS 
{distinct_grp_table} """.
+                               format(**locals()))
+                       plpy.execute( """ CREATE TEMP TABLE 
{distinct_grp_table} AS
+                               SELECT DISTINCT {grouping_cols} FROM 
{edge_table} """.
+                               format(**locals()))
+                       subq = unique_string(desp='subquery')
+
+                       checkg_ds_sub = 
_check_groups(distinct_grp_table,subq,glist)
+                       grp_d_comma = _grp_from_table(distinct_grp_table,glist) 
+","
+
+                       plpy.execute(
+                               """ INSERT INTO {out_table}
+                               SELECT {grp_d_comma} {vertex_id} AS {vertex_id},
+                                       {init_w} AS {weight}, NULL::INT AS 
parent
+                               FROM {distinct_grp_table} INNER JOIN
+                                       (
+                                       SELECT {src} AS {vertex_id} {comma_grp}
+                                       FROM {edge_table}
+                                       UNION
+                                       SELECT {dest} AS {vertex_id} {comma_grp}
+                                       FROM {edge_table}
+                                       ) {subq} ON ({checkg_ds_sub})
+                               WHERE {vertex_id} IS NOT NULL
+                               """.format(**locals()))
+
+                       plpy.execute(
+                               """ INSERT INTO {oldupdate}
+                                       SELECT {source_vertex}, 0, 
{source_vertex},
+                                       {grouping_cols}
+                                       FROM {distinct_grp_table}
+                               """.format(**locals()))
+
+                       # The maximum number of vertices for any group.
+                       # Used for determining negative cycles.
+                       v_cnt = plpy.execute(
+                               """ SELECT max(count) as max FROM (
+                                               SELECT count({vertex_id}) AS 
count
+                                               FROM {out_table}
+                                               GROUP BY {grouping_cols}) x
+                               """.format(**locals()))[0]['max']
+                       plpy.execute("DROP TABLE IF EXISTS 
{0}".format(distinct_grp_table))
+               else:
+                       plpy.execute(
+                               """ INSERT INTO {out_table}
+                               SELECT {vertex_id} AS {vertex_id},
+                                       {init_w} AS {weight},
+                                       NULL AS parent
+                               FROM {vertex_table}
+                               WHERE {vertex_id} IS NOT NULL
+                                """.format(**locals()))
+
+                       # The source can be reached with 0 cost and it has 
itself as the
+                       # parent.
+                       plpy.execute(
+                               """ INSERT INTO {oldupdate}
+                                       
VALUES({source_vertex},0,{source_vertex})
+                               """.format(**locals()))
+
+                       v_cnt = plpy.execute(
+                               """ SELECT count(*) FROM {vertex_table}
+                               WHERE {vertex_id} IS NOT NULL
+                               """.format(**locals()))[0]['count']
 
-               v_cnt = plpy.execute(
-                       """SELECT count(*) FROM {vertex_table}
-                       WHERE {vertex_id} IS NOT 
NULL""".format(**locals()))[0]['count']
                for i in range(0,v_cnt+1):
 
-                       # Apply the updates calculated in the last iteration
-                       sql = m4_ifdef(<!__HAWQ__!>,
-                               <!"""
+                       # Apply the updates calculated in the last iteration.
+                       if is_hawq:
+                               sql = """
                                TRUNCATE TABLE {temp_table};
                                INSERT INTO {temp_table}
                                        SELECT *
                                        FROM {out_table}
-                                       WHERE {out_table}.{vertex_id} NOT IN (
-                                               SELECT {oldupdate}.id FROM 
{oldupdate})
+                                       WHERE NOT EXISTS (
+                                               SELECT 1
+                                               FROM {oldupdate} as oldupdate
+                                               WHERE {out_table}.{vertex_id} = 
oldupdate.id
+                                               {checkg_oo})
                                        UNION
-                                       SELECT * FROM {oldupdate};
+                                       SELECT {grp_comma} id, {weight}, parent 
FROM {oldupdate};
                                DROP TABLE {out_table};
                                ALTER TABLE {temp_table} RENAME TO {out_table};
-                               CREATE TABLE {temp_table} (
-                                       {vertex_id} INT, {weight} DOUBLE 
PRECISION, parent INT)
-                                       {distribution};
-                               """!>,
-                               <!"""
+                               CREATE TABLE {temp_table} AS (
+                                       SELECT * FROM {out_table} LIMIT 0)
+                                       {distribution};"""
+                               plpy.execute(sql.format(**locals()))
+                               ret = plpy.execute("SELECT id FROM {0} LIMIT 1".
+                                       format(oldupdate))
+                       else:
+                               sql = """
                                UPDATE {out_table} SET
-                               {weight}=oldupdate.val,
+                               {weight}=oldupdate.{weight},
                                parent=oldupdate.parent
                                FROM
                                {oldupdate} AS oldupdate
                                WHERE
-                               {out_table}.{vertex_id}=oldupdate.id
-                               """!>)
-                       plpy.execute(sql.format(**locals()))
+                               {out_table}.{vertex_id}=oldupdate.id AND
+                               {out_table}.{weight}>oldupdate.{weight} 
{checkg_oo}
+                               """
+                               ret = plpy.execute(sql.format(**locals()))
 
+                       if ret.nrows() == 0:
+                               break
 
                        plpy.execute("TRUNCATE TABLE {0}".format(newupdate))
 
@@ -194,105 +355,237 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, 
edge_table,
                        # for comparison.
 
                        # Once we have a list of edges and values (stores as 
'message'),
-                       # we check if these values are lower than the existing 
shortest path
-                       # values.
+                       # we check if these values are lower than the existing 
shortest
+                       # path values.
 
                        sql = (""" INSERT INTO {newupdate}
-                               SELECT DISTINCT ON (message.id) message.id AS 
id,
-                                       message.val AS val,
-                                       message.parent AS parent
+                               SELECT DISTINCT ON (message.id {comma_grp})
+                                       message.id AS id,
+                                       message.{weight} AS {weight},
+                                       message.parent AS parent {comma_grp_m}
                                FROM {out_table} AS out_table INNER JOIN
                                        (
-                                               SELECT edge_table.{dest} AS id, 
x.val AS val,
-                                                       oldupdate.id AS parent
+                                       SELECT {edge_table}.{dest} AS id, 
x.{weight} AS {weight},
+                                               oldupdate.id AS parent 
{comma_grp_e}
+                                       FROM {oldupdate} AS oldupdate INNER JOIN
+                                               {edge_table}  ON
+                                                       ({edge_table}.{src} = 
oldupdate.id {checkg_eo})
+                                               INNER JOIN
+                                               (
+                                               SELECT {edge_table}.{dest} AS 
id,
+                                                       min(oldupdate.{weight} +
+                                                               
{edge_table}.{weight}) AS {weight} {comma_grp_e}
                                                FROM {oldupdate} AS oldupdate 
INNER JOIN
-                                                       {edge_table} AS 
edge_table ON
-                                                       (edge_table.{src} = 
oldupdate.id) INNER JOIN
-                                                       (
-                                                               SELECT 
edge_table.{dest} AS id,
-                                                                       
min(oldupdate.val + edge_table.{weight})
-                                                                       AS val
-                                                               FROM 
{oldupdate} AS oldupdate INNER JOIN
-                                                                       
{edge_table} AS edge_table ON
-                                                                       
(edge_table.{src}=oldupdate.id)
-                                                               GROUP BY 
edge_table.{dest}
-                                                       ) x ON 
(edge_table.{dest} = x.id)
-                                               WHERE ABS(oldupdate.val + 
edge_table.{weight} - x.val)
-                                                       < {EPSILON}
-                                       ) AS message ON (message.id = 
out_table.{vertex_id})
-                               WHERE message.val<out_table.{weight}
+                                                       {edge_table}  ON
+                                                       
({edge_table}.{src}=oldupdate.id {checkg_eo})
+                                               GROUP BY {edge_table}.{dest} 
{comma_grp_e}
+                                               ) x
+                                               ON ({edge_table}.{dest} = x.id 
{checkg_ex} )
+                                       WHERE ABS(oldupdate.{weight} + 
{edge_table}.{weight}
+                                                               - x.{weight}) < 
{EPSILON}
+                                       ) message
+                                       ON (message.id = out_table.{vertex_id} 
{checkg_om})
+                               WHERE message.{weight}<out_table.{weight}
                                """.format(**locals()))
 
-                       # If there are no updates, SSSP is finalized
-                       ret = plpy.execute(sql)
-                       if ret.nrows() == 0:
-                               break
+                       plpy.execute(sql)
 
-                       # Swap the update tables for the next iteration
+                       # Swap the update tables for the next iteration.
                        tmp = oldupdate
                        oldupdate = newupdate
                        newupdate = tmp
 
-               # Bellman-Ford should converge in |V|-1 iterations.
+               plpy.execute("DROP TABLE IF EXISTS {0}".format(newupdate))
+               # The algorithm should converge in less than |V| iterations.
+               # Otherwise there is a negative cycle in the graph.
                if i == v_cnt:
-                       plpy.execute("DROP TABLE IF EXISTS 
{out_table}".format(**locals()))
-                       plpy.error("Graph SSSP: Detected a negative cycle in 
the graph.")
-
-               m4_ifdef(<!__HAWQ__!>,
-                       plpy.execute("DROP TABLE {temp_table} 
".format(**locals())), <!''!>)
+                       if grouping_cols is None:
+                               plpy.execute("DROP TABLE IF EXISTS {0},{1},{2}".
+                                       format(out_table, out_table+"_summary", 
oldupdate))
+                               if is_hawq:
+                                       plpy.execute("DROP TABLE IF EXISTS 
{0}".format(temp_table))
+                               plpy.error("Graph SSSP: Detected a negative 
cycle in the graph.")
+
+                       # It is possible that not all groups has negative 
cycles.
+                       else:
+
+                               # gsql is the string created by collating 
grouping columns.
+                               # By looking at the oldupdate table we can see 
which groups
+                               # are in a negative cycle.
+
+                               negs = plpy.execute(
+                                       """ SELECT array_agg(DISTINCT 
({grouping_cols})) AS grp
+                                               FROM {oldupdate}
+                                       """.format(**locals()))[0]['grp']
+
+                               # Delete the groups with negative cycles from 
the output table.
+                               sql_del = """ DELETE FROM {out_table}
+                                       USING {oldupdate} AS oldupdate
+                                       WHERE {checkg_oo_sub}"""
+                               if is_hawq:
+                                       sql_del = """
+                                               TRUNCATE TABLE {temp_table};
+                                               INSERT INTO {temp_table}
+                                                       SELECT *
+                                                       FROM {out_table}
+                                                       WHERE NOT EXISTS(
+                                                               SELECT 1
+                                                               FROM 
{oldupdate} as oldupdate
+                                                               WHERE 
{checkg_oo_sub}
+                                                               );
+                                               DROP TABLE {out_table};
+                                               ALTER TABLE {temp_table} RENAME 
TO {out_table};"""
+
+                               plpy.execute(sql_del.format(**locals()))
+
+                               # If every group has a negative cycle,
+                               # drop the output table as well.
+                               if table_is_empty(out_table):
+                                       plpy.execute("DROP TABLE IF EXISTS 
{0},{1}".
+                                               
format(out_table,out_table+"_summary"))
+
+                               plpy.warning(
+                                       """Graph SSSP: Detected a negative 
cycle in the """ +
+                                       """sub-graphs of following groups: 
{0}.""".
+                                       format(str(negs)[1:-1]))
+
+               plpy.execute("DROP TABLE IF EXISTS {0}".format(oldupdate))
+               if is_hawq:
+                       plpy.execute("DROP TABLE IF EXISTS {temp_table} ".
+                               format(**locals()))
 
        return None
 
-def graph_sssp_get_path(schema_madlib, sssp_table, dest_vertex, **kwargs):
+def graph_sssp_get_path(schema_madlib, sssp_table, dest_vertex, path_table,
+       **kwargs):
        """
-       Helper function that can be used to get the shortest path for a vertex
+    Helper function that can be used to get the shortest path for a vertex
     Args:
-       @param source_table     Name of the table that contains the SSSP output.
-        @param out_table       The vertex that will be the destination of the
-                                       desired path.
+        @param sssp_table   Name of the table that contains the SSSP output.
+        @param dest_vertex  The vertex that will be the destination of the
+                            desired path.
+        @param path_table   Name of the output table that contains the path.
        """
+       with MinWarning("warning"):
+               _validate_get_path(sssp_table, dest_vertex, path_table)
 
-       validate_get_path(sssp_table, dest_vertex)
-       cur = dest_vertex
-       cols = get_cols(sssp_table)
-       id = cols[0]
-       ret = [dest_vertex]
-       plan_name = unique_string(desp='plan')
-
-       # Follow the 'parent' chain until you reach the source.
-       # We don't need to know what the source is since it is the only vertex 
with
-       # itself as its parent
-       plpy.execute(""" PREPARE {plan_name} (int) AS
-               SELECT parent FROM {sssp_table} WHERE {id} = $1 LIMIT 1
-               """.format(**locals()))
-       sql = "EXECUTE {plan_name} ({cur})"
-       parent = plpy.execute(sql.format(**locals()))
+               temp1_name = unique_string(desp='temp1')
+               temp2_name = unique_string(desp='temp2')
 
-       if parent.nrows() == 0:
-               plpy.error(
-                       "Graph SSSP: Vertex {0} is not present in the sssp 
table {1}".
-                       format(dest_vertex,sssp_table))
-
-       while 1:
-               parent = parent[0]['parent']
-               if parent == cur:
-                       ret.reverse()
-                       return ret
-               else:
-                       ret.append(parent)
-                       cur = parent
-               parent = plpy.execute(sql.format(**locals()))
+               select_grps = ""
+               check_grps_t1 = ""
+               check_grps_t2 = ""
+               check_grps_pt1 = ""
+               check_grps_pt2 = ""
+               checkg_po = ""
+               grp_comma = ""
+               tmp = ""
+
+               summary = plpy.execute("SELECT * FROM 
{0}_summary".format(sssp_table))
+               vertex_id = summary[0]['vertex_id']
+               source_vertex = summary[0]['source_vertex']
+
+               if vertex_id == "NULL":
+                       vertex_id = "id"
+
+               grouping_cols = summary[0]['grouping_cols']
+               if grouping_cols == "NULL":
+                       grouping_cols = None
+
+               if grouping_cols is not None:
+                       glist = split_quoted_delimited_str(grouping_cols)
+                       select_grps = _grp_from_table(sssp_table,glist) + " , "
+                       check_grps_t1 = " AND " + _check_groups(
+                               sssp_table,temp1_name,glist)
+                       check_grps_t2 = " AND " + _check_groups(
+                               sssp_table,temp2_name,glist)
+
+                       checkg_po = " WHERE " + _check_groups(
+                               path_table,"oldupdate",glist)
+                       grp_comma = grouping_cols + " , "
+
+               if source_vertex == dest_vertex:
+                       plpy.execute("""
+                               CREATE TABLE {path_table} AS
+                               SELECT {grp_comma} '{{{dest_vertex}}}'::INT[] 
AS path
+                               FROM {sssp_table} WHERE {vertex_id} = 
{dest_vertex}
+                               """.format(**locals()))
+                       return
+
+               plpy.execute( "DROP TABLE IF EXISTS {0},{1}".
+                       format(temp1_name,temp2_name));
+               out = plpy.execute(""" CREATE TEMP TABLE {temp1_name} AS
+                               SELECT {grp_comma} {sssp_table}.parent AS 
{vertex_id},
+                                       ARRAY[{dest_vertex}] AS path
+                               FROM {sssp_table}
+                               WHERE {vertex_id} = {dest_vertex}
+                                       AND {sssp_table}.parent IS NOT NULL
+                       """.format(**locals()))
+
+               plpy.execute("""
+                       CREATE TEMP TABLE {temp2_name} AS
+                               SELECT * FROM {temp1_name} LIMIT 0
+                       """.format(**locals()))
+
+               # Follow the 'parent' chain until you reach the source.
+               while out.nrows() > 0:
+
+                       plpy.execute("TRUNCATE TABLE 
{temp2_name}".format(**locals()))
+                       # If the vertex id is not the source vertex,
+                       # Add it to the path and move to its parent
+                       out = plpy.execute(
+                               """ INSERT INTO {temp2_name}
+                               SELECT {select_grps} {sssp_table}.parent AS 
{vertex_id},
+                                       {sssp_table}.{vertex_id} || 
{temp1_name}.path AS path
+                               FROM {sssp_table} INNER JOIN {temp1_name} ON
+                                       ({sssp_table}.{vertex_id} = 
{temp1_name}.{vertex_id}
+                                               {check_grps_t1})
+                               WHERE {source_vertex} <> 
{sssp_table}.{vertex_id}
+                               """.format(**locals()))
+
+                       tmp = temp2_name
+                       temp2_name = temp1_name
+                       temp1_name = tmp
+
+                       tmp = check_grps_t1
+                       check_grps_t1 = check_grps_t2
+                       check_grps_t2 = tmp
+
+               # Add the source vertex to the beginning of every path and
+               # add the empty arrays for the groups that don't have a path to 
reach
+               # the destination vertex
+               plpy.execute("""
+                       CREATE TABLE {path_table} AS
+                       SELECT {grp_comma} {source_vertex} || path AS path
+                       FROM {temp2_name}
+                       UNION
+                       SELECT {grp_comma} '{{}}'::INT[] AS path
+                       FROM {sssp_table}
+                       WHERE {vertex_id} = {dest_vertex}
+                               AND {sssp_table}.parent IS NULL
+                       """.format(**locals()))
+
+               out = plpy.execute("SELECT 1 FROM {0} LIMIT 
1".format(path_table))
+
+               if out.nrows() == 0:
+                       plpy.error(
+                               "Graph SSSP: Vertex {0} is not present in the 
SSSP table {1}".
+                               format(dest_vertex,sssp_table))
+
+               plpy.execute("DROP TABLE IF EXISTS {temp1_name}, {temp1_name}".
+                       format(**locals()))
 
        return None
 
-def validate_sssp(vertex_table, vertex_id, edge_table, edge_params,
-       source_vertex, out_table, **kwargs):
+
+def _validate_sssp(vertex_table, vertex_id, edge_table, edge_params,
+       source_vertex, out_table, glist, **kwargs):
 
        validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
                out_table,'SSSP')
 
        _assert(isinstance(source_vertex,int),
-               """Graph SSSP: Source vertex {source_vertex} has to be an 
integer """.
+               """Graph SSSP: Source vertex {source_vertex} has to be an 
integer.""".
                format(**locals()))
        src_exists = plpy.execute("""
                SELECT * FROM {vertex_table} WHERE {vertex_id}={source_vertex}
@@ -300,8 +593,8 @@ def validate_sssp(vertex_table, vertex_id, edge_table, 
edge_params,
 
        if src_exists.nrows() == 0:
                plpy.error(
-                       """Graph SSSP: Source vertex {source_vertex} is not 
present in the
-                       vertex table {vertex_table} """.format(**locals()))
+                       """Graph SSSP: Source vertex {source_vertex} is not 
present in the vertex table {vertex_table}.""".
+                       format(**locals()))
 
        vt_error = plpy.execute(
                """ SELECT {vertex_id}
@@ -312,12 +605,20 @@ def validate_sssp(vertex_table, vertex_id, edge_table, 
edge_params,
 
        if vt_error.nrows() != 0:
                plpy.error(
-                       """Graph SSSP: Source vertex table {vertex_table}
-                       contains duplicate vertex id's """.format(**locals()))
+                       """Graph SSSP: Source vertex table {vertex_table} 
contains duplicate vertex id's.""".
+                       format(**locals()))
+
+       _assert(not table_exists(out_table+"_summary"),
+               "Graph SSSP: Output summary table already exists!")
+
+       if glist is not None:
+               _assert(columns_exist_in_table(edge_table, glist),
+                       """Graph SSSP: Not all columns from {glist} are present 
in edge table ({edge_table}).""".
+                       format(**locals()))
 
        return None
 
-def validate_get_path(sssp_table, dest_vertex, **kwargs):
+def _validate_get_path(sssp_table, dest_vertex, path_table, **kwargs):
 
        _assert(sssp_table and sssp_table.strip().lower() not in ('null', ''),
                "Graph SSSP: Invalid SSSP table name!")
@@ -326,21 +627,31 @@ def validate_get_path(sssp_table, dest_vertex, **kwargs):
        _assert(not table_is_empty(sssp_table),
                "Graph SSSP: SSSP table ({0}) is empty!".format(sssp_table))
 
+       summary = sssp_table+"_summary"
+       _assert(table_exists(summary),
+               "Graph SSSP: SSSP summary table ({0}) is 
missing!".format(summary))
+       _assert(not table_is_empty(summary),
+               "Graph SSSP: SSSP summary table ({0}) is 
empty!".format(summary))
+
+       _assert(not table_exists(path_table),
+               "Graph SSSP: Output path table already exists!")
+
+       return None
 
 def graph_sssp_help(schema_madlib, message, **kwargs):
-    """
-    Help function for graph_sssp and graph_sssp_get_path
+       """
+       Help function for graph_sssp and graph_sssp_get_path
 
-    Args:
-        @param schema_madlib
-        @param message: string, Help message string
-        @param kwargs
+       Args:
+               @param schema_madlib
+               @param message: string, Help message string
+               @param kwargs
 
-    Returns:
-        String. Help/usage information
-    """
-    if not message:
-        help_string = """
+       Returns:
+           String. Help/usage information
+       """
+       if not message:
+               help_string = """
 -----------------------------------------------------------------------
                             SUMMARY
 -----------------------------------------------------------------------
@@ -352,41 +663,120 @@ weights of its constituent edges is minimized.
 For more details on function usage:
     SELECT {schema_madlib}.graph_sssp('usage')
             """
-    elif message in ['usage', 'help', '?']:
-        help_string = """
+       elif message.lower() in ['usage', 'help', '?']:
+               help_string = """
 {graph_usage}
 
 To retrieve the path for a specific vertex:
 
  SELECT {schema_madlib}.graph_sssp_get_path(
     sssp_table TEXT, -- Name of the table that contains the SSSP output.
-    dest_vertex        INT   -- The vertex that will be the destination of the
-                 -- desired path.
+    dest_vertex        INT,  -- The vertex that will be the destination of the
+                      -- desired path.
+    path_table  TEXT  -- Name of the output table that contains the path.
 );
 
 ----------------------------------------------------------------------------
                             OUTPUT
 ----------------------------------------------------------------------------
-The output table ('out_table' above) will contain a row for every vertex from
-vertex_table and have the following columns:
-
-vertex_id      : The id for the destination. Will use the input parameter
-               (vertex_id) for column naming.
-weight                 : The total weight of the shortest path from the source 
vertex
-               to this particular vertex. Will use the input parameter (weight)
-               for column naming.
-parent                 : The parent of this vertex in the shortest path from 
source.
-               Will use "parent" for column naming.
-
-The graph_sssp_get_path function will return an INT array that contains the
-shortest path from the initial source vertex to the desired destination vertex.
+The output of SSSP ('out_table' above) contains a row for every vertex of
+every group and have the following columns (in addition to the grouping
+columns):
+  - vertex_id : The id for the destination. Will use the input parameter
+                'vertex_id' for column naming.
+  - weight    : The total weight of the shortest path from the source vertex
+              to this particular vertex.
+              Will use the input parameter 'weight' for column naming.
+  - parent    : The parent of this vertex in the shortest path from source.
+              Will use 'parent' for column naming.
+
+The output of graph_sssp_get_path ('path_table' above) contains a row for
+every group and has the following columns:
+  - grouping_cols : The grouping columns given in the creation of the SSSP
+                  table. If there are no grouping columns, these columns
+                  will not exist and the table will have a single row.
+  - path (ARRAY)  : The shortest path from the source vertex (as specified
+                  in the SSSP execution) to the destination vertex.
+"""
+       elif message.lower() in ("example", "examples"):
+               help_string = """
+----------------------------------------------------------------------------
+                                EXAMPLES
+----------------------------------------------------------------------------
+-- Create a graph, represented as vertex and edge tables.
+DROP TABLE IF EXISTS vertex,edge,out,out_summary,out_path;
+CREATE TABLE vertex(
+        id INTEGER
+        );
+CREATE TABLE edge(
+        src INTEGER,
+        dest INTEGER,
+        weight DOUBLE PRECISION
+);
+
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6),
+(7)
+;
+INSERT INTO edge VALUES
+(0, 1, 1),
+(0, 2, 1),
+(0, 4, 10),
+(1, 2, 2),
+(1, 3, 10),
+(2, 3, 1),
+(2, 5, 1),
+(2, 6, 3),
+(3, 0, 1),
+(4, 0, -2),
+(5, 6, 1),
+(6, 7, 1)
+;
+
+-- Compute the SSSP:
+DROP TABLE IF EXISTS pagerank_out;
+SELECT madlib.graph_sssp(
+       'vertex',                            -- Vertex table
+       'id',                                -- Vertix id column
+       'edge',                              -- Edge table
+       'src=src, dest=dest, weight=weight', -- Comma delimted string of edge 
arguments
+        0,                                  -- The source vertex
+       'out'                                -- Output table of SSSP
+);
+-- View the SSSP costs for every vertex:
+SELECT * FROM out ORDER BY id;
+
+-- View the actual shortest path for a vertex:
+SELECT graph_sssp_get_path('out',5,'out_path');
+SELECT * FROM out_path;
+
+-- Create a graph with 2 groups:
+DROP TABLE IF EXISTS edge_gr;
+CREATE TABLE edge_gr AS
+(
+  SELECT *, 0 AS grp FROM edge
+  UNION
+  SELECT *, 1 AS grp FROM edge WHERE src < 6 AND dest < 6
+);
+INSERT INTO edge_gr VALUES
+(4,5,-20,1);
+
+-- Find SSSP for all groups:
+DROP TABLE IF EXISTS out_gr, out_gr_summary;
+SELECT graph_sssp('vertex',NULL,'edge_gr',NULL,0,'out_gr','grp');
 """
-    else:
-        help_string = "No such option. Use {schema_madlib}.graph_sssp()"
+       else:
+               help_string = "No such option. Use {schema_madlib}.graph_sssp()"
 
-    return help_string.format(schema_madlib=schema_madlib,
-       graph_usage=get_graph_usage(schema_madlib, 'graph_sssp',
+       return help_string.format(schema_madlib=schema_madlib,
+               graph_usage=get_graph_usage(schema_madlib, 'graph_sssp',
     """source_vertex INT,  -- The source vertex id for the algorithm to start.
-    out_table     TEXT  -- Name of the table to store the result of SSSP."""))
+    out_table     TEXT, -- Name of the table to store the result of SSSP.
+    grouping_cols TEXT  -- The list of grouping columns."""))
 # ---------------------------------------------------------------------
-

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8faf6226/src/ports/postgres/modules/graph/sssp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/sssp.sql_in 
b/src/ports/postgres/modules/graph/sssp.sql_in
index 7f89823..be433dc 100644
--- a/src/ports/postgres/modules/graph/sssp.sql_in
+++ b/src/ports/postgres/modules/graph/sssp.sql_in
@@ -55,7 +55,8 @@ graph_sssp( vertex_table,
             edge_table,
             edge_args,
             source_vertex,
-            out_table
+            out_table,
+            grouping_cols
           )
 </pre>
 
@@ -89,12 +90,18 @@ exist in the 'vertex_id' column of 'vertex_table'.</dd>
 
 <dt>out_table</dt>
 <dd>TEXT. Name of the table to store the result of SSSP.
-It will contain a row for every vertex from 'vertex_table' and have
-the following columns:
+It contains a row for every vertex of every group and have
+the following columns (in addition to the grouping columns):
   - vertex_id : The id for the destination. Will use the input parameter 
'vertex_id' for column naming.
   - weight : The total weight of the shortest path from the source vertex to 
this particular vertex.
-  Will use the input parameter (weight) for column naming.
-  - parent : The parent of this vertex in the shortest path from source. Will 
use 'parent' for column naming.</dd>
+  Will use the input parameter 'weight' for column naming.
+  - parent : The parent of this vertex in the shortest path from source. Will 
use 'parent' for column naming.
+
+A summary table named <out_table>_summary is also created. This is an internal 
table that keeps a record of the input parameters and is used by the path 
function described below.
+</dd>
+
+<dt>grouping_cols</dt>
+<dd>TEXT, default = NULL. List of columns used to group the input into 
discrete subgraphs. These columns must exist in the edge table. When this value 
is null, no grouping is used and a single SSSP result is generated. </dd>
 </dl>
 
 @par Path Retrieval
@@ -103,9 +110,10 @@ The path retrieval function returns the shortest path from 
the
 source vertex to a specified desination vertex.
 
 <pre class="syntax">
-graph_sssp( sssp_table,
-            dest_vertex
-          )
+graph_sssp_get_path( sssp_table,
+                     dest_vertex,
+                     path_table
+                    )
 </pre>
 
 \b Arguments
@@ -115,6 +123,14 @@ graph_sssp( sssp_table,
 
 <dt>dest_vertex</dt>
 <dd>INTEGER. The vertex that will be the destination of the desired path.</dd>
+
+<dt>path_table</dt>
+<dd>TEXT. Name of the output table that contains the path.
+It contains a row for every group and has the following columns:
+  - grouping_cols : The grouping columns given in the creation of the SSSP 
table. If there are no grouping columns, these columns will not exist and the 
table will have a single row.
+  - path (ARRAY) : The shortest path from the source vertex (as specified in 
the SSSP execution) to the destination vertex.
+</dd>
+
 </dl>
 
 @anchor notes
@@ -167,7 +183,7 @@ INSERT INTO edge VALUES
 
 -# Calculate the shortest paths from vertex 0:
 <pre class="syntax">
-DROP TABLE IF EXISTS out;
+DROP TABLE IF EXISTS out, out_summary;
 SELECT madlib.graph_sssp(
                          'vertex',      -- Vertex table
                          NULL,          -- Vertix id column (NULL means use 
default naming)
@@ -191,14 +207,16 @@ SELECT * FROM out ORDER BY id;
 (8 rows)
 </pre>
 
--# Get the shortest path to vertex 6:
+-# Get the shortest path to vertex 5:
 <pre class="syntax">
-SELECT madlib.graph_sssp_get_path('out',6) AS spath;
+DROP TABLE IF EXISTS out_path;
+SELECT madlib.graph_sssp_get_path('out',5,'out_path');
+SELECT * FROM out_path;
 </pre>
 <pre class="result">
-   spath
-\-----------
- {0,2,5,6}
+  path
+\---------
+ {0,2,5}
 </pre>
 
 -# Now let's do a similar example except using
@@ -212,10 +230,10 @@ CREATE TABLE edge_alt AS SELECT src AS e_src, dest, 
weight AS e_weight FROM edge
 
 -# Get the shortest path from vertex 1:
 <pre class="syntax">
-DROP TABLE IF EXISTS out_alt;
+DROP TABLE IF EXISTS out_alt, out_alt_summary;
 SELECT madlib.graph_sssp(
                          'vertex_alt',                  -- Vertex table
-                         'v_id',                        -- Vertix id column 
(NULL means use default naming)
+                         'v_id',                        -- Vertex id column 
(NULL means use default naming)
                          'edge_alt',                    -- Edge table
                          'src=e_src, weight=e_weight',  -- Edge arguments 
(NULL means use default naming)
                          1,                             -- Source vertex for 
path calculation
@@ -236,6 +254,65 @@ SELECT * FROM out_alt ORDER BY v_id;
 (8 rows)
 </pre>
 
+-# Create a graph with 2 groups:
+<pre class="syntax">
+DROP TABLE IF EXISTS edge_gr;
+CREATE TABLE edge_gr AS
+(
+  SELECT *, 0 AS grp FROM edge
+  UNION
+  SELECT *, 1 AS grp FROM edge WHERE src < 6 AND dest < 6
+);
+INSERT INTO edge_gr VALUES
+(4,5,-20,1);
+</pre>
+
+-# Find SSSP for all groups
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr, out_gr_summary;
+SELECT madlib.graph_sssp(
+                         'vertex',      -- Vertex table
+                         NULL,          -- Vertex id column (NULL means use 
default naming)
+                         'edge_gr',     -- Edge table
+                         NULL,          -- Edge arguments (NULL means use 
default naming)
+                         0,             -- Source vertex for path calculation
+                         'out_gr',      -- Output table of shortest paths
+                         'grp'          -- Grouping columns
+);
+SELECT * FROM out_gr ORDER BY grp,id;
+</pre>
+<pre class="result">
+ grp | id | weight | parent
+-----+----+--------+--------
+   0 |  0 |      0 |      0
+   0 |  1 |      1 |      0
+   0 |  2 |      1 |      0
+   0 |  3 |      2 |      2
+   0 |  4 |     10 |      0
+   0 |  5 |      2 |      2
+   0 |  6 |      3 |      5
+   0 |  7 |      4 |      6
+   1 |  0 |      0 |      0
+   1 |  1 |      1 |      0
+   1 |  2 |      1 |      0
+   1 |  3 |      2 |      2
+   1 |  4 |     10 |      0
+   1 |  5 |    -10 |      4
+</pre>
+
+-# Find the path to vertex 5 in every group
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr_path;
+SELECT madlib.graph_sssp_get_path('out_gr',5,'out_gr_path');
+SELECT * FROM out_gr_path ORDER BY grp;
+</pre>
+<pre class="result">
+ grp |  path
+-----+---------
+   0 | {0,2,5}
+   1 | {0,4,5}
+</pre>
+
 @anchor literature
 @par Literature
 
@@ -253,21 +330,36 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp(
     edge_table              TEXT,
     edge_args               TEXT,
     source_vertex           INT,
-    out_table               TEXT
+    out_table               TEXT,
+    grouping_cols           TEXT
 
 ) RETURNS VOID AS $$
     PythonFunction(graph, sssp, graph_sssp)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp(
+    vertex_table            TEXT,
+    vertex_id               TEXT,
+    edge_table              TEXT,
+    edge_args               TEXT,
+    source_vertex           INT,
+    out_table               TEXT
+
+) RETURNS VOID AS $$
+     SELECT MADLIB_SCHEMA.graph_sssp($1, $2, $3, $4, $5, $6, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp_get_path(
     sssp_table             TEXT,
-    dest_vertex            INT
+    dest_vertex            INT,
+    path_table             TEXT
 
-) RETURNS INT[] AS $$
+) RETURNS VOID AS $$
     PythonFunction(graph, sssp, graph_sssp_get_path)
 $$ LANGUAGE plpythonu VOLATILE
-m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
 -------------------------------------------------------------------------
 
 -- Online help

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8faf6226/src/ports/postgres/modules/graph/test/sssp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/test/sssp.sql_in 
b/src/ports/postgres/modules/graph/test/sssp.sql_in
index e2342c5..c3545c2 100644
--- a/src/ports/postgres/modules/graph/test/sssp.sql_in
+++ b/src/ports/postgres/modules/graph/test/sssp.sql_in
@@ -20,7 +20,10 @@
  *//* ----------------------------------------------------------------------- 
*/
 
 
-DROP TABLE IF EXISTS vertex,edge,out,vertex_alt,edge_alt,out_alt;
+DROP TABLE IF EXISTS vertex,edge,out,out_summary,out_path,
+       vertex_alt,edge_alt,out_alt,out_alot_summary,
+       edge_gr,out_gr,out_gr_summary,out_gr_path,
+       edge_gr2, out_gr2, out_gr2_summary;
 
 
 CREATE TABLE vertex(
@@ -30,7 +33,7 @@ CREATE TABLE vertex(
 CREATE TABLE edge(
                   src INTEGER,
                   dest INTEGER,
-                  weight INTEGER
+                  weight DOUBLE PRECISION
                 );
 
 INSERT INTO vertex VALUES
@@ -62,17 +65,69 @@ SELECT graph_sssp('vertex',NULL,'edge',NULL,0,'out');
 
 SELECT * FROM out;
 
-SELECT assert(weight = 3, 'Wrong output in graph (SSSP)') FROM out WHERE id = 
6;
-SELECT assert(parent = 5, 'Wrong parent in graph (SSSP)') FROM out WHERE id = 
6;
+SELECT assert(weight = 3, 'Wrong output in graph (SSSP)')
+       FROM out WHERE id = 6;
+SELECT assert(parent = 5, 'Wrong parent in graph (SSSP)')
+       FROM out WHERE id = 6;
 
-SELECT graph_sssp_get_path('out',6);
+SELECT graph_sssp_get_path('out',6,'out_path');
 
-CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex;
-CREATE TABLE edge_alt AS SELECT src AS e_src, dest, weight AS e_weight FROM 
edge;
+CREATE TABLE vertex_alt AS SELECT id AS v_id
+       FROM vertex;
+CREATE TABLE edge_alt AS SELECT src AS e_src, dest, weight AS e_weight
+       FROM edge;
 
-SELECT graph_sssp('vertex_alt','v_id','edge_alt','src=e_src, 
weight=e_weight',1,'out_alt');
+SELECT graph_sssp('vertex_alt','v_id','edge_alt','src=e_src, weight=e_weight'
+       ,1,'out_alt');
 
 SELECT * FROM out_alt;
 
-SELECT assert(e_weight = 4, 'Wrong output in graph (SSSP)') FROM out_alt WHERE 
v_id = 6;
-SELECT assert(parent = 5, 'Wrong parent in graph (SSSP)') FROM out_alt WHERE 
v_id = 6;
+SELECT assert(e_weight = 4, 'Wrong output in graph (SSSP)')
+       FROM out_alt WHERE v_id = 6;
+SELECT assert(parent = 5, 'Wrong parent in graph (SSSP)')
+       FROM out_alt WHERE v_id = 6;
+
+CREATE TABLE edge_gr AS
+(      SELECT *, 0 AS grp FROM edge
+       UNION
+       SELECT *, 1 AS grp FROM edge WHERE src < 6 AND dest < 6
+       UNION
+       SELECT *, 2 AS grp FROM edge WHERE src < 6 AND dest < 6
+);
+
+INSERT INTO edge_gr VALUES
+(7,NULL,NULL,1),
+(4,0,-20,2);
+
+SELECT graph_sssp('vertex',NULL,'edge_gr',NULL,0,'out_gr','grp');
+
+SELECT assert(weight = 3, 'Wrong output in graph (SSSP)')
+       FROM out_gr WHERE id = 6 AND grp = 0;
+SELECT assert(parent = 5, 'Wrong parent in graph (SSSP)')
+       FROM out_gr WHERE id = 6 AND grp = 0;
+
+SELECT assert(weight = 2, 'Wrong output in graph (SSSP)')
+       FROM out_gr WHERE id = 5 AND grp = 1;
+SELECT assert(parent = 2, 'Wrong parent in graph (SSSP)')
+       FROM out_gr WHERE id = 5 AND grp = 1;
+
+SELECT assert(weight = 'Infinity', 'Wrong output in graph (SSSP)')
+       FROM out_gr WHERE id = 7 AND grp = 1;
+
+SELECT graph_sssp_get_path('out_gr',5,'out_gr_path');
+
+CREATE TABLE edge_gr2 AS
+(      SELECT *, 0 AS grp1, 0 AS grp2 FROM edge
+       UNION
+       SELECT *, 1 AS grp1, 0 AS grp2 FROM edge WHERE src < 6 AND dest < 6
+       UNION
+       SELECT *, 1 AS grp1, 1 AS grp2 FROM edge WHERE src < 6 AND dest < 6
+);
+
+SELECT graph_sssp('vertex',NULL,'edge_gr2',NULL,0,'out_gr2','grp1,grp2');
+
+
+SELECT assert(weight = 3, 'Wrong output in graph (SSSP)')
+       FROM out_gr2 WHERE id = 6 AND grp1 = 0 AND grp2 = 0;
+SELECT assert(parent = 5, 'Wrong parent in graph (SSSP)')
+       FROM out_gr2 WHERE id = 6 AND grp1 = 0 AND grp2 = 0;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8faf6226/src/ports/postgres/modules/pca/pca.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/pca/pca.py_in 
b/src/ports/postgres/modules/pca/pca.py_in
index 196c558..680e9f6 100644
--- a/src/ports/postgres/modules/pca/pca.py_in
+++ b/src/ports/postgres/modules/pca/pca.py_in
@@ -144,16 +144,16 @@ def pca_wrap(schema_madlib, source_table, pc_table, 
row_id,
         )
         """.format(pc_table=pc_table, 
grouping_cols_clause=grouping_cols_clause))
     pc_table_mean = add_postfix(pc_table, "_mean")
+    plpy.execute("DROP TABLE IF EXISTS {0}".format(pc_table_mean))
     plpy.execute("""
-        DROP TABLE IF EXISTS {pc_table_mean};
         CREATE TABLE {pc_table_mean} (
             column_mean     double precision[]
             {grouping_cols_clause}
         )
         """.format(pc_table_mean=pc_table_mean, 
grouping_cols_clause=grouping_cols_clause))
     if result_summary_table:
+        plpy.execute("DROP TABLE IF EXISTS {0}".format(result_summary_table))
         plpy.execute("""
-                DROP TABLE IF EXISTS {0};
                 CREATE TABLE {0} (
                 rows_used               INTEGER,
                 "exec_time (ms)"        numeric,
@@ -947,7 +947,7 @@ SELECT {schema_madlib}.pca_train( 'mat',
           'id',
           3
     );
-    
+
 SELECT * FROM result_table ORDER BY row_id;
 
 DROP TABLE IF EXISTS mat_group;

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8faf6226/src/ports/postgres/modules/stats/test/chi2_test.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/stats/test/chi2_test.sql_in 
b/src/ports/postgres/modules/stats/test/chi2_test.sql_in
index c49d996..62648a0 100644
--- a/src/ports/postgres/modules/stats/test/chi2_test.sql_in
+++ b/src/ports/postgres/modules/stats/test/chi2_test.sql_in
@@ -58,7 +58,7 @@ CREATE TABLE chi2_independence_est_1 AS
 SELECT (chi2_gof_test(observed, expected, deg_freedom)).*
 FROM (
     SELECT
-        observed,
+        id_x,id_y,observed,
         sum(observed) OVER (PARTITION BY id_x)::DOUBLE PRECISION
             * sum(observed) OVER (PARTITION BY id_y) AS expected
     FROM chi2_test_friendly_unpivoted

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/8faf6226/src/ports/postgres/modules/validation/internal/cross_validation.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/validation/internal/cross_validation.py_in 
b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
index c1b2561..11cde2f 100644
--- a/src/ports/postgres/modules/validation/internal/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/internal/cross_validation.py_in
@@ -200,9 +200,9 @@ def _cv_copy_data(rel_origin, dependent_varname,
     """
     """
     target_col, features_col = 'y', 'x'
+    plpy.execute("drop table if exists {0}".format(rel_copied))
     plpy.execute("""
         select setseed(0.5);
-        drop table if exists {rel_copied};
         create temp table {rel_copied} as
             select
                 row_number() over (order by random()) as {random_id},
@@ -233,15 +233,15 @@ def _cv_split_data(rel_source, col_data, col_id, row_num,
     # which corresponds to rows outside of [start_row, end_row).
     # Extract the validation part of data,
     # which corresponds to rows inside of [start_row, end_row).
+    plpy.execute("drop view if exists {rel_train}".format(**kwargs))
     plpy.execute("""
-        drop view if exists {rel_train};
         create temp view {rel_train} as
             select {col_id}, {col_string} from {rel_source}
             where {col_id} < {start_row}
                  or {col_id} >= {end_row}
         """.format(**kwargs))
+    plpy.execute("drop view if exists {rel_valid}".format(**kwargs))
     plpy.execute("""
-        drop view if exists {rel_valid};
         create temp view {rel_valid} as
             select {col_id}, {col_string} from {rel_source}
             where {col_id} >= {start_row}

incubator-madlib git commit: Mulltiple: Add grouping support for SSSP and support GPDB5

Reply via email to