incubator-madlib git commit: Graph: Add APSP with grouping

okislal Fri, 26 May 2017 15:31:48 -0700

Repository: incubator-madlib
Updated Branches:
  refs/heads/master e3c98a094 -> 755c7b70d



Graph: Add APSP with grouping

JIRA: MADLIB-1099

- Add all pairs shortest path algorithm with grouping support.
This algorithm provides the minimum cost (sum of edge weights) paths for
every vertex pair in the graph.
- Add the get path algorithm to find the actual path for any vertex pair
(with grouping support)
- Minor changes in SSSP and graph utilities.

Closes #136


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/755c7b70
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/755c7b70
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/755c7b70

Branch: refs/heads/master
Commit: 755c7b70d3bfc44a15b4d3d9f4fa634b24723a5b
Parents: e3c98a0
Author: Orhan Kislal <okis...@pivotal.io>
Authored: Fri May 26 15:29:57 2017 -0700
Committer: Orhan Kislal <okis...@pivotal.io>
Committed: Fri May 26 15:29:57 2017 -0700

----------------------------------------------------------------------
 doc/mainpage.dox.in                             |   4 +
 src/ports/postgres/modules/graph/apsp.py_in     | 832 +++++++++++++++++++
 src/ports/postgres/modules/graph/apsp.sql_in    | 430 ++++++++++
 .../postgres/modules/graph/graph_utils.py_in    |  23 +
 src/ports/postgres/modules/graph/sssp.py_in     |  50 +-
 src/ports/postgres/modules/graph/sssp.sql_in    |   8 +-
 .../postgres/modules/graph/test/apsp.sql_in     |  99 +++
 7 files changed, 1405 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/755c7b70/doc/mainpage.dox.in
----------------------------------------------------------------------
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index c94260b..8b894b1 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -125,6 +125,10 @@ complete matrix stored as a distributed table.
         @ingroup grp_datatrans
 @defgroup grp_graph Graph
 @{Contains graph algorithms. @}
+
+    @defgroup grp_apsp All Pairs Shortest Path
+    @ingroup grp_graph
+
     @defgroup grp_pagerank PageRank
     @ingroup grp_graph
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/755c7b70/src/ports/postgres/modules/graph/apsp.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/apsp.py_in 
b/src/ports/postgres/modules/graph/apsp.py_in
new file mode 100644
index 0000000..298f57c
--- /dev/null
+++ b/src/ports/postgres/modules/graph/apsp.py_in
@@ -0,0 +1,832 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# All Pairs Shortest Path
+
+# Please refer to the apsp.sql_in file for the documentation
+
+"""
+@file apsp.py_in
+
+@namespace graph
+"""
+
+
+import plpy
+from graph_utils import validate_graph_coding
+from graph_utils import get_graph_usage
+from graph_utils import _grp_from_table
+from graph_utils import _check_groups
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.utilities import split_quoted_delimited_str
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import table_is_empty
+from utilities.validate_args import get_expr_type
+
+m4_changequote(`<!', `!>')
+
+def graph_apsp(schema_madlib, vertex_table, vertex_id, edge_table,
+               edge_args, out_table, grouping_cols, **kwargs):
+       """
+    All Pairs shortest path function for graphs using the matrix
+    multiplication based algorithm [1].
+    [1] 
http://users.cecs.anu.edu.au/~Alistair.Rendell/Teaching/apac_comp3600/module4/all_pairs_shortest_paths.xhtml
+    Args:
+        @param vertex_table     Name of the table that contains the vertex 
data.
+        @param vertex_id        Name of the column containing the vertex ids.
+        @param edge_table       Name of the table that contains the edge data.
+        @param edge_args        A comma-delimited string containing multiple
+                                                       named arguments of the 
form "name=value".
+        @param out_table           Name of the table to store the result of 
APSP.
+        @param grouping_cols   The list of grouping columns.
+
+    """
+
+       with MinWarning("warning"):
+
+               INT_MAX = 2147483647
+               INFINITY = "'Infinity'"
+               EPSILON = 0.000001
+
+               params_types = {'src': str, 'dest': str, 'weight': str}
+               default_args = {'src': 'src', 'dest': 'dest', 'weight': 
'weight'}
+               edge_params = extract_keyvalue_params(edge_args,
+                                            params_types,
+                                            default_args)
+
+               # Prepare the input for recording in the summary table
+               if vertex_id is None:
+                       v_st= "NULL"
+                       vertex_id = "id"
+               else:
+                       v_st = vertex_id
+               if edge_args is None:
+                       e_st = "NULL"
+               else:
+                       e_st = edge_args
+               if grouping_cols is None:
+                       g_st = "NULL"
+                       glist = None
+               else:
+                       g_st = grouping_cols
+                       glist = split_quoted_delimited_str(grouping_cols)
+
+               src = edge_params["src"]
+               dest = edge_params["dest"]
+               weight = edge_params["weight"]
+
+               distribution = m4_ifdef(<!__POSTGRESQL__!>, <!''!>,
+                       <!"DISTRIBUTED BY ({0})".format(src)!>)
+
+               is_hawq = m4_ifdef(<!__HAWQ__!>, <!True!>, <!False!>)
+
+               _validate_apsp(vertex_table, vertex_id, edge_table,
+                       edge_params, out_table, glist)
+
+               out_table_1 = unique_string(desp='out_table_1')
+               out_table_2 = unique_string(desp='out_table_2')
+               tmp_view = unique_string(desp='tmp_view')
+               v1 = unique_string(desp='v1')
+               v2 = unique_string(desp='v2')
+               message = unique_string(desp='message')
+
+               # Initialize grouping related variables
+               comma_grp = ""
+               comma_grp_e = ""
+               comma_grp_m = ""
+               grp_comma = ""
+               grp_v1_comma = ""
+               grp_o1_comma = ""
+               grp_o_comma = ""
+               checkg_eo = ""
+               checkg_eout = ""
+               checkg_ex = ""
+               checkg_om = ""
+               checkg_o1v_sub = ""
+               checkg_ov_sub = ""
+               checkg_ot = ""
+               checkg_o1t = ""
+               checkg_vv = ""
+               checkg_o2v = ""
+               checkg_oy = ""
+               checkg_vv_sub = "TRUE"
+               grp_by = ""
+
+               if grouping_cols is not None:
+
+                       # We use actual table names in some cases and aliases 
in others
+                       # In some cases, we swap the table names so use of an 
alias is
+                       # necessary. In other cases, they are used to simplify 
debugging.
+
+                       comma_grp = " , " + grouping_cols
+                       comma_grp_e = " , " + _grp_from_table("edge",glist)
+                       comma_grp_m = " , " + _grp_from_table(message,glist)
+                       grp_comma = grouping_cols + " , "
+                       grp_v1_comma = _grp_from_table("v1",glist) + " , "
+                       grp_o1_comma = _grp_from_table(out_table_1,glist) + " , 
"
+                       grp_o_comma = _grp_from_table("out",glist) + " , "
+
+                       checkg_eo = " AND " + 
_check_groups(edge_table,out_table,glist)
+                       checkg_eout = " AND " + 
_check_groups("edge","out",glist)
+                       checkg_ex = " AND " + _check_groups("edge","x",glist)
+                       checkg_om = " AND " + 
_check_groups(out_table,message,glist)
+                       checkg_o1v_sub = _check_groups("out",tmp_view,glist)
+                       checkg_ov_sub = _check_groups(out_table,tmp_view,glist)
+                       checkg_ot = " AND " + 
_check_groups(out_table,tmp_view,glist)
+                       checkg_o1t = " AND " + _check_groups("out","t",glist)
+                       checkg_vv = " AND " + _check_groups("v1","v2",glist)
+                       checkg_o2v = " AND " + 
_check_groups(out_table_2,"v2",glist)
+                       checkg_oy = " AND " + _check_groups("out","y",glist)
+                       checkg_vv_sub = _check_groups("v1","v2",glist)
+                       grp_by = " GROUP BY " + grouping_cols
+
+               w_type = get_expr_type(weight,edge_table).lower()
+               init_w = INT_MAX
+               if w_type in ['real','double precision','float8']:
+                       init_w = INFINITY
+
+               # We keep a summary table to keep track of the parameters used 
for this
+               # APSP run. This table is used in the path finding function to 
eliminate
+               # the need for repetition.
+               plpy.execute( """ CREATE TABLE {out_table}_summary  (
+                       vertex_table            TEXT,
+                       vertex_id               TEXT,
+                       edge_table              TEXT,
+                       edge_args               TEXT,
+                       out_table               TEXT,
+                       grouping_cols           TEXT)
+                       """.format(**locals()))
+               plpy.execute( """ INSERT INTO {out_table}_summary VALUES
+                       ('{vertex_table}', '{v_st}', '{edge_table}', '{e_st}',
+                       '{out_table}', '{g_st}') """.format(**locals()))
+
+               plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
+
+               # Find all of the vertices involved with a given group
+               plpy.execute(""" CREATE VIEW {tmp_view} AS
+                       SELECT {src} AS {vertex_id} {comma_grp}
+                       FROM {edge_table} WHERE {src} IS NOT NULL
+                       UNION
+                       SELECT {dest} AS {vertex_id} {comma_grp}
+                       FROM {edge_table} WHERE {dest} IS NOT NULL
+                       """.format(**locals()))
+
+               # Don't use the unnecessary rows of the output table during 
joins.
+               ot_sql = """ SELECT * FROM {out_table}
+                       WHERE {weight} != {init_w} AND {src} != {dest} """
+
+               # HAWQ does not support UPDATE so the initialization has to 
differ.
+               if is_hawq:
+
+                       plpy.execute(" DROP TABLE IF EXISTS {0},{1}".format(
+                               out_table_1,out_table_2))
+                       # Create 2 identical tables to swap at every iteration.
+                       plpy.execute(""" CREATE TABLE {out_table_1} AS
+                               SELECT {grp_comma} {src},{dest},{weight}, 
NULL::INT AS parent
+                               FROM {edge_table} LIMIT 0 {distribution}
+                               """.format(**locals()))
+                       plpy.execute(""" CREATE TABLE {out_table_2} AS
+                               SELECT * FROM {out_table_1} LIMIT 0 
{distribution}
+                               """.format(**locals()))
+
+                       # The source can be reached with 0 cost and next is 
itself.
+                       plpy.execute(""" INSERT INTO {out_table_2}
+                               SELECT {grp_comma} {vertex_id} AS {src}, 
{vertex_id} AS {dest},
+                                       0 AS {weight}, {vertex_id} AS parent
+                               FROM {tmp_view} """.format(**locals()))
+                       # Distance = 1: every edge means there is a path from 
src to dest
+                       plpy.execute(""" INSERT INTO {out_table_2}
+                               SELECT {grp_comma} {src}, {dest}, {weight}, 
{dest} AS parent
+                               FROM {edge_table} """.format(**locals()))
+
+                       # Fill the rest of the possible pairs with infinite 
initial weights
+                       fill_sql = """ INSERT INTO {out_table_1}
+                               SELECT {grp_v1_comma}
+                                       v1.{vertex_id} AS {src}, v2.{vertex_id} 
AS {dest},
+                                       {init_w} AS {weight}, NULL::INT AS 
parent
+                               FROM {tmp_view} v1, {tmp_view} v2
+                               WHERE NOT EXISTS
+                                       (SELECT 1 FROM {out_table_2}
+                                               WHERE v1.{vertex_id} = {src} AND
+                                                       v2.{vertex_id} = {dest}
+                                                       {checkg_vv} 
{checkg_o2v})
+                                       {checkg_vv}
+                               UNION
+                               SELECT * FROM {out_table_2}
+                               """.format(**locals())
+                       plpy.execute(fill_sql)
+
+                       ot_sql1 = ot_sql.format(out_table = out_table_1, init_w 
= init_w,
+                               weight = weight, src = src, dest = dest)
+                       ot_sql2 = ot_sql.format(out_table = out_table_2, init_w 
= init_w,
+                               weight = weight, src = src, dest = dest)
+
+               # PostgreSQL & GPDB initialization
+               else:
+
+                       plpy.execute( """ CREATE TABLE {out_table} AS
+                               (SELECT {grp_comma} {src}, {dest}, {weight},
+                                       {src} AS parent FROM {edge_table} LIMIT 
0)
+                               {distribution} """.format(**locals()))
+
+                       plpy.execute( """ INSERT INTO {out_table}
+                               SELECT {grp_v1_comma}
+                                       v1.{vertex_id} AS {src}, v2.{vertex_id} 
AS {dest},
+                                       {init_w} AS {weight}, NULL::INT AS 
parent
+                               FROM
+                                       {tmp_view} AS v1 INNER JOIN
+                                       {tmp_view} AS v2 ON ({checkg_vv_sub})
+                                       """.format(**locals()))
+                       plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
+
+                       # GPDB and HAWQ have distributed by clauses to help 
them with indexing.
+                       # For Postgres we add the indices manually.
+                       sql_index = m4_ifdef(<!__POSTGRESQL__!>,
+                               <!""" CREATE INDEX ON {out_table} ({src});
+                               """.format(**locals())!>, <!''!>)
+                       plpy.execute(sql_index)
+
+                       # The source can be reached with 0 cost and next is 
itself.
+                       plpy.execute(
+                               """ UPDATE {out_table} SET
+                               {weight} = 0, parent = {vertex_id}
+                               FROM {vertex_table}
+                               WHERE {out_table}.{src} = {vertex_id}
+                               AND {out_table}.{dest} = {vertex_id}
+                               """.format(**locals()))
+
+                       # Distance = 1: every edge means there is a path from 
src to dest
+                       plpy.execute(
+                               """ UPDATE {out_table} SET
+                               {weight} = {edge_table}.{weight}, parent = 
{edge_table}.{dest}
+                               FROM {edge_table}
+                               WHERE {out_table}.{src} = {edge_table}.{src}
+                               AND {out_table}.{dest} = {edge_table}.{dest}
+                               AND {out_table}.{weight} > 
{edge_table}.{weight} {checkg_eo}
+                               """.format(**locals()))
+
+                       ot_sql1 = ot_sql.format(**locals())
+                       out_table_1 = out_table
+
+               # Find the maximum number of iterations to try
+               # If not done by v_cnt iterations, there is a negative cycle.
+               v_cnt = plpy.execute(
+                       """ SELECT max(count) as max FROM (
+                                       SELECT count({src}) AS count
+                                       FROM {out_table_1}
+                                       {grp_by}) x
+                       """.format(**locals()))[0]['max']
+
+               if v_cnt < 2:
+                       plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
+                       plpy.execute("DROP TABLE IF EXISTS {0},{1}".
+                                       format(out_table, out_table+"_summary"))
+                       if is_hawq:
+                               plpy.execute("DROP TABLE IF EXISTS 
{0},{1}".format(
+                                       out_table_1,out_table_2))
+                       if grouping_cols:
+                               plpy.error(("Graph APSP: {0} has less than 2 
vertices in "+
+                                       "every group.").format(edge_table))
+                       else:
+                               plpy.error("Graph APSP: {0} has less than 2 
vertices.".format(
+                                       vertex_table))
+
+               for i in range(0,v_cnt+1):
+
+                       """
+                       Create a view that will be used to update the output 
table
+
+                       The implementation is based on the matrix 
multiplication idea.
+                       The initial output table consists of 3 sets of vertex 
pairs.
+                       1) for every vervex v: v -> v, weight 0, parent v
+                       2) for every edge v1,v2,w: v1 -> v2, weight w, parent v2
+                       3) for every other vertex pair v1,v2: v1 -> v2, weight 
'Inf',
+                               parent NULL
+
+                       The algorithm "relaxes" the paths: finds alternate 
paths with less
+                       weights
+                       At every step, we look at every combination of 
non-infinite
+                       existing paths and edges to see if we can relax a path.
+
+                       Assume the graph is a chain: 1->2->3->...
+                       The initial output table will have a finite weighted 
path for 1->2
+                       and infinite for the rest. At ith iteration, the output 
table will
+                       have 1->i path and will relax 1->i+1 path from infinite 
to a
+                       finite value (weight of 1->i path + weight of i->i+1 
edge) and
+                       assign i as the parent.
+
+                       Since using '=' with floats is dangerous we use an 
epsilon value
+                       for comparison.
+                       """
+
+                       plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
+                       update_sql = """ CREATE VIEW {tmp_view} AS
+                               SELECT DISTINCT ON ({grp_o_comma} y.{src}, 
y.{dest})
+                                       {grp_o_comma} y.{src}, 
y.{dest},y.{weight}, y.parent
+                               FROM  {out_table_1} AS out
+                               INNER JOIN
+                                       (SELECT x.{src}, x.{dest}, x.{weight},
+                                               out.{dest} as parent 
{comma_grp_e}
+                                       FROM
+                                               ({ot_sql1}) AS out
+                                       INNER JOIN
+                                               {edge_table} AS edge
+                                       ON (out.{dest} = edge.{src} 
{checkg_eout})
+                                       INNER JOIN
+                                               (SELECT out.{src}, edge.{dest},
+                                                       
min(out.{weight}+edge.{weight}) AS {weight}
+                                                       {comma_grp_e}
+                                               FROM
+                                                       ({ot_sql1}) AS out,
+                                                       {edge_table} AS edge
+                                               WHERE out.{dest} = edge.{src} 
{checkg_eout}
+                                               GROUP BY out.{src},edge.{dest} 
{comma_grp_e}) x
+                                       ON (x.{src} = out.{src} AND x.{dest} = 
edge.{dest} {checkg_ex})
+                                       WHERE ABS(out.{weight}+edge.{weight} - 
x.{weight})
+                                               < {EPSILON}) y
+                               ON (y.{src} = out.{src} AND y.{dest} = 
out.{dest} {checkg_oy})
+                               WHERE y.{weight} < out.{weight}
+                               """.format(**locals())
+                       plpy.execute(update_sql)
+
+                       # HAWQ employs alternating tables and has to be handled 
separately
+                       if is_hawq:
+
+                               # Stop if therea re no more updates
+                               if table_is_empty(tmp_view):
+
+                                       plpy.execute("DROP VIEW 
{0}".format(tmp_view))
+                                       plpy.execute("ALTER TABLE {0} RENAME TO 
{1}".format(
+                                               out_table_1,out_table))
+                                       break
+
+                               # The new output table will still have the old 
values for
+                               # every vertex pair that does not appear on the 
update view.
+
+                               plpy.execute("TRUNCATE TABLE 
{0}".format(out_table_2))
+                               fill_sql = """ INSERT INTO {out_table_2}
+                                       SELECT * FROM {out_table_1} AS out 
WHERE NOT EXISTS
+                                               (SELECT 1 FROM {tmp_view} AS t
+                                               WHERE t.{src} = out.{src} AND
+                                                       t.{dest} = out.{dest} 
{checkg_o1t})
+                                       UNION
+                                       SELECT * FROM 
{tmp_view}""".format(**locals())
+                               plpy.execute(fill_sql)
+
+                               # Swap the table names and the sql command we 
use for filtering
+                               tmpname = out_table_1
+                               out_table_1 = out_table_2
+                               out_table_2 = tmpname
+
+                               tmpname = ot_sql1
+                               ot_sql1 = ot_sql2
+                               ot_sql2 = tmpname
+
+                       else:
+
+                               updates = plpy.execute(
+                                       """     UPDATE {out_table}
+                                               SET {weight} = 
{tmp_view}.{weight},
+                                                       parent = 
{tmp_view}.parent
+                                               FROM {tmp_view}
+                                               WHERE {tmp_view}.{src} = 
{out_table}.{src} AND
+                                                       {tmp_view}.{dest} = 
{out_table}.{dest} {checkg_ot}
+                                       """.format(**locals()))
+                               if updates.nrows() == 0:
+                                       break
+
+               # The algorithm should have reached a break command by this 
point.
+               # This check handles the existence of a negative cycle.
+               if i == v_cnt:
+
+                       # If there are no groups, clean up and give error.
+                       if grouping_cols is None:
+                               plpy.execute("DROP VIEW IF EXISTS 
{0}".format(tmp_view))
+                               plpy.execute("DROP TABLE IF EXISTS {0},{1}".
+                                       format(out_table, out_table+"_summary"))
+                               if is_hawq:
+                                       plpy.execute("DROP TABLE IF EXISTS 
{0},{1}".format(
+                                               out_table_1,out_table_2))
+                               plpy.error("Graph APSP: Detected a negative 
cycle in the graph.")
+
+                       # It is possible that not all groups has negative 
cycles.
+                       else:
+                               # negs is the string created by collating 
grouping columns.
+                               # By looking at the update view, we can see 
which groups
+                               # are in a negative cycle.
+
+                               negs = plpy.execute(
+                                       """ SELECT array_agg(DISTINCT 
({grouping_cols})) AS grp
+                                               FROM {tmp_view}
+                                       """.format(**locals()))[0]['grp']
+
+                               # Delete the groups with negative cycles from 
the output table.
+
+                               # HAWQ doesn't support DELETE so we have to 
copy the valid results.
+                               if is_hawq:
+                                       sql_del = """ CREATE TABLE {out_table} 
AS
+                                               SELECT *
+                                               FROM {out_table_1} AS out
+                                               WHERE NOT EXISTS(
+                                                       SELECT 1
+                                                       FROM {tmp_view}
+                                                       WHERE {checkg_o1v_sub}
+                                                       );"""
+                                       plpy.execute(sql_del.format(**locals()))
+                                       plpy.execute("DROP VIEW IF EXISTS 
{0}".format(tmp_view))
+                                       plpy.execute("DROP TABLE IF EXISTS 
{0},{1}".format(
+                                               out_table_1,out_table_2))
+                               else:
+                                       sql_del = """ DELETE FROM {out_table}
+                                               USING {tmp_view}
+                                               WHERE {checkg_ov_sub}"""
+                                       plpy.execute(sql_del.format(**locals()))
+
+                               plpy.execute("DROP VIEW IF EXISTS 
{0}".format(tmp_view))
+
+                               # If every group has a negative cycle,
+                               # drop the output table as well.
+                               if table_is_empty(out_table):
+                                       plpy.execute("DROP TABLE IF EXISTS 
{0},{1}".
+                                               
format(out_table,out_table+"_summary"))
+                               plpy.warning(
+                                       """Graph APSP: Detected a negative 
cycle in the """ +
+                                       """sub-graphs of following groups: 
{0}.""".
+                                       format(str(negs)[1:-1]))
+
+               else:
+                       plpy.execute("DROP VIEW IF EXISTS {0}".format(tmp_view))
+                       if is_hawq:
+                               plpy.execute("DROP TABLE IF EXISTS 
{0}".format(out_table_2))
+
+       return None
+
+def graph_apsp_get_path(schema_madlib, apsp_table,
+       source_vertex, dest_vertex, path_table, **kwargs):
+       """
+       Helper function that can be used to get the shortest path between any 2
+       vertices
+    Args:
+       @param apsp_table               Name of the table that contains the APSP
+                                                       output.
+        @param source_vertex   The vertex that will be the source of the
+                                               desired path.
+        @param dest_vertex             The vertex that will be the destination 
of the
+                                               desired path.
+       """
+
+       with MinWarning("warning"):
+               _validate_get_path(apsp_table, source_vertex, dest_vertex, 
path_table)
+
+               temp1_name = unique_string(desp='temp1')
+               temp2_name = unique_string(desp='temp2')
+
+               summary = plpy.execute("SELECT * FROM 
{0}_summary".format(apsp_table))
+               vertex_id = summary[0]['vertex_id']
+               edge_args = summary[0]['edge_args']
+               grouping_cols = summary[0]['grouping_cols']
+
+               params_types = {'src': str, 'dest': str, 'weight': str}
+               default_args = {'src': 'src', 'dest': 'dest', 'weight': 
'weight'}
+               edge_params = extract_keyvalue_params(edge_args,
+                                            params_types,
+                                            default_args)
+
+               src = edge_params["src"]
+               dest = edge_params["dest"]
+               weight = edge_params["weight"]
+
+               if vertex_id == "NULL":
+                       vertex_id = "id"
+
+               if grouping_cols == "NULL":
+                       grouping_cols = None
+
+               select_grps = ""
+               check_grps_t1 = ""
+               check_grps_t2 = ""
+               grp_comma = ""
+               tmp = ""
+
+               if grouping_cols is not None:
+                       glist = split_quoted_delimited_str(grouping_cols)
+                       select_grps = _grp_from_table(apsp_table,glist) + " , "
+                       check_grps_t1 = " AND " + _check_groups(
+                               apsp_table,temp1_name,glist)
+                       check_grps_t2 = " AND " + _check_groups(
+                               apsp_table,temp2_name,glist)
+
+                       grp_comma = grouping_cols + " , "
+
+               # If the source and destination is the same vertex.
+               # There is no need to check the paths for any group.
+               if source_vertex == dest_vertex:
+                       plpy.execute("""
+                               CREATE TABLE {path_table} AS
+                               SELECT {grp_comma} '{{{dest_vertex}}}'::INT[] 
AS path
+                               FROM {apsp_table} WHERE {src} = {source_vertex} 
AND
+                                       {dest} = {dest_vertex}
+                               """.format(**locals()))
+                       return
+
+               plpy.execute( "DROP TABLE IF EXISTS {0},{1}".
+                       format(temp1_name,temp2_name));
+
+               # Initialize the temporary tables
+               out = plpy.execute(""" CREATE TEMP TABLE {temp1_name} AS
+                               SELECT {grp_comma} {apsp_table}.parent AS 
{vertex_id},
+                                       ARRAY[{dest_vertex}] AS path
+                               FROM {apsp_table}
+                               WHERE {src} = {source_vertex} AND {dest} = 
{dest_vertex}
+                                       AND {apsp_table}.parent IS NOT NULL
+                       """.format(**locals()))
+
+               plpy.execute("""
+                       CREATE TEMP TABLE {temp2_name} AS
+                               SELECT * FROM {temp1_name} LIMIT 0
+                       """.format(**locals()))
+
+               # Follow the 'parent' chain until you reach the case where the 
parent
+               # is the same as destination. This means it is the last vertex 
before
+               # the source.
+               while out.nrows() > 0:
+
+                       plpy.execute("TRUNCATE TABLE 
{temp2_name}".format(**locals()))
+                       # If the parent id is not the same as dest,
+                       # that means we have to follow the chain:
+                       #       Add it to the path and move to its parent
+                       out = plpy.execute(
+                               """ INSERT INTO {temp2_name}
+                               SELECT {select_grps} {apsp_table}.parent AS 
{vertex_id},
+                                       {apsp_table}.{dest} || 
{temp1_name}.path AS path
+                               FROM {apsp_table} INNER JOIN {temp1_name} ON
+                                       ({apsp_table}.{dest} = 
{temp1_name}.{vertex_id}
+                                               {check_grps_t1})
+                               WHERE {src} = {source_vertex} AND
+                                       {apsp_table}.parent <> 
{apsp_table}.{dest}
+                               """.format(**locals()))
+
+                       tmp = temp2_name
+                       temp2_name = temp1_name
+                       temp1_name = tmp
+
+                       tmp = check_grps_t1
+                       check_grps_t1 = check_grps_t2
+                       check_grps_t2 = tmp
+
+               # We have to consider 3 cases.
+               # 1) The path has more than 2 vertices:
+               #       Add the current parent and the source vertex
+               # 2) The path has exactly 2 vertices (an edge between src and 
dest is
+               # the shortest path).
+               #       Add the source vertex
+               # 3) The path has 0 vertices (unreachable)
+               #       Add an empty array.
+
+               # Path with 1 vertex (src == dest) has been handled before
+               plpy.execute("""
+                       CREATE TABLE {path_table} AS
+                       SELECT {grp_comma} {source_vertex} || ({vertex_id} || 
path) AS path
+                       FROM {temp2_name}
+                       WHERE {vertex_id} <> {dest_vertex}
+                       UNION
+                       SELECT {grp_comma} {source_vertex} || path AS path
+                       FROM {temp2_name}
+                       WHERE {vertex_id} = {dest_vertex}
+                       UNION
+                       SELECT {grp_comma} '{{}}'::INT[] AS path
+                       FROM {apsp_table}
+                       WHERE {src} = {source_vertex} AND {dest} = {dest_vertex}
+                               AND {apsp_table}.parent IS NULL
+                       """.format(**locals()))
+
+               out = plpy.execute("SELECT 1 FROM {0} LIMIT 
1".format(path_table))
+
+               if out.nrows() == 0:
+                       plpy.error( ("Graph APSP: Vertex {0} and/or {1} is not 
present"+
+                               " in the APSP table {1}").format(
+                               source_vertex,dest_vertex,apsp_table))
+
+               plpy.execute("DROP TABLE IF EXISTS {temp1_name}, {temp1_name}".
+                       format(**locals()))
+
+       return None
+
+def _validate_apsp(vertex_table, vertex_id, edge_table, edge_params,
+       out_table, glist, **kwargs):
+
+       validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
+               out_table,'APSP')
+
+       vt_error = plpy.execute(
+               """ SELECT {vertex_id}
+                       FROM {vertex_table}
+                       WHERE {vertex_id} IS NOT NULL
+                       GROUP BY {vertex_id}
+                       HAVING count(*) > 1 """.format(**locals()))
+
+       if vt_error.nrows() != 0:
+               plpy.error(
+                       """Graph APSP: Source vertex table {vertex_table} 
contains duplicate vertex id's.""".
+                       format(**locals()))
+
+       _assert(not table_exists(out_table+"_summary"),
+               "Graph APSP: Output summary table already exists!")
+
+       if glist is not None:
+               _assert(columns_exist_in_table(edge_table, glist),
+                       """Graph APSP: Not all columns from {glist} are present 
in edge table ({edge_table}).""".
+                       format(**locals()))
+
+def _validate_get_path(apsp_table, source_vertex, dest_vertex,
+       path_table, **kwargs):
+
+       _assert(apsp_table and apsp_table.strip().lower() not in ('null', ''),
+               "Graph APSP: Invalid APSP table name!")
+       _assert(table_exists(apsp_table),
+               "Graph APSP: APSP table ({0}) is missing!".format(apsp_table))
+       _assert(not table_is_empty(apsp_table),
+               "Graph APSP: APSP table ({0}) is empty!".format(apsp_table))
+
+       summary = apsp_table+"_summary"
+       _assert(table_exists(summary),
+               "Graph APSP: APSP summary table ({0}) is 
missing!".format(summary))
+       _assert(not table_is_empty(summary),
+               "Graph APSP: APSP summary table ({0}) is 
empty!".format(summary))
+
+       _assert(not table_exists(path_table),
+               "Graph APSP: Output path table already exists!")
+
+       return None
+
+def graph_apsp_help(schema_madlib, message, **kwargs):
+       """
+       Help function for graph_apsp and graph_apsp_get_path
+
+       Args:
+               @param schema_madlib
+               @param message: string, Help message string
+               @param kwargs
+
+       Returns:
+           String. Help/usage information
+       """
+       if not message:
+               help_string = """
+-----------------------------------------------------------------------
+                            SUMMARY
+-----------------------------------------------------------------------
+
+Given a graph, all pairs shortest path (APSP) algorithm finds a path for
+every vertex pair such that the sum of the weights of its constituent
+edges is minimized.
+
+For more details on function usage:
+    SELECT {schema_madlib}.graph_apsp('usage')
+            """
+       elif message.lower() in ['usage', 'help', '?']:
+               help_string = """
+Given a graph, all pairs shortest path (apsp) algorithm finds a path for
+every vertex pair such that the sum of the weights of its constituent
+edges is minimized.
+
+{graph_usage}
+
+To retrieve the path for a specific vertex pair:
+
+ SELECT {schema_madlib}.graph_apsp_get_path(
+    apsp_table  TEXT, -- Name of the table that contains the apsp output.
+    source_vertex INT, -- The vertex that will be the source of the
+                       -- desired path.
+    dest_vertex          INT, -- The vertex that will be the destination of the
+                       -- desired path.
+    path_table   TEXT  -- Name of the output table that contains the path.
+);
+
+----------------------------------------------------------------------------
+                            OUTPUT
+----------------------------------------------------------------------------
+The output of apsp ('out_table' above) contains a row for every vertex of
+every group and has the following columns (in addition to the grouping
+columns):
+  - source_vertex : The id for the source vertex. Will use the input edge
+                    column 'src' for column naming.
+  - dest_vertex   : The id for the destination vertex. Will use the input edge
+                    column 'dest' for column naming.
+  - weight        : The total weight of the shortest path from the source
+                    vertex to this particular vertex.
+                    Will use the input parameter 'weight' for column naming.
+  - parent        : The parent of the destination vertex in the shortest path
+                    from source. praent will be equal to dest_vertex is there
+                    are no more intermediary vertices. Will use 'parent' for
+                    column naming.
+
+The output of graph_apsp_get_path ('path_table' above) contains a row for
+every group and has the following columns:
+  - grouping_cols : The grouping columns given in the creation of the apsp
+                  table. If there are no grouping columns, these columns
+                  will not exist and the table will have a single row.
+  - path (ARRAY)  : The shortest path from the source vertex to the
+                  destination vertex.
+"""
+       elif message.lower() in ("example", "examples"):
+               help_string = """
+----------------------------------------------------------------------------
+                                EXAMPLES
+----------------------------------------------------------------------------
+-- Create a graph, represented as vertex and edge tables.
+DROP TABLE IF EXISTS vertex,edge,out,out_summary,out_path;
+CREATE TABLE vertex(
+        id INTEGER
+        );
+CREATE TABLE edge(
+        src INTEGER,
+        dest INTEGER,
+        weight DOUBLE PRECISION
+);
+
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6),
+(7)
+;
+INSERT INTO edge VALUES
+(0, 1, 1),
+(0, 2, 1),
+(0, 4, 10),
+(1, 2, 2),
+(1, 3, 10),
+(2, 3, 1),
+(2, 5, 1),
+(2, 6, 3),
+(3, 0, 1),
+(4, 0, -2),
+(5, 6, 1),
+(6, 7, 1)
+;
+
+-- Compute the apsp:
+DROP TABLE IF EXISTS out;
+SELECT madlib.graph_apsp(
+       'vertex',                            -- Vertex table
+       'id',                                -- Vertix id column
+       'edge',                              -- Edge table
+       'src=src, dest=dest, weight=weight', -- Comma delimited string of edge 
arguments
+       'out'                                -- Output table of apsp
+);
+-- View the apsp costs for every vertex:
+SELECT * FROM out ORDER BY src, dest;
+
+-- View the actual shortest path for a vertex:
+SELECT graph_apsp_get_path('out',0, 5,'out_path');
+SELECT * FROM out_path;
+
+-- Create a graph with 2 groups:
+DROP TABLE IF EXISTS edge_gr;
+CREATE TABLE edge_gr AS
+(
+  SELECT *, 0 AS grp FROM edge
+  UNION
+  SELECT *, 1 AS grp FROM edge WHERE src < 6 AND dest < 6
+);
+INSERT INTO edge_gr VALUES
+(4,5,-20,1);
+
+-- Find apsp for all groups:
+DROP TABLE IF EXISTS out_gr, out_gr_summary;
+SELECT graph_apsp('vertex',NULL,'edge_gr',NULL,'out_gr','grp');
+"""
+       else:
+               help_string = "No such option. Use {schema_madlib}.graph_apsp()"
+
+       return help_string.format(schema_madlib=schema_madlib,
+               graph_usage=get_graph_usage(schema_madlib, 'graph_apsp',
+    """out_table     TEXT, -- Name of the table to store the result of apsp.
+    grouping_cols TEXT  -- The list of grouping columns."""))
+# ---------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/755c7b70/src/ports/postgres/modules/graph/apsp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/apsp.sql_in 
b/src/ports/postgres/modules/graph/apsp.sql_in
new file mode 100644
index 0000000..41fe3b6
--- /dev/null
+++ b/src/ports/postgres/modules/graph/apsp.sql_in
@@ -0,0 +1,430 @@
+/* ----------------------------------------------------------------------- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file apsp.sql_in
+ *
+ * @brief SQL functions for graph analytics
+ * @date Nov 2016
+ *
+ * @sa Provides all pairs shortest path algorithm.
+ *
+ *//* ----------------------------------------------------------------------- 
*/
+m4_include(`SQLCommon.m4')
+
+/**
+@addtogroup grp_apsp
+
+<div class="toc"><b>Contents</b>
+<ul>
+<li><a href="#apsp">APSP</a></li>
+<li><a href="#notes">Notes</a></li>
+<li><a href="#examples">Examples</a></li>
+<li><a href="#literature">Literature</a></li>
+</ul>
+</div>
+
+@brief Finds the shortest paths between every vertex pair in a given graph.
+
+The all pairs shortest paths (APSP) algorithm finds the lengths (summed 
weights) of the shortest paths between all pairs of vertices such that the sum 
of the weights of the path edges is minimized.
+
+@anchor apsp
+@par APSP
+<pre class="syntax">
+graph_apsp( vertex_table,
+            vertex_id,
+            edge_table,
+            edge_args,
+            out_table,
+            grouping_cols
+          )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>vertex_table</dt>
+<dd>TEXT. Name of the table containing the vertex data for the graph. Must
+contain the column specified in the 'vertex_id' parameter below.</dd>
+
+<dt>vertex_id</dt>
+<dd>TEXT, default = 'id'. Name of the column in 'vertex_table' containing
+vertex ids.  The vertex ids are of type INTEGER with no duplicates.
+They do not need to be contiguous.</dd>
+
+<dt>edge_table</dt>
+<dd>TEXT. Name of the table containing the edge data. The edge table must
+contain columns for source vertex, destination vertex and edge weight.
+Column naming convention is described below in the 'edge_args' parameter.</dd>
+
+<dt>edge_args</dt>
+<dd>TEXT. A comma-delimited string containing multiple named arguments of
+the form "name=value". The following parameters are supported for
+this string argument:
+  - src (INTEGER): Name of the column containing the source vertex ids in the
+  edge table. Default column name is 'src'.
+  - dest (INTEGER): Name of the column containing the destination vertex ids
+  in the edge table. Default column name is 'dest'.
+  - weight (FLOAT8): Name of the column containing the edge weights in the
+  edge table. Default column name is 'weight'.</dd>
+
+<dt>out_table</dt>
+<dd>TEXT. Name of the table to store the result of APSP.
+It contains a row for every vertex of every group and have
+the following columns (in addition to the grouping columns):
+  - source_vertex : The id for the source vertex. Will use the input edge
+  column 'src' for column naming.
+  - dest_vertex   : The id for the destination vertex. Will use the input
+  edge column 'dest' for column naming.
+  - weight : The total weight of the shortest path from the source vertex to
+  this particular vertex. Will use the input parameter 'weight' for column
+  naming.
+  - parent : The parent of the destination vertex in the shortest path from
+  source. praent will be equal to dest_vertex is there are no more
+  intermediary vertices. Will use 'parent' for column naming.
+
+A summary table named <out_table>_summary is also created. This is an internal
+table that keeps a record of the input parameters and is used by the path
+function described below.
+</dd>
+
+<dt>grouping_cols</dt>
+<dd>TEXT, default = NULL. List of columns used to group the input into discrete
+subgraphs. These columns must exist in the edge table. When this value is null,
+no grouping is used and a single APSP result is generated. </dd>
+</dl>
+
+@par Path Retrieval
+
+The path retrieval function returns the shortest path from the
+source vertex to a specified desination vertex.
+
+<pre class="syntax">
+graph_apsp_get_path( apsp_table,
+                     source_vertex,
+                     dest_vertex,
+                     path_table
+                    )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+<dt>apsp_table</dt>
+<dd>TEXT. Name of the table that contains the APSP output.</dd>
+
+<dt>source_vertex</dt>
+<dd>INTEGER. The vertex that will be the source of the desired path.</dd>
+
+<dt>dest_vertex</dt>
+<dd>INTEGER. The vertex that will be the destination of the desired path.</dd>
+
+<dt>path_table</dt>
+<dd>TEXT. Name of the output table that contains the path.
+It contains a row for every group and has the following columns:
+  - grouping_cols : The grouping columns given in the creation of the APSP
+  table. If there are no grouping columns, these columns will not exist and the
+  table will have a single row.
+  - path (ARRAY) : The shortest path from the source vertex to the destination
+  vertex.
+</dd>
+
+</dl>
+
+@anchor notes
+@par Notes
+
+Graphs with negative edges are supported but graphs with negative cycles are 
not.
+
+The implementation is analogous to a the matrix multiplication procedure.
+Please refer to the design documents, [1] and [2] for more details.
+
+Also see the Grail project [3] for more background on graph analytics 
processing
+in relational databases.
+
+@anchor examples
+@examp
+
+-# Create vertex and edge tables to represent the graph:
+<pre class="syntax">
+DROP TABLE IF EXISTS vertex, edge;
+CREATE TABLE vertex(
+        id INTEGER
+        );
+CREATE TABLE edge(
+        src INTEGER,
+        dest INTEGER,
+        weight FLOAT8
+        );
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6),
+(7);
+INSERT INTO edge VALUES
+(0, 1, 1.0),
+(0, 2, 1.0),
+(0, 4, 10.0),
+(1, 2, 2.0),
+(1, 3, 10.0),
+(2, 3, 1.0),
+(2, 5, 1.0),
+(2, 6, 3.0),
+(3, 0, 1.0),
+(4, 0, -2.0),
+(5, 6, 1.0),
+(6, 7, 1.0);
+</pre>
+
+-# Calculate the shortest paths:
+<pre class="syntax">
+DROP TABLE IF EXISTS out, out_summary;
+SELECT madlib.graph_apsp(
+                         'vertex',      -- Vertex table
+                         NULL,          -- Vertix id column (NULL means use 
default naming)
+                         'edge',        -- Edge table
+                         NULL,          -- Edge arguments (NULL means use 
default naming)
+                         'out');        -- Output table of shortest paths
+SELECT * FROM out ORDER BY src,dest;
+</pre>
+<pre class="result">
+ src | dest |  weight  | parent
+-----+------+----------+--------
+   0 |    0 |        0 |      0
+   0 |    1 |        1 |      1
+   0 |    2 |        1 |      2
+   0 |    3 |        2 |      2
+   0 |    4 |       10 |      4
+   0 |    5 |        2 |      2
+   0 |    6 |        3 |      5
+   0 |    7 |        4 |      6
+   1 |    0 |        4 |      3
+   1 |    1 |        0 |      1
+   1 |    2 |        2 |      2
+   1 |    3 |        3 |      2
+   1 |    4 |       14 |      0
+   1 |    5 |        3 |      2
+   1 |    6 |        4 |      5
+   1 |    7 |        5 |      6
+   .
+   .
+   .
+(8 rows)
+</pre>
+
+-# Get the shortest path from vertex 0 to vertex 5:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_path;
+SELECT madlib.graph_apsp_get_path('out',0,5,'out_path');
+SELECT * FROM out_path;
+</pre>
+<pre class="result">
+  path
+\---------
+ {0,2,5}
+</pre>
+
+-# Now let's do a similar example except using
+different column names in the tables (i.e., not the defaults).
+Create the vertex and edge tables:
+<pre class="syntax">
+DROP TABLE IF EXISTS vertex_alt, edge_alt;
+CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex;
+CREATE TABLE edge_alt AS SELECT src AS e_src, dest, weight AS e_weight FROM 
edge;
+</pre>
+
+-# Get the shortest path from vertex 1:
+<pre class="syntax">
+DROP TABLE IF EXISTS out_alt, out_alt_summary;
+SELECT madlib.graph_apsp(
+                         'vertex_alt',                  -- Vertex table
+                         'v_id',                        -- Vertex id column 
(NULL means use default naming)
+                         'edge_alt',                    -- Edge table
+                         'src=e_src, weight=e_weight',  -- Edge arguments 
(NULL means use default naming)
+                         'out_alt');                    -- Output table of 
shortest paths
+SELECT * FROM out_alt ORDER BY e_src, dest;
+</pre>
+<pre class="result">
+ e_src | dest | e_weight | parent
+-------+------+----------+--------
+     0 |    0 |        0 |      0
+     0 |    1 |        1 |      1
+     0 |    2 |        1 |      2
+     0 |    3 |        2 |      2
+     0 |    4 |       10 |      4
+     0 |    5 |        2 |      2
+     0 |    6 |        3 |      5
+     0 |    7 |        4 |      6
+     1 |    0 |        4 |      3
+     1 |    1 |        0 |      1
+     1 |    2 |        2 |      2
+     1 |    3 |        3 |      2
+     1 |    4 |       14 |      0
+     1 |    5 |        3 |      2
+     1 |    6 |        4 |      5
+     1 |    7 |        5 |      6
+     .
+     .
+     .
+(8 rows)
+</pre>
+
+-# Create a graph with 2 groups:
+<pre class="syntax">
+DROP TABLE IF EXISTS edge_gr;
+CREATE TABLE edge_gr AS
+(
+  SELECT *, 0 AS grp FROM edge
+  UNION
+  SELECT *, 1 AS grp FROM edge WHERE src < 6 AND dest < 6
+);
+INSERT INTO edge_gr VALUES
+(4,5,-20,1);
+</pre>
+
+-# Find APSP for all groups
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr, out_gr_summary;
+SELECT madlib.graph_apsp(
+                         'vertex',      -- Vertex table
+                         NULL,          -- Vertex id column (NULL means use 
default naming)
+                         'edge_gr',     -- Edge table
+                         NULL,          -- Edge arguments (NULL means use 
default naming)
+                         'out_gr',      -- Output table of shortest paths
+                         'grp'          -- Grouping columns
+);
+SELECT * FROM out_gr WHERE src < 2 ORDER BY grp,src,dest;
+</pre>
+<pre class="result">
+ grp | src | dest | weight | parent
+-----+-----+------+--------+--------
+   0 |   0 |    0 |      0 |      0
+   0 |   0 |    1 |      1 |      1
+   0 |   0 |    2 |      1 |      2
+   0 |   0 |    3 |      2 |      2
+   0 |   0 |    4 |     10 |      4
+   0 |   0 |    5 |      2 |      2
+   0 |   0 |    6 |      4 |      2
+   0 |   0 |    7 |      5 |      6
+   0 |   1 |    0 |      4 |      3
+   0 |   1 |    1 |      0 |      1
+   0 |   1 |    2 |      2 |      2
+   0 |   1 |    3 |      3 |      2
+   0 |   1 |    4 |     14 |      0
+   0 |   1 |    5 |      3 |      2
+   0 |   1 |    6 |      4 |      5
+   0 |   1 |    7 |      5 |      6
+   1 |   0 |    0 |      0 |      0
+   1 |   0 |    1 |      1 |      1
+   1 |   0 |    2 |      1 |      2
+   1 |   0 |    3 |      2 |      2
+   1 |   0 |    4 |     10 |      4
+   1 |   0 |    5 |    -10 |      4
+   1 |   1 |    0 |      4 |      3
+   1 |   1 |    1 |      0 |      1
+   1 |   1 |    2 |      2 |      2
+   1 |   1 |    3 |      3 |      2
+   1 |   1 |    4 |     14 |      0
+   1 |   1 |    5 |     -6 |      4
+</pre>
+
+-# Find the path from vertex 0 to vertex 5 in every group
+<pre class="syntax">
+DROP TABLE IF EXISTS out_gr_path;
+SELECT madlib.graph_apsp_get_path('out_gr',0,5,'out_gr_path');
+SELECT * FROM out_gr_path ORDER BY grp;
+</pre>
+<pre class="result">
+ grp |  path
+-----+---------
+   0 | {0,2,5}
+   1 | {0,4,5}
+</pre>
+
+@anchor literature
+@par Literature
+
+[1] http://www.columbia.edu/~cs2035/courses/ieor6614.S11/apsp.pdf
+
+[2] 
http://users.cecs.anu.edu.au/~Alistair.Rendell/Teaching/apac_comp3600/module4/all_pairs_shortest_paths.xhtml
+
+[3] The case against specialized graph analytics engines, J. Fan, G. Soosai 
Raj,
+and J. M. Patel. CIDR 2015. 
http://cidrdb.org/cidr2015/Papers/CIDR15_Paper20.pdf
+*/
+
+-------------------------------------------------------------------------
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp(
+    vertex_table            TEXT,
+    vertex_id               TEXT,
+    edge_table              TEXT,
+    edge_args               TEXT,
+    out_table               TEXT,
+    grouping_cols           TEXT
+
+) RETURNS VOID AS $$
+    PythonFunction(graph, apsp, graph_apsp)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp(
+    vertex_table            TEXT,
+    vertex_id               TEXT,
+    edge_table              TEXT,
+    edge_args               TEXT,
+    out_table               TEXT
+
+) RETURNS VOID AS $$
+     SELECT MADLIB_SCHEMA.graph_apsp($1, $2, $3, $4, $5, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp_get_path(
+    apsp_table             TEXT,
+    source_vertex          INT,
+       dest_vertex            INT,
+       path_table             TEXT
+
+) RETURNS VOID AS $$
+    PythonFunction(graph, apsp, graph_apsp_get_path)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `MODIFIES SQL DATA', `');
+-------------------------------------------------------------------------------
+
+-- Online help
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp(
+    message VARCHAR
+) RETURNS VARCHAR AS $$
+    PythonFunction(graph, apsp, graph_apsp_help)
+$$ LANGUAGE plpythonu IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+
+--------------------------------------------------------------------------------
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_apsp()
+RETURNS VARCHAR AS $$
+    SELECT MADLIB_SCHEMA.graph_apsp('');
+$$ LANGUAGE sql IMMUTABLE
+m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
+--------------------------------------------------------------------------------
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/755c7b70/src/ports/postgres/modules/graph/graph_utils.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/graph_utils.py_in 
b/src/ports/postgres/modules/graph/graph_utils.py_in
index 25f70a5..77378d6 100644
--- a/src/ports/postgres/modules/graph/graph_utils.py_in
+++ b/src/ports/postgres/modules/graph/graph_utils.py_in
@@ -38,6 +38,29 @@ from utilities.validate_args import table_exists
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_is_empty
 
+def _check_groups(tbl1, tbl2, grp_list):
+
+       """
+       Helper function for joining tables with groups.
+       Args:
+               @param tbl1       Name of the first table
+               @param tbl2       Name of the second table
+               @param grp_list   The list of grouping columns
+       """
+
+       return ' AND '.join([" {tbl1}.{i} = {tbl2}.{i} ".format(**locals())
+               for i in grp_list])
+
+def _grp_from_table(tbl, grp_list):
+
+       """
+       Helper function for selecting grouping columns of a table
+       Args:
+               @param tbl        Name of the table
+               @param grp_list   The list of grouping columns
+       """
+       return ' , '.join([" {tbl}.{i} ".format(**locals())
+               for i in grp_list])
 
 def validate_graph_coding(vertex_table, vertex_id, edge_table, edge_params,
        out_table, func_name, **kwargs):

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/755c7b70/src/ports/postgres/modules/graph/sssp.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/sssp.py_in 
b/src/ports/postgres/modules/graph/sssp.py_in
index a8339a8..933623e 100644
--- a/src/ports/postgres/modules/graph/sssp.py_in
+++ b/src/ports/postgres/modules/graph/sssp.py_in
@@ -28,46 +28,22 @@
 """
 
 import plpy
-from graph_utils import *
+from graph_utils import validate_graph_coding
+from graph_utils import get_graph_usage
+from graph_utils import _grp_from_table
+from graph_utils import _check_groups
 from utilities.control import MinWarning
 from utilities.utilities import _assert
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string
-from utilities.utilities import _string_to_array
 from utilities.utilities import split_quoted_delimited_str
 from utilities.validate_args import table_exists
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import table_is_empty
-from utilities.validate_args import get_cols_and_types
 from utilities.validate_args import get_expr_type
 
 m4_changequote(`<!', `!>')
 
-
-def _check_groups(tbl1, tbl2, grp_list):
-
-       """
-       Helper function for joining tables with groups.
-       Args:
-               @param tbl1       Name of the first table
-               @param tbl2       Name of the second table
-               @param grp_list   The list of grouping columns
-       """
-
-       return ' AND '.join([" {tbl1}.{i} = {tbl2}.{i} ".format(**locals())
-               for i in grp_list])
-
-def _grp_from_table(tbl, grp_list):
-
-       """
-       Helper function for selecting grouping columns of a table
-       Args:
-               @param tbl        Name of the table
-               @param grp_list   The list of grouping columns
-       """
-       return ' , '.join([" {tbl}.{i} ".format(**locals())
-               for i in grp_list])
-
 def graph_sssp(schema_madlib, vertex_table, vertex_id, edge_table,
                edge_args, source_vertex, out_table, grouping_cols, **kwargs):
 
@@ -163,7 +139,7 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, 
edge_table,
 
                w_type = get_expr_type(weight,edge_table).lower()
                init_w = INT_MAX
-               if w_type in ['double precision','float8']:
+               if w_type in ['real','double precision','float8']:
                        init_w = INFINITY
 
                # We keep a table of every vertex, the minimum cost to that 
destination
@@ -354,7 +330,7 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, 
edge_table,
                        # associated min values. Using these values, we 
identify which edge
                        # is selected.
 
-                       # Since using '='' with floats is dangerous we use an 
epsilon value
+                       # Since using '=' with floats is dangerous we use an 
epsilon value
                        # for comparison.
 
                        # Once we have a list of edges and values (stores as 
'message'),
@@ -412,7 +388,7 @@ def graph_sssp(schema_madlib, vertex_table, vertex_id, 
edge_table,
                        # It is possible that not all groups has negative 
cycles.
                        else:
 
-                               # grp is the string created by collating 
grouping columns.
+                               # negs is the string created by collating 
grouping columns.
                                # By looking at the oldupdate table we can see 
which groups
                                # are in a negative cycle.
 
@@ -470,6 +446,7 @@ def graph_sssp_get_path(schema_madlib, sssp_table, 
dest_vertex, path_table,
         @param dest_vertex  The vertex that will be the destination of the
                             desired path.
         @param path_table   Name of the output table that contains the path.
+
        """
        with MinWarning("warning"):
                _validate_get_path(sssp_table, dest_vertex, path_table)
@@ -480,9 +457,6 @@ def graph_sssp_get_path(schema_madlib, sssp_table, 
dest_vertex, path_table,
                select_grps = ""
                check_grps_t1 = ""
                check_grps_t2 = ""
-               check_grps_pt1 = ""
-               check_grps_pt2 = ""
-               checkg_po = ""
                grp_comma = ""
                tmp = ""
 
@@ -505,8 +479,6 @@ def graph_sssp_get_path(schema_madlib, sssp_table, 
dest_vertex, path_table,
                        check_grps_t2 = " AND " + _check_groups(
                                sssp_table,temp2_name,glist)
 
-                       checkg_po = " WHERE " + _check_groups(
-                               path_table,"oldupdate",glist)
                        grp_comma = grouping_cols + " , "
 
                if source_vertex == dest_vertex:
@@ -662,7 +634,7 @@ def graph_sssp_help(schema_madlib, message, **kwargs):
 -----------------------------------------------------------------------
 
 Given a graph and a source vertex, single source shortest path (SSSP)
-algorithm finds a path for every vertex such that the the sum of the
+algorithm finds a path for every vertex such that the sum of the
 weights of its constituent edges is minimized.
 
 For more details on function usage:
@@ -670,6 +642,10 @@ For more details on function usage:
             """
        elif message.lower() in ['usage', 'help', '?']:
                help_string = """
+Given a graph and a source vertex, single source shortest path (SSSP)
+algorithm finds a path for every vertex such that the sum of the
+weights of its constituent edges is minimized.
+
 {graph_usage}
 
 To retrieve the path for a specific vertex:

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/755c7b70/src/ports/postgres/modules/graph/sssp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/sssp.sql_in 
b/src/ports/postgres/modules/graph/sssp.sql_in
index be433dc..fb0cdba 100644
--- a/src/ports/postgres/modules/graph/sssp.sql_in
+++ b/src/ports/postgres/modules/graph/sssp.sql_in
@@ -18,12 +18,12 @@
  * under the License.
  *
  *
- * @file graph.sql_in
+ * @file sssp.sql_in
  *
  * @brief SQL functions for graph analytics
  * @date Nov 2016
  *
- * @sa Provides various graph algorithms.
+ * @sa Provides single source shortest path algorithm.
  *
  *//* ----------------------------------------------------------------------- 
*/
 m4_include(`SQLCommon.m4')
@@ -370,11 +370,11 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp(
 $$ LANGUAGE plpythonu IMMUTABLE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
 
---------------------------------------------------------------------------------
+-------------------------------------------------------------------------------
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.graph_sssp()
 RETURNS VARCHAR AS $$
     SELECT MADLIB_SCHEMA.graph_sssp('');
 $$ LANGUAGE sql IMMUTABLE
 m4_ifdef(`\_\_HAS_FUNCTION_PROPERTIES\_\_', `CONTAINS SQL', `');
---------------------------------------------------------------------------------
+-------------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/755c7b70/src/ports/postgres/modules/graph/test/apsp.sql_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/graph/test/apsp.sql_in 
b/src/ports/postgres/modules/graph/test/apsp.sql_in
new file mode 100644
index 0000000..a7fa3af
--- /dev/null
+++ b/src/ports/postgres/modules/graph/test/apsp.sql_in
@@ -0,0 +1,99 @@
+/* ----------------------------------------------------------------------- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* ----------------------------------------------------------------------- 
*/
+
+DROP TABLE IF EXISTS vertex,edge,out,out_summary,out_path,
+       vertex_alt,edge_alt,out_alt,out_alot_summary,
+       edge_gr,out_gr,out_gr_summary,out_gr_path,
+       edge_gr2, out_gr2, out_gr2_summary;
+
+
+CREATE TABLE vertex(
+                  id INTEGER
+                );
+
+CREATE TABLE edge(
+                  src INTEGER,
+                  dest INTEGER,
+                  weight DOUBLE PRECISION
+                );
+
+INSERT INTO vertex VALUES
+(0),
+(1),
+(2),
+(3),
+(4),
+(5),
+(6),
+(7)
+;
+INSERT INTO edge VALUES
+(0, 1, 1),
+(0, 2, 1),
+(0, 4, 10),
+(1, 2, 2),
+(1, 3, 10),
+(2, 3, 1),
+(2, 5, 1),
+(2, 6, 3),
+(3, 0, 1),
+(4, 0, -2),
+(5, 6, 1),
+(6, 7, 1)
+;
+
+
+SELECT graph_apsp('vertex',NULL,'edge',NULL,'out');
+
+SELECT * FROM out ORDER BY src,dest;
+
+SELECT assert(weight = 4, 'Wrong output in graph (APSP)')
+FROM out WHERE src = 0 AND dest = 7;
+SELECT assert(parent = 6, 'Wrong parent in graph (APSP)')
+FROM out WHERE src = 0 AND dest = 7;
+
+SELECT assert(weight = 11, 'Wrong output in graph (APSP)')
+FROM out WHERE src = 3 AND dest = 4;
+SELECT assert(parent = 0, 'Wrong parent in graph (APSP)')
+FROM out WHERE src = 3 AND dest = 4;
+-- Path
+SELECT graph_apsp_get_path('out',0,7,'out_path');
+
+-- Grouping
+CREATE TABLE edge_gr AS
+(SELECT *, 0 AS grp FROM edge
+UNION
+SELECT *, 1 AS grp FROM edge WHERE src < 6 AND dest < 6
+UNION
+SELECT *, 2 AS grp FROM edge WHERE src < 6 AND dest < 6
+);
+
+INSERT INTO edge_gr VALUES
+(7,NULL,NULL,1),
+(4,0,-20,2);
+
+SELECT graph_apsp('vertex',NULL,'edge_gr',NULL,'out_gr','grp');
+
+SELECT assert(weight = 2, 'Wrong output in graph (APSP)')
+       FROM out_gr WHERE src = 0 AND dest = 5 AND grp = 1;
+SELECT assert(parent = 2, 'Wrong output in graph (APSP)')
+       FROM out_gr WHERE src = 0 AND dest = 5 AND grp = 1;
+

incubator-madlib git commit: Graph: Add APSP with grouping

Reply via email to