http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/PageRank-v1.ipynb ---------------------------------------------------------------------- diff --git a/community-artifacts/PageRank-v1.ipynb b/community-artifacts/PageRank-v1.ipynb deleted file mode 100644 index 32b1caf..0000000 --- a/community-artifacts/PageRank-v1.ipynb +++ /dev/null @@ -1,774 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PageRank\n", - "The PageRank algorithm produces a probability distribution representing the likelihood that a person randomly traversing a graph will arrive at any particular vertex. PageRank was added in MADlib 1.11." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", - " \"You should import from traitlets.config instead.\", ShimWarning)\n", - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", - " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" - ] - } - ], - "source": [ - "%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "u'Connected: fmcquillan@madlib'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Greenplum 4.3.10.0\n", - "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", - " \n", - "# PostgreSQL local\n", - "%sql postgresql://fmcquillan@localhost:5432/madlib\n", - "\n", - "# Greenplum 4.2.3.0\n", - "#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>version</th>\n", - " </tr>\n", - " <tr>\n", - " <td>MADlib version: 1.11-dev, git revision: rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(u'MADlib version: 1.11-dev, git revision: rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%sql select madlib.version();\n", - "#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Create vertex and edge tables" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "Done.\n", - "7 rows affected.\n", - "22 rows affected.\n", - "22 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>src</th>\n", - " <th>dest</th>\n", - " <th>user_id</th>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>2</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>4</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>4</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>3</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>3</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>5</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>3</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>3</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>6</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>3</td>\n", - " <td>0</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>3</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>4</td>\n", - " <td>0</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>4</td>\n", - " <td>0</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>5</td>\n", - " <td>6</td>\n", - " <td>2</td>\n", - " </tr>\n", - " <tr>\n", - " <td>5</td>\n", - " <td>6</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>6</td>\n", - " <td>3</td>\n", - " <td>1</td>\n", - " </tr>\n", - " <tr>\n", - " <td>6</td>\n", - " <td>3</td>\n", - " <td>2</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(0, 1, 1),\n", - " (0, 2, 2),\n", - " (0, 2, 1),\n", - " (0, 4, 2),\n", - " (0, 4, 1),\n", - " (0, 1, 2),\n", - " (1, 3, 1),\n", - " (1, 3, 2),\n", - " (1, 2, 2),\n", - " (1, 2, 1),\n", - " (2, 5, 1),\n", - " (2, 3, 1),\n", - " (2, 3, 2),\n", - " (2, 6, 1),\n", - " (3, 0, 2),\n", - " (3, 0, 1),\n", - " (4, 0, 1),\n", - " (4, 0, 2),\n", - " (5, 6, 2),\n", - " (5, 6, 1),\n", - " (6, 3, 1),\n", - " (6, 3, 2)]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql \n", - "DROP TABLE IF EXISTS vertex, edge;\n", - "\n", - "CREATE TABLE vertex(\n", - " id INTEGER\n", - " );\n", - "\n", - "CREATE TABLE edge(\n", - " src INTEGER,\n", - " dest INTEGER,\n", - " user_id INTEGER\n", - " );\n", - "\n", - "INSERT INTO vertex VALUES\n", - "(0),\n", - "(1),\n", - "(2),\n", - "(3),\n", - "(4),\n", - "(5),\n", - "(6);\n", - "\n", - "INSERT INTO edge VALUES\n", - "(0, 1, 1),\n", - "(0, 2, 1),\n", - "(0, 4, 1),\n", - "(1, 2, 1),\n", - "(1, 3, 1),\n", - "(2, 3, 1),\n", - "(2, 5, 1),\n", - "(2, 6, 1),\n", - "(3, 0, 1),\n", - "(4, 0, 1),\n", - "(5, 6, 1),\n", - "(6, 3, 1),\n", - "(0, 1, 2),\n", - "(0, 2, 2),\n", - "(0, 4, 2),\n", - "(1, 2, 2),\n", - "(1, 3, 2),\n", - "(2, 3, 2),\n", - "(3, 0, 2),\n", - "(4, 0, 2),\n", - "(5, 6, 2),\n", - "(6, 3, 2);\n", - "\n", - "SELECT * from edge ORDER BY src;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 2. Calculate the PageRank" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "1 rows affected.\n", - "7 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>id</th>\n", - " <th>pagerank</th>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>0.287537493412</td>\n", - " </tr>\n", - " <tr>\n", - " <td>3</td>\n", - " <td>0.210169889019</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>0.146626834541</td>\n", - " </tr>\n", - " <tr>\n", - " <td>4</td>\n", - " <td>0.102896143842</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>0.102896143842</td>\n", - " </tr>\n", - " <tr>\n", - " <td>6</td>\n", - " <td>0.0972863776889</td>\n", - " </tr>\n", - " <tr>\n", - " <td>5</td>\n", - " <td>0.0525871176569</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(0, 0.287537493411842),\n", - " (3, 0.210169889018553),\n", - " (2, 0.146626834540625),\n", - " (4, 0.10289614384217),\n", - " (1, 0.10289614384217),\n", - " (6, 0.097286377688878),\n", - " (5, 0.0525871176569217)]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql\n", - "DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n", - "\n", - "SELECT madlib.pagerank(\n", - " 'vertex', -- Vertex table\n", - " 'id', -- Vertix id column\n", - " 'edge', -- Edge table\n", - " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", - " 'pagerank_out'); -- Output table of PageRank\n", - "\n", - "SELECT * FROM pagerank_out ORDER BY pagerank DESC;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look at the summary table:" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>__iterations__</th>\n", - " </tr>\n", - " <tr>\n", - " <td>16</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(16,)]" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql\n", - "SELECT * FROM pagerank_out_summary;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now run PageRank with a damping factor of 0.5 which results in different final values:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "1 rows affected.\n", - "7 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>id</th>\n", - " <th>pagerank</th>\n", - " </tr>\n", - " <tr>\n", - " <td>0</td>\n", - " <td>0.225477161441</td>\n", - " </tr>\n", - " <tr>\n", - " <td>3</td>\n", - " <td>0.199090328587</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>0.136261327206</td>\n", - " </tr>\n", - " <tr>\n", - " <td>6</td>\n", - " <td>0.132691559968</td>\n", - " </tr>\n", - " <tr>\n", - " <td>4</td>\n", - " <td>0.10900929141</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>0.10900929141</td>\n", - " </tr>\n", - " <tr>\n", - " <td>5</td>\n", - " <td>0.0884610399788</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(0, 0.225477161441199),\n", - " (3, 0.199090328586664),\n", - " (2, 0.136261327206477),\n", - " (6, 0.132691559968224),\n", - " (4, 0.109009291409508),\n", - " (1, 0.109009291409508),\n", - " (5, 0.0884610399788161)]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql\n", - "DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n", - "SELECT madlib.pagerank(\n", - " 'vertex', -- Vertex table\n", - " 'id', -- Vertix id column\n", - " 'edge', -- Edge table\n", - " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", - " 'pagerank_out', -- Output table of PageRank\n", - " 0.5); -- Damping factor\n", - "SELECT * FROM pagerank_out ORDER BY pagerank DESC;" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Grouping" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now compute the PageRank distribution separately for each user using the grouping feature:" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "1 rows affected.\n", - "14 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>user_id</th>\n", - " <th>id</th>\n", - " <th>pagerank</th>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>0</td>\n", - " <td>0.278254883886</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>3</td>\n", - " <td>0.201881146671</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>2</td>\n", - " <td>0.142881123461</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>6</td>\n", - " <td>0.114536378321</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>4</td>\n", - " <td>0.100267456154</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>1</td>\n", - " <td>0.100267456154</td>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>5</td>\n", - " <td>0.0619115553529</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>0</td>\n", - " <td>0.318546250042</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>3</td>\n", - " <td>0.237866867733</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>2</td>\n", - " <td>0.159148764894</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>1</td>\n", - " <td>0.11168334438</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>4</td>\n", - " <td>0.11168334438</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>6</td>\n", - " <td>0.0396428571429</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>5</td>\n", - " <td>0.0214285714286</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(1, 0, 0.278254883885528),\n", - " (1, 3, 0.201881146670752),\n", - " (1, 2, 0.142881123460599),\n", - " (1, 6, 0.114536378321472),\n", - " (1, 4, 0.10026745615438),\n", - " (1, 1, 0.10026745615438),\n", - " (1, 5, 0.0619115553528898),\n", - " (2, 0, 0.318546250041731),\n", - " (2, 3, 0.237866867733431),\n", - " (2, 2, 0.159148764893974),\n", - " (2, 1, 0.111683344379718),\n", - " (2, 4, 0.111683344379718),\n", - " (2, 6, 0.0396428571428571),\n", - " (2, 5, 0.0214285714285714)]" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql\n", - "DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n", - "\n", - "SELECT madlib.pagerank(\n", - " 'vertex', -- Vertex table\n", - " 'id', -- Vertix id column\n", - " 'edge', -- Edge table\n", - " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", - " 'pagerank_out', -- Output table of PageRank\n", - " NULL, -- Default damping factor (0.85)\n", - " NULL, -- Default max iters (100)\n", - " 0.00000001, -- Threshold\n", - " 'user_id'); -- Grouping column name\n", - "\n", - "SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC;" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2 rows affected.\n" - ] - }, - { - "data": { - "text/html": [ - "<table>\n", - " <tr>\n", - " <th>user_id</th>\n", - " <th>__iterations__</th>\n", - " </tr>\n", - " <tr>\n", - " <td>1</td>\n", - " <td>27</td>\n", - " </tr>\n", - " <tr>\n", - " <td>2</td>\n", - " <td>31</td>\n", - " </tr>\n", - "</table>" - ], - "text/plain": [ - "[(1, 27), (2, 31)]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "%%sql\n", - "SELECT * FROM pagerank_out_summary ORDER BY user_id;" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/PageRank-v2.ipynb ---------------------------------------------------------------------- diff --git a/community-artifacts/PageRank-v2.ipynb b/community-artifacts/PageRank-v2.ipynb new file mode 100644 index 0000000..eea174f --- /dev/null +++ b/community-artifacts/PageRank-v2.ipynb @@ -0,0 +1,889 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PageRank\n", + "The PageRank algorithm produces a probability distribution representing the likelihood that a person randomly traversing a graph will arrive at any particular vertex. PageRank was added in MADlib 1.11.\n", + "\n", + "We also implement personalized PageRank, in which a notion of importance provides personalization to a query. This was added in 1.14" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] + } + ], + "source": [ + "%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Greenplum Database 5.4.0 on GCP (demo machine)\n", + "%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", + " \n", + "# PostgreSQL local\n", + "#%sql postgresql://fmcquillan@localhost:5432/madlib\n", + "\n", + "# Greenplum Database 4.3.10.0\n", + "#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>version</th>\n", + " </tr>\n", + " <tr>\n", + " <td>MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%sql select madlib.version();\n", + "#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Create vertex and edge tables" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "Done.\n", + "7 rows affected.\n", + "22 rows affected.\n", + "22 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>src</th>\n", + " <th>dest</th>\n", + " <th>user_id</th>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>5</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>6</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>6</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>3</td>\n", + " <td>2</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(0, 1, 1),\n", + " (0, 2, 2),\n", + " (0, 2, 1),\n", + " (0, 4, 2),\n", + " (0, 4, 1),\n", + " (0, 1, 2),\n", + " (1, 3, 1),\n", + " (1, 3, 2),\n", + " (1, 2, 2),\n", + " (1, 2, 1),\n", + " (2, 5, 1),\n", + " (2, 3, 1),\n", + " (2, 3, 2),\n", + " (2, 6, 1),\n", + " (3, 0, 2),\n", + " (3, 0, 1),\n", + " (4, 0, 1),\n", + " (4, 0, 2),\n", + " (5, 6, 2),\n", + " (5, 6, 1),\n", + " (6, 3, 1),\n", + " (6, 3, 2)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql \n", + "DROP TABLE IF EXISTS vertex, edge;\n", + "\n", + "CREATE TABLE vertex(\n", + " id INTEGER\n", + " );\n", + "\n", + "CREATE TABLE edge(\n", + " src INTEGER,\n", + " dest INTEGER,\n", + " user_id INTEGER\n", + " );\n", + "\n", + "INSERT INTO vertex VALUES\n", + "(0),\n", + "(1),\n", + "(2),\n", + "(3),\n", + "(4),\n", + "(5),\n", + "(6);\n", + "\n", + "INSERT INTO edge VALUES\n", + "(0, 1, 1),\n", + "(0, 2, 1),\n", + "(0, 4, 1),\n", + "(1, 2, 1),\n", + "(1, 3, 1),\n", + "(2, 3, 1),\n", + "(2, 5, 1),\n", + "(2, 6, 1),\n", + "(3, 0, 1),\n", + "(4, 0, 1),\n", + "(5, 6, 1),\n", + "(6, 3, 1),\n", + "(0, 1, 2),\n", + "(0, 2, 2),\n", + "(0, 4, 2),\n", + "(1, 2, 2),\n", + "(1, 3, 2),\n", + "(2, 3, 2),\n", + "(3, 0, 2),\n", + "(4, 0, 2),\n", + "(5, 6, 2),\n", + "(6, 3, 2);\n", + "\n", + "SELECT * from edge ORDER BY src;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Calculate PageRank" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "7 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>pagerank</th>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>0.287518161212</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0.210171199451</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0.146637377532</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0.102910437211</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0.102910437211</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0.0972746644343</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0.0525777229482</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(0, 0.287518161212111),\n", + " (3, 0.210171199451415),\n", + " (2, 0.146637377532288),\n", + " (4, 0.102910437211324),\n", + " (1, 0.102910437211324),\n", + " (6, 0.0972746644343418),\n", + " (5, 0.0525777229481976)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n", + "\n", + "SELECT madlib.pagerank(\n", + " 'vertex', -- Vertex table\n", + " 'id', -- Vertix id column\n", + " 'edge', -- Edge table\n", + " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", + " 'pagerank_out'); -- Output table of PageRank\n", + "\n", + "SELECT * FROM pagerank_out ORDER BY pagerank DESC;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Look at the summary table:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__iterations__</th>\n", + " </tr>\n", + " <tr>\n", + " <td>12</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(12,)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM pagerank_out_summary;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now run PageRank with a damping factor of 0.5 which results in different final values:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Calculate PageRank with optional params" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "7 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>pagerank</th>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>0.225477430556</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0.199105076058</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0.136259748402</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0.132687846189</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0.109006420855</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0.109006420855</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0.088457057085</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(0, 0.225477430555642),\n", + " (3, 0.199105076058278),\n", + " (2, 0.136259748401728),\n", + " (6, 0.132687846188875),\n", + " (4, 0.109006420855422),\n", + " (1, 0.109006420855422),\n", + " (5, 0.0884570570850217)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n", + "SELECT madlib.pagerank(\n", + " 'vertex', -- Vertex table\n", + " 'id', -- Vertix id column\n", + " 'edge', -- Edge table\n", + " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", + " 'pagerank_out', -- Output table of PageRank\n", + " 0.5); -- Damping factor\n", + "SELECT * FROM pagerank_out ORDER BY pagerank DESC;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Grouping\n", + "Now compute the PageRank distribution separately for each user using the grouping feature:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "14 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>user_id</th>\n", + " <th>id</th>\n", + " <th>pagerank</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0.278254883886</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>3</td>\n", + " <td>0.201881146671</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>0.142881123461</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>6</td>\n", + " <td>0.114536378321</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>4</td>\n", + " <td>0.100267456154</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0.100267456154</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " <td>0.0619115553529</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>0.318546250042</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>0.237866867733</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>0.159148764894</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>1</td>\n", + " <td>0.11168334438</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>4</td>\n", + " <td>0.11168334438</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>6</td>\n", + " <td>0.0396428571429</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>5</td>\n", + " <td>0.0214285714286</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 0, 0.278254883885528),\n", + " (1, 3, 0.201881146670752),\n", + " (1, 2, 0.142881123460599),\n", + " (1, 6, 0.114536378321472),\n", + " (1, 4, 0.10026745615438),\n", + " (1, 1, 0.10026745615438),\n", + " (1, 5, 0.0619115553528898),\n", + " (2, 0, 0.318546250041731),\n", + " (2, 3, 0.237866867733431),\n", + " (2, 2, 0.159148764893974),\n", + " (2, 1, 0.111683344379718),\n", + " (2, 4, 0.111683344379718),\n", + " (2, 6, 0.0396428571428571),\n", + " (2, 5, 0.0214285714285714)]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n", + "\n", + "SELECT madlib.pagerank(\n", + " 'vertex', -- Vertex table\n", + " 'id', -- Vertix id column\n", + " 'edge', -- Edge table\n", + " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", + " 'pagerank_out', -- Output table of PageRank\n", + " NULL, -- Default damping factor (0.85)\n", + " NULL, -- Default max iters (100)\n", + " 0.00000001, -- Threshold\n", + " 'user_id'); -- Grouping column name\n", + "\n", + "SELECT * FROM pagerank_out ORDER BY user_id, pagerank DESC;" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>user_id</th>\n", + " <th>__iterations__</th>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>27</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>31</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(1, 27), (2, 31)]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM pagerank_out_summary ORDER BY user_id;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5. Personalized PageRank\n", + "Here we specify {2,4} as the personalization vertices. This parameter could be specified as ARRAY[2,4] as well:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "1 rows affected.\n", + "7 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>id</th>\n", + " <th>pagerank</th>\n", + " </tr>\n", + " <tr>\n", + " <td>0</td>\n", + " <td>0.282616480981</td>\n", + " </tr>\n", + " <tr>\n", + " <td>2</td>\n", + " <td>0.189069710497</td>\n", + " </tr>\n", + " <tr>\n", + " <td>3</td>\n", + " <td>0.177501646133</td>\n", + " </tr>\n", + " <tr>\n", + " <td>4</td>\n", + " <td>0.15505560795</td>\n", + " </tr>\n", + " <tr>\n", + " <td>1</td>\n", + " <td>0.0800556079496</td>\n", + " </tr>\n", + " <tr>\n", + " <td>6</td>\n", + " <td>0.0743076577868</td>\n", + " </tr>\n", + " <tr>\n", + " <td>5</td>\n", + " <td>0.0401701653568</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(0, 0.282616480980651),\n", + " (2, 0.18906971049691),\n", + " (3, 0.177501646133459),\n", + " (4, 0.155055607949638),\n", + " (1, 0.0800556079496381),\n", + " (6, 0.0743076577867786),\n", + " (5, 0.0401701653567887)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS pagerank_out, pagerank_out_summary;\n", + "SELECT madlib.pagerank(\n", + " 'vertex', -- Vertex table\n", + " 'id', -- Vertix id column\n", + " 'edge', -- Edge table\n", + " 'src=src, dest=dest', -- Comma delimted string of edge arguments\n", + " 'pagerank_out', -- Output table of PageRank\n", + " NULL, -- Default damping factor (0.85)\n", + " NULL, -- Default max iters (100)\n", + " NULL, -- Default Threshold\n", + " NULL, -- No Grouping\n", + " '{2,4}'); -- Personalization vertices\n", + "SELECT * FROM pagerank_out ORDER BY pagerank DESC;" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] + }, + { + "data": { + "text/html": [ + "<table>\n", + " <tr>\n", + " <th>__iterations__</th>\n", + " </tr>\n", + " <tr>\n", + " <td>37</td>\n", + " </tr>\n", + "</table>" + ], + "text/plain": [ + "[(37,)]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%sql\n", + "SELECT * FROM pagerank_out_summary;" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}