madlib git commit: MLP: Set min messages to error for predict
Repository: madlib Updated Branches: refs/heads/master 4cced1b13 -> 0e1161c0d MLP: Set min messages to error for predict Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/0e1161c0 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/0e1161c0 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/0e1161c0 Branch: refs/heads/master Commit: 0e1161c0d4de0a81a318016f81ad661e114e34ec Parents: 4cced1b Author: Rahul Iyer Authored: Mon Apr 23 13:40:10 2018 -0700 Committer: Rahul Iyer Committed: Mon Apr 23 13:40:10 2018 -0700 -- src/ports/postgres/modules/convex/mlp_igd.py_in | 2 ++ 1 file changed, 2 insertions(+) -- http://git-wip-us.apache.org/repos/asf/madlib/blob/0e1161c0/src/ports/postgres/modules/convex/mlp_igd.py_in -- diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in index 8e3bccf..2cfa12f 100644 --- a/src/ports/postgres/modules/convex/mlp_igd.py_in +++ b/src/ports/postgres/modules/convex/mlp_igd.py_in @@ -877,6 +877,8 @@ def _get_minibatch_param_from_mlp_model_summary(summary_dict, param, return None + +@MinWarning("error") def mlp_predict(schema_madlib, model_table, data_table, id_col_name, output_table, pred_type='response', **kwargs): """ Score new observations using a trained neural network
[03/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/mlp-mnist-v2.ipynb -- diff --git a/community-artifacts/mlp-mnist-v2.ipynb b/community-artifacts/mlp-mnist-v2.ipynb new file mode 100644 index 000..3c1ad14 --- /dev/null +++ b/community-artifacts/mlp-mnist-v2.ipynb @@ -0,0 +1,1154 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Neural networks\n", +"\n", +"Multilayer perceptron (MLP) using the well known MNIST data set.\n", +"\n", +"Updated to include mini-batching which was added in the 1.14 release.\n", +"\n", +"# Intro" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "data": { + "image/jpeg": "/9j/4R5fRXhpZgAATU0AKggABwESAAMBAAEAAAEaAAUBYgEbAAUB\nagEoAAMBAAIAAAExAAIccgEyAAIUjodpAAQBpNAACvyA\nAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIENTNSBXaW5kb3dzADIwMTU6MDc6MjQgMTA6NTk6\nNTEAA6ABAAMBAAEAAKACAAQBAAACoKADAAQBAAABcwAGAQMAAwAA\nAAEABgAAARoABQEAAAEeARsABQEAAAEmASgAAwEAAgAAAgEABAEAAAEuAgIA\nBAEAAB0pAEgBSAH/2P/tAAxBZG9iZV9DTQAB/+4ADkFkb2JlAGSA\nAf/bAIQADAgICAkIDAkJDBELCgsRFQ8MDA8VGBMTFRMTGBEMDAwMDAwRDAwMDAwMDAwMDAwM\nDAwMDAwMDAwMDAwMDAwMDAENCwsNDg0QDg4QFA4ODhQUDg4ODhQRDAwMDAwREQwMDAwMDBEMDAwM\nDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM/8AAEQgAWACgAwEiAAIRAQMRAf/dAAQACv/EAT8AAAEF\nAQEBAQEBAAMAAQIEBQYHCAkKCwEAAQUBAQEBAQEAAQACAwQFBgcICQoLEAAB\nBAEDAgQCBQcGCAUDDDMBAAIRAwQhEjEFQVFhEyJxgTIGFJGhsUIjJBVSwWIzNHKC0UMHJZJT8OHx\nY3M1FqKygyZEk1RkRcKjdDYX0lXiZfKzhMPTdePzRieUpIW0lcTU5PSltcXV5fVWZnaGlqa2xtbm\n9jdHV2d3h5ent8fX5/cRAAICAQIEBAMEBQYHBwYF NQEAAhEDITESBEFRYXEiEwUygZEUobFCI8FS\n0fAzJGLhcoKSQ1MVY3M08SUGFqKygwcmNcLSRJNUoxdkRVU2dGXi8rOEw9N14/NGlKSFtJXE1OT0\npbXF1eX1VmZ2hpamtsbW5vYnN0dXZ3eHl6e3x//aAAwDAQACEQMRAD8A8+yL677Bjsa4VU+1pr5c\nZ997qHex2/8AkvqVmrEDunZFjr2OqqurbVZqNjnh+/1G7fUZQ7+bf/w3p2s/PR+o9Lz+kVMuusdm\nY1h9mVSG2UknX0vXs3Pqs/kW1/8AFb0OjM9YNu2FhYSygXuL91h/wTXN9KnZs/nv0P8AwX+EVwRH\nERI1OvlP8v0WrxXEGGsL+Yfi6n1b+qNudU578dz6GkObbYfTZPZ9su2V4j/of4Sy/wDwXpLQ+tXS\nOidDbVl0Xs6hmOALNAcdriZda9u57MjZ7fRxv5j/AE3s/QrAOX1nIoi8bq2GHAlpx/3dwZLfsX/W\n/wBH/wAUhWU+ptPUdlTK2D1Qx732j6XpsrbvuZ6j2t9m5VZ8vmMwYnhiP3gYxr+t+jxf3nRx83y8\ncUoyjxEg7cMpX/e+fh/2f/VHP9PJ6hkvfY51j3AudY4+0azue7/BsRsu+uPtNW71AQL9hibOPVtd\n9JzLo3Naz9H/ADnvVhl7La/s2JS19FmjaCSywfvW2tZ77/67LX/9bUR07Ira66k1uraD6raSCA3u\n17bW73s/66rAgRHT1dZS31aEpgy19P7sfBosyxbIbtxbXab2NAYf65hz6/625W+k5WS3Os6dkOc6\nvPY7FtredwBcZoewas3svbW6t6rkV2GWFjmNGrbWw5o8fUq3OsZ/L/z1ZxaDe0Vh4FteuPaTIkcU\nG5m5vu/wPqemm2QQbutfCX9X/CZBAT9OwP8AzfH/AAXO22Y73V3s3ODi19TuxB2O/lMf7UW+trqz\nbS42AfSB+nWB/pGt+kz/AId ns/4pdD9b+gHBycfMqE09Rory2gCNrntHrV/9ublzM2UvFlbi17T7\nSNCFBDNGVgai6/rNjLy04Ue4v+qUYrLQCSId2lELQGyHNDDoDrz3Uox8ru3HyI+Fdh/6nHf/AOAf\n8Qp112Ue8si9mlbXabf+HP8A6K/z/wDBqQD7O7AT337Mr7NlDGma3NeQA0agDbE/y91aLVjV5YL2\nAG0iLGAR/wChDG/+fa0G5jWUVusJBcBHftvc7/PuR6qMmusCtjqrLoDCdH7eXP8A3m7voMTrAJMv\nlA1WxgZUIfMTo3cetwz2VUVB9bQC+x2jWNI29/5KNlYFLb6rWPDgAfc7RrgP39v57VY6b0rqTG2W\nZ9fpYEB92Y7QV/ubv9I6z6Laq/0j0LO6sy+jKq6QQPQaLG3Fv6Z1YivJZW1381Tsd6/s9/6L9Ips\nOXBkxmQPFqTED+r+7+8s5jl8+HJwyiYmgDekfV++hf0NptORfdXTV/g6LPa6I3fzbvd/nIb8zpWP\nXNFbr36tftbtAP8AJts3PZ/Yas3EzH7ybALHO/Odq6f3nOSNFlgL2PLGEk+72Nn+S523d/ZQOSNf\nq4Cz39Ulvtm6nIkDt6Ypm9YbUf1WivHIduFhm2wT/wAJb/5FA+1W+u251jrTIIJ4Gu72fu+5RdTc\n0x6lTncwXt/79tU62vYSL2+m0CZI0I/kOHtUfFM6GwB4cMf+9ZOGI1Gv1uX/AHz/AP/Q5Tp+bfhd\nQNFDt+Na4NdU8B9dlbiN1N1T5rtb+Yl9Yei04edQ2uxuLhWVCzFYSS5u91nrV/n3Xena3+d/0foq\n5iYWFTe1vrnLvYQ5lTKHtqY8wd5teTZbWzd7WVM+mq31nysPL6kxl15NePWzGZ6bdzy6ou9d3q2P\npZ7rbH/yFp5Yj27IF8VCzXp/d/uubCROb02Bwni0/wAU/wB5yrc/ErtF1Hvyg0NdkWNhriNN9dbC\n51Vm3/ WtTr6jk10NfXY5jQTuLX7AS76Xs9rfcosxeie5nqZFV4gtN7Q2l2vua5+P9otr9v0H7LP+\ntqX2BmVWa6wxkRse29trASf8I2WWVb/9a1XByEmqvX0w/a2CMdCwfOf7GVGRg5FzXiaskNg2D3F3\n9lv538v1N607KMT0baKrWnMsbNo8h+839/8A8+f8YsNmBkY9pY7HvfYzuGnaD/J27t3tVlvSC8Nf\nYy0thpLnVlsal7/e76Wytjv7akhOfCYmAJOh/R/lJbMR6TqOhH6X8otW3Cdj2OBvre6r6fpl7XVu\n/wBGfVrZ/nfzX/CKz07GnIrscfQfIhzSNjp/lsOyh/8AX/RP/wCCTMz8k+y5hlsltpbuewk7thkO\nd6as/YnmoZEso3fzdm4sZaZ1Yxz9/p2/29n/ABagnhE4nhvby4Wzgz+3IGVb+fE+kk9Mz/q2K7nl\n2fgsdFThHtmYdW4ek7a1eddV+r2RWW5Lsd+NjWFw9dw2tLgRuZTjndfb6e737G+/9J+4rGH1HMbf\nXiZRsqJikWD81p09LLriux1Tf33s/Rfy6lW64d/Ucm6x9hvLx7gXBhaGtb+lDHOv/M2+p9BV+X5I\nwuc5cW0K+T5dpzv9Nu81z8cg9vGOEEyy6+vh498eLh4f1fF6nOpq6I2zZZ9otaP5zIY5lZYP324l\njLPV2/ufaP0v/BqfUS/CvOLfGTjPa1+JazQmp38zbjv9zms/0mO/ez1vU/wqDk5mdG0trfXyWFu+\nQPzv0+65zW/1lp9KnqeIMWzF9O7FJtwDXvAe4/pLMT3+vt9X+ep/m6vUVqIs8EdD/d08v0uL/Ccy\nZI9ctY9fV/zv0eFp5NIOTQ3HLbW4VMbZAO8Tb7mE+/8ASWN/fVn6vY9Z6gcnqdr8XDxh6+Vbtmxw\nnayqhr/5zJybXbGfufpL/wCbpeh19LynMyLL22V22kMHqV7LPcd9n6K1zf8AO9VXX9N zMTCrxBi2\nZFYd9ozi8gMBINePW6zcfTtopc+2yqv+bfk/ziWXAcg2MRLXv/U4aX4OaGGYIIkYn6/vcTf+sX15\nd1ikYNdBqopG3GoZ7obG0b3aufZs/PXJY9jsPKrue4M2E7gDLy1w2WAbPou2O/PVi+uut+y7
[15/15] madlib-site git commit: jupyter notebooks for 1.14 release
jupyter notebooks for 1.14 release Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/3f849b9e Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/3f849b9e Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/3f849b9e Branch: refs/heads/notebook-updates-1dot14 Commit: 3f849b9e496063ab70adf139fece33c2e32583eb Parents: 6c103d3 Author: Frank McQuillan Authored: Mon Apr 23 14:56:06 2018 -0700 Committer: Frank McQuillan Committed: Mon Apr 23 14:56:06 2018 -0700 -- community-artifacts/Balanced-sampling-v1.ipynb | 3706 ++ community-artifacts/Decision-trees-v1.ipynb | 1590 ++ ...coding-categorical-variables-1dot10-v1.ipynb | 2748 --- .../Encoding-categorical-variables-v2.ipynb | 4026 +++ community-artifacts/LDA-v1.ipynb| 2034 community-artifacts/MLP.ipynb | 514 -- .../Minibatch-preprocessor-v1.ipynb | 1330 + community-artifacts/PageRank-v1.ipynb | 774 --- community-artifacts/PageRank-v2.ipynb | 889 community-artifacts/Random-forest-v1.ipynb | 2899 +++ community-artifacts/Summary-v1.ipynb| 1026 community-artifacts/Summary-v2.ipynb| 1017 community-artifacts/Term-frequency-v1.ipynb | 1062 community-artifacts/kNN-v2.ipynb| 751 --- community-artifacts/kNN-v3.ipynb| 857 community-artifacts/mlp-mnist-v2.ipynb | 1154 + community-artifacts/mlp-v2.ipynb| 3755 -- community-artifacts/mlp-v3.ipynb| 4584 ++ images/neural-net-head.jpg | Bin 0 -> 326157 bytes 19 files changed, 25148 insertions(+), 9568 deletions(-) --
[05/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Summary-v1.ipynb -- diff --git a/community-artifacts/Summary-v1.ipynb b/community-artifacts/Summary-v1.ipynb deleted file mode 100644 index 57c3611..000 --- a/community-artifacts/Summary-v1.ipynb +++ /dev/null @@ -1,1026 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: fmcquillan@madlib'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum 4.3.10.0\n", -"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", -"\n", -"# PostgreSQL local\n", -"%sql postgresql://fmcquillan@localhost:5432/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang',)]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 1. On-line help" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "summary\n", - "\n", - "\n", - "'summary' is a generic function used to produce summary statisticsof any data table. The function invokes particular 'methods' fromthe MADlib library to provide an overview of the data.---For an overview on usage, run:SELECT madlib.summary('usage'); ---For an example, run:SELECT madlib.summary('example')\n", - "\n", - "" - ], - "text/plain": [ - "[(u\"\\n'summary' is a generic function used to produce summary statistics\\nof any data table. The function invokes particular 'methods' from\\nthe MADlib library to provide an overview of the data.\\n---\\nFor an overview on usage, run:\\nSELECT madlib.summary('usage');\\n ---\\nFor an example, run:\\nSELECT madlib.summary('example')\\n\",)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql SELECT * FROM madlib.summary();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 2. Load data\n", -"Using part of the popular iris data set from https://archive.ics.uci.edu/ml/datasets/iris"; - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "30 rows affected.\n", - "30 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "sepal_length\n", - "sepal_width\n", - "petal_length\n", - "petal_width\n", - "class_name\n", - "\n", - "\n", - "1\n", - "5.1\n", - "3.5\n", - "1.4\n", - "0.2\n", - "Iris-setosa\n", - "\n", - "\n", -
[02/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/mlp-v2.ipynb -- diff --git a/community-artifacts/mlp-v2.ipynb b/community-artifacts/mlp-v2.ipynb deleted file mode 100644 index 145b3e2..000 --- a/community-artifacts/mlp-v2.ipynb +++ /dev/null @@ -1,3755 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Multilayer Perceptron" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": { -"scrolled": true - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 118, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: gpdbchina@madlib'" - ] - }, - "execution_count": 118, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum 4.3.10.0\n", -"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", -"\n", -"# PostgreSQL local\n", -"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"\n", -"# Greenplum 4.2.3.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 119, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 2.0-dev, git revision: rel/v1.12-9-gf790a61, cmake configuration time: Tue Sep 19 17:56:02 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 2.0-dev, git revision: rel/v1.12-9-gf790a61, cmake configuration time: Tue Sep 19 17:56:02 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 119, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 1. Create input table for classification" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "52 rows affected.\n", - "52 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "attributes\n", - "class_text\n", - "class\n", - "state\n", - "\n", - "\n", - "1\n", - "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "2\n", - "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "3\n", - "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "4\n", - "[Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "5\n", - "[Decimal('5.1'), Decimal('3.4'), Decimal('1.5'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "6\n", - "[Decimal('5.0'), Decimal('3.5'), Decimal('1.3'), Decimal('0.3')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "7\n", - "[Decimal('4.5'), Decimal('2.3'), Decimal('1.3'), Decimal('0.3')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "8\n", - "
[10/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/LDA-v1.ipynb -- diff --git a/community-artifacts/LDA-v1.ipynb b/community-artifacts/LDA-v1.ipynb new file mode 100644 index 000..19a199c --- /dev/null +++ b/community-artifacts/LDA-v1.ipynb @@ -0,0 +1,2034 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Latent Dirichlet Allocation \n", +"\n", +"Latent Dirichlet Allocation (LDA) is a generative probabilistic model for natural texts. It is used in problems such as automated topic discovery, collaborative filtering, and document classification.\n", +"\n", +"In addition to an implementation of LDA, this MADlib module also provides a number of additional helper functions to interpret results of the LDA output." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-15-g7ffad03, cmake configuration time: Wed Feb 21 01:33:31 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-15-g7ffad03, cmake configuration time: Wed Feb 21 01:33:31 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Prepare documents\n", +"The examples below are short strings extracted from various Wikipedia documents. First we create a document table with one document per row:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "4 rows affected.\n", + "4 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "docid\n", + "contents\n", + "\n", + "\n", + "0\n", + "Statistical topic models are a class of Bayesian latent variable models, originally developed for analyzing the semantic content of large document corpora.\n", + "\n", + "\n", + "1\n", + "By the late 1960s, the balance between pitching and hitting had swung in favor of the pitchers. In 1968 Carl Yastrzemski won the American League batting title with an average of just .301, the lowest in history.\n", + "\n", + "\n", + "2\n", + "Machine learning is closely related to and often overlaps with computational statistics; a discipline that also specializes in prediction-making. It has strong ties to mathematica
[13/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Decision-trees-v1.ipynb -- diff --git a/community-artifacts/Decision-trees-v1.ipynb b/community-artifacts/Decision-trees-v1.ipynb new file mode 100644 index 000..e97b943 --- /dev/null +++ b/community-artifacts/Decision-trees-v1.ipynb @@ -0,0 +1,1590 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Decision trees\n", +"\n", +"A decision tree is a supervised learning method that can be used for classification and regression. It consists of a structure in which internal nodes represent tests on attributes, and the branches from nodes represent the result of those tests. Each leaf node is a class label and the paths from root to leaf nodes define the set of classification or regression rules." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: fmcquillan@madlib'" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"#%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Decision tree classification examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Data set related to whether to play golf or not." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "14 rows affected.\n", + "14 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "OUTLOOK\n", + "temperature\n", + "humidity\n", + "Temp_Humidity\n", + "clouds_airquality\n", + "windy\n", + "class\n", + "observation_weight\n", + "\n", + "\n", + "1\n", + "sunny\n", + "85.0\n", + "85.0\n", + "[85.0, 85.0]\n", + "[u'none', u'unhealthy']\n", + "False\n", + "Don't Play\n", + "5.0\n", + "\n", + "\n", + "2\n", + "sunny\n", + "80.0\n", + "90.0\n", + "[80.0, 90.0]\n", + "[u'none', u'moderate']\n", + "True\n", + "Don't Play\n", + "5.0\n", + "\n", + "\n", + "3\n", + "overcast\n", + "83.0\n", + "78.0\n", + "[83.0, 78.0]\n", + "[u'low', u'moderate']\n", + "False\n", + "Play\n", + "1.5\n", + "\n", + "\n", + "4\n", + "rain\n", + "70.0\n", + "96.0\n", + "[70.0, 96.0]\n", + "[u'low', u'moderate']\n", + "False\n", + "Play\n", + "1.0\n", + "\n", + "\n", + "5\n", + "rain\n", + "68.0\n", + "80.0\n", + "[68.0, 80.0]\n", + "[u'medium', u'good']\n", + "False\n", + "Play\n", + "1.0\n", + "\n", + "\n", + "6\n", + "rain\n", + "65.0\n", + "70.0\n", + "[65.0, 70.0]\n", + "[u'low', u'unhealthy']\n", + "True\n", + "Don't Play\n", + "1.0\n", + "\n", + "\n", + "7\n", + "overcast\n", + "64.0\n", + "65.0\n", +
[14/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Balanced-sampling-v1.ipynb -- diff --git a/community-artifacts/Balanced-sampling-v1.ipynb b/community-artifacts/Balanced-sampling-v1.ipynb new file mode 100644 index 000..5f6ec23 --- /dev/null +++ b/community-artifacts/Balanced-sampling-v1.ipynb @@ -0,0 +1,3706 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Balanced sampling\n", +"\n", +"This module offers a number of re-sampling techniques including under-sampling majority classes, over-sampling minority classes, and combinations of the two.\n", +"\n", +"Balanced sampling was added in MADlib 1.14." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Based in part on the flags data set from https://archive.ics.uci.edu/ml/datasets/Flags"; + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "22 rows affected.\n", + "22 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "name\n", + "landmass\n", + "zone\n", + "area\n", + "population\n", + "language\n", + "colours\n", + "mainhue\n", + "\n", + "\n", + "1\n", + "Argentina\n", + "2\n", + "3\n", + "2777\n", + "28\n", + "2\n", + "2\n", + "blue\n", + "\n", + "\n", + "2\n", + "Australia\n", + "6\n", + "2\n", + "7690\n", + "15\n", + "1\n", + "3\n", + "blue\n", + "\n", + "\n", + "8\n", + "Greece\n", + "3\n", + "1\n", + "132\n", + "10\n", + "6\n", + "2\n", + "blue\n", + "\n", + "\n", + "9\n", + "Guatemala\n", + "1\n", + "4\n", + "109\n", + "8\n", + "2\n", + "2\n", + "blue\n", + "\n", + "\n", + "17\n", + "Sweden\n", + "3\n", + "1\n", + "450\n", + "8\n", + "
[08/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Minibatch-preprocessor-v1.ipynb -- diff --git a/community-artifacts/Minibatch-preprocessor-v1.ipynb b/community-artifacts/Minibatch-preprocessor-v1.ipynb new file mode 100644 index 000..fe03a27 --- /dev/null +++ b/community-artifacts/Minibatch-preprocessor-v1.ipynb @@ -0,0 +1,1330 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Mini-batch preprocessor\n", +"\n", +"The mini-batch preprocessor is a utility that prepares input data for use by models that support mini-batch as an optimization option. (This is currently only the case for Neural Networks.) It is effectively a packing operation that builds arrays of dependent and independent variables from the source data table.\n", +"\n", +"The mini-batch preprocessor was added in MADlib 1.14." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Based on the well known iris dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "52 rows affected.\n", + "52 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "attributes\n", + "class_text\n", + "class\n", + "state\n", + "\n", + "\n", + "1\n", + "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "2\n", + "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "3\n", + "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", +
[04/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Term-frequency-v1.ipynb -- diff --git a/community-artifacts/Term-frequency-v1.ipynb b/community-artifacts/Term-frequency-v1.ipynb new file mode 100644 index 000..99a0cd0 --- /dev/null +++ b/community-artifacts/Term-frequency-v1.ipynb @@ -0,0 +1,1062 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Term Frequency\n", +"Term frequency computes the number of times that a word or term occurs in a document. Term frequency is often used as part of a larger text processing pipeline, which may include operations such as stemming, stop word removal and topic modelling." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: fmcquillan@madlib'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum 4.3.10.0\n", +"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", +"\n", +"# PostgreSQL local\n", +"%sql postgresql://fmcquillan@localhost:5432/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang',)]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Prepare documents\n", +"First we create a document table with one document per row:" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "4 rows affected.\n", + "4 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "docid\n", + "contents\n", + "\n", + "\n", + "0\n", + "I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.\n", + "\n", + "\n", + "1\n", + "Chinchillas and kittens are cute.\n", + "\n", + "\n", + "2\n", + "My sister adopted two kittens yesterday.\n", + "\n", + "\n", + "3\n", + "Look at this cute hamster munching on a piece of broccoli.\n", + "\n", + "" + ], + "text/plain": [ + "[(0, u'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", + " (1, u'Chinchillas and kittens are cute.'),\n", + " (2, u'My sister adopted two kittens yesterday.'),\n", + " (3, u'Look at this cute hamster munching on a piece of broccoli.')]" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%%sql\n", +"DROP TABLE IF EXISTS documents;\n", +"CREATE TABLE documents(docid INT4, contents TEXT);\n", +"\n", +"INSERT INTO documents VALUES\n", +"(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", +"(1, 'Chinchillas and kittens are cute.'),\n", +"(2, 'My sister adopted two kittens yesterday.'),\n", +"(3, 'Look at this cute hamster munching on a piece of broccoli.');\n", +"\n", +"SELECT * from documents ORDER BY docid;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"You can
[06/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Random-forest-v1.ipynb -- diff --git a/community-artifacts/Random-forest-v1.ipynb b/community-artifacts/Random-forest-v1.ipynb new file mode 100644 index 000..bac8363 --- /dev/null +++ b/community-artifacts/Random-forest-v1.ipynb @@ -0,0 +1,2899 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Random forest\n", +"\n", +"Random forests build an ensemble of classifiers, each of which is a tree model constructed using bootstrapped samples from the input data. The results of these models are then combined to yield a single prediction, which, at the expense of some loss in interpretation, have been found to be highly accurate.\n", +"\n", +"Please also refer to the decision tree user documentation for information relevant to the implementation of random forests in MADlib." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-40-ga1360f3, cmake configuration time: Wed Mar 28 18:16:08 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-40-ga1360f3, cmake configuration time: Wed Mar 28 18:16:08 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Random forest classification examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Data set related to whether to play golf or not." + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "14 rows affected.\n", + "14 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "OUTLOOK\n", + "temperature\n", + "humidity\n", + "Temp_Humidity\n", + "clouds_airquality\n", + "windy\n", + "class\n", + "\n", + "\n", + "1\n", + "sunny\n", + "85.0\n", + "85.0\n", + "[85.0, 85.0]\n", + "[u'none', u'unhealthy']\n", + "False\n", + "Don't Play\n", + "\n", + "\n", + "2\n", + "sunny\n", + "80.0\n", + "90.0\n", + "[80.0, 90.0]\n", + "[u'none', u'moderate']\n", + "True\n", + "Don't Play\n", + "\n", + "\n", + "3\n", + "overcast\n", + "83.0\n", + "78.0\n", + "[83.0, 78.0]\n", + "[u'low', u'moderate']\n", + "False\n", + "Play\n", + "\n", + "\n", + "4\n", + "rain\n", +
[11/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Encoding-categorical-variables-v2.ipynb -- diff --git a/community-artifacts/Encoding-categorical-variables-v2.ipynb b/community-artifacts/Encoding-categorical-variables-v2.ipynb new file mode 100644 index 000..5e4cb6f --- /dev/null +++ b/community-artifacts/Encoding-categorical-variables-v2.ipynb @@ -0,0 +1,4026 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Encoding categorical variables\n", +"This is the new module that replaces create_indicator_variables() which was deprecated as of MADlib v1.10" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-21-g3af2d70, cmake configuration time: Mon Feb 26 18:00:54 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-21-g3af2d70, cmake configuration time: Mon Feb 26 18:00:54 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"## 1. Load data set\n", +"Use a subset of the abalone dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "20 rows affected.\n", + "20 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "sex\n", + "length\n", + "diameter\n", + "height\n", + "rings\n", + "\n", + "\n", + "1\n", + "M\n", + "0.455\n", + "0.365\n", + "0.095\n", + "15\n", + "\n", + "\n", + "2\n", + "M\n", + "0.35\n", + "0.265\n", + "0.09\n", + "7\n", + "\n", + "\n", + "3\n", + "F\n", + "0.53\n", + "0.42\n", + "0.135\n", + "9\n", + "\n", + "\n", + "4\n", + "M\n", + "0.44\n", + "0.365\n", + "0.125\n", + "10\n", + "\n", + "\n", + "5\n", + "I\n", + "0.33\n", + "0.255\n", + "0.08\n", + "7\n", + "\n", + "\n", +
[12/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb -- diff --git a/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb b/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb deleted file mode 100644 index 409de20..000 --- a/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb +++ /dev/null @@ -1,2748 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Encoding categorical variables (MADlib v1.10+)\n", -"This is the new module that replaces create_indicator_variables() which has been deprecated as of MADlib v1.10" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: gpdbchina@madlib'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n", -"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"#%sql postgresql://gpadmin@54.197.30.46:10432/gpadmin" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.10.0-dev, git revision: rel/v1.9.1-47-g2d5a5ed, cmake configuration time: Tue Feb 7 19:45:19 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.10.0-dev, git revision: rel/v1.9.1-47-g2d5a5ed, cmake configuration time: Tue Feb 7 19:45:19 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"## 1. Load data set\n", -"Use a subset of the abalone dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "20 rows affected.\n", - "20 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "sex\n", - "length\n", - "diameter\n", - "height\n", - "rings\n", - "\n", - "\n", - "1\n", - "M\n", - "0.455\n", - "0.365\n", - "0.095\n", - "15\n", - "\n", - "\n", - "2\n", - "M\n", - "0.35\n", - "0.265\n", - "0.09\n", - "7\n", - "\n", - "\n", - "3\n", - "F\n", - "0.53\n", - "0.42\n", - "0.135\n", - "9\n", - "\n", - "\n", - "4\n", - "M\n", - "0.44\n", - "0.365\n", - "0.125\n", - "10\n", - "\n", - "\n", - "5\n", - "I\n", - "0.33\n", - "0.255\n", - "0.08\n", - "7\n", - "\n", - "\n", - "6\n", - "I\n", - "0.425\n", - "0.3\n", - "0.095\n", - "8\n", - "\n", - "\n", - "7\n", - "F\n", - "0.53\n", - "0.415\n", - "0.15\n", - "20\n", - "\n", - "\n", - "8\n", - "F\n", - "
[07/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/PageRank-v1.ipynb -- diff --git a/community-artifacts/PageRank-v1.ipynb b/community-artifacts/PageRank-v1.ipynb deleted file mode 100644 index 32b1caf..000 --- a/community-artifacts/PageRank-v1.ipynb +++ /dev/null @@ -1,774 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# PageRank\n", -"The PageRank algorithm produces a probability distribution representing the likelihood that a person randomly traversing a graph will arrive at any particular vertex. PageRank was added in MADlib 1.11." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", - " \"You should import from traitlets.config instead.\", ShimWarning)\n", - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", - " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: fmcquillan@madlib'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum 4.3.10.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", -"\n", -"# PostgreSQL local\n", -"%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"\n", -"# Greenplum 4.2.3.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.11-dev, git revision: rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.11-dev, git revision: rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 1. Create vertex and edge tables" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "Done.\n", - "7 rows affected.\n", - "22 rows affected.\n", - "22 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "src\n", - "dest\n", - "user_id\n", - "\n", - "\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "0\n", - "2\n", - "2\n", - "\n", - "\n", - "0\n", - "2\n", - "1\n", - "\n", - "\n", - "0\n", - "4\n", - "2\n", - "\n", - "\n", - "0\n", - "4\n", - "1\n", - "\n", - "\n", - "0\n", - "1\n", - "2\n", - "\n", - "\n", - "1\n", - "3\n", - "1\n", - "\n", - "\n", - "1\n", - "3\n", - "2\n", - "\n", - "\n", - "1\n", - "2\n", -
[09/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/MLP.ipynb -- diff --git a/community-artifacts/MLP.ipynb b/community-artifacts/MLP.ipynb deleted file mode 100644 index dcd0cdb..000 --- a/community-artifacts/MLP.ipynb +++ /dev/null @@ -1,514 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"- This demo uses the popular MNIST dataset, which consists of 70,000 hand written digits and is used for \n", -"classification.\n", -"\n", -"## Current best accuracy on postgres\n", -"\n", -"### train_accuracy\n", -"\n", -"- 99.64%\n", -"\n", -"### test_accuracy\n", -"\n", -"- 96.79%\n", -"\n", -"### Parameters\n", -"- Hidden layers: [200,200], tanh activation, n_iterations=10, learning_rate_init=0.001, learning_rate_policy=constant, lambda=0.0001, tolerance=0" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: csloan@postgres'" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%load_ext sql\n", -"%sql postgresql://csloan@localhost:5432/postgres" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "DROP TABLE\n", - "CREATE TABLE\n", - "COPY 6\n", - "DROP TABLE\n", - "CREATE TABLE\n", - "COPY 1\n" - ] -} - ], - "source": [ -"%%bash\n", -"# Note that these datasets are available from https://github.com/apache/incubator-madlib-site\n";, -"gunzip -c ../data/mnist_train.sql.gz > ../data/mnist_train.sql\n", -"gunzip -c ../data/mnist_test.sql.gz > ../data/mnist_test.sql\n", -"psql -f ../data/mnist_train.sql\n", -"psql -f ../data/mnist_test.sql" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ -{ - "data": { - "image/png": "iVBORw0KGgoNSUhEUgAAA9YAAAGDCAIAAABBVx+IAABFLklEQVR42u3df3AU953n/7ElukHM\nGCOBsSQ2TFivZZ8FxsjEsXyxoZxS4iOK76I9fw/OW18l32/B9+vvslvl2dqUtZVjvHVfcanyuGqB\nS1Wou0TZJOZqK3IdYflWzMYb4hxKiKzlh0UBwpYH2xLIFgIxg2BaI/Pt6Y7aTXfPaDQaTXd/5vn4\ng5GE0OfT/Zn59IuP3vPpO28BKKE7AwBKiAgOEMEBIjgAIjgAAABABAcA\nAABABAcAAACI4AARHAARHCCCAwAAACCCA0ARpYb7jmT3zihnCADK55IQz3FJ6DmX\nIIIDQHEk+mJ/ld2+gRSnCADKxcUj0RyXhB8VdkkgggMlRQQHgGykx1/5xVG7V5plzg0AlItw\n+8/sV4Jf/fTPVxHBAWB+MngoJDvgxABAOXG4EIRC0px+ZOER/A4AyILpGgCAeYngAIjg\nQBlE8FsAoGFKBQCgRBEcABEcAEonFT8YjXREY7FoJNodT4nRlngHJeQwud6oB/vghW4I/wRg\nIiKCA4DrAbxv1/Z9oW07o5HIS+1S146951L+b0u8gxJymFxv1IN98EI3hH8CMBERwQHA/QTe3334\ncl1TXWa3cDncXHPx4P55u4F9ydoS76CEHCbXG/VgH7zQDeGfAExERHAAcF8iHlek4B/u0SBJNVIy\nPqz4vS3xDkrIYXK9UQ/2wQvdEP4JwEREBAcA9ykp80pJZgUlkUz4vS3xDkrIYXK9UQ/2wQvdEP4J\nwEREBAcA90m33bA+M4nLkuT3tsQ7KCGHyfVGPdgHL3RD+CcAExERHADcFwrXSkoyof/GUlGSSjBc\nF/J7W+IdlJDD5HqjHuyDF7oh/BOAiYgIDgDukxu3t tQM92g7WKXiR+K1LVsaZb+3Jd5BCTlMrjfq\nwT54oRvCPwGYiIjgAOCFDN4U2d2e3NfRGYvt6gpseyWyRvZ/W+IdlJDD5HqjHuyDF7oh/BOAiYgI\nDgBeEGpo69wT64hEorFoW4MsRlviHZSQw+R6ox7sgxe6IfwTgImICA4AAAD4DBEcIIID\nRHAARHCACA4AAACACA4QwQEiOAAiOEAEBwAAAEAEBwAAAIjg\nAIjgABEcIIIDIIIDgEel4gejkY5oLBaNRLvjKTHaEu+ghBwm1xv1YB+80A3hnwBM\nRERwAHA9gPft2r4vtG1nNBJ5qV3q2rH3XMr/bYl3UEIOk+uNerAPXuiG8E8AJiIiOAC4n8D7uw9f\nrmuqk9WP5XBzzcWD+wdSfm9LvIMScphcb9SDffBCN4R/AjAREcEBwH2JeFyRgiFJ+0SSaqRkfFjx\ne1viHZSQw+R6ox7sgxe6IfwTgImICA4A7lNS5pWSzApKIpnwe1viHZSQw+R6ox7sgxe6IfwTgImI\nCA4A7pNk2fRZZhKXJcnvbYl3UEIOk+uNerAPXuiG8E8AJiIiOAC4LxSulZRkQv+NpaIklWC4LuT3\ntsQ7KCGHyfVGPdgHL3RD+CcAExERHADcJzdubakZ7tF2sErFj8RrW7Y0yn5vS7yDEnKYXG/Ug33w\nQjeEfwIwERHBAcALGbwpsrs9ua+jMxbb1RXY9kpkjez/tsQ7KCGHyfVGPdgHL3RD+CcAExERHAC8\nINTQ1rkn1hGJRGPRtgZZjLbEOyghh8n1Rj3YBy90Q/gnABMRERwAAADwGSI4QAQHiOAA\niOAAERwARCCtWLF48dIVSyVOBQDASejepeqF4o/uLuxCQQQHADtlZOT69SsjVxROBQDA\nSeLSFfVC8eHVwi4URPBy8alm amoqrVE0qVTqpubGjRsTmuvXryc1Cc01zfj4+FWbcZNrJuq/Uv+5\n+nPUn6b+WPWHpzSTGrXpT21u2TBeAABAYETwcqHmWj2C6ylcTcP5p3A9Z88Yvo38rdJ/lBHB1Yb0\n/K22TvIGAABEcJRXBNdXwfU1aTUZ2/O3To/gRrY2Urjxpz2FJ6aZV8H1JXAjgltWwW85YbAR\nHCLQ87e+8q1mbjUfGyFbjdRXrly5fPnyqGZkZOSSZlgzNDT0geaCif6VDz/88KOPPtL/HNKo33/x\n4kX1J6g/R/2BY2NjekZX29JXxNXW9SA+ZWKO44wUAAAggkMQ+iq4ynEV3FgIz7EK7liIYq9F0QtR\nLLXgllVwvTOO69+kcACY0Y/moFh9+LtCMXwAEby8IrgqWy24UYiSoxbcSOG5q1D0QhQ9fxuFKPrb\nMc214MZ/Cczhm/wNAACI4BCHnryNzK1HbTUu68HaKET55JNPPv74Y70WRa9C0UtNshWiGFUo+p96\nIYr6b9Wfo/60sbEx9YfrodxYEU9PM78107IozngBAAAiOHwv9yq4eSE8Oc1Y1bYsgZsXwvU18mw7\nEtrfjmmp/LaEb4YJAAAQwSFUBLfvCz6XQhTj6+ZClGz7glsiuGMteIBCcPhWKn4wGumIxmLRSLQ7\nnhKjLfEOSshhytb+J+/806FDv/gn9Y+3LlwvefNTYwO9v+vt/d3RI2++1f+JIvZL0jsdYCLy16gR\nwcuF/kZMNQebC1HUrGwuRFF
[01/15] madlib-site git commit: jupyter notebooks for 1.14 release
Repository: madlib-site Updated Branches: refs/heads/notebook-updates-1dot14 [created] 3f849b9e4 http://git-wip-us.apache.org/repos/asf/madlib-site/blob/3f849b9e/community-artifacts/mlp-v3.ipynb -- diff --git a/community-artifacts/mlp-v3.ipynb b/community-artifacts/mlp-v3.ipynb new file mode 100644 index 000..8c585a6 --- /dev/null +++ b/community-artifacts/mlp-v3.ipynb @@ -0,0 +1,4584 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Multilayer Perceptron\n", +"\n", +"Multilayer Perceptron (MLP) is a type of neural network that can be used for regression and classification.\n", +"\n", +"This version of the workbook includes mini-batching which was added in the 1.14 release." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { +"scrolled": true + }, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Classification without Mini-Batching\n", +"\n", +"# 1. Create input table for classification" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "52 rows affected.\n", + "52 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "attributes\n", + "class_text\n", + "class\n", + "state\n", + "\n", + "\n", + "1\n", + "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "2\n", + "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "3\n", + "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "4\n", + "[Decimal('4.4'), Decimal('3.0'), Decimal('1.3')
[08/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Minibatch-preprocessor-v1.ipynb -- diff --git a/community-artifacts/Minibatch-preprocessor-v1.ipynb b/community-artifacts/Minibatch-preprocessor-v1.ipynb new file mode 100644 index 000..fe03a27 --- /dev/null +++ b/community-artifacts/Minibatch-preprocessor-v1.ipynb @@ -0,0 +1,1330 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Mini-batch preprocessor\n", +"\n", +"The mini-batch preprocessor is a utility that prepares input data for use by models that support mini-batch as an optimization option. (This is currently only the case for Neural Networks.) It is effectively a packing operation that builds arrays of dependent and independent variables from the source data table.\n", +"\n", +"The mini-batch preprocessor was added in MADlib 1.14." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Based on the well known iris dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "52 rows affected.\n", + "52 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "attributes\n", + "class_text\n", + "class\n", + "state\n", + "\n", + "\n", + "1\n", + "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "2\n", + "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "3\n", + "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", +
[13/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Decision-trees-v1.ipynb -- diff --git a/community-artifacts/Decision-trees-v1.ipynb b/community-artifacts/Decision-trees-v1.ipynb new file mode 100644 index 000..e97b943 --- /dev/null +++ b/community-artifacts/Decision-trees-v1.ipynb @@ -0,0 +1,1590 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Decision trees\n", +"\n", +"A decision tree is a supervised learning method that can be used for classification and regression. It consists of a structure in which internal nodes represent tests on attributes, and the branches from nodes represent the result of those tests. Each leaf node is a class label and the paths from root to leaf nodes define the set of classification or regression rules." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: fmcquillan@madlib'" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"#%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Decision tree classification examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Data set related to whether to play golf or not." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "14 rows affected.\n", + "14 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "OUTLOOK\n", + "temperature\n", + "humidity\n", + "Temp_Humidity\n", + "clouds_airquality\n", + "windy\n", + "class\n", + "observation_weight\n", + "\n", + "\n", + "1\n", + "sunny\n", + "85.0\n", + "85.0\n", + "[85.0, 85.0]\n", + "[u'none', u'unhealthy']\n", + "False\n", + "Don't Play\n", + "5.0\n", + "\n", + "\n", + "2\n", + "sunny\n", + "80.0\n", + "90.0\n", + "[80.0, 90.0]\n", + "[u'none', u'moderate']\n", + "True\n", + "Don't Play\n", + "5.0\n", + "\n", + "\n", + "3\n", + "overcast\n", + "83.0\n", + "78.0\n", + "[83.0, 78.0]\n", + "[u'low', u'moderate']\n", + "False\n", + "Play\n", + "1.5\n", + "\n", + "\n", + "4\n", + "rain\n", + "70.0\n", + "96.0\n", + "[70.0, 96.0]\n", + "[u'low', u'moderate']\n", + "False\n", + "Play\n", + "1.0\n", + "\n", + "\n", + "5\n", + "rain\n", + "68.0\n", + "80.0\n", + "[68.0, 80.0]\n", + "[u'medium', u'good']\n", + "False\n", + "Play\n", + "1.0\n", + "\n", + "\n", + "6\n", + "rain\n", + "65.0\n", + "70.0\n", + "[65.0, 70.0]\n", + "[u'low', u'unhealthy']\n", + "True\n", + "Don't Play\n", + "1.0\n", + "\n", + "\n", + "7\n", + "overcast\n", + "64.0\n", + "65.0\n", +
[01/15] madlib-site git commit: jupyter notebooks for 1.14 release
Repository: madlib-site Updated Branches: refs/heads/asf-site 4fe8cfb2f -> 418f361cf http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/mlp-v3.ipynb -- diff --git a/community-artifacts/mlp-v3.ipynb b/community-artifacts/mlp-v3.ipynb new file mode 100644 index 000..8c585a6 --- /dev/null +++ b/community-artifacts/mlp-v3.ipynb @@ -0,0 +1,4584 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Multilayer Perceptron\n", +"\n", +"Multilayer Perceptron (MLP) is a type of neural network that can be used for regression and classification.\n", +"\n", +"This version of the workbook includes mini-batching which was added in the 1.14 release." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { +"scrolled": true + }, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Classification without Mini-Batching\n", +"\n", +"# 1. Create input table for classification" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "52 rows affected.\n", + "52 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "attributes\n", + "class_text\n", + "class\n", + "state\n", + "\n", + "\n", + "1\n", + "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "2\n", + "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "3\n", + "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]\n", + "Iris_setosa\n", + "1\n", + "Alaska\n", + "\n", + "\n", + "4\n", + "[Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), Decimal('
[05/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Summary-v1.ipynb -- diff --git a/community-artifacts/Summary-v1.ipynb b/community-artifacts/Summary-v1.ipynb deleted file mode 100644 index 57c3611..000 --- a/community-artifacts/Summary-v1.ipynb +++ /dev/null @@ -1,1026 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: fmcquillan@madlib'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum 4.3.10.0\n", -"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", -"\n", -"# PostgreSQL local\n", -"%sql postgresql://fmcquillan@localhost:5432/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.12, git revision: unknown, cmake configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang',)]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 1. On-line help" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "summary\n", - "\n", - "\n", - "'summary' is a generic function used to produce summary statisticsof any data table. The function invokes particular 'methods' fromthe MADlib library to provide an overview of the data.---For an overview on usage, run:SELECT madlib.summary('usage'); ---For an example, run:SELECT madlib.summary('example')\n", - "\n", - "" - ], - "text/plain": [ - "[(u\"\\n'summary' is a generic function used to produce summary statistics\\nof any data table. The function invokes particular 'methods' from\\nthe MADlib library to provide an overview of the data.\\n---\\nFor an overview on usage, run:\\nSELECT madlib.summary('usage');\\n ---\\nFor an example, run:\\nSELECT madlib.summary('example')\\n\",)]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql SELECT * FROM madlib.summary();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 2. Load data\n", -"Using part of the popular iris data set from https://archive.ics.uci.edu/ml/datasets/iris"; - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "30 rows affected.\n", - "30 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "sepal_length\n", - "sepal_width\n", - "petal_length\n", - "petal_width\n", - "class_name\n", - "\n", - "\n", - "1\n", - "5.1\n", - "3.5\n", - "1.4\n", - "0.2\n", - "Iris-setosa\n", - "\n", - "\n", -
[09/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/MLP.ipynb -- diff --git a/community-artifacts/MLP.ipynb b/community-artifacts/MLP.ipynb deleted file mode 100644 index dcd0cdb..000 --- a/community-artifacts/MLP.ipynb +++ /dev/null @@ -1,514 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"- This demo uses the popular MNIST dataset, which consists of 70,000 hand written digits and is used for \n", -"classification.\n", -"\n", -"## Current best accuracy on postgres\n", -"\n", -"### train_accuracy\n", -"\n", -"- 99.64%\n", -"\n", -"### test_accuracy\n", -"\n", -"- 96.79%\n", -"\n", -"### Parameters\n", -"- Hidden layers: [200,200], tanh activation, n_iterations=10, learning_rate_init=0.001, learning_rate_policy=constant, lambda=0.0001, tolerance=0" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: csloan@postgres'" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%load_ext sql\n", -"%sql postgresql://csloan@localhost:5432/postgres" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "DROP TABLE\n", - "CREATE TABLE\n", - "COPY 6\n", - "DROP TABLE\n", - "CREATE TABLE\n", - "COPY 1\n" - ] -} - ], - "source": [ -"%%bash\n", -"# Note that these datasets are available from https://github.com/apache/incubator-madlib-site\n";, -"gunzip -c ../data/mnist_train.sql.gz > ../data/mnist_train.sql\n", -"gunzip -c ../data/mnist_test.sql.gz > ../data/mnist_test.sql\n", -"psql -f ../data/mnist_train.sql\n", -"psql -f ../data/mnist_test.sql" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ -{ - "data": { - "image/png": "iVBORw0KGgoNSUhEUgAAA9YAAAGDCAIAAABBVx+IAABFLklEQVR42u3df3AU953n/7ElukHM\nGCOBsSQ2TFivZZ8FxsjEsXyxoZxS4iOK76I9fw/OW18l32/B9+vvslvl2dqUtZVjvHVfcanyuGqB\nS1Wou0TZJOZqK3IdYflWzMYb4hxKiKzlh0UBwpYH2xLIFgIxg2BaI/Pt6Y7aTXfPaDQaTXd/5vn4\ng5GE0OfT/Zn59IuP3vPpO28BKKE7AwBKiAgOEMEBIjgAIjgAAABABAcA\nAABABAcAAACI4AARHAARHCCCAwAAACCCA0ARpYb7jmT3zihnCADK55IQz3FJ6DmX\nIIIDQHEk+mJ/ld2+gRSnCADKxcUj0RyXhB8VdkkgggMlRQQHgGykx1/5xVG7V5plzg0AlItw\n+8/sV4Jf/fTPVxHBAWB+MngoJDvgxABAOXG4EIRC0px+ZOER/A4AyILpGgCAeYngAIjg\nQBlE8FsAoGFKBQCgRBEcABEcAEonFT8YjXREY7FoJNodT4nRlngHJeQwud6oB/vghW4I/wRg\nIiKCA4DrAbxv1/Z9oW07o5HIS+1S146951L+b0u8gxJymFxv1IN98EI3hH8CMBERwQHA/QTe3334\ncl1TXWa3cDncXHPx4P55u4F9ydoS76CEHCbXG/VgH7zQDeGfAExERHAAcF8iHlek4B/u0SBJNVIy\nPqz4vS3xDkrIYXK9UQ/2wQvdEP4JwEREBAcA9ykp80pJZgUlkUz4vS3xDkrIYXK9UQ/2wQvdEP4J\nwEREBAcA90m33bA+M4nLkuT3tsQ7KCGHyfVGPdgHL3RD+CcAExERHADcFwrXSkoyof/GUlGSSjBc\nF/J7W+IdlJDD5HqjHuyDF7oh/BOAiYgIDgDukxu3t tQM92g7WKXiR+K1LVsaZb+3Jd5BCTlMrjfq\nwT54oRvCPwGYiIjgAOCFDN4U2d2e3NfRGYvt6gpseyWyRvZ/W+IdlJDD5HqjHuyDF7oh/BOAiYgI\nDgBeEGpo69wT64hEorFoW4MsRlviHZSQw+R6ox7sgxe6IfwTgImICA4AAAD4DBEcIIID\nRHAARHCACA4AAACACA4QwQEiOAAiOEAEBwAAAEAEBwAAAIjg\nAIjgABEcIIIDIIIDgEel4gejkY5oLBaNRLvjKTHaEu+ghBwm1xv1YB+80A3hnwBM\nRERwAHA9gPft2r4vtG1nNBJ5qV3q2rH3XMr/bYl3UEIOk+uNerAPXuiG8E8AJiIiOAC4n8D7uw9f\nrmuqk9WP5XBzzcWD+wdSfm9LvIMScphcb9SDffBCN4R/AjAREcEBwH2JeFyRgiFJ+0SSaqRkfFjx\ne1viHZSQw+R6ox7sgxe6IfwTgImICA4A7lNS5pWSzApKIpnwe1viHZSQw+R6ox7sgxe6IfwTgImI\nCA4A7pNk2fRZZhKXJcnvbYl3UEIOk+uNerAPXuiG8E8AJiIiOAC4LxSulZRkQv+NpaIklWC4LuT3\ntsQ7KCGHyfVGPdgHL3RD+CcAExERHADcJzdubakZ7tF2sErFj8RrW7Y0yn5vS7yDEnKYXG/Ug33w\nQjeEfwIwERHBAcALGbwpsrs9ua+jMxbb1RXY9kpkjez/tsQ7KCGHyfVGPdgHL3RD+CcAExERHAC8\nINTQ1rkn1hGJRGPRtgZZjLbEOyghh8n1Rj3YBy90Q/gnABMRERwAAADwGSI4QAQHiOAA\niOAAERwARCCtWLF48dIVSyVOBQDASejepeqF4o/uLuxCQQQHADtlZOT69SsjVxROBQDA\nSeLSFfVC8eHVwi4URPBy8alm amoqrVE0qVTqpubGjRsTmuvXryc1Cc01zfj4+FWbcZNrJuq/Uv+5\n+nPUn6b+WPWHpzSTGrXpT21u2TBeAABAYETwcqHmWj2C6ylcTcP5p3A9Z88Yvo38rdJ/lBHB1Yb0\n/K22TvIGAABEcJRXBNdXwfU1aTUZ2/O3To/gRrY2Urjxpz2FJ6aZV8H1JXAjgltWwW85YbAR\nHCLQ87e+8q1mbjUfGyFbjdRXrly5fPnyqGZkZOSSZlgzNDT0geaCif6VDz/88KOPPtL/HNKo33/x\n4kX1J6g/R/2BY2NjekZX29JXxNXW9SA+ZWKO44wUAAAggkMQ+iq4ynEV3FgIz7EK7liIYq9F0QtR\nLLXgllVwvTOO69+kcACY0Y/moFh9+LtCMXwAEby8IrgqWy24UYiSoxbcSOG5q1D0QhQ9fxuFKPrb\nMc214MZ/Cczhm/wNAACI4BCHnryNzK1HbTUu68HaKET55JNPPv74Y70WRa9C0UtNshWiGFUo+p96\nIYr6b9Wfo/60sbEx9YfrodxYEU9PM78107IozngBAAAiOHwv9yq4eSE8Oc1Y1bYsgZsXwvU18mw7\nEtrfjmmp/LaEb4YJAAAQwSFUBLfvCz6XQhTj6+ZClGz7glsiuGMteIBCcPhWKn4wGumIxmLRSLQ7\nnhKjLfEOSshhytb+J+/806FDv/gn9Y+3LlwvefNTYwO9v+vt/d3RI2++1f+JIvZL0jsdYCLy16gR\nwcuF/kZMNQebC1HUrGwuRFF
[02/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/mlp-v2.ipynb -- diff --git a/community-artifacts/mlp-v2.ipynb b/community-artifacts/mlp-v2.ipynb deleted file mode 100644 index 145b3e2..000 --- a/community-artifacts/mlp-v2.ipynb +++ /dev/null @@ -1,3755 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Multilayer Perceptron" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": { -"scrolled": true - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 118, - "metadata": {}, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: gpdbchina@madlib'" - ] - }, - "execution_count": 118, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum 4.3.10.0\n", -"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", -"\n", -"# PostgreSQL local\n", -"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"\n", -"# Greenplum 4.2.3.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 119, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 2.0-dev, git revision: rel/v1.12-9-gf790a61, cmake configuration time: Tue Sep 19 17:56:02 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 2.0-dev, git revision: rel/v1.12-9-gf790a61, cmake configuration time: Tue Sep 19 17:56:02 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 119, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 1. Create input table for classification" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "52 rows affected.\n", - "52 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "attributes\n", - "class_text\n", - "class\n", - "state\n", - "\n", - "\n", - "1\n", - "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "2\n", - "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "3\n", - "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), Decimal('0.1')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "4\n", - "[Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "5\n", - "[Decimal('5.1'), Decimal('3.4'), Decimal('1.5'), Decimal('0.2')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "6\n", - "[Decimal('5.0'), Decimal('3.5'), Decimal('1.3'), Decimal('0.3')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "7\n", - "[Decimal('4.5'), Decimal('2.3'), Decimal('1.3'), Decimal('0.3')]\n", - "Iris_setosa\n", - "1\n", - "Alaska\n", - "\n", - "\n", - "8\n", - "
[07/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/PageRank-v1.ipynb -- diff --git a/community-artifacts/PageRank-v1.ipynb b/community-artifacts/PageRank-v1.ipynb deleted file mode 100644 index 32b1caf..000 --- a/community-artifacts/PageRank-v1.ipynb +++ /dev/null @@ -1,774 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# PageRank\n", -"The PageRank algorithm produces a probability distribution representing the likelihood that a person randomly traversing a graph will arrive at any particular vertex. PageRank was added in MADlib 1.11." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", - " \"You should import from traitlets.config instead.\", ShimWarning)\n", - "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", - " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: fmcquillan@madlib'" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"# Greenplum 4.3.10.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", -"\n", -"# PostgreSQL local\n", -"%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"\n", -"# Greenplum 4.2.3.0\n", -"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.11-dev, git revision: rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.11-dev, git revision: rc/v1.9alpha-rc1-138-gcc5ce09, cmake configuration time: Tue Apr 11 20:47:30 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# 1. Create vertex and edge tables" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "Done.\n", - "7 rows affected.\n", - "22 rows affected.\n", - "22 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "src\n", - "dest\n", - "user_id\n", - "\n", - "\n", - "0\n", - "1\n", - "1\n", - "\n", - "\n", - "0\n", - "2\n", - "2\n", - "\n", - "\n", - "0\n", - "2\n", - "1\n", - "\n", - "\n", - "0\n", - "4\n", - "2\n", - "\n", - "\n", - "0\n", - "4\n", - "1\n", - "\n", - "\n", - "0\n", - "1\n", - "2\n", - "\n", - "\n", - "1\n", - "3\n", - "1\n", - "\n", - "\n", - "1\n", - "3\n", - "2\n", - "\n", - "\n", - "1\n", - "2\n", -
[10/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/LDA-v1.ipynb -- diff --git a/community-artifacts/LDA-v1.ipynb b/community-artifacts/LDA-v1.ipynb new file mode 100644 index 000..19a199c --- /dev/null +++ b/community-artifacts/LDA-v1.ipynb @@ -0,0 +1,2034 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Latent Dirichlet Allocation \n", +"\n", +"Latent Dirichlet Allocation (LDA) is a generative probabilistic model for natural texts. It is used in problems such as automated topic discovery, collaborative filtering, and document classification.\n", +"\n", +"In addition to an implementation of LDA, this MADlib module also provides a number of additional helper functions to interpret results of the LDA output." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-15-g7ffad03, cmake configuration time: Wed Feb 21 01:33:31 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-15-g7ffad03, cmake configuration time: Wed Feb 21 01:33:31 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Prepare documents\n", +"The examples below are short strings extracted from various Wikipedia documents. First we create a document table with one document per row:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "4 rows affected.\n", + "4 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "docid\n", + "contents\n", + "\n", + "\n", + "0\n", + "Statistical topic models are a class of Bayesian latent variable models, originally developed for analyzing the semantic content of large document corpora.\n", + "\n", + "\n", + "1\n", + "By the late 1960s, the balance between pitching and hitting had swung in favor of the pitchers. In 1968 Carl Yastrzemski won the American League batting title with an average of just .301, the lowest in history.\n", + "\n", + "\n", + "2\n", + "Machine learning is closely related to and often overlaps with computational statistics; a discipline that also specializes in prediction-making. It has strong ties to mathematica
[03/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/mlp-mnist-v2.ipynb -- diff --git a/community-artifacts/mlp-mnist-v2.ipynb b/community-artifacts/mlp-mnist-v2.ipynb new file mode 100644 index 000..3c1ad14 --- /dev/null +++ b/community-artifacts/mlp-mnist-v2.ipynb @@ -0,0 +1,1154 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Neural networks\n", +"\n", +"Multilayer perceptron (MLP) using the well known MNIST data set.\n", +"\n", +"Updated to include mini-batching which was added in the 1.14 release.\n", +"\n", +"# Intro" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "data": { + "image/jpeg": "/9j/4R5fRXhpZgAATU0AKggABwESAAMBAAEAAAEaAAUBYgEbAAUB\nagEoAAMBAAIAAAExAAIccgEyAAIUjodpAAQBpNAACvyA\nAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIENTNSBXaW5kb3dzADIwMTU6MDc6MjQgMTA6NTk6\nNTEAA6ABAAMBAAEAAKACAAQBAAACoKADAAQBAAABcwAGAQMAAwAA\nAAEABgAAARoABQEAAAEeARsABQEAAAEmASgAAwEAAgAAAgEABAEAAAEuAgIA\nBAEAAB0pAEgBSAH/2P/tAAxBZG9iZV9DTQAB/+4ADkFkb2JlAGSA\nAf/bAIQADAgICAkIDAkJDBELCgsRFQ8MDA8VGBMTFRMTGBEMDAwMDAwRDAwMDAwMDAwMDAwM\nDAwMDAwMDAwMDAwMDAwMDAENCwsNDg0QDg4QFA4ODhQUDg4ODhQRDAwMDAwREQwMDAwMDBEMDAwM\nDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM/8AAEQgAWACgAwEiAAIRAQMRAf/dAAQACv/EAT8AAAEF\nAQEBAQEBAAMAAQIEBQYHCAkKCwEAAQUBAQEBAQEAAQACAwQFBgcICQoLEAAB\nBAEDAgQCBQcGCAUDDDMBAAIRAwQhEjEFQVFhEyJxgTIGFJGhsUIjJBVSwWIzNHKC0UMHJZJT8OHx\nY3M1FqKygyZEk1RkRcKjdDYX0lXiZfKzhMPTdePzRieUpIW0lcTU5PSltcXV5fVWZnaGlqa2xtbm\n9jdHV2d3h5ent8fX5/cRAAICAQIEBAMEBQYHBwYF NQEAAhEDITESBEFRYXEiEwUygZEUobFCI8FS\n0fAzJGLhcoKSQ1MVY3M08SUGFqKygwcmNcLSRJNUoxdkRVU2dGXi8rOEw9N14/NGlKSFtJXE1OT0\npbXF1eX1VmZ2hpamtsbW5vYnN0dXZ3eHl6e3x//aAAwDAQACEQMRAD8A8+yL677Bjsa4VU+1pr5c\nZ997qHex2/8AkvqVmrEDunZFjr2OqqurbVZqNjnh+/1G7fUZQ7+bf/w3p2s/PR+o9Lz+kVMuusdm\nY1h9mVSG2UknX0vXs3Pqs/kW1/8AFb0OjM9YNu2FhYSygXuL91h/wTXN9KnZs/nv0P8AwX+EVwRH\nERI1OvlP8v0WrxXEGGsL+Yfi6n1b+qNudU578dz6GkObbYfTZPZ9su2V4j/of4Sy/wDwXpLQ+tXS\nOidDbVl0Xs6hmOALNAcdriZda9u57MjZ7fRxv5j/AE3s/QrAOX1nIoi8bq2GHAlpx/3dwZLfsX/W\n/wBH/wAUhWU+ptPUdlTK2D1Qx732j6XpsrbvuZ6j2t9m5VZ8vmMwYnhiP3gYxr+t+jxf3nRx83y8\ncUoyjxEg7cMpX/e+fh/2f/VHP9PJ6hkvfY51j3AudY4+0azue7/BsRsu+uPtNW71AQL9hibOPVtd\n9JzLo3Naz9H/ADnvVhl7La/s2JS19FmjaCSywfvW2tZ77/67LX/9bUR07Ira66k1uraD6raSCA3u\n17bW73s/66rAgRHT1dZS31aEpgy19P7sfBosyxbIbtxbXab2NAYf65hz6/625W+k5WS3Os6dkOc6\nvPY7FtredwBcZoewas3svbW6t6rkV2GWFjmNGrbWw5o8fUq3OsZ/L/z1ZxaDe0Vh4FteuPaTIkcU\nG5m5vu/wPqemm2QQbutfCX9X/CZBAT9OwP8AzfH/AAXO22Y73V3s3ODi19TuxB2O/lMf7UW+trqz\nbS42AfSB+nWB/pGt+kz/AId ns/4pdD9b+gHBycfMqE09Rory2gCNrntHrV/9ublzM2UvFlbi17T7\nSNCFBDNGVgai6/rNjLy04Ue4v+qUYrLQCSId2lELQGyHNDDoDrz3Uox8ru3HyI+Fdh/6nHf/AOAf\n8Qp112Ue8si9mlbXabf+HP8A6K/z/wDBqQD7O7AT337Mr7NlDGma3NeQA0agDbE/y91aLVjV5YL2\nAG0iLGAR/wChDG/+fa0G5jWUVusJBcBHftvc7/PuR6qMmusCtjqrLoDCdH7eXP8A3m7voMTrAJMv\nlA1WxgZUIfMTo3cetwz2VUVB9bQC+x2jWNI29/5KNlYFLb6rWPDgAfc7RrgP39v57VY6b0rqTG2W\nZ9fpYEB92Y7QV/ubv9I6z6Laq/0j0LO6sy+jKq6QQPQaLG3Fv6Z1YivJZW1381Tsd6/s9/6L9Ips\nOXBkxmQPFqTED+r+7+8s5jl8+HJwyiYmgDekfV++hf0NptORfdXTV/g6LPa6I3fzbvd/nIb8zpWP\nXNFbr36tftbtAP8AJts3PZ/Yas3EzH7ybALHO/Odq6f3nOSNFlgL2PLGEk+72Nn+S523d/ZQOSNf\nq4Cz39Ulvtm6nIkDt6Ypm9YbUf1WivHIduFhm2wT/wAJb/5FA+1W+u251jrTIIJ4Gu72fu+5RdTc\n0x6lTncwXt/79tU62vYSL2+m0CZI0I/kOHtUfFM6GwB4cMf+9ZOGI1Gv1uX/AHz/AP/Q5Tp+bfhd\nQNFDt+Na4NdU8B9dlbiN1N1T5rtb+Yl9Yei04edQ2uxuLhWVCzFYSS5u91nrV/n3Xena3+d/0foq\n5iYWFTe1vrnLvYQ5lTKHtqY8wd5teTZbWzd7WVM+mq31nysPL6kxl15NePWzGZ6bdzy6ou9d3q2P\npZ7rbH/yFp5Yj27IF8VCzXp/d/uubCROb02Bwni0/wAU/wB5yrc/ErtF1Hvyg0NdkWNhriNN9dbC\n51Vm3/ WtTr6jk10NfXY5jQTuLX7AS76Xs9rfcosxeie5nqZFV4gtN7Q2l2vua5+P9otr9v0H7LP+\ntqX2BmVWa6wxkRse29trASf8I2WWVb/9a1XByEmqvX0w/a2CMdCwfOf7GVGRg5FzXiaskNg2D3F3\n9lv538v1N607KMT0baKrWnMsbNo8h+839/8A8+f8YsNmBkY9pY7HvfYzuGnaD/J27t3tVlvSC8Nf\nYy0thpLnVlsal7/e76Wytjv7akhOfCYmAJOh/R/lJbMR6TqOhH6X8otW3Cdj2OBvre6r6fpl7XVu\n/wBGfVrZ/nfzX/CKz07GnIrscfQfIhzSNjp/lsOyh/8AX/RP/wCCTMz8k+y5hlsltpbuewk7thkO\nd6as/YnmoZEso3fzdm4sZaZ1Yxz9/p2/29n/ABagnhE4nhvby4Wzgz+3IGVb+fE+kk9Mz/q2K7nl\n2fgsdFThHtmYdW4ek7a1eddV+r2RWW5Lsd+NjWFw9dw2tLgRuZTjndfb6e737G+/9J+4rGH1HMbf\nXiZRsqJikWD81p09LLriux1Tf33s/Rfy6lW64d/Ucm6x9hvLx7gXBhaGtb+lDHOv/M2+p9BV+X5I\nwuc5cW0K+T5dpzv9Nu81z8cg9vGOEEyy6+vh498eLh4f1fF6nOpq6I2zZZ9otaP5zIY5lZYP324l\njLPV2/ufaP0v/BqfUS/CvOLfGTjPa1+JazQmp38zbjv9zms/0mO/ez1vU/wqDk5mdG0trfXyWFu+\nQPzv0+65zW/1lp9KnqeIMWzF9O7FJtwDXvAe4/pLMT3+vt9X+ep/m6vUVqIs8EdD/d08v0uL/Ccy\nZI9ctY9fV/zv0eFp5NIOTQ3HLbW4VMbZAO8Tb7mE+/8ASWN/fVn6vY9Z6gcnqdr8XDxh6+Vbtmxw\nnayqhr/5zJybXbGfufpL/wCbpeh19LynMyLL22V22kMHqV7LPcd9n6K1zf8AO9VXX9N zMTCrxBi2\nZFYd9ozi8gMBINePW6zcfTtopc+2yqv+bfk/ziWXAcg2MRLXv/U4aX4OaGGYIIkYn6/vcTf+sX15\nd1ikYNdBqopG3GoZ7obG0b3aufZs/PXJY9jsPKrue4M2E7gDLy1w2WAbPou2O/PVi+uut+y7
[06/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Random-forest-v1.ipynb -- diff --git a/community-artifacts/Random-forest-v1.ipynb b/community-artifacts/Random-forest-v1.ipynb new file mode 100644 index 000..bac8363 --- /dev/null +++ b/community-artifacts/Random-forest-v1.ipynb @@ -0,0 +1,2899 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Random forest\n", +"\n", +"Random forests build an ensemble of classifiers, each of which is a tree model constructed using bootstrapped samples from the input data. The results of these models are then combined to yield a single prediction, which, at the expense of some loss in interpretation, have been found to be highly accurate.\n", +"\n", +"Please also refer to the decision tree user documentation for information relevant to the implementation of random forests in MADlib." + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-40-ga1360f3, cmake configuration time: Wed Mar 28 18:16:08 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-40-ga1360f3, cmake configuration time: Wed Mar 28 18:16:08 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Random forest classification examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Data set related to whether to play golf or not." + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "14 rows affected.\n", + "14 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "OUTLOOK\n", + "temperature\n", + "humidity\n", + "Temp_Humidity\n", + "clouds_airquality\n", + "windy\n", + "class\n", + "\n", + "\n", + "1\n", + "sunny\n", + "85.0\n", + "85.0\n", + "[85.0, 85.0]\n", + "[u'none', u'unhealthy']\n", + "False\n", + "Don't Play\n", + "\n", + "\n", + "2\n", + "sunny\n", + "80.0\n", + "90.0\n", + "[80.0, 90.0]\n", + "[u'none', u'moderate']\n", + "True\n", + "Don't Play\n", + "\n", + "\n", + "3\n", + "overcast\n", + "83.0\n", + "78.0\n", + "[83.0, 78.0]\n", + "[u'low', u'moderate']\n", + "False\n", + "Play\n", + "\n", + "\n", + "4\n", + "rain\n", +
[04/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Term-frequency-v1.ipynb -- diff --git a/community-artifacts/Term-frequency-v1.ipynb b/community-artifacts/Term-frequency-v1.ipynb new file mode 100644 index 000..99a0cd0 --- /dev/null +++ b/community-artifacts/Term-frequency-v1.ipynb @@ -0,0 +1,1062 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Term Frequency\n", +"Term frequency computes the number of times that a word or term occurs in a document. Term frequency is often used as part of a larger text processing pipeline, which may include operations such as stemming, stop word removal and topic modelling." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: fmcquillan@madlib'" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum 4.3.10.0\n", +"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n", +"\n", +"# PostgreSQL local\n", +"%sql postgresql://fmcquillan@localhost:5432/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.13, git revision: unknown, cmake configuration time: Wed Dec 20 08:02:21 UTC 2017, build type: Release, build system: Darwin-17.3.0, C compiler: Clang, C++ compiler: Clang',)]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Prepare documents\n", +"First we create a document table with one document per row:" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "4 rows affected.\n", + "4 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "docid\n", + "contents\n", + "\n", + "\n", + "0\n", + "I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.\n", + "\n", + "\n", + "1\n", + "Chinchillas and kittens are cute.\n", + "\n", + "\n", + "2\n", + "My sister adopted two kittens yesterday.\n", + "\n", + "\n", + "3\n", + "Look at this cute hamster munching on a piece of broccoli.\n", + "\n", + "" + ], + "text/plain": [ + "[(0, u'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", + " (1, u'Chinchillas and kittens are cute.'),\n", + " (2, u'My sister adopted two kittens yesterday.'),\n", + " (3, u'Look at this cute hamster munching on a piece of broccoli.')]" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%%sql\n", +"DROP TABLE IF EXISTS documents;\n", +"CREATE TABLE documents(docid INT4, contents TEXT);\n", +"\n", +"INSERT INTO documents VALUES\n", +"(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),\n", +"(1, 'Chinchillas and kittens are cute.'),\n", +"(2, 'My sister adopted two kittens yesterday.'),\n", +"(3, 'Look at this cute hamster munching on a piece of broccoli.');\n", +"\n", +"SELECT * from documents ORDER BY docid;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"You can
[11/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Encoding-categorical-variables-v2.ipynb -- diff --git a/community-artifacts/Encoding-categorical-variables-v2.ipynb b/community-artifacts/Encoding-categorical-variables-v2.ipynb new file mode 100644 index 000..5e4cb6f --- /dev/null +++ b/community-artifacts/Encoding-categorical-variables-v2.ipynb @@ -0,0 +1,4026 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Encoding categorical variables\n", +"This is the new module that replaces create_indicator_variables() which was deprecated as of MADlib v1.10" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ +{ + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated. You should import from traitlets.config instead.\n", + " \"You should import from traitlets.config instead.\", ShimWarning)\n", + "/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.\n", + " warn(\"IPython.utils.traitlets has moved to a top-level traitlets package.\")\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-21-g3af2d70, cmake configuration time: Mon Feb 26 18:00:54 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-21-g3af2d70, cmake configuration time: Mon Feb 26 18:00:54 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"## 1. Load data set\n", +"Use a subset of the abalone dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "20 rows affected.\n", + "20 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "sex\n", + "length\n", + "diameter\n", + "height\n", + "rings\n", + "\n", + "\n", + "1\n", + "M\n", + "0.455\n", + "0.365\n", + "0.095\n", + "15\n", + "\n", + "\n", + "2\n", + "M\n", + "0.35\n", + "0.265\n", + "0.09\n", + "7\n", + "\n", + "\n", + "3\n", + "F\n", + "0.53\n", + "0.42\n", + "0.135\n", + "9\n", + "\n", + "\n", + "4\n", + "M\n", + "0.44\n", + "0.365\n", + "0.125\n", + "10\n", + "\n", + "\n", + "5\n", + "I\n", + "0.33\n", + "0.255\n", + "0.08\n", + "7\n", + "\n", + "\n", +
[15/15] madlib-site git commit: jupyter notebooks for 1.14 release
jupyter notebooks for 1.14 release Project: http://git-wip-us.apache.org/repos/asf/madlib-site/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib-site/commit/418f361c Tree: http://git-wip-us.apache.org/repos/asf/madlib-site/tree/418f361c Diff: http://git-wip-us.apache.org/repos/asf/madlib-site/diff/418f361c Branch: refs/heads/asf-site Commit: 418f361cffb2d03563634047cb2a26a4c1b71caf Parents: 4fe8cfb Author: Frank McQuillan Authored: Mon Apr 23 14:56:06 2018 -0700 Committer: Frank McQuillan Committed: Mon Apr 23 16:14:50 2018 -0700 -- community-artifacts/Balanced-sampling-v1.ipynb | 3706 ++ community-artifacts/Decision-trees-v1.ipynb | 1590 ++ ...coding-categorical-variables-1dot10-v1.ipynb | 2748 --- .../Encoding-categorical-variables-v2.ipynb | 4026 +++ community-artifacts/LDA-v1.ipynb| 2034 community-artifacts/MLP.ipynb | 514 -- .../Minibatch-preprocessor-v1.ipynb | 1330 + community-artifacts/PageRank-v1.ipynb | 774 --- community-artifacts/PageRank-v2.ipynb | 889 community-artifacts/Random-forest-v1.ipynb | 2899 +++ community-artifacts/Summary-v1.ipynb| 1026 community-artifacts/Summary-v2.ipynb| 1017 community-artifacts/Term-frequency-v1.ipynb | 1062 community-artifacts/kNN-v2.ipynb| 751 --- community-artifacts/kNN-v3.ipynb| 857 community-artifacts/mlp-mnist-v2.ipynb | 1154 + community-artifacts/mlp-v2.ipynb| 3755 -- community-artifacts/mlp-v3.ipynb| 4584 ++ images/neural-net-head.jpg | Bin 0 -> 326157 bytes 19 files changed, 25148 insertions(+), 9568 deletions(-) --
[14/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Balanced-sampling-v1.ipynb -- diff --git a/community-artifacts/Balanced-sampling-v1.ipynb b/community-artifacts/Balanced-sampling-v1.ipynb new file mode 100644 index 000..5f6ec23 --- /dev/null +++ b/community-artifacts/Balanced-sampling-v1.ipynb @@ -0,0 +1,3706 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# Balanced sampling\n", +"\n", +"This module offers a number of re-sampling techniques including under-sampling majority classes, over-sampling minority classes, and combinations of the two.\n", +"\n", +"Balanced sampling was added in MADlib 1.14." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "The sql extension is already loaded. To reload it, use:\n", + " %reload_ext sql\n" + ] +} + ], + "source": [ +"%load_ext sql" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ +{ + "data": { + "text/plain": [ + "u'Connected: gpadmin@madlib'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"# Greenplum Database 5.4.0 on GCP (demo machine)\n", +"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n", +"\n", +"# PostgreSQL local\n", +"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", +"\n", +"# Greenplum Database 4.3.10.0\n", +"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "1 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "version\n", + "\n", + "\n", + "MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n", + "\n", + "" + ], + "text/plain": [ + "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-22-g0bfcaf5, cmake configuration time: Wed Mar 14 21:35:16 UTC 2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7',)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" +} + ], + "source": [ +"%sql select madlib.version();\n", +"#%sql select version();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ +"# 1. Load data\n", +"Based in part on the flags data set from https://archive.ics.uci.edu/ml/datasets/Flags"; + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ +{ + "name": "stdout", + "output_type": "stream", + "text": [ + "Done.\n", + "Done.\n", + "22 rows affected.\n", + "22 rows affected.\n" + ] +}, +{ + "data": { + "text/html": [ + "\n", + "\n", + "id\n", + "name\n", + "landmass\n", + "zone\n", + "area\n", + "population\n", + "language\n", + "colours\n", + "mainhue\n", + "\n", + "\n", + "1\n", + "Argentina\n", + "2\n", + "3\n", + "2777\n", + "28\n", + "2\n", + "2\n", + "blue\n", + "\n", + "\n", + "2\n", + "Australia\n", + "6\n", + "2\n", + "7690\n", + "15\n", + "1\n", + "3\n", + "blue\n", + "\n", + "\n", + "8\n", + "Greece\n", + "3\n", + "1\n", + "132\n", + "10\n", + "6\n", + "2\n", + "blue\n", + "\n", + "\n", + "9\n", + "Guatemala\n", + "1\n", + "4\n", + "109\n", + "8\n", + "2\n", + "2\n", + "blue\n", + "\n", + "\n", + "17\n", + "Sweden\n", + "3\n", + "1\n", + "450\n", + "8\n", + "
[12/15] madlib-site git commit: jupyter notebooks for 1.14 release
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/418f361c/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb -- diff --git a/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb b/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb deleted file mode 100644 index 409de20..000 --- a/community-artifacts/Encoding-categorical-variables-1dot10-v1.ipynb +++ /dev/null @@ -1,2748 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"# Encoding categorical variables (MADlib v1.10+)\n", -"This is the new module that replaces create_indicator_variables() which has been deprecated as of MADlib v1.10" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "The sql extension is already loaded. To reload it, use:\n", - " %reload_ext sql\n" - ] -} - ], - "source": [ -"%load_ext sql" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "data": { - "text/plain": [ - "u'Connected: gpdbchina@madlib'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n", -"#%sql postgresql://fmcquillan@localhost:5432/madlib\n", -"#%sql postgresql://gpadmin@54.197.30.46:10432/gpadmin" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "1 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "version\n", - "\n", - "\n", - "MADlib version: 1.10.0-dev, git revision: rel/v1.9.1-47-g2d5a5ed, cmake configuration time: Tue Feb 7 19:45:19 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n", - "\n", - "" - ], - "text/plain": [ - "[(u'MADlib version: 1.10.0-dev, git revision: rel/v1.9.1-47-g2d5a5ed, cmake configuration time: Tue Feb 7 19:45:19 UTC 2017, build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0',)]" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" -} - ], - "source": [ -"%sql select madlib.version();\n", -"#%sql select version();" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ -"## 1. Load data set\n", -"Use a subset of the abalone dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { -"collapsed": false - }, - "outputs": [ -{ - "name": "stdout", - "output_type": "stream", - "text": [ - "Done.\n", - "Done.\n", - "20 rows affected.\n", - "20 rows affected.\n" - ] -}, -{ - "data": { - "text/html": [ - "\n", - "\n", - "id\n", - "sex\n", - "length\n", - "diameter\n", - "height\n", - "rings\n", - "\n", - "\n", - "1\n", - "M\n", - "0.455\n", - "0.365\n", - "0.095\n", - "15\n", - "\n", - "\n", - "2\n", - "M\n", - "0.35\n", - "0.265\n", - "0.09\n", - "7\n", - "\n", - "\n", - "3\n", - "F\n", - "0.53\n", - "0.42\n", - "0.135\n", - "9\n", - "\n", - "\n", - "4\n", - "M\n", - "0.44\n", - "0.365\n", - "0.125\n", - "10\n", - "\n", - "\n", - "5\n", - "I\n", - "0.33\n", - "0.255\n", - "0.08\n", - "7\n", - "\n", - "\n", - "6\n", - "I\n", - "0.425\n", - "0.3\n", - "0.095\n", - "8\n", - "\n", - "\n", - "7\n", - "F\n", - "0.53\n", - "0.415\n", - "0.15\n", - "20\n", - "\n", - "\n", - "8\n", - "F\n", - "
madlib git commit: Release 1.14: Update version numbers and support upgrading to v1.14
Repository: madlib Updated Branches: refs/heads/master 0e1161c0d -> 1c81cb102 Release 1.14: Update version numbers and support upgrading to v1.14 Update the version number to 1.14 for the release candidate. Update the changelists and other related files for upgrade. Update the upgrade_util to ensure PG 10 support. Simplify the _get_existing_uda function since it is not possible to define an aggregate without any arguments. Note that upgrade is not supported from versions prior to 1.11. Co-authored-by: Nikhil Kak Closes #266 Project: http://git-wip-us.apache.org/repos/asf/madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/1c81cb10 Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/1c81cb10 Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/1c81cb10 Branch: refs/heads/master Commit: 1c81cb1027decfa5634f8bab902e664c2b720abd Parents: 0e1161c Author: Orhan Kislal Authored: Mon Apr 23 16:59:11 2018 -0700 Committer: Orhan Kislal Committed: Mon Apr 23 16:59:11 2018 -0700 -- deploy/postflight.sh | 2 +- doc/mainpage.dox.in | 1 + pom.xml | 2 +- src/config/Version.yml| 2 +- src/madpack/changelist_1.12_1.13.yaml | 14 ++--- src/madpack/changelist_1.13_1.14.yaml | 99 ++ src/madpack/template_changelist.yaml | 58 + src/madpack/upgrade_util.py | 57 ++--- 8 files changed, 203 insertions(+), 32 deletions(-) -- http://git-wip-us.apache.org/repos/asf/madlib/blob/1c81cb10/deploy/postflight.sh -- diff --git a/deploy/postflight.sh b/deploy/postflight.sh index 8e108c9..87c6099 100755 --- a/deploy/postflight.sh +++ b/deploy/postflight.sh @@ -2,7 +2,7 @@ # $0 - Script Path, $1 - Package Path, $2 - Target Location, and $3 - Target Volume -MADLIB_VERSION=1.14-dev +MADLIB_VERSION=1.14 find $2/usr/local/madlib/bin -type d -exec cp -RPf {} $2/usr/local/madlib/old_bin \; 2>/dev/null find $2/usr/local/madlib/bin -depth -type d -exec rm -r {} \; 2>/dev/null http://git-wip-us.apache.org/repos/asf/madlib/blob/1c81cb10/doc/mainpage.dox.in -- diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index b93f23c..384d21b 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -17,6 +17,7 @@ Useful links: https://mail-archives.apache.org/mod_mbox/madlib-user/";>User mailing list https://mail-archives.apache.org/mod_mbox/madlib-dev/";>Dev mailing list User documentation for earlier releases: +v1.14, v1.13, v1.12, v1.11, http://git-wip-us.apache.org/repos/asf/madlib/blob/1c81cb10/pom.xml -- diff --git a/pom.xml b/pom.xml index fb8e6aa..5a3ba8f 100644 --- a/pom.xml +++ b/pom.xml @@ -22,7 +22,7 @@ org.apache.madlib madlib - 1.14-dev + 1.14 pom http://git-wip-us.apache.org/repos/asf/madlib/blob/1c81cb10/src/config/Version.yml -- diff --git a/src/config/Version.yml b/src/config/Version.yml index cc18d2a..2ec314d 100644 --- a/src/config/Version.yml +++ b/src/config/Version.yml @@ -1 +1 @@ -version: 1.14-dev +version: 1.14 http://git-wip-us.apache.org/repos/asf/madlib/blob/1c81cb10/src/madpack/changelist_1.12_1.13.yaml -- diff --git a/src/madpack/changelist_1.12_1.13.yaml b/src/madpack/changelist_1.12_1.13.yaml index 5c44c74..0e6c3df 100644 --- a/src/madpack/changelist_1.12_1.13.yaml +++ b/src/madpack/changelist_1.12_1.13.yaml @@ -59,16 +59,16 @@ udf: argument: character varying, character varying, character varying, character varying - __knn_validate_src: rettype: integer -argument: varchar, varchar, varchar, varchar, varchar, varchar, varchar, varchar, integer +argument: character varying, character varying, character varying, character varying, character varying, character varying, character varying, character varying, integer - knn: -rettype: varchar -argument: varchar, varchar, varchar, varchar, varchar, varchar, varchar, varchar, integer +rettype: character varying +argument: character varying, character varying, character varying, character varying, character varying, character varying, character varying, character varying, integer - knn: -rettype: varchar -argument: varchar, varchar, varchar, varchar, varchar, varchar, varchar, varchar +rettype: character varying +argument: character varying, character varying, character varying, character varying, character varying, character varying, chara
[madlib-site] Git Push Summary
Repository: madlib-site Updated Branches: refs/heads/notebook-updates-1dot14 [deleted] 3f849b9e4