[13/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Elastic-net-v3.ipynb
--
diff --git a/community-artifacts/Elastic-net-v3.ipynb 
b/community-artifacts/Elastic-net-v3.ipynb
new file mode 100644
index 000..7592fe6
--- /dev/null
+++ b/community-artifacts/Elastic-net-v3.ipynb
@@ -0,0 +1,2049 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Elastic net (MADlib v1.10+)\n",
+"Demonstrates elastic net, including these updates:\n",
+"- in MADlib 1.10: grouping and cross validation introduced \n",
+"- in MADlib 1.13: report negative root mean squared error instead of the 
negative mean squared error"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-23-gabafa66, cmake configuration time: Wed Jul 11 00:36:05 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-23-gabafa66, 
cmake configuration time: Wed Jul 11 00:36:05 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"## 1.  Create data set\n",
+"House prices and characteristics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "27 rows affected.\n",
+  "27 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "tax\n",
+   "bedroom\n",
+   "bath\n",
+   "price\n",
+   "size\n",
+   "lot\n",
+   "zipcode\n",
+   "\n",
+   "\n",
+   "1\n",
+   "590\n",
+   "2\n",
+   "1.0\n",
+   "5\n",
+   "770\n",
+   "22100\n",
+   "94301\n",
+   "\n",
+   "\n",
+   "2\n",
+   "1050\n",
+   "3\n",
+   "2.0\n",
+   "85000\n",
+   "1410\n",
+   "12000\n",
+   "94301\n",
+   "\n",
+   "\n",
+   "3\n",
+   "20\n",
+   "3\n",
+   "1.0\n",
+   "22500\n",
+   "1060\n",
+   "3500\n",
+   "94301\n",
+   "\n",
+   "\n",
+   "4\n",
+   "870\n",
+   "2\n",
+   "2.0\n",
+   "9\n",
+   "1300\n",
+   "17500\n",
+   "94301\n",
+   "\n",
+   "\n",
+   "5\n",
+   "1320\n",
+   "3\n",
+   "2.0\n",
+   "133000\n",
+   "1500\n",
+   "3\n",
+   "94301\n",
+   "\n",
+   "\n",
+   "6\n",
+   "1350\n",
+   "2\n",
+   "1.0\n",
+   "90500\n",
+   "820\n",
+   "25700\n",
+   "94301\n",
+   

[09/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Novelty-detection-demo-1.ipynb
--
diff --git a/community-artifacts/Novelty-detection-demo-1.ipynb 
b/community-artifacts/Novelty-detection-demo-1.ipynb
deleted file mode 100755
index 563bda4..000
--- a/community-artifacts/Novelty-detection-demo-1.ipynb
+++ /dev/null
@@ -1,478 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Novelty detection using 1-class SVM\n",
-"\n",
-"Classifies new data as similar or different to the training set.  This 
method is an unsupervised method that builds a decision boundary between the 
data and origin in kernel space and can be used as a novelty detector."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "The sql extension is already loaded. To reload it, use:\n",
-  "  %reload_ext sql\n"
- ]
-}
-   ],
-   "source": [
-"# Setup\n",
-"%load_ext sql\n",
-"# %sql postgresql://gpdbchina@10.194.10.68:55000/madlib\n",
-"%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"%matplotlib inline\n",
-"\n",
-"import pandas as pd\n",
-"import numpy as np\n",
-"import matplotlib.pyplot as plt\n",
-"import matplotlib.font_manager"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {
-"collapsed": false
-   },
-   "outputs": [
-{
- "data": {
-  "image/png": 
"iVBORw0KGgoNSUhEUgAAAW8AAAD7CAYAAAClvBX1BHNCSVQICAgIfAhkiAlwSFlz\nAAALEgAACxIB0t1+/AAAHdNJREFUeJzt3W9wXNWZ5/HvkWWRTrADso2J48SAnRk2hICMijJFaqVN\n0mqGqWhG0hvCwDSwi3Zq+WOsNiiOCHFheRUnESQwM8WYsEhhirCVYTUjZid9LZKSqkSF7LA2lJeB\nAHaGTUIYYpydGOiJsHX2xbndarW69cfq7tu3+/ep6qL76va9R23z+PRznnOOsdYiIiLhUhd0A0RE\nZOkUvEVEQkjBW0QkhBS8RURCSMFbRCSEFLxFREKovlw3MsaoJlFE5DRYa03usbL2vK21gT6++tWv\nBt6GSnnos9Bnoc8iHJ9FIUqbiIiEkIK3iEgI1VTwbm1tDboJFUOfxQx9FjP0Wcyo9M/CzJdTKeqN\njLHlupeISLUwxmCDHrAUEZHiUPAWEQkhBW8RkRBS8BYRCSEFbxGREFLwFhEJIQVvEZEQUvAWEQkh\nBW8RkRAKffD2PI+utja62trwPC/o5oiIlEWop8d7nke8o4N9qRQAvZEIwyMjxGKxot5HRCQohabH\nhzp4d7W10T42Rtx/PQyMRqM8eeBAUe8jIhKUkq9tYoypM8YcNMaMFuuaIiKSXzG3QdsO/BOwuojX\nnFd3IkF8chKy0yaJRLluLyISmKL0vI0xG4Grge8U43qLFYvFGB4ZYTQaZTQaVb5bQk2D77IURcl5\nG2O+D+wFPgwkrLXtec7Ret4iBWjwXQoplPNedtrEGPOHwL9Ya583xrQCc26Stnv37szz1tbWit+p\nQqRc9g8Osi+Vygy+k0qxf3BQwbsGjY+PMz4+vuB5y+55G2P+K3AdcBKIAKuA/2Gt/dOc89TzFilA\nlVNSSFlKBY0xLShtIrJkSptIISVLm4jI8qUH3/cPD
 
gIwnEgocMu8Qj1JR0Sk2mkDYhGRKqLgLSIS\nQgreIiIhpOAtIhJCCt4iIiGk4C0iUkKlWrNGpYIiIiVSjMlXVbkZg4hIJSvGsgeq8xYRqSKaHi8i\nUiKl3DBGaRMRkRLyPC+zZk33aaxZo5y3iEgIKectIlJFFLxFpCy0R2dxKW0iIiWnzSZOn3LeIhIY\nbfN2+pTzFhE8z6OtrYu2ti6lLkJOdd4iNcLzPDo64qRS+wCYnIwzMjJcltRFKeuda5XSJiI1oq2t\ni7GxdshKXkSjoxw48GRZ7r/ceudapQ2IRSQvz/MYHNwPQCLRXbKgGovFFLCLSMFbpEYkEt1MTsbT\nmQsikV5aWm4LLJUiy6O0iUgNye1lDw7uDzSVIgtT2kRE5qQu0oFcwkelgiI1Zu/evaxZs4U1a7aw\nYcMqIpFeXOX1MJFIL4lEd9BNlEVQ2kSkhuzdu5e77/468IB/5Hbi8Q7eeOMEUNoBSzk9mmEpIqxZ\ns4Xjx79Cdo67sXEPb7/9WpDNknlohqWILJpmYlY+BW8JBa1IN7/FBtuenhuB20nnuOF2/9jsa3V0\nxBkba2dsrJ2Ojrg+80pkrS3Lw91KZOmSyaRdH4nYIbBDYNdHIjaZTAbdrIqRTCZtJLLewpCFIRuJ\nrC/4+SSTSbt58ydtff05dtWqj9v+/v4550Sjnf61rP8YstFoZ+b9TU0ttrFxs21qulJ/DmXgx865\nMTXfwVI8FLzldHVGo3ZoJpLYIbCd0WjQzaoY8wXbbPMF+WQyaaPRThuNdtqmppY511u16mM2Ho9b\nYxotbLOQsLDaGnO2bWpqURAvoULBW3XeIjVi164BfyalG6xMpWbqvLNnWTY03MGKFTs4dSr9zp2c\nOJFiePgJ4FbgYuAOoB5r7+PQIfd+zcwsL+W8peJ1JxJuFTpclrY3EqG7BlekK5TXTiS689Zq33DD\nDaxcuZ6VK9cTjUZ54YX/k74S
 
0AU8xNGjr3LttbeQSp0PnAucy9TURk6deh94CBgF/hr4C+BDwLP+\neWcCv+c/d4FfE37KLF93vBQPlDaRZUgmk7YzGrWd0WhNfkVfKK+dnfZIJpM2Ho9bWJ053z2/0sKZ\nFtbmHE/4z8+ysM5/vm1O6sQd+4iF9VnvX28hWTBVI8uHct4i4TVfXru/v982Nm62kci5NhJZZxsb\nN1tjPphzfsIPzp/KE5Q3+wE4O2Anc4L82f41zi0Q1FfnHfyU5SsUvJU2EQmx9IzJ48e/Qir1NVIp\ny/Hjf4y19cBTWWc+A3wLl+rItQ64BvhnXKrEA2K43PhOXH77s8AQkG+i3a+Bm5mYOFicX0oWZdkD\nlsaYjcB3gfXANPCwtfaB+d8lIkuRbznXRGKYa6+9BTfVPZ519qh/bAcuDw7wsv/f7qxzD+MC8hrg\nJHAecCVwLXAO8BbwH4CngQNAJ/ATYHvWvdK59jeBnxXjV5VFKkbP+yTQY629CLgCuMUYc2ERrisi\nvlgsxsiIW641Gh1dVGVHQ0MD9fV3UV9/F5//fLM/qPkmcB2uauQRYBD4MvABXOB+BNgAvAPcCEwC\n/wnYBPwt8CX/9Xagx7/Wm6e1oJVmcS5P0dc2Mcb8LfCgtfaHOcdtse8lUuvmLjS1E9ezfpj+/rsA\nuO++RwH4whc+k1mA6ujRoxw5cgfM2s/9YeCnwDf9Y7244PwM8Aug3z/fA3ZTV/cSH/rQarZsuYCB\ngV1LKhPM3U8zEulVqWEBZVnP2xhzHnAp7ruViJRYX18fAPfdt4dU6l3AUF//fc45ZyOPPvo4R478\nM/BpAIaHn2Dz5n/HBRdcwOrVH8650mHgVVzgzk7BfA2XPpn2z/H8n+9jehpOnNjJiy++sOR2Dw7u\nz1tzruC9eEUbsDTGnAn8DbDdWvtOsa4rIrPlphv6+vp4++3XeO+9X9HXt513332XI0dW+4G7Afgz\n/3EGR45
 

[03/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/mlp-mnist-v2.ipynb
--
diff --git a/community-artifacts/mlp-mnist-v2.ipynb 
b/community-artifacts/mlp-mnist-v2.ipynb
deleted file mode 100644
index 3c1ad14..000
--- a/community-artifacts/mlp-mnist-v2.ipynb
+++ /dev/null
@@ -1,1154 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Neural networks\n",
-"\n",
-"Multilayer perceptron (MLP) using the well known MNIST data set.\n",
-"\n",
-"Updated to include mini-batching which was added in the 1.14 release.\n",
-"\n",
-"# Intro"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "image/jpeg": 
"/9j/4R5fRXhpZgAATU0AKggABwESAAMBAAEAAAEaAAUBYgEbAAUB\nagEoAAMBAAIAAAExAAIccgEyAAIUjodpAAQBpNAACvyA\nAAAnEAAK/IAAACcQQWRvYmUgUGhvdG9zaG9wIENTNSBXaW5kb3dzADIwMTU6MDc6MjQgMTA6NTk6\nNTEAA6ABAAMBAAEAAKACAAQBAAACoKADAAQBAAABcwAGAQMAAwAA\nAAEABgAAARoABQEAAAEeARsABQEAAAEmASgAAwEAAgAAAgEABAEAAAEuAgIA\nBAEAAB0pAEgBSAH/2P/tAAxBZG9iZV9DTQAB/+4ADkFkb2JlAGSA\nAf/bAIQADAgICAkIDAkJDBELCgsRFQ8MDA8VGBMTFRMTGBEMDAwMDAwRDAwMDAwMDAwMDAwM\nDAwMDAwMDAwMDAwMDAwMDAENCwsNDg0QDg4QFA4ODhQUDg4ODhQRDAwMDAwREQwMDAwMDBEMDAwM\nDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwM/8AAEQgAWACgAwEiAAIRAQMRAf/dAAQACv/EAT8AAAEF\nAQEBAQEBAAMAAQIEBQYHCAkKCwEAAQUBAQEBAQEAAQACAwQFBgcICQoLEAAB\nBAEDAgQCBQcGCAUDDDMBAAIRAwQhEjEFQVFhEyJxgTIGFJGhsUIjJBVSwWIzNHKC0UMHJZJT8OHx\nY3M1FqKygyZEk1RkRcKjdDYX0lXiZfKzhMPTdePzRieUpIW0lcTU5PSltcXV5fVWZnaGlqa2xtbm\n9jdHV2d3h5ent8fX5/cRAAICAQIEBAMEBQYHBwYF
 
NQEAAhEDITESBEFRYXEiEwUygZEUobFCI8FS\n0fAzJGLhcoKSQ1MVY3M08SUGFqKygwcmNcLSRJNUoxdkRVU2dGXi8rOEw9N14/NGlKSFtJXE1OT0\npbXF1eX1VmZ2hpamtsbW5vYnN0dXZ3eHl6e3x//aAAwDAQACEQMRAD8A8+yL677Bjsa4VU+1pr5c\nZ997qHex2/8AkvqVmrEDunZFjr2OqqurbVZqNjnh+/1G7fUZQ7+bf/w3p2s/PR+o9Lz+kVMuusdm\nY1h9mVSG2UknX0vXs3Pqs/kW1/8AFb0OjM9YNu2FhYSygXuL91h/wTXN9KnZs/nv0P8AwX+EVwRH\nERI1OvlP8v0WrxXEGGsL+Yfi6n1b+qNudU578dz6GkObbYfTZPZ9su2V4j/of4Sy/wDwXpLQ+tXS\nOidDbVl0Xs6hmOALNAcdriZda9u57MjZ7fRxv5j/AE3s/QrAOX1nIoi8bq2GHAlpx/3dwZLfsX/W\n/wBH/wAUhWU+ptPUdlTK2D1Qx732j6XpsrbvuZ6j2t9m5VZ8vmMwYnhiP3gYxr+t+jxf3nRx83y8\ncUoyjxEg7cMpX/e+fh/2f/VHP9PJ6hkvfY51j3AudY4+0azue7/BsRsu+uPtNW71AQL9hibOPVtd\n9JzLo3Naz9H/ADnvVhl7La/s2JS19FmjaCSywfvW2tZ77/67LX/9bUR07Ira66k1uraD6raSCA3u\n17bW73s/66rAgRHT1dZS31aEpgy19P7sfBosyxbIbtxbXab2NAYf65hz6/625W+k5WS3Os6dkOc6\nvPY7FtredwBcZoewas3svbW6t6rkV2GWFjmNGrbWw5o8fUq3OsZ/L/z1ZxaDe0Vh4FteuPaTIkcU\nG5m5vu/wPqemm2QQbutfCX9X/CZBAT9OwP8AzfH/AAXO22Y73V3s3ODi19TuxB2O/lMf7UW+trqz\nbS42AfSB+nWB/pGt+kz/AId
 
ns/4pdD9b+gHBycfMqE09Rory2gCNrntHrV/9ublzM2UvFlbi17T7\nSNCFBDNGVgai6/rNjLy04Ue4v+qUYrLQCSId2lELQGyHNDDoDrz3Uox8ru3HyI+Fdh/6nHf/AOAf\n8Qp112Ue8si9mlbXabf+HP8A6K/z/wDBqQD7O7AT337Mr7NlDGma3NeQA0agDbE/y91aLVjV5YL2\nAG0iLGAR/wChDG/+fa0G5jWUVusJBcBHftvc7/PuR6qMmusCtjqrLoDCdH7eXP8A3m7voMTrAJMv\nlA1WxgZUIfMTo3cetwz2VUVB9bQC+x2jWNI29/5KNlYFLb6rWPDgAfc7RrgP39v57VY6b0rqTG2W\nZ9fpYEB92Y7QV/ubv9I6z6Laq/0j0LO6sy+jKq6QQPQaLG3Fv6Z1YivJZW1381Tsd6/s9/6L9Ips\nOXBkxmQPFqTED+r+7+8s5jl8+HJwyiYmgDekfV++hf0NptORfdXTV/g6LPa6I3fzbvd/nIb8zpWP\nXNFbr36tftbtAP8AJts3PZ/Yas3EzH7ybALHO/Odq6f3nOSNFlgL2PLGEk+72Nn+S523d/ZQOSNf\nq4Cz39Ulvtm6nIkDt6Ypm9YbUf1WivHIduFhm2wT/wAJb/5FA+1W+u251jrTIIJ4Gu72fu+5RdTc\n0x6lTncwXt/79tU62vYSL2+m0CZI0I/kOHtUfFM6GwB4cMf+9ZOGI1Gv1uX/AHz/AP/Q5Tp+bfhd\nQNFDt+Na4NdU8B9dlbiN1N1T5rtb+Yl9Yei04edQ2uxuLhWVCzFYSS5u91nrV/n3Xena3+d/0foq\n5iYWFTe1vrnLvYQ5lTKHtqY8wd5teTZbWzd7WVM+mq31nysPL6kxl15NePWzGZ6bdzy6ou9d3q2P\npZ7rbH/yFp5Yj27IF8VCzXp/d/uubCROb02Bwni0/wAU/wB5yrc/ErtF1Hvyg0NdkWNhriNN9dbC\n51Vm3/
 
WtTr6jk10NfXY5jQTuLX7AS76Xs9rfcosxeie5nqZFV4gtN7Q2l2vua5+P9otr9v0H7LP+\ntqX2BmVWa6wxkRse29trASf8I2WWVb/9a1XByEmqvX0w/a2CMdCwfOf7GVGRg5FzXiaskNg2D3F3\n9lv538v1N607KMT0baKrWnMsbNo8h+839/8A8+f8YsNmBkY9pY7HvfYzuGnaD/J27t3tVlvSC8Nf\nYy0thpLnVlsal7/e76Wytjv7akhOfCYmAJOh/R/lJbMR6TqOhH6X8otW3Cdj2OBvre6r6fpl7XVu\n/wBGfVrZ/nfzX/CKz07GnIrscfQfIhzSNjp/lsOyh/8AX/RP/wCCTMz8k+y5hlsltpbuewk7thkO\nd6as/YnmoZEso3fzdm4sZaZ1Yxz9/p2/29n/ABagnhE4nhvby4Wzgz+3IGVb+fE+kk9Mz/q2K7nl\n2fgsdFThHtmYdW4ek7a1eddV+r2RWW5Lsd+NjWFw9dw2tLgRuZTjndfb6e737G+/9J+4rGH1HMbf\nXiZRsqJikWD81p09LLriux1Tf33s/Rfy6lW64d/Ucm6x9hvLx7gXBhaGtb+lDHOv/M2+p9BV+X5I\nwuc5cW0K+T5dpzv9Nu81z8cg9vGOEEyy6+vh498eLh4f1fF6nOpq6I2zZZ9otaP5zIY5lZYP324l\njLPV2/ufaP0v/BqfUS/CvOLfGTjPa1+JazQmp38zbjv9zms/0mO/ez1vU/wqDk5mdG0trfXyWFu+\nQPzv0+65zW/1lp9KnqeIMWzF9O7FJtwDXvAe4/pLMT3+vt9X+ep/m6vUVqIs8EdD/d08v0uL/Ccy\nZI9ctY9fV/zv0eFp5NIOTQ3HLbW4VMbZAO8Tb7mE+/8ASWN/fVn6vY9Z6gcnqdr8XDxh6+Vbtmxw\nnayqhr/5zJybXbGfufpL/wCbpeh19LynMyLL22V22kMHqV7LPcd9n6K1zf8AO9VXX9N
 

[14/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Elastic-net-v2.ipynb
--
diff --git a/community-artifacts/Elastic-net-v2.ipynb 
b/community-artifacts/Elastic-net-v2.ipynb
deleted file mode 100644
index b6082f0..000
--- a/community-artifacts/Elastic-net-v2.ipynb
+++ /dev/null
@@ -1,2078 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Elastic net (MADlib v1.10+)\n",
-"Demonstrates elastic net, including these updates:\n",
-"- in MADlib 1.10: grouping and cross validation which were introduced \n",
-"- in MADlib 1.13: report negative root mean squared error instead of the 
negative mean squared error"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stderr",
- "output_type": "stream",
- "text": [
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
-  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
-  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: gpdbchina@madlib'"
-  ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum 4.3.10.0\n",
-"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum 4.2.3.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.13-dev, git revision: 
rel/v1.12-42-gedc93f5, cmake configuration time: Fri Dec  8 18:28:18 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-42-gedc93f5, 
cmake configuration time: Fri Dec  8 18:28:18 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"## 1.  Create data set\n",
-"House prices and characteristics."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "27 rows affected.\n",
-  "27 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "id\n",
-   "tax\n",
-   "bedroom\n",
-   "bath\n",
-   "price\n",
-   "size\n",
-   "lot\n",
-   "zipcode\n",
-   "\n",
-   "\n",
-   "1\n",
-   "590\n",
-   "2\n",
-   "1.0\n",
-   "5\n",
-   "770\n",
-   "22100\n",
-   "94301\n",
-   "\n",
-   "\n",
-   "2\n",
-   "1050\n",
-   "3\n",
-   "2.0\n",
-   "85000\n",
-   "1410\n",
-   "12000\n",
-   "94301\n",
-   "\n",
-   "\n",
-   "3\n",
-   "20\n",
-   "3\n",
-   "1.0\n",
-   "22500\n",
-   "1060\n",
-   "3500\n",
-   "94301\n",
-   "\n",
-   "\n",
-   "4\n",
-   "870\n",
-   "2\n",
-   "2.0\n",
-   "9\n",

[04/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Stratified-sampling-v2.ipynb
--
diff --git a/community-artifacts/Stratified-sampling-v2.ipynb 
b/community-artifacts/Stratified-sampling-v2.ipynb
new file mode 100644
index 000..daa417b
--- /dev/null
+++ b/community-artifacts/Stratified-sampling-v2.ipynb
@@ -0,0 +1,672 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Stratified sampling\n",
+"Stratified sampling is a method for sampling subpopulations (strata) 
independently. It is commonly used to reduce sampling error by ensuring that 
subgroups are adequately represented in the sample.\n",
+"\n",
+"Stratified sampling was added in MADlib 1.12."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+"scrolled": true
+   },
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpdbchina@madlib'"
+  ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"#%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.12-dev, git revision: 
rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, 
cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
+  ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1.  Create input table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "25 rows affected.\n",
+  "25 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id1\n",
+   "id2\n",
+   "gr1\n",
+   "gr2\n",
+   "\n",
+   "\n",
+   "1\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "2\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "3\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "4\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "5\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "6\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "7\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "8\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "9\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "9\n",
+   "0\n",
+   "1\n",
+   "1\n",
+   "\n",
+   "\n",
+   "9\n",
+   "0\n",
+   "  

[01/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
Repository: madlib-site
Updated Branches:
  refs/heads/asf-site 5fa1ac070 -> acd339f65


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/stratified-sampling-v1.ipynb
--
diff --git a/community-artifacts/stratified-sampling-v1.ipynb 
b/community-artifacts/stratified-sampling-v1.ipynb
deleted file mode 100644
index 75e02fd..000
--- a/community-artifacts/stratified-sampling-v1.ipynb
+++ /dev/null
@@ -1,672 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Stratified sampling\n",
-"Stratified sampling is a method for sampling subpopulations (strata) 
independently. It is commonly used to reduce sampling error by ensuring that 
subgroups are adequately represented in the sample.\n",
-"\n",
-"Stratified sampling was added in MADlib 1.12."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-"scrolled": true
-   },
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "The sql extension is already loaded. To reload it, use:\n",
-  "  %reload_ext sql\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: gpdbchina@madlib'"
-  ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum 4.3.10.0\n",
-"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum 4.2.3.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.12-dev, git revision: 
rel/v1.11-23-gfdf7b6d, cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.12-dev, git revision: rel/v1.11-23-gfdf7b6d, 
cmake configuration time: Wed Jun 28 18:06:35 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
-  ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# 1.  Create input table"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "25 rows affected.\n",
-  "25 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "id1\n",
-   "id2\n",
-   "gr1\n",
-   "gr2\n",
-   "\n",
-   "\n",
-   "1\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "2\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "3\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "4\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "5\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "6\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "7\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "8\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "9\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-   "9\n",
-   "0\n",
-   "1\n",
-   "1\n",
-   "\n",
-   "\n",
-

[06/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/SVM-novelty-detection-v2.ipynb
--
diff --git a/community-artifacts/SVM-novelty-detection-v2.ipynb 
b/community-artifacts/SVM-novelty-detection-v2.ipynb
new file mode 100755
index 000..678d7c9
--- /dev/null
+++ b/community-artifacts/SVM-novelty-detection-v2.ipynb
@@ -0,0 +1,511 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Novelty detection using 1-class SVM\n",
+"\n",
+"Classifies new data as similar or different to the training set.  This 
method is an unsupervised method that builds a decision boundary between the 
data and origin in kernel space and can be used as a novelty detector."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+"collapsed": true
+   },
+   "outputs": [],
+   "source": [
+"# Setup\n",
+"%matplotlib inline\n",
+"\n",
+"import pandas as pd\n",
+"import numpy as np\n",
+"import matplotlib.pyplot as plt\n",
+"import matplotlib.font_manager"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "image/png": 
"iVBORw0KGgoNSUhEUgAAAW8AAAD7CAYAAAClvBX1BHNCSVQICAgIfAhkiAlwSFlz\nAAALEgAACxIB0t1+/AAAHfZJREFUeJzt3X9wXOV97/H3IwulS2yMZTmG4OCACOMCHiOby3DHnWsN\nYXcZOlUq6x9CSJUfjSZznZofx0RQU6IEcYkTtvnVtB6RTKzAMPQmvmrVznSPlXbEHTE3vQk2lDhQ\niIcyIQZSYXJBwyay0XP/eM6uVqtdayWt9uzZ/bxmdtgfZ88+LObjZ7/Pj2OstYiISLQ0hd0AERFZ\nPIW3iEgEKbxFRCJI4S0iEkEKbxGRCFJ4i4hEUHO1PsgYozmJIiJLYK01hc9VtedtrQ319oUvfCH0\nNtTKTd+Fvgt9F9H4LkpR2UREJIIU3iIiEdRQ4d3Z2Rl2E2qGvotZ+i5m6buYVevfhTlbTaWiH2SM\nrdZniYjUC2MMNuwBSxERqQyFt4hIBCm8RUQiSOEtIhJBCm+RMvm+T08iQU8ige/7YTdHGpxmm4iU\nwfd9eru7OZDJANAfizE8MkIymQy5ZVLvNNtEZBmGUikOZDL0Ar3AgUyGoVQq7GZFin65VFbVNqYS\nkcZV+Muld2JCv1yWSeEtUoY+z6N3YgLyyyaeF3KroiP/lwsAwS8XhffSKbxFypBMJhkeGcmVSoY9\nT8EjodKApTQ83/dzodynUF4RGvBdulIDlgpvaWgKlerRX5JLs+LhbYxpAn4KvGKt7SryusJbak5P\nIkHX2FiuFjsMjMbjHD5yJMxmieRUY6rgbcDPK3g+EREpoSLhbYzZBNwEfKcS5xOplj7Pc6USXK+7\nPxajT7NIJAIqUjYxxvwAeABYC3gqm0iUqBYrtaxU2WTZUwWNMX8IvG6tfdoY0wnM+5CsgYGB3P3O\nzs6av1KFNIZkMqnAlpoxPj7O+Pj4gsctu+dtjPkfw
 
K3AGSAGrAH+l7X2TwqOU89bRGSRqjJV0Biz\nC5VNREQqRhtTiYjUES3SERGpYep5i4jUEYV3hWnPYhGpBoV3BWX3yegaG3NLrru7FeAiDWqlO3IK\n7wrS1VYam351SVY1OnLaz1ukAnSlGMlXjYtPKLwrSFdbaVy6UoxUm8K7gnS1FRGB6nTkNM9bpAJ0\nUQcpVKkNz3QlHZEVpt0JZSUovEVEIkgrLEVE6ojCW0QkghTeIiIRpPAWEYkghbeISAQpvEVEIkjh\nLSISQQpvkQainQ/rhxbpiDQILeGPJq2wFGlwPYmE21s6eDwMjMbjHD5yJMxmyQK0wlJEpI4ovEUa\nRJ/nuVIJrtfdH4vRt8htSn3fJ5HoIZHoUc08ZCqbiDSQ5ex86Ps+3d29ZDIHAIjF+hkZGVbNfIWp\n5i0i8/i+Tyo1BIDn9Z01iBOJHsbGuiCvah6Pj3LkyOGVb2gDKxXeupKOSAPJD+tdu7bzwAPfyvWk\nJyZ61ZOOEIW3SIMoLHuMjd0BfI1sTzqTgVRqqGR4e14fExO92St7EYv143nDVWi5FKMBS5EG4Ps+\nt9yyh0zmEuCC4PbeRZ0jmUwyMuJKJfH4KCMjLri3b/8D1q+/jO3bOzWIWUWqeYvUucIeN9wFnAY+\njZt38hCw+AFI3/fp6rqZ6enm3DlaWu5idPQRlV4qSDVvkQaVSg0Fwd2b9+xBXODGgQFaW/+Txx4r\nHtylBjVTqSGmp7cAn82de3r67KUXqRyFt0hDmgr+mQReY8eO0ZLBnd9rL2dQc3LyDRKJHsANij7x\nxFFg4dkssjgqm4hE0GKm+LnyxseZnv5q8Mw+4G3gQ8Bqmpuf4x//8W8B12uenHyDt976T95883ec\nPn2at9/+L0D2/91LiMdf4siRw0XLJs3NHk1N7zI9/XXgWeBh4JuA5oUvVamyCdbaqtzcR4nIcqXT\naRuLbbRwyMIhG4tttOl0+qzH
 

[07/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Random-forest-v2.ipynb
--
diff --git a/community-artifacts/Random-forest-v2.ipynb 
b/community-artifacts/Random-forest-v2.ipynb
new file mode 100644
index 000..87605b7
--- /dev/null
+++ b/community-artifacts/Random-forest-v2.ipynb
@@ -0,0 +1,3082 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Random forest\n",
+"\n",
+"Random forests build an ensemble of classifiers, each of which is a tree 
model constructed using bootstrapped samples from the input data. The results 
of these models are then combined to yield a single prediction, which, at the 
expense of some loss in interpretation, have been found to be highly 
accurate.\n",
+"\n",
+"Please also refer to the decision tree user documentation for information 
relevant to the implementation of random forests in MADlib.\n",
+"\n",
+"This notebook includes impurity importance which was added in 1.15."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-45-g3ab7554, cmake configuration time: Wed Aug  1 18:34:10 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-45-g3ab7554, 
cmake configuration time: Wed Aug  1 18:34:10 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Random forest classification examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"Data set related to whether to play golf or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "14 rows affected.\n",
+  "14 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "OUTLOOK\n",
+   "temperature\n",
+   "humidity\n",
+   "Temp_Humidity\n",
+   "clouds_airquality\n",
+   "windy\n",
+   "class\n",
+   "\n",
+   "\n",
+   "1\n",
+   "sunny\n",
+   "85.0\n",
+   "85.0\n",
+   "[85.0, 85.0]\n",
+   "[u'none', u'unhealthy']\n",
+   "False\n",
+   "Don't Play\n",
+   "\n",
+   "\n",
+   "2\n",
+   "sunny\n",
+   "80.0\n",
+   "90.0\n",
+   "[80.0, 90.0]\n",
+   "[u'none', 

[10/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/MLP-v4.ipynb
--
diff --git a/community-artifacts/MLP-v4.ipynb b/community-artifacts/MLP-v4.ipynb
new file mode 100644
index 000..a6b62d6
--- /dev/null
+++ b/community-artifacts/MLP-v4.ipynb
@@ -0,0 +1,4588 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Multilayer Perceptron\n",
+"\n",
+"Multilayer Perceptron (MLP) is a type of neural network that can be used 
for regression and classification.\n",
+"\n",
+"This version of the workbook includes mini-batching added in 1.14 and 
momentum added in 1.15"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+"scrolled": true
+   },
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-23-g5c4331d, cmake configuration time: Thu Jul  5 17:46:06 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-23-g5c4331d, 
cmake configuration time: Thu Jul  5 17:46:06 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Classification without Mini-Batching\n",
+"\n",
+"# 1.  Create input table for classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "52 rows affected.\n",
+  "52 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "attributes\n",
+   "class_text\n",
+   "class\n",
+   "state\n",
+   "\n",
+   "\n",
+   "1\n",
+   "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), 
Decimal('0.2')]\n",
+   "Iris_setosa\n",
+   "1\n",
+   "Alaska\n",
+   "\n",
+   "\n",
+   "2\n",
+   "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), 
Decimal('0.2')]\n",
+   "Iris_setosa\n",
+   "1\n",
+   "Alaska\n",
+   "\n",
+   "\n",
+   "3\n",
+   "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), 
Decimal('0.1')]\n",
+   "Iris_setosa\n",
+   "1\n",
+   "Alaska\n",
+   "\n",
+   "\n",
+   "4\n",
+   "[Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), 
Decimal('0.2')]\n",
+   "Iris_setosa\n",
+   "1\n",
+   "

[05/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/SVM-v1.ipynb
--
diff --git a/community-artifacts/SVM-v1.ipynb b/community-artifacts/SVM-v1.ipynb
new file mode 100644
index 000..405710d
--- /dev/null
+++ b/community-artifacts/SVM-v1.ipynb
@@ -0,0 +1,2806 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Support Vector Machines\n",
+"Support Vector Machines (SVMs) are models for regression and 
classification tasks. SVM models have two particularly desirable features: 
robustness in the presence of noisy data and applicability to a variety of data 
configurations. At its core, a linear SVM model is a hyperplane separating two 
distinct classes of data (in the case of classification problems), in such a 
way that the distance between the hyperplane and the nearest training data 
point (called the margin) is maximized. Vectors that lie on this margin are 
called support vectors. With the support vectors fixed, perturbations of 
vectors beyond the margin will not affect the model; this contributes to the 
model’s robustness. By substituting a kernel function for the usual inner 
product, one can approximate a large variety of decision boundaries in addition 
to linear hyperplanes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "The sql extension is already loaded. To reload it, use:\n",
+  "  %reload_ext sql\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum Database 4.3.10.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-25-gda13eb7, cmake configuration time: Tue Jul 10 21:37:52 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-25-gda13eb7, 
cmake configuration time: Tue Jul 10 21:37:52 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+"collapsed": true
+   },
+   "source": [
+"# Classification\n",
+"# 1. Create input data set"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "15 rows affected.\n",
+  "15 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "tax\n",
+   "bedroom\n",
+   "bath\n",
+   "price\n",
+   "size\n",
+   "lot\n",
+   "\n",
+   "\n",
+   "1\n",
+   "590\n",
+   "2\n",
+   "1.0\n",
+   "5\n",
+   "770\n",
+   "22100\n",
+   "\n",
+   "\n",
+   "2\n",
+   "1050\n",
+   "3\n",
+   "2.0\n",
+   "85000\n",
+   "1410\n",
+   "12000\n",
+   "\n",
+   "\n",
+   "3\n",
+   "20\n",
+   "3\n",
+   "1.0\n",
+   "22500\n",
+   "1060\n",
+   "3500\n",
+   "\n",
+   "\n",
+   "4\n",
+   "870\n",
+   "2\n",
+   "2.0\n",
+   "   

[02/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/mlp-v3.ipynb
--
diff --git a/community-artifacts/mlp-v3.ipynb b/community-artifacts/mlp-v3.ipynb
deleted file mode 100644
index 8c585a6..000
--- a/community-artifacts/mlp-v3.ipynb
+++ /dev/null
@@ -1,4584 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Multilayer Perceptron\n",
-"\n",
-"Multilayer Perceptron (MLP) is a type of neural network that can be used 
for regression and classification.\n",
-"\n",
-"This version of the workbook includes mini-batching which was added in 
the 1.14 release."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-"scrolled": true
-   },
-   "outputs": [
-{
- "name": "stderr",
- "output_type": "stream",
- "text": [
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
-  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
-  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: gpadmin@madlib'"
-  ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
-"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum Database 4.3.10.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.14-dev, git revision: 
rc/1.13-rc1-66-g4cced1b, cmake configuration time: Mon Apr 23 16:26:17 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.14-dev, git revision: rc/1.13-rc1-66-g4cced1b, 
cmake configuration time: Mon Apr 23 16:26:17 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Classification without Mini-Batching\n",
-"\n",
-"# 1.  Create input table for classification"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "52 rows affected.\n",
-  "52 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "id\n",
-   "attributes\n",
-   "class_text\n",
-   "class\n",
-   "state\n",
-   "\n",
-   "\n",
-   "1\n",
-   "[Decimal('5.0'), Decimal('3.2'), Decimal('1.2'), 
Decimal('0.2')]\n",
-   "Iris_setosa\n",
-   "1\n",
-   "Alaska\n",
-   "\n",
-   "\n",
-   "2\n",
-   "[Decimal('5.5'), Decimal('3.5'), Decimal('1.3'), 
Decimal('0.2')]\n",
-   "Iris_setosa\n",
-   "1\n",
-   "Alaska\n",
-   "\n",
-   "\n",
-   "3\n",
-   "[Decimal('4.9'), Decimal('3.1'), Decimal('1.5'), 
Decimal('0.1')]\n",
-   "Iris_setosa\n",
-   "1\n",
-   "Alaska\n",
-   "\n",
-   "\n",
-   "4\n",
-   "[Decimal('4.4'), Decimal('3.0'), Decimal('1.3'), 
Decimal('0.2')]\n",
-   "Iris_setosa\n",
-   "1\n",
-   "

[16/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Decision-trees-v1.ipynb
--
diff --git a/community-artifacts/Decision-trees-v1.ipynb 
b/community-artifacts/Decision-trees-v1.ipynb
deleted file mode 100644
index 02a60ef..000
--- a/community-artifacts/Decision-trees-v1.ipynb
+++ /dev/null
@@ -1,3051 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Decision trees\n",
-"\n",
-"A decision tree is a supervised learning method that can be used for 
classification and regression. It consists of a structure in which internal 
nodes represent tests on attributes, and the branches from nodes represent the 
result of those tests. Each leaf node is a class label and the paths from root 
to leaf nodes define the set of classification or regression rules."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stderr",
- "output_type": "stream",
- "text": [
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
-  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
-  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
-  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
- ]
-}
-   ],
-   "source": [
-"%load_ext sql"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-{
- "data": {
-  "text/plain": [
-   "u'Connected: gpadmin@madlib'"
-  ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
-"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
-"\n",
-"# PostgreSQL local\n",
-"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
-"\n",
-"# Greenplum Database 4.3.10.0\n",
-"#%sql postgresql://gpdbchina@10.194.10.68:61000/madlib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "1 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "version\n",
-   "\n",
-   "\n",
-   "MADlib version: 1.14, git revision: 
rc/1.13-rc1-68-g1c81cb1, cmake configuration time: Tue Apr 24 15:54:15 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
-   "\n",
-   ""
-  ],
-  "text/plain": [
-   "[(u'MADlib version: 1.14, git revision: rc/1.13-rc1-68-g1c81cb1, cmake 
configuration time: Tue Apr 24 15:54:15 UTC 2018, build type: release, build 
system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ compiler: 
g++ 4.4.7',)]"
-  ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
-}
-   ],
-   "source": [
-"%sql select madlib.version();\n",
-"#%sql select version();"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# Decision tree classification examples"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-"# 1. Load data\n",
-"Data set related to whether to play golf or not."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-{
- "name": "stdout",
- "output_type": "stream",
- "text": [
-  "Done.\n",
-  "Done.\n",
-  "14 rows affected.\n",
-  "14 rows affected.\n"
- ]
-},
-{
- "data": {
-  "text/html": [
-   "\n",
-   "\n",
-   "id\n",
-   "OUTLOOK\n",
-   "temperature\n",
-   "humidity\n",
-   "Temp_Humidity\n",
-   "clouds_airquality\n",
-   "windy\n",
-   "class\n",
-   "observation_weight\n",
-   "\n",
-   "\n",
-   "1\n",
-   "sunny\n",
-   "85.0\n",
-   "85.0\n",
-   "[85.0, 85.0]\n",
-   "[u'none', u'unhealthy']\n",
-   "False\n",
-   "Don't Play\n",
-   "5.0\n",
-   "\n",
-   "\n",
-   "2\n",
-   "sunny\n",
-   "80.0\n",
-   "90.0\n",
-   "[80.0, 90.0]\n",
-   "[u'none', 

[15/18] madlib-site git commit: update jupyter notebooks for 1dot15

2018-08-01 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/acd339f6/community-artifacts/Decision-trees-v2.ipynb
--
diff --git a/community-artifacts/Decision-trees-v2.ipynb 
b/community-artifacts/Decision-trees-v2.ipynb
new file mode 100644
index 000..5b55b03
--- /dev/null
+++ b/community-artifacts/Decision-trees-v2.ipynb
@@ -0,0 +1,3208 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Decision trees\n",
+"\n",
+"A decision tree is a supervised learning method that can be used for 
classification and regression. It consists of a structure in which internal 
nodes represent tests on attributes, and the branches from nodes represent the 
result of those tests. Each leaf node is a class label and the paths from root 
to leaf nodes define the set of classification or regression rules.\n",
+"\n",
+"This notebook includes impurity importance which was added in 1.15."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpadmin@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum Database 5.4.0 on GCP (demo machine)\n",
+"%sql postgresql://gpadmin@35.184.253.255:5432/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.15-dev, git revision: 
rc/1.14-rc1-45-g3ab7554, cmake configuration time: Wed Aug  1 18:34:10 UTC 
2018, build type: release, build system: Linux-2.6.32-696.20.1.el6.x86_64, C 
compiler: gcc 4.4.7, C++ compiler: g++ 4.4.7\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.15-dev, git revision: rc/1.14-rc1-45-g3ab7554, 
cmake configuration time: Wed Aug  1 18:34:10 UTC 2018, build type: release, 
build system: Linux-2.6.32-696.20.1.el6.x86_64, C compiler: gcc 4.4.7, C++ 
compiler: g++ 4.4.7',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Decision tree classification examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"Data set related to whether to play golf or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "14 rows affected.\n",
+  "14 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "OUTLOOK\n",
+   "temperature\n",
+   "humidity\n",
+   "Temp_Humidity\n",
+   "clouds_airquality\n",
+   "windy\n",
+   "class\n",
+   "observation_weight\n",
+   "\n",
+   "\n",
+   "1\n",
+   "sunny\n",
+   "85.0\n",
+   "85.0\n",
+   "[85.0, 85.0]\n",
+   "[u'none', u'unhealthy']\n",
+   "False\n",
+   "Don't Play\n",
+   "5.0\n",
+   "\n",
+   "\n",
+   "2\n",
+   "sunny\n",
+   "80.0\n",
+   "90.0\n",
+   "[80.0, 90.0]\n",
+   "[u'none', u'moderate']\n",
+   "   

[1/2] madlib git commit: DT/RF: Add function to report importance scores

2018-08-01 Thread riyer
Repository: madlib
Updated Branches:
  refs/heads/master e2534e44e -> 186390f7c


DT/RF: Add function to report importance scores

JIRA: MADLIB-925

This commit adds a new MADlib function (get_var_importance) to report the
importance scores in decision tree and random forest by unnesting the
importance values along with corresponding features.

Closes #295

Co-authored-by: Rahul Iyer 
Co-authored-by: Jingyi Mei 
Co-authored-by: Orhan Kislal 


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/1aac377f
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/1aac377f
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/1aac377f

Branch: refs/heads/master
Commit: 1aac377f68d20290374c004a3a8bb2da82ab1fcc
Parents: e2534e4
Author: Nandish Jayaram 
Authored: Tue Jul 3 12:22:07 2018 -0700
Committer: Rahul Iyer 
Committed: Wed Aug 1 12:58:22 2018 -0700

--
 .../recursive_partitioning/decision_tree.cpp|  11 +-
 .../recursive_partitioning/decision_tree.hpp|   2 +-
 .../recursive_partitioning/random_forest.cpp|  15 ++
 .../recursive_partitioning/random_forest.hpp|   1 +
 .../recursive_partitioning/decision_tree.py_in  |  10 +-
 .../recursive_partitioning/decision_tree.sql_in | 102 +++---
 .../recursive_partitioning/random_forest.py_in  | 187 ++-
 .../recursive_partitioning/random_forest.sql_in | 168 +
 .../test/decision_tree.ic.sql_in|   3 +-
 .../test/decision_tree.sql_in   |  46 -
 .../test/random_forest.sql_in   |  20 +-
 .../test/unit_tests/plpy_mock.py_in |  43 +
 .../test/unit_tests/test_random_forest.py_in| 173 +
 13 files changed, 697 insertions(+), 84 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib/blob/1aac377f/src/modules/recursive_partitioning/decision_tree.cpp
--
diff --git a/src/modules/recursive_partitioning/decision_tree.cpp 
b/src/modules/recursive_partitioning/decision_tree.cpp
index d249946..0a7f7a5 100644
--- a/src/modules/recursive_partitioning/decision_tree.cpp
+++ b/src/modules/recursive_partitioning/decision_tree.cpp
@@ -488,7 +488,7 @@ print_decision_tree::run(AnyType ){
 }
 
 AnyType
-get_variable_importance::run(AnyType ){
+compute_variable_importance::run(AnyType ){
 Tree dt = args[0].getAs();
 const int n_cat_features = args[1].getAs();
 const int n_con_features = args[2].getAs();
@@ -497,19 +497,12 @@ get_variable_importance::run(AnyType ){
 ColumnVector con_var_importance = ColumnVector::Zero(n_con_features);
 dt.computeVariableImportance(cat_var_importance, con_var_importance);
 
-// Variable importance is scaled to represent a percentage. Even though
-// the importance values are split between categorical and continuous, the
-// percentages are relative to the combined set.
ColumnVector combined_var_imp(n_cat_features + n_con_features);
combined_var_imp << cat_var_importance, con_var_importance;
-
-// Avoid divide by zero by adding a small number
-double total_var_imp = combined_var_imp.sum();
-double VAR_IMP_EPSILON = 1e-6;
-combined_var_imp *=  (100.0 / (total_var_imp + VAR_IMP_EPSILON));
 return combined_var_imp;
 }
 
+
 AnyType
 display_text_tree::run(AnyType ){
 Tree dt = args[0].getAs();

http://git-wip-us.apache.org/repos/asf/madlib/blob/1aac377f/src/modules/recursive_partitioning/decision_tree.hpp
--
diff --git a/src/modules/recursive_partitioning/decision_tree.hpp 
b/src/modules/recursive_partitioning/decision_tree.hpp
index ae62bfa..8cb6703 100644
--- a/src/modules/recursive_partitioning/decision_tree.hpp
+++ b/src/modules/recursive_partitioning/decision_tree.hpp
@@ -14,7 +14,7 @@ DECLARE_UDF(recursive_partitioning, 
compute_surr_stats_transition)
 DECLARE_UDF(recursive_partitioning, dt_surr_apply)
 
 DECLARE_UDF(recursive_partitioning, print_decision_tree)
-DECLARE_UDF(recursive_partitioning, get_variable_importance)
+DECLARE_UDF(recursive_partitioning, compute_variable_importance)
 DECLARE_UDF(recursive_partitioning, predict_dt_response)
 DECLARE_UDF(recursive_partitioning, predict_dt_prob)
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/1aac377f/src/modules/recursive_partitioning/random_forest.cpp
--
diff --git a/src/modules/recursive_partitioning/random_forest.cpp 
b/src/modules/recursive_partitioning/random_forest.cpp
index 70ebbaa..a12f095 100644
--- a/src/modules/recursive_partitioning/random_forest.cpp
+++ b/src/modules/recursive_partitioning/random_forest.cpp
@@ -204,6 +204,21 @@ rf_con_imp_score::run(AnyType ) {
 // 

[2/2] madlib git commit: DT/RF: Fix user doc examples

2018-08-01 Thread riyer
DT/RF: Fix user doc examples


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/186390f7
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/186390f7
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/186390f7

Branch: refs/heads/master
Commit: 186390f7c2af5ad886a4d5b77d0792b68cd3414d
Parents: 1aac377
Author: Frank McQuillan 
Authored: Wed Aug 1 12:49:10 2018 -0700
Committer: Rahul Iyer 
Committed: Wed Aug 1 12:58:44 2018 -0700

--
 .../recursive_partitioning/decision_tree.sql_in | 16 ++--
 .../recursive_partitioning/random_forest.sql_in | 12 +++-
 2 files changed, 17 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib/blob/186390f7/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
--
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 469f1b2..5926152 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -284,14 +284,17 @@ tree_train(
   impurity_var_importance
   DOUBLE PRECISION[]. Impurity importance of each variable.
   The order of the variables is the same as
-  that of 'independent_varnames' column in the summary table (see below).
+  that of the 'independent_varnames' column in the summary table (see 
below).
 
   The impurity importance of any feature is the decrease in impurity by a
   node containing the feature as a primary split, summed over the whole
   tree. If surrogates are used, then the importance value includes the
   impurity decrease scaled by the adjusted surrogate agreement.
-  Reported importance values are normalized to sum to 100 across
-  all variables.
+  Importance values are displayed as raw values as per the 
'split_criterion'
+  parameter.
+  To see importance values normalized to sum to 100 across
+  all variables, use the importance display helper function 
+  described later on this page. 
   Please refer to [1] for more information on variable importance.
   
   
@@ -727,7 +730,7 @@ independent_var_types   | text, boolean, double 
precision
 n_folds | 0
 null_proxy  |
 
-View the impurity importance table using the helper function:
+View the normalized impurity importance table using the helper function:
 
 \\x off
 DROP TABLE IF EXISTS imp_output;
@@ -,10 +1114,11 @@ which shows ordering of levels of categorical variables 
'vs' and 'cyl':
 SELECT pruning_cp, cat_levels_in_text, cat_n_levels, impurity_var_importance, 
tree_depth FROM train_output;
 
 
+-[ RECORD 1 
]---+
 pruning_cp  | 0
 cat_levels_in_text  | {0,1,4,6,8}
 cat_n_levels| {2,3}
-impurity_var_importance | 
{0,51.8593201959496,10.976977929129,5.31897402755374,31.8447278473677}
+impurity_var_importance | 
{0,22.6309172500675,4.79024943310651,2.321153,13.8967382920111}
 tree_depth  | 4
 
 View the summary table:
@@ -1147,7 +1151,7 @@ independent_var_types   | integer, integer, double 
precision, double precisi
 n_folds | 0
 null_proxy  |
 
-View the impurity importance table using the helper function:
+View the normalized impurity importance table using the helper function:
 
 \\x off
 DROP TABLE IF EXISTS imp_output;

http://git-wip-us.apache.org/repos/asf/madlib/blob/186390f7/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
--
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index 39b6f5d..5b5a0f0 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -164,7 +164,9 @@ forest_train(training_table_name,
 Due to nature of permutation, the importance value can end up being
 negative if the number of levels for a categorical variable is small and is
 unbalanced. In such a scenario, the importance values are shifted to ensure
-that the lowest importance value is 0.
+that the lowest importance value is 0.  To see importance values 
normalized 
+to sum to 100 across all variables, use the importance display helper 
function 
+described later on this page. 
 
   
 
@@ -758,7 +760,7 @@ the variables in 'independent_varnames'

madlib git commit: DT/RF: Don't eliminate single-level cat variable

2018-08-01 Thread riyer
Repository: madlib
Updated Branches:
  refs/heads/master 20f95b33b -> e2534e44e


DT/RF: Don't eliminate single-level cat variable

JIRA: MADLIB-1258

When DT/RF is run with grouping, a subset of the groups could eliminate
a categorical variable leading to multiple issues downstream, including
invalid importance values and incorrect prediction.

This commit keeps all categorical variables (even if it contains just
one level). The accumulator state would use additional space during
tree_train for this categorical variable, even though the variable is
never consumed by the tree. This inefficiency is still preferred since
it yields clean code and error-free prediction/importance reporting.

Additional changes:
- get_expr_type (validate_args.py) has been updated to return type for
multiple expressions at the same time. This prevents calling a separate
query for each expression, thus saving time.
- Cat features are not stored per tree (in the grouping case) anymore
since the features are now consistent across trees.

Closes #301

Co-authored-by: Nandish Jayaram 


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/e2534e44
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/e2534e44
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/e2534e44

Branch: refs/heads/master
Commit: e2534e44ea36aedec843a3a7c48236d0e1104e2c
Parents: 20f95b3
Author: Rahul Iyer 
Authored: Thu Jul 26 12:17:58 2018 -0700
Committer: Rahul Iyer 
Committed: Wed Aug 1 12:51:13 2018 -0700

--
 src/modules/recursive_partitioning/DT_impl.hpp  |  91 
 .../recursive_partitioning/decision_tree.cpp|  21 +-
 .../recursive_partitioning/decision_tree.py_in  | 217 +--
 .../recursive_partitioning/random_forest.py_in  | 120 +-
 .../test/decision_tree.sql_in   |  83 +++
 .../test/random_forest.sql_in   |  46 ++--
 .../modules/utilities/validate_args.py_in   |  49 +++--
 7 files changed, 319 insertions(+), 308 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib/blob/e2534e44/src/modules/recursive_partitioning/DT_impl.hpp
--
diff --git a/src/modules/recursive_partitioning/DT_impl.hpp 
b/src/modules/recursive_partitioning/DT_impl.hpp
index 69bdc88..75e4ce4 100644
--- a/src/modules/recursive_partitioning/DT_impl.hpp
+++ b/src/modules/recursive_partitioning/DT_impl.hpp
@@ -518,6 +518,7 @@ DecisionTree::expand(const Accumulator ,
 double gain = impurityGain(
 state.cat_stats.row(stats_i).
 segment(fv_index, sps * 2), sps);
+
 if (gain > max_impurity_gain){
 max_impurity_gain = gain;
 max_feat = f;
@@ -665,21 +666,29 @@ DecisionTree::pickSurrogates(
 
 // 1. Compute the max count and corresponding split threshold for
 // each categorical and continuous feature
+
 ColumnVector cat_max_thres = ColumnVector::Zero(n_cats);
 ColumnVector cat_max_count = ColumnVector::Zero(n_cats);
 IntegerVector cat_max_is_reverse = IntegerVector::Zero(n_cats);
 Index prev_cum_levels = 0;
 for (Index each_cat=0; each_cat < n_cats; each_cat++){
 Index n_levels = state.cat_levels_cumsum(each_cat) - 
prev_cum_levels;
-Index max_label;
-(cat_stats_counts.row(stats_i).segment(
-prev_cum_levels * 2, n_levels * 2)).maxCoeff(_label);
-cat_max_thres(each_cat) = static_cast(max_label / 2);
-cat_max_count(each_cat) =
-cat_stats_counts(stats_i, prev_cum_levels*2 + 
max_label);
-// every odd col is for reverse, hence i % 2 == 1 for reverse 
index i
-cat_max_is_reverse(each_cat) = (max_label % 2 == 1) ? 1 : 0;
-prev_cum_levels = state.cat_levels_cumsum(each_cat);
+if (n_levels > 0){
+Index max_label;
+(cat_stats_counts.row(stats_i).segment(
+prev_cum_levels * 2, n_levels * 
2)).maxCoeff(_label);
+
+// For each split, there are two stats =>
+//  max_label / 2 gives the split index. A floor
+// operation is unnecessary since the threshold will yield
+// the same results for n and n+0.5.
+cat_max_thres(each_cat) = static_cast(max_label / 
2);
+cat_max_count(each_cat) =
+cat_stats_counts(stats_i, prev_cum_levels*2 + 
max_label);
+// every odd col is for reverse, hence i % 2 == 1 for 
reverse index i
+ 

[2/2] madlib git commit: Utilities: Add module transform_vec_cols for column-vector conversion

2018-08-01 Thread okislal
Utilities: Add module transform_vec_cols for column-vector conversion

JIRA: MADLIB-1240

This commit adds a new SQL function called vec2cols and refactors the
current function cols2vec, providing greater integration between the two
modules. We now have a single Python file with separate classes for each
feature. We also have unified unit-tests and dev-check/install-check
tests.

The vec2cols function enables users to split up a single column into
multiple columns, given that the input column contains array entries.
For example, if the input column contained ARRAY[1, 2, 3] in one of its
rows, the output table will contain 3 different columns, one for each
element of the array.

Co-authored-by: Nandish Jayaram 
Co-authored-by: Rahul Iyer 
Co-authored-by: Nikhil Kak 
Co-authored-by: Orhan Kislal 
Co-authored-by: Frank McQuillan 

Closes #291


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/20f95b33
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/20f95b33
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/20f95b33

Branch: refs/heads/master
Commit: 20f95b33bcbd05b154a566c81958091c66258858
Parents: a0cfcf8
Author: Arvind Sridhar 
Authored: Wed Aug 1 11:22:27 2018 -0700
Committer: Orhan Kislal 
Committed: Wed Aug 1 11:22:27 2018 -0700

--
 doc/mainpage.dox.in |   1 +
 .../postgres/modules/internal/db_utils.py_in|   9 +
 .../postgres/modules/utilities/cols2vec.py_in   | 128 -
 .../postgres/modules/utilities/cols2vec.sql_in  | 345 ++---
 .../modules/utilities/test/cols2vec.sql_in  |  91 
 .../utilities/test/transform_vec_cols.ic.sql_in |  68 +++
 .../utilities/test/transform_vec_cols.sql_in| 470 ++
 .../unit_tests/test_transform_vec_cols.py_in| 226 +
 .../modules/utilities/transform_vec_cols.py_in  | 496 +++
 .../postgres/modules/utilities/utilities.py_in  |  14 +-
 .../postgres/modules/utilities/vec2cols.sql_in  | 348 +
 11 files changed, 1908 insertions(+), 288 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/doc/mainpage.dox.in
--
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index 8f97491..d174ab7 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -276,6 +276,7 @@ complete matrix stored as a distributed table.
 @defgroup grp_minibatch_preprocessing Mini-Batch Preprocessor
 @defgroup grp_pmml PMML Export
 @defgroup grp_text_utilities Term Frequency
+@defgroup grp_vec2cols Vector to Columns
 @}
 
 @defgroup grp_early_stage Early Stage Development

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/internal/db_utils.py_in
--
diff --git a/src/ports/postgres/modules/internal/db_utils.py_in 
b/src/ports/postgres/modules/internal/db_utils.py_in
index c75babf..45477ef 100644
--- a/src/ports/postgres/modules/internal/db_utils.py_in
+++ b/src/ports/postgres/modules/internal/db_utils.py_in
@@ -79,3 +79,12 @@ def quote_literal(input_str):
 return "{qd}{input_str}{qd}".format(qd=QUOTE_DELIMITER,
 input_str=input_str)
 # 
--
+
+def is_col_1d_array(source_table, col_name):
+query = """
+SELECT array_upper({0}, 2) IS NULL AS n_y
+FROM {1}
+LIMIT 1
+""".format(col_name, source_table)
+result = plpy.execute(query)
+return result[0]["n_y"]

http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/cols2vec.py_in
--
diff --git a/src/ports/postgres/modules/utilities/cols2vec.py_in 
b/src/ports/postgres/modules/utilities/cols2vec.py_in
deleted file mode 100644
index 4f2b1c9..000
--- a/src/ports/postgres/modules/utilities/cols2vec.py_in
+++ /dev/null
@@ -1,128 +0,0 @@
-# coding=utf-8
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the

[1/2] madlib git commit: Utilities: Add module transform_vec_cols for column-vector conversion

2018-08-01 Thread okislal
Repository: madlib
Updated Branches:
  refs/heads/master a0cfcf8f7 -> 20f95b33b


http://git-wip-us.apache.org/repos/asf/madlib/blob/20f95b33/src/ports/postgres/modules/utilities/vec2cols.sql_in
--
diff --git a/src/ports/postgres/modules/utilities/vec2cols.sql_in 
b/src/ports/postgres/modules/utilities/vec2cols.sql_in
new file mode 100644
index 000..989074c
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/vec2cols.sql_in
@@ -0,0 +1,348 @@
+/* --- */
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * @file sessionize.sql_in
+ *
+ * @brief SQL functions for sessionization functions
+ * @date May 2016
+ *
+ */
+/* --- */
+
+m4_include(`SQLCommon.m4')
+
+/**
+@addtogroup grp_vec2cols
+
+@brief Converts a feature array in a single column of an output table into 
multiple columns.
+
+Contents
+
+Syntax
+Usage
+Examples
+
+
+
+@about
+Converts a feature array in a single column into multiple columns.
+This process can be used to reverse the function cols2vec.
+
+Given a table with a column of type array, this function will create an output
+table that splits this array into multiple columns, one per array element.
+It includes the option to name the new feature columns, and to include 
+columns from the original table in the output.
+
+@anchor vec2cols_usage
+@usage
+
+
+vec2cols(
+source_table,
+output_table,
+vector_col,
+feature_names,
+cols_to_output
+)
+
+
+\b Arguments
+
+source_table
+TEXT. Name of the table containing the source data..
+
+output_table
+TEXT. Name of the generated table containing the output. If a table with 
the
+same name already exists, an error will be returned.
+
+vector_col
+TEXT. Name of the column containing the feature array.  
+Must be a one-dimensional array.
+
+feature_names (optional)
+TEXT[]. Array of names associated with the feature array.  
+Note that this array exists in the
+summary table created by the function 'cols2vec'.  
+If the 'feature_names' array is not specified,
+column names will be automatically generated of 
+the form 'f1, f2, ...fn'.
+
+cols_to_output (optional)
+TEXT, default NULL. Comma-separated string of column names 
+from the source table to keep in the
+output table, in addition to the feature columns.  
+To keep all columns from the source table, use '*'.
+Note: total number of columns in a table cannot exceed the 
+PostgreSQL limits.
+
+
+
+Output table
+
+The output table produced by the vec2cols function contains the following 
columns:
+
+  
+<...>
+Columns from source table, depending on which ones are kept (if 
any).
+
+  
+  
+feature columns
+Columns for each of the features in 'vector_col'.  Column type 
+will depend on the feature array type in the source table.  Column 
+naming will depend on whether the parameter 'feature_names' is used.
+  
+
+
+
+@anchor vec2cols_example
+@par Examples
+-#  Load sample data:
+
+DROP TABLE IF EXISTS golf CASCADE;
+CREATE TABLE golf (
+id integer NOT NULL,
+"OUTLOOK" text,
+temperature double precision,
+humidity double precision,
+"Temp_Humidity" double precision[],
+clouds_airquality text[],
+windy boolean,
+class text,
+observation_weight double precision
+);
+INSERT INTO golf VALUES
+(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t 
Play', 5.0),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t 
Play', 5.0),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 
'Play', 1.5),
+(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 
1.0),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 
'Don''t