[6/7] madlib-site git commit: add new workbooks for 1dot13

2017-12-08 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/Elastic-net-v2.ipynb
--
diff --git a/community-artifacts/Elastic-net-v2.ipynb 
b/community-artifacts/Elastic-net-v2.ipynb
new file mode 100644
index 000..b6082f0
--- /dev/null
+++ b/community-artifacts/Elastic-net-v2.ipynb
@@ -0,0 +1,2078 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Elastic net (MADlib v1.10+)\n",
+"Demonstrates elastic net, including these updates:\n",
+"- in MADlib 1.10: grouping and cross validation which were introduced \n",
+"- in MADlib 1.13: report negative root mean squared error instead of the 
negative mean squared error"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpdbchina@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum 4.3.10.0\n",
+"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum 4.2.3.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.13-dev, git revision: 
rel/v1.12-42-gedc93f5, cmake configuration time: Fri Dec  8 18:28:18 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-42-gedc93f5, 
cmake configuration time: Fri Dec  8 18:28:18 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"## 1.  Create data set\n",
+"House prices and characteristics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "27 rows affected.\n",
+  "27 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "tax\n",
+   "bedroom\n",
+   "bath\n",
+   "price\n",
+   "size\n",
+   "lot\n",
+   "zipcode\n",
+   "\n",
+   "\n",
+   "1\n",
+   "590\n",
+   "2\n",
+   "1.0\n",
+   "5\n",
+   "770\n",
+   "22100\n",
+   "94301\n",
+   "\n",
+   "\n",
+   "2\n",
+   "1050\n",
+   "3\n",
+   "2.0\n",
+   "85000\n",
+   "1410\n",
+   "12000\n",
+   "94301\n",
+   "\n",
+   "\n",
+   "3\n",
+   "20\n",
+   "3\n",
+   "1.0\n",
+   "22500\n",
+   "1060\n",
+   "3500\n",
+   "94301\n",
+   "\n",
+   "\n",
+   "4\n",
+   "870\n",
+   "2\n",
+   "2.0\n",
+   "9\n",
+   

[3/7] madlib-site git commit: add new workbooks for 1dot13

2017-12-08 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/kNN-v2.ipynb
--
diff --git a/community-artifacts/kNN-v2.ipynb b/community-artifacts/kNN-v2.ipynb
new file mode 100644
index 000..5b74e48
--- /dev/null
+++ b/community-artifacts/kNN-v2.ipynb
@@ -0,0 +1,751 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# k-Nearest Neighbors\n",
+"Finds k nearest data points to a given data point and outputs majority 
vote value of output classes in case of classification, and average value of 
target values in case of regression. KNN was first added in MADlib 1.10 and the 
interface was updated in 1.13."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: gpdbchina@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum 4.3.10.0\n",
+"%sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"#%sql postgresql://fmcquillan@localhost:5432/madlib\n",
+"\n",
+"# Greenplum 4.2.3.0\n",
+"#%sql postgresql://gpdbchina@10.194.10.68:55000/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.13-dev, git revision: 
rel/v1.12-41-g4aa0732, cmake configuration time: Tue Dec  5 20:44:49 UTC 2017, 
build type: Release, build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C 
compiler: gcc 4.4.0, C++ compiler: g++ 4.4.0\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.13-dev, git revision: rel/v1.12-41-g4aa0732, 
cmake configuration time: Tue Dec  5 20:44:49 UTC 2017, build type: Release, 
build system: Linux-2.6.18-238.27.1.el5.hotfix.bz516490, C compiler: gcc 4.4.0, 
C++ compiler: g++ 4.4.0',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1.  Load data for classification"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "9 rows affected.\n",
+  "9 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "data\n",
+   "label\n",
+   "\n",
+   "\n",
+   "1\n",
+   "[1, 1]\n",
+   "1\n",
+   "\n",
+   "\n",
+   "2\n",
+   "[2, 2]\n",
+   "1\n",
+   "\n",
+   "\n",
+   "3\n",
+   "[3, 3]\n",
+   "1\n",
+   "\n",
+   "\n",
+   "4\n",
+   "[4, 4]\n",
+   "1\n",
+   "\n",
+   "\n",
+   "5\n",
+   "[4, 5]\n",
+   "1\n",
+   "\n",
+   "\n",
+   "6\n",
+   "[20, 50]\n",
+   "0\n",
+   "\n",
+   "\n",
+   "7\n",
+   "[10, 31]\n",
+   "0\n",
+   "\n",
+   "\n",
+   "8\n",
+   "[81, 13]\n",
+   "0\n",
+   "\n",
+   "\n",
+   "9\n",
+   "[1, 111]\n",
+   "0\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   

[4/7] madlib-site git commit: add new workbooks for 1dot13

2017-12-08 Thread fmcquillan
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/95826612/community-artifacts/Logistic-regression-v1.ipynb
--
diff --git a/community-artifacts/Logistic-regression-v1.ipynb 
b/community-artifacts/Logistic-regression-v1.ipynb
new file mode 100644
index 000..226049d
--- /dev/null
+++ b/community-artifacts/Logistic-regression-v1.ipynb
@@ -0,0 +1,892 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# Logistic regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/config.py:13: 
ShimWarning: The `IPython.config` package has been deprecated. You should 
import from traitlets.config instead.\n",
+  "  \"You should import from traitlets.config instead.\", ShimWarning)\n",
+  
"/Users/fmcquillan/anaconda/lib/python2.7/site-packages/IPython/utils/traitlets.py:5:
 UserWarning: IPython.utils.traitlets has moved to a top-level traitlets 
package.\n",
+  "  warn(\"IPython.utils.traitlets has moved to a top-level traitlets 
package.\")\n"
+ ]
+}
+   ],
+   "source": [
+"%load_ext sql"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+{
+ "data": {
+  "text/plain": [
+   "u'Connected: fmcquillan@madlib'"
+  ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"# Greenplum 4.3.10.0\n",
+"# %sql postgresql://gpdbchina@10.194.10.68:61000/madlib\n",
+"\n",
+"# PostgreSQL local\n",
+"%sql postgresql://fmcquillan@localhost:5432/madlib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "1 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "version\n",
+   "\n",
+   "\n",
+   "MADlib version: 1.12, git revision: unknown, cmake 
configuration time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build 
system: Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang\n",
+   "\n",
+   ""
+  ],
+  "text/plain": [
+   "[(u'MADlib version: 1.12, git revision: unknown, cmake configuration 
time: Wed Aug 23 23:07:18 UTC 2017, build type: Release, build system: 
Darwin-16.7.0, C compiler: Clang, C++ compiler: Clang',)]"
+  ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+}
+   ],
+   "source": [
+"%sql select madlib.version();\n",
+"#%sql select version();"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+"# 1. Load data\n",
+"This data set is related to predicting a second heart attack given 
treatment and health factors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+{
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+  "Done.\n",
+  "Done.\n",
+  "20 rows affected.\n",
+  "20 rows affected.\n"
+ ]
+},
+{
+ "data": {
+  "text/html": [
+   "\n",
+   "\n",
+   "id\n",
+   "second_attack\n",
+   "treatment\n",
+   "trait_anxiety\n",
+   "\n",
+   "\n",
+   "1\n",
+   "1\n",
+   "1\n",
+   "70\n",
+   "\n",
+   "\n",
+   "2\n",
+   "1\n",
+   "1\n",
+   "80\n",
+   "\n",
+   "\n",
+   "3\n",
+   "1\n",
+   "1\n",
+   "50\n",
+   "\n",
+   "\n",
+   "4\n",
+   "1\n",
+   "0\n",
+   "60\n",
+   "\n",
+   "\n",
+   "5\n",
+   "1\n",
+   "0\n",
+   "40\n",
+   "\n",
+   "\n",
+   "6\n",
+   "1\n",
+   "0\n",
+   "65\n",
+   "\n",
+   "\n",
+   "7\n",
+   "1\n",
+   "0\n",
+   "75\n",
+   "\n",
+   "\n",
+   "8\n",
+   "1\n",
+   "0\n",
+   "80\n",
+   "\n",
+   "\n",
+   "9\n",
+   "1\n",
+   "0\n",
+   "70\n",
+   "\n",
+   "\n",
+   "10\n",
+   "1\n",
+   "0\n",
+   "60\n",
+   "\n",
+   "\n",
+   "11\n",

madlib git commit: Doc: Add grouping predict e.g. for lin_reg

2017-12-08 Thread njayaram
Repository: madlib
Updated Branches:
  refs/heads/master edc93f529 -> 2658b3343


Doc: Add grouping predict e.g. for lin_reg

Add example for prediction with grouping support in linear the
regression module.

Closes #209


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/2658b334
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/2658b334
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/2658b334

Branch: refs/heads/master
Commit: 2658b33434d0e7f7ce248fed64f6ab8c5329
Parents: edc93f5
Author: Frank McQuillan 
Authored: Wed Dec 6 11:04:27 2017 -0800
Committer: Nandish Jayaram 
Committed: Fri Dec 8 11:28:19 2017 -0800

--
 .../postgres/modules/regress/linear.sql_in  | 38 
 1 file changed, 38 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/madlib/blob/2658b334/src/ports/postgres/modules/regress/linear.sql_in
--
diff --git a/src/ports/postgres/modules/regress/linear.sql_in 
b/src/ports/postgres/modules/regress/linear.sql_in
index e1484db..68c9020 100644
--- a/src/ports/postgres/modules/regress/linear.sql_in
+++ b/src/ports/postgres/modules/regress/linear.sql_in
@@ -356,6 +356,44 @@ Result:
 (15 rows)
 
 
+-# Compare predicted price with actual with grouping. 
+It means a different model is used depending on the number of bedrooms.
+
+\\x OFF
+SELECT houses.*,
+   madlib.linregr_predict( m.coef,
+  ARRAY[1,tax,bath,size]
+ ) as predict,
+price -
+  madlib.linregr_predict( m.coef,
+ ARRAY[1,tax,bath,size] 
+) as residual
+FROM houses, houses_linregr_bedroom m
+WHERE houses.bedroom = m.bedroom
+ORDER BY id;
+
+Result:
+
+ id | tax  | bedroom | bath | price  | size |  lot  | predict  | 
residual  
++--+-+--++--+---+--+---
+  1 |  590 |   2 |1 |  5 |  770 | 22100 | 43223.5393423978 |  
6776.46065760222
+  2 | 1050 |   3 |2 |  85000 | 1410 | 12000 | 111527.609949684 |  
-26527.609949684
+  3 |   20 |   3 |1 |  22500 | 1060 |  3500 | 20187.9052986341 |  
2312.09470136587
+  4 |  870 |   2 |2 |  9 | 1300 | 17500 | 99354.9203362612 | 
-9354.92033626116
+  5 | 1320 |   3 |2 | 133000 | 1500 | 3 | 124508.080626412 |  
8491.91937358756
+  6 | 1350 |   2 |1 |  90500 |  820 | 25700 | 96640.8258367579 |  
-6140.8258367579
+  7 | 2790 |   3 |  2.5 | 26 | 2130 | 25000 | 224650.799707327 |  
35349.2002926733
+  8 |  680 |   2 |1 | 142500 | 1170 | 22000 | 138458.174652714 |  
4041.82534728572
+  9 | 1840 |   3 |2 | 16 | 1500 | 19000 | 138650.335313722 |  
21349.6646862777
+ 10 | 3680 |   4 |2 | 24 | 2790 | 2 |   24 |   
  0
+ 11 | 1660 |   3 |1 |  87000 | 1030 | 17500 | 62911.2752186594 |  
24088.7247813406
+ 12 | 1620 |   3 |2 | 118600 | 1250 | 2 | 117007.693446414 |  
1592.30655358579
+ 13 | 3100 |   3 |2 | 14 | 1760 | 38000 | 189203.861766403 | 
-49203.8617664034
+ 14 | 2070 |   2 |3 | 148000 | 1550 | 14000 | 143322.539831869 |  
4677.46016813093
+ 15 |  650 |   3 |  1.5 |  65000 | 1450 | 12000 | 82452.4386727394 | 
-17452.4386727394
+(15 rows)
+
+
 
 @anchor notes
 @par Note