[16/51] [partial] madlib-site git commit: Doc: Add v1.15.1 documentation

nkak Mon, 15 Oct 2018 11:55:37 -0700

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__mfvsketch.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__mfvsketch.html 
b/docs/v1.15.1/group__grp__mfvsketch.html
new file mode 100644
index 0000000..aa76e83
--- /dev/null
+++ b/docs/v1.15.1/group__grp__mfvsketch.html
@@ -0,0 +1,183 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: MFV (Most Frequent Values)</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__mfvsketch.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">MFV (Most Frequent Values)<div class="ingroups"><a 
class="el" href="group__grp__stats.html">Statistics</a> &raquo; <a class="el" 
href="group__grp__desc__stats.html">Descriptive Statistics</a> &raquo; <a 
class="el" href="group__grp__sketches.html">Cardinality 
Estimators</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li>
+<a href="#syntax">Syntax</a> </li>
+<li>
+<a href="#examples">Examples</a> </li>
+<li>
+<a href="#literature">Literature</a> </li>
+<li>
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>MFVSketch: Most Frequent Values variant of CountMin sketch, 
implemented as a UDA.</p>
+<p>Produces an n-bucket histogram for a column where each bucket counts one of 
the most frequent values in the column. The output is an array of doubles 
{value, count} in descending order of frequency; counts are approximated via 
CountMin sketches. Ties are handled arbitrarily.</p>
+<p><a class="anchor" id="syntax"></a> The MFV frequent-value UDA comes in two 
different versions:</p><ul>
+<li>a faithful implementation that preserves the approximation guarantees of 
Cormode/Muthukrishnan, <pre class="syntax">
+mfvsketch_top_histogram( col_name,
+                         n )
+</pre></li>
+<li>and a "quick and dirty" version that can do parallel aggregation in 
Greenplum at the expense of missing some of the most frequent values. <pre 
class="syntax">
+mfvsketch_quick_histogram( col_name,
+                           n )
+</pre></li>
+</ul>
+<p>In PostgreSQL the two UDAs are identical. In Greenplum, the quick version 
should produce good results unless the number of values requested is small, or 
the distribution is flat.</p>
+<dl class="section note"><dt>Note</dt><dd>This is a <a 
href="https://www.postgresql.org/docs/current/static/xaggr.html";>User Defined 
Aggregate</a> which returns the results when used in a query. Use "CREATE TABLE 
AS ", with the UDA as subquery if the results are to be stored. This is unlike 
the usual MADlib stored procedure interface which places the results in a table 
instead of returning it.</dd></dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Generate some data. <pre class="example">
+CREATE TABLE data(class INT, a1 INT);
+INSERT INTO data SELECT 1,1 FROM generate_series(1,10000);
+INSERT INTO data SELECT 1,2 FROM generate_series(1,15000);
+INSERT INTO data SELECT 1,3 FROM generate_series(1,10000);
+INSERT INTO data SELECT 2,5 FROM generate_series(1,1000);
+INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);
+</pre></li>
+<li>Produce a histogram of 5 bins and return the most frequent value and 
associated count in each bin. <pre class="example">
+SELECT mfvsketch_top_histogram( a1, 5 )
+FROM data;
+</pre> Result: <pre class="result">
+                mfvsketch_top_histogram
+&#160;-------------------------------------------------------------
+[0:4][0:1]={{2,15000},{1,10000},{3,10000},{5,1000},{6,1000}}
+(1 row)
+</pre></li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd>This method is not usually called an MFV sketch in 
the literature; it is a natural extension of the CountMin sketch.</dd></dl>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="sketch_8sql__in.html" title="SQL functions for 
sketch-based approximations of descriptive statistics. ">sketch.sql_in</a> 
documenting the SQL functions.</p>
+<p>Module <a class="el" href="group__grp__countmin.html">CountMin 
(Cormode-Muthukrishnan)</a>. </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__minibatch__preprocessing.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__minibatch__preprocessing.html 
b/docs/v1.15.1/group__grp__minibatch__preprocessing.html
new file mode 100644
index 0000000..0fa58d5
--- /dev/null
+++ b/docs/v1.15.1/group__grp__minibatch__preprocessing.html
@@ -0,0 +1,462 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Mini-Batch Preprocessor</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__minibatch__preprocessing.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Mini-Batch Preprocessor<div class="ingroups"><a class="el" 
href="group__grp__other__functions.html">Utilities</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b><ul>
+<li class="level1">
+<a href="#minibatch_preprocessor">Mini-Batch Preprocessor</a> </li>
+<li class="level1">
+<a href="#example">Examples</a> </li>
+<li class="level1">
+<a href="#literature">Literature</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>The mini-batch preprocessor is a utility that prepares input data for 
use by models that support mini-batch as an optimization option. (This is 
currently only the case for <a href="group__grp__nn.html">Neural Networks</a>.) 
It is effectively a packing operation that builds arrays of dependent and 
independent variables from the source data table.</p>
+<p>The advantage of using mini-batching is that it can perform better than 
stochastic gradient descent (default MADlib optimizer) because it uses more 
than one training example at a time, typically resulting faster and smoother 
convergence [1].</p>
+<p><a class="anchor" id="minibatch_preprocessor"></a></p><dl class="section 
user"><dt>Mini-Batch Preprocessor</dt><dd>The mini-batch preprocessor has the 
following format:</dd></dl>
+<pre class="syntax">
+minibatch_preprocessor( source_table,
+                        output_table,
+                        dependent_varname,
+                        independent_varname,
+                        grouping_col,
+                        buffer_size,
+                        one_hot_encode_int_dep_var
+                        )
+</pre><p><b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing input data. Can also 
be a view. </p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. Name of the output table from the preprocessor 
which will be used as input to algorithms that support mini-batching. Note that 
the arrays packed into the output table are randomized and normalized, so they 
will not match up in an obvious way with the rows in the source table. </p>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_varname </dt>
+<dd><p class="startdd">TEXT. Name of the dependent variable column. </p>
+<p class="enddd"></p>
+</dd>
+<dt>independent_varname </dt>
+<dd>TEXT. Column name or expression list to evaluate for the independent 
variable. Please note that independent variables are cast to double precision 
by the preprocessor, so categorical variables should be one-hot or dummy 
encoded as appropriate. See <a 
href="group__grp__encode__categorical.html">Encoding Categorical Variables</a> 
for more details on this. <dl class="section note"><dt>Note</dt><dd>Supported 
expressions for independent variables include:<ul>
+<li>âARRAY[x1,x2,x3]â, where x1, x2, and x3 are columns in the source 
table containing scalar values.</li>
+<li>Single column in the source table containing an array like ARRAY[1,2,3] or 
{1,2,3}. </li>
+</ul>
+</dd>
+<dd>
+The following forms are not currently supported:<ul>
+<li>âx1,x2,x3â, where x1,x2,x3 are columns in source table with scalar 
values</li>
+<li>ARRAY[x1,x2] where x1 is scalar and x2 is array</li>
+<li>ARRAY[x1,x2] where both x1 and x2 are arrays</li>
+<li>ARRAY[x1] where x1 is array </li>
+</ul>
+</dd></dl>
+</dd>
+<dt>grouping_col (optional) </dt>
+<dd>TEXT, default: NULL. An expression list used to group the input dataset 
into discrete groups, which runs the preprocessing separately for each group. 
When this value is NULL, no grouping is used and a single preprocessor step is 
run for the whole data set. <dl class="section note"><dt>Note</dt><dd>If you 
plan to use grouping in model training, then you must set up the groups in the 
preprocessor exactly as you want to use them in training. </dd></dl>
+</dd>
+<dt>buffer_size (optional) </dt>
+<dd><p class="startdd">INTEGER, default: computed. Buffer size is the number 
of rows from the source table that are packed into one row of the preprocessor 
output table. The default value is computed considering size of the source 
table, number of independent variables, number of groups, and number of 
segments in the database cluster. For larger data sets, the computed buffer 
size will typically be a value in the millions. </p>
+<p class="enddd"></p>
+</dd>
+<dt>one_hot_encode_int_dep_var (optional) </dt>
+<dd><p class="startdd">BOOLEAN. default: FALSE. Flag to one-hot encode 
dependent variables that are scalar integers. This parameter is ignored if the 
dependent variable is not a scalar integer.</p>
+<dl class="section note"><dt>Note</dt><dd>The mini-batch preprocessor 
automatically encodes dependent variables that are boolean and character types 
such as text, char and varchar. However, scalar integers are a special case 
because they can be used in both classification and regression problems, so you 
must tell the mini-batch preprocessor whether you want to encode them or not. 
In the case that you have already encoded the dependent variable yourself, you 
can ignore this parameter. Also, if you want to encode float values for some 
reason, cast them to text first.  </dd></dl>
+</dd>
+</dl>
+<p><b>Output tables</b> <br />
+ The output table produced by the mini-batch preprocessor contains the 
following columns: </p><table class="output">
+<tr>
+<th>__id__ </th><td>INTEGER. Unique id for packed table.   </td></tr>
+<tr>
+<th>dependent_varname </th><td>FLOAT8[]. Packed array of dependent variables. 
If the dependent variable in the source table is categorical, the preprocessor 
will one-hot encode it.   </td></tr>
+<tr>
+<th>independent_varname </th><td>FLOAT8[]. Packed array of independent 
variables.   </td></tr>
+<tr>
+<th>grouping_cols </th><td>TEXT. Name of grouping columns.   </td></tr>
+</table>
+<p>A summary table named &lt;output_table&gt;_summary is also created, which 
has the following columns: </p><table class="output">
+<tr>
+<th>source_table </th><td>Name of the source table.  </td></tr>
+<tr>
+<th>output_table </th><td>Name of output table generated by preprocessor.  
</td></tr>
+<tr>
+<th>dependent_varname </th><td>Dependent variable from the source table.  
</td></tr>
+<tr>
+<th>independent_varname </th><td>Independent variable from the source table.  
</td></tr>
+<tr>
+<th>buffer_size </th><td>Buffer size used in preprocessing step.  </td></tr>
+<tr>
+<th>class_values </th><td>Class values (i.e., levels) of the dependent 
variable if categorical. If the dependent variable is not categorical, this 
will be NULL./td&gt;  </td></tr>
+<tr>
+<th>num_rows_processed </th><td>The total number of rows that were used in the 
preprocessing operation.  </td></tr>
+<tr>
+<th>num_missing_rows_skipped </th><td>The total number of rows that were 
skipped because of NULL values in either the dependent or independent 
variables.  </td></tr>
+<tr>
+<th>grouping_col </th><td>Comma separated list of grouping column names if 
grouping is used. If no grouping, will be NULL.  </td></tr>
+</table>
+<p>A standardization table named &lt;output_table&gt;_standardization is also 
created. This is needed by the models that will use the preprocessed data so is 
likely not of much interest to users. It has the following columns: </p><table 
class="output">
+<tr>
+<th>grouping columns </th><td>If 'grouping_col' is specified, a column for 
each grouping column is created.  </td></tr>
+<tr>
+<th>mean </th><td>Mean of independent variables.  </td></tr>
+<tr>
+<th>std </th><td>Population standard deviation of independent variables.  
</td></tr>
+</table>
+<p><a class="anchor" id="example"></a></p><dl class="section 
user"><dt>Examples</dt><dd><ol type="1">
+<li>Create an input data set based on the well known iris data set: <pre 
class="example">
+DROP TABLE IF EXISTS iris_data;
+CREATE TABLE iris_data(
+    id serial,
+    attributes numeric[],
+    class_text varchar,
+    class integer,
+    state varchar
+);
+INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
+(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
+(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
+(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
+(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
+(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
+(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
+(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
+(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
+(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
+(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
+(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
+(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
+(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
+(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
+(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
+(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
+(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
+(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
+(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
+(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
+(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
+(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
+(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
+(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
+(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
+(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
+(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
+(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
+(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
+(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
+(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
+(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
+(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
+(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
+(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
+(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
+(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
+(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
+(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
+(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
+(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
+</pre></li>
+<li>Run the preprocessor: <pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, 
iris_data_packed_standardization;
+SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class_text',        -- Dependent variable
+                                     'attributes'         -- Independent 
variables
+                                     );
+</pre> For small datasets like in this example, buffer size is mainly 
determined by the number of segments in the database. This example is run on a 
Greenplum database with 2 segments, so there are 2 rows with a buffer size of 
26. For PostgresSQL, there would be only one row with a buffer size of 52 since 
it is a single node database. For larger data sets, other factors go into 
computing buffers size besides number of segments. Also, note that the 
dependent variable has been one-hot encoded since it is categorical. Here is a 
sample of the packed output table: <pre class="example">
+\x on
+SELECT * FROM iris_data_packed;
+</pre> <pre class="result">
+-[ RECORD 1 ]-------+-------------------------------------
+__id__              | 0
+dependent_varname   | 
{{1,0},{0,1},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{1,0},{0,1}}
+independent_varname | 
{{-0.767560815504508,0.806649237861967,-1.07515071152907,-1.18456909732025},{-0.0995580974152422,0.00385956572525086,1.03989986852812,1.17758048907675},...
+...
+-[ RECORD 2 ]-------+-------------------------------------
+__id__              | 1
+dependent_varname   | 
{{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{0,1}}
+independent_varname | 
{{0.568444620674023,2.01083374606704,-1.28665576953479,-1.18456909732025},{-1.76956489263841,0.405254401793609,-1.21615408353289,-1.18456909732025},...
+...
+</pre> Review the output summary table: <pre class="example">
+SELECT * FROM iris_data_packed_summary;
+</pre> <pre class="result">
+-[ RECORD 1 ]------------+------------------------------
+source_table             | iris_data
+output_table             | iris_data_packed
+dependent_varname        | class_text
+independent_varname      | attributes
+buffer_size              | 26
+class_values             | {Iris_setosa,Iris_versicolor}
+num_rows_processed       | 52
+num_missing_rows_skipped | 0
+grouping_cols            |
+</pre> Review the output standardization table: <pre class="example">
+SELECT * FROM iris_data_packed_standardization;
+</pre> <pre class="result">
+-[ RECORD 1 ]------------------------------------------------------
+mean | {5.45961538462,2.99807692308,3.025,0.851923076923}
+std  | {0.598799958695,0.498262513686,1.41840579525,0.550346179381}
+</pre></li>
+<li>Generally the default buffer size will work well, but if you have occasion 
to change it: <pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, 
iris_data_packed_standardization;
+SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class_text',        -- Dependent variable
+                                     'attributes',        -- Independent 
variables
+                                     NULL,                -- Grouping
+                                     10                   -- Buffer size
+                                     );
+</pre> Review the output summary table: <pre class="example">
+SELECT * FROM iris_data_packed_summary;
+</pre> <pre class="result">
+-[ RECORD 1 ]------------+------------------------------
+source_table             | iris_data
+output_table             | iris_data_packed
+dependent_varname        | class_text
+independent_varname      | attributes
+buffer_size              | 10
+class_values             | {Iris_setosa,Iris_versicolor}
+num_rows_processed       | 52
+num_missing_rows_skipped | 0
+grouping_cols            |
+</pre></li>
+<li>Run the preprocessor with grouping by state: <pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, 
iris_data_packed_standardization;
+SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class_text',        -- Dependent variable
+                                     'attributes',        -- Independent 
variables
+                                     'state'              -- Grouping
+                                     );
+</pre> Review the output table: <pre class="example">
+SELECT * FROM iris_data_packed ORDER BY state, __id__;
+</pre> <pre class="result">
+-[ RECORD 1 ]-------+-------------------------------------
+__id__              | 0
+state               | Alaska
+dependent_varname   | 
{{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1}}
+independent_varname | {{0.306242850830503,-0.977074857057813,0.680489757142278 
...
+...
+-[ RECORD 2 ]-------+-------------------------------------
+__id__              | 1
+state               | Alaska
+dependent_varname   | 
{{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0}}
+independent_varname | {{1.10129640587123,-0.126074175104234,1.2524188915498 ...
+...
+-[ RECORD 3 ]-------+-------------------------------------
+__id__              | 2
+state               | Alaska
+dependent_varname   | {{1,0}}
+independent_varname | {{-0.647821415218373,1.15042684782613,-1.17827992968215 
...
+...
+-[ RECORD 4 ]-------+-------------------------------------
+__id__              | 0
+state               | Tennessee
+dependent_varname   | 
{{1,0},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{1,0},{0,1}}
+independent_varname | {{0.32912603663053,2.59625206429212,-1.12079945083087 ...
+...
+-[ RECORD 5 ]-------+-------------------------------------
+__id__              | 1
+state               | Tennessee
+dependent_varname   | 
{{0,1},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1}}
+independent_varname | {{0.865744574615085,-0.267261241912424,0.970244300719264 
...
+...
+</pre> Review the output summary table: <pre class="example">
+SELECT * FROM iris_data_packed_summary;
+</pre> <pre class="result">
+-[ RECORD 1 ]------------+------------------------------
+source_table             | iris_data
+output_table             | iris_data_packed
+dependent_varname        | class_text
+independent_varname      | attributes
+buffer_size              | 13
+class_values             | {Iris_setosa,Iris_versicolor}
+num_rows_processed       | 52
+num_missing_rows_skipped | 0
+grouping_cols            | state
+</pre> Review the output standardization table: <pre class="example">
+SELECT * FROM iris_data_packed_standardization;
+</pre> <pre class="result">
+-[ RECORD 1 
]-------------------------------------------------------------------
+state | Alaska
+mean  | {5.40740740740741,2.95925925925926,2.94814814814815,0.833333333333333}
+std   | 
{0.628888452645665,0.470034875978888,1.39877469405147,0.536103914747325}
+-[ RECORD 2 
]-------------------------------------------------------------------
+state | Tennessee
+mean  | {5.516,3.04,3.108,0.872}
+std   | {0.55905634778617,0.523832034148353,1.43469021046357,0.564637937088893}
+</pre></li>
+<li>If the depedent variable is scalar integer, and you have not already 
encoded it, you can ask the preprocessor to encode it for you: <pre 
class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, 
iris_data_packed_standardization;
+SELECT madlib.minibatch_preprocessor('iris_data',         -- Source table
+                                     'iris_data_packed',  -- Output table
+                                     'class',             -- Integer dependent 
variable
+                                     'attributes',        -- Independent 
variables
+                                     NULL,                -- Grouping
+                                     NULL,                -- Buffer size
+                                     TRUE                 -- Encode scalar int 
dependent variable
+                                     );
+</pre> Review the output summary table: <pre class="example">
+SELECT * FROM iris_data_packed_summary;
+</pre> <pre class="result">
+-[ RECORD 1 ]------------+-----------------
+source_table             | iris_data
+output_table             | iris_data_packed
+dependent_varname        | class
+independent_varname      | attributes
+dependent_vartype        | integer
+buffer_size              | 26
+class_values             | {1,2}
+num_rows_processed       | 52
+num_missing_rows_skipped | 0
+grouping_cols            |
+</pre></li>
+</ol>
+</dd></dl>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>[1] "Neural Networks for Machine Learning", Lectures 6a and 6b on 
mini-batch gradient descent, Geoffrey Hinton with Nitish Srivastava and Kevin 
Swersky, <a 
href="http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf";>http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p><a class="el" href="minibatch__preprocessing_8sql__in.html" title="TODO. 
">minibatch_preprocessing.sql_in</a></p>
+<p><a href="group__grp__nn.html"><b>Neural Networks</b></a> </p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__mlogreg.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__mlogreg.html 
b/docs/v1.15.1/group__grp__mlogreg.html
new file mode 100644
index 0000000..7dbbd4d
--- /dev/null
+++ b/docs/v1.15.1/group__grp__mlogreg.html
@@ -0,0 +1,423 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Multinomial Logistic Regression</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__mlogreg.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Multinomial Logistic Regression<div class="ingroups"><a 
class="el" href="group__grp__deprecated.html">Deprecated 
Modules</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<dl class="section warning"><dt>Warning</dt><dd><em> This is an old 
implementation of multinomial logistic regression. Replacement of this function 
is available as the Multinomial regression module <a class="el" 
href="group__grp__multinom.html">Multinomial Regression</a></em></dd></dl>
+<div class="toc"><b>Contents</b> <ul>
+<li class="level1">
+<a href="#train">Training Function</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Function</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#background">Technical Background</a> </li>
+<li class="level1">
+<a href="#literature">Literature</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>Multinomial logistic regression is a widely used regression analysis 
tool that models the outcomes of categorical dependent random variables. The 
model assumes that the conditional mean of the dependent categorical variables 
is the logistic function of an affine combination of independent variables. 
Multinomial logistic regression finds the vector of coefficients that maximizes 
the likelihood of the observations.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training 
Function</dt><dd>The multinomial logistic regression training function has the 
following syntax: <pre class="syntax">
+mlogregr_train(source_table,
+               output_table,
+               dependent_varname,
+               independent_varname,
+               ref_category,
+               optimizer_params
+              )
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">TEXT. The name of the table containing the input 
data.</p>
+<p></p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. The name of the generated table containing the 
output model. The output table produced by the multinomial logistic regression 
training function contains the following columns: </p><table class="output">
+<tr>
+<th>category </th><td>INTEGER. The category. Categories are encoded as 
integers with values from {0, 1, 2,..., <em>numCategories</em> &ndash; 1}  
</td></tr>
+<tr>
+<th>ref_category </th><td>INTEGER. The reference category. Categories are 
encoded as integers with values from {0, 1, 2,..., <em>numCategories</em> 
&ndash; 1}  </td></tr>
+<tr>
+<th>coef </th><td>FLOAT8[]. An array of coefficients, \( \boldsymbol c \).   
</td></tr>
+<tr>
+<th>log_likelihood </th><td>FLOAT8. The log-likelihood, \( l(\boldsymbol c) 
\).  </td></tr>
+<tr>
+<th>std_err </th><td>FLOAT8[]. An array of the standard errors.  </td></tr>
+<tr>
+<th>z_stats </th><td>FLOAT8[]. An array of the Wald z-statistics.  </td></tr>
+<tr>
+<th>p_values </th><td>FLOAT8[]. An array of the Wald p-values.  </td></tr>
+<tr>
+<th>odds_ratios </th><td>FLOAT8[]. An array of the odds ratios.  </td></tr>
+<tr>
+<th>condition_no </th><td>FLOAT8. The condition number of the matrix, computed 
using the coefficients of the iteration immediately preceding convergence.  
</td></tr>
+<tr>
+<th>num_iterations </th><td>INTEGER. The number of iterations executed before 
the algorithm completed.  </td></tr>
+</table>
+<p>A summary table named &lt;out_table&gt;_summary is also created at the same 
time, and it contains the following columns:</p>
+<table class="output">
+<tr>
+<th>source_table </th><td>The data source table name.  </td></tr>
+<tr>
+<th>out_table </th><td>The output table name.  </td></tr>
+<tr>
+<th>dependent_varname </th><td>The dependent variable.  </td></tr>
+<tr>
+<th>independent_varname </th><td>The independent variables.  </td></tr>
+<tr>
+<th>optimizer_params </th><td>The optimizer parameters. It is a copy of the 
optimizer_params in the training function's arguments.  </td></tr>
+<tr>
+<th>ref_category </th><td>An integer, the value of reference category used.  
</td></tr>
+<tr>
+<th>num_rows_processed </th><td>INTEGER. The number of rows actually 
processed, which is equal to the total number of rows in the source table minus 
the number of skipped rows.  </td></tr>
+<tr>
+<th>num_missing_rows_skipped </th><td>INTEGER. The number of rows skipped 
during the training. A row will be skipped if the ind_col is NULL or contains 
NULL values.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_varname </dt>
+<dd><p class="startdd">TEXT. The name of the column containing the dependent 
variable.</p>
+<p class="enddd"></p>
+</dd>
+<dt>independent_varname </dt>
+<dd><p class="startdd">TEXT. Expression list to evaluate for the independent 
variables. An intercept variable is not assumed. The number of independent 
variables cannot exceed 65535.</p>
+<p class="enddd"></p>
+</dd>
+<dt>ref_category (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 0. The reference category ranges from 
[0, <em>numCategories</em> &ndash; 1].</p>
+<p class="enddd"></p>
+</dd>
+<dt>optimizer_params (optional) </dt>
+<dd>VARCHAR, default: NULL, which uses the default values of optimizer 
parameters. It should be a string that contains pairs of 'key=value' separated 
by commas. Supported parameters with their default values: max_iter=20, 
optimizer='irls', precision=1e-4. Currently, only 'irls' and 'newton' are 
allowed for 'optimizer'.  </dd>
+</dl>
+</dd></dl>
+<dl class="section note"><dt>Note</dt><dd>Table names can be optionally schema 
qualified and table and column names should follow the same case-sensitivity 
and quoting rules as in the database.</dd></dl>
+<p><a class="anchor" id="predict"></a></p><dl class="section 
user"><dt>Prediction Function</dt><dd>The prediction function is provided to 
estimate the conditional mean given a new predictor. It has the following 
syntax: <pre class="syntax">
+mlogregr_predict(
+    model_table,
+    new_data_table,
+    id_col_name,
+    output_table,
+    type)
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>model_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the multilogistic 
model. This should be the output table returned from 
<em>mlogregr_train</em>.</p>
+<p class="enddd"></p>
+</dd>
+<dt>new_data_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing prediction data. 
This table is expected to contain the same features that were used during 
training. The table should also contain <em>id_col_name</em> used for 
identifying each row.</p>
+<p class="enddd"></p>
+</dd>
+<dt>id_col_name </dt>
+<dd><p class="startdd">TEXT. Name of the column containing id information in 
the source data. This is a mandatory argument and is used for correlating 
prediction table rows with the source. The values of this column are expected 
to be unique for each tuple. </p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. Name of the table to output prediction results 
to. If this table already exists then an error is returned. This output table 
contains the <em>id_col_name</em> column giving the 'id' for each 
prediction.</p>
+<p>If <em>type</em> = 'response', then the table has a single additional 
column with the prediction value of the response. The type of this column 
depends on the type of the response variable used during training.</p>
+<p>If <em>type</em> = 'prob', then the table has multiple additional columns, 
one for each possible category. The columns are labeled as 
'estimated_prob_<em>category_value</em>', where <em>category_value</em> 
represents the values of categories (0 to K-1).</p>
+<p class="enddd"></p>
+</dd>
+<dt>type </dt>
+<dd><p class="startdd">TEXT, optional, default: 'response'.</p>
+<p>When <em>type</em> = 'prob', the probabilities of each category (including 
the reference category) is given.</p>
+<p class="enddd">When <em>type</em> = 'response', a single output is provided 
which represents the prediction category for each tuple. This represents the 
category with the highest probability.  </p>
+</dd>
+</dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Create the training data table. <pre class="example">
+DROP TABLE IF EXISTS test3;
+CREATE TABLE test3 (
+    feat1 INTEGER,
+    feat2 INTEGER,
+    cat INTEGER
+);
+INSERT INTO test3(feat1, feat2, cat) VALUES
+(1,35,1),
+(2,33,0),
+(3,39,1),
+(1,37,1),
+(2,31,1),
+(3,36,0),
+(2,36,1),
+(2,31,1),
+(2,41,1),
+(2,37,1),
+(1,44,1),
+(3,33,2),
+(1,31,1),
+(2,44,1),
+(1,35,1),
+(1,44,0),
+(1,46,0),
+(2,46,1),
+(2,46,2),
+(3,49,1),
+(2,39,0),
+(2,44,1),
+(1,47,1),
+(1,44,1),
+(1,37,2),
+(3,38,2),
+(1,49,0),
+(2,44,0),
+(3,61,2),
+(1,65,2),
+(3,67,1),
+(3,65,2),
+(1,65,2),
+(2,67,2),
+(1,65,2),
+(1,62,2),
+(3,52,2),
+(3,63,2),
+(2,59,2),
+(3,65,2),
+(2,59,0),
+(3,67,2),
+(3,67,2),
+(3,60,2),
+(3,67,2),
+(3,62,2),
+(2,54,2),
+(3,65,2),
+(3,62,2),
+(2,59,2),
+(3,60,2),
+(3,63,2),
+(3,65,2),
+(2,63,1),
+(2,67,2),
+(2,65,2),
+(2,62,2);
+</pre></li>
+<li>Run the multilogistic regression function. <pre class="example">
+DROP TABLE IF EXISTS test3_output;
+DROP TABLE IF EXISTS test3_output_summary;
+SELECT madlib.mlogregr_train('test3',
+                             'test3_output',
+                             'cat',
+                             'ARRAY[1, feat1, feat2]',
+                             0,
+                             'max_iter=20, optimizer=irls, precision=0.0001'
+                             );
+</pre></li>
+<li>View the result: <pre class="example">
+-- Set extended display on for easier reading of output
+\x on
+SELECT * FROM test3_output;
+</pre> Results: <pre class="result">
+-[ RECORD 1 ]--+------------------------------------------------------------
+category       | 1
+ref_category   | 0
+coef           | {1.45474045211601,0.0849956182104023,-0.0172383499601956}
+loglikelihood  | -39.14759930999
+std_err        | {2.13085072854143,0.585021661344715,0.0431487356292144}
+z_stats        | {0.682704063982831,0.145286275409074,-0.39950996729842}
+p_values       | {0.494793861210936,0.884484850387893,0.689517480964129}
+odd_ratios     | {4.28337158128448,1.08871229617973,0.982909380301134}
+condition_no   | 280069.034217586
+num_iterations | 5
+-[ RECORD 2 ]--+------------------------------------------------------------
+category       | 2
+ref_category   | 0
+coef           | {-7.12908167688326,0.87648787696783,0.127886153027713}
+loglikelihood  | -39.14759930999
+std_err        | {2.52104008297868,0.639575886323862,0.0445757462972303}
+z_stats        | {-2.82783352990566,1.37042045472615,2.86896269049475}
+p_values       | {0.00468641692252239,0.170555690550421,0.00411820373218956}
+odd_ratios     | {0.000801455044349486,2.40244718187161,1.13642361694409}
+condition_no   | 280069.034217586
+num_iterations | 5
+</pre></li>
+<li>View all parameters used during the training <pre class="example">
+\x on
+SELECT * FROM test3_output_summary;
+</pre> Results: <pre class="result">
+-[ RECORD 1 ]------------+--------------------------------------------------
+method                   | mlogregr
+source_table             | test3
+out_table                | test3_output
+dependent_varname        | cat
+independent_varname      | ARRAY[1, feat1, feat2]
+optimizer_params         | max_iter=20, optimizer=irls, precision=0.0001
+ref_category             | 0
+num_categories           | 3
+num_rows_processed       | 57
+num_missing_rows_skipped | 0
+variance_covariance      | 
{{4.54052482732554,3.01080140927409,-0.551901021610841,-0.380754019900586,-0.0784151362989211,-0.0510014701718268},{3.01080140927409,6.35564309998514,-0.351902272617974,-0.766730342510818,-0.051877550252329,-0.0954432017695571},{-0.551901021610841,-0.351902272617974,0.34225034424253,0.231740815080827,-0.00117521831508331,-0.00114043921343171},{-0.380754019900586,-0.766730342510818,0.231740815080827,0.409057314366954,-0.000556498286025567,-0.000404735750986327},{-0.0784151362989211,-0.051877550252329,-0.00117521831508331,-0.000556498286025569,0.00186181338639984,0.00121080293928445},{-0.0510014701718268,-0.0954432017695571,-0.00114043921343171,-0.000404735750986325,0.00121080293928446,0.00198699715795504}}
+coef                     | 
{{1.45474045211601,0.0849956182104023,-0.0172383499601956},{-7.12908167688326,0.87648787696783,0.127886153027713}}
+</pre></li>
+</ol>
+<p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd>Multinomial logistic regression models 
the outcomes of categorical dependent random variables (denoted \( Y \in \{ 
0,1,2 \ldots k \} \)). The model assumes that the conditional mean of the 
dependent categorical variables is the logistic function of an affine 
combination of independent variables (usually denoted \( \boldsymbol x \)). 
That is, <p class="formulaDsp">
+\[ E[Y \mid \boldsymbol x] = \sigma(\boldsymbol c^T \boldsymbol x) \]
+</p>
+ for some unknown vector of coefficients \( \boldsymbol c \) and where \( 
\sigma(x) = \frac{1}{1 + \exp(-x)} \) is the logistic function. Multinomial 
logistic regression finds the vector of coefficients \( \boldsymbol c \) that 
maximizes the likelihood of the observations.</dd></dl>
+<p>Let</p><ul>
+<li>\( \boldsymbol y \in \{ 0,1 \}^{n \times k} \) denote the vector of 
observed dependent variables, with \( n \) rows and \( k \) columns, containing 
the observed values of the dependent variable,</li>
+<li>\( X \in \mathbf R^{n \times k} \) denote the design matrix with \( k \) 
columns and \( n \) rows, containing all observed vectors of independent 
variables \( \boldsymbol x_i \) as rows.</li>
+</ul>
+<p>By definition, </p><p class="formulaDsp">
+\[ P[Y = y_i | \boldsymbol x_i] = \sigma((-1)^{y_i} \cdot \boldsymbol c^T 
\boldsymbol x_i) \,. \]
+</p>
+<p> Maximizing the likelihood \( \prod_{i=1}^n \Pr(Y = y_i \mid \boldsymbol 
x_i) \) is equivalent to maximizing the log-likelihood \( \sum_{i=1}^n \log 
\Pr(Y = y_i \mid \boldsymbol x_i) \), which simplifies to </p><p 
class="formulaDsp">
+\[ l(\boldsymbol c) = -\sum_{i=1}^n \log(1 + \exp((-1)^{y_i} \cdot \boldsymbol 
c^T \boldsymbol x_i)) \,. \]
+</p>
+<p> The Hessian of this objective is \( H = -X^T A X \) where \( A = 
\text{diag}(a_1, \dots, a_n) \) is the diagonal matrix with \( a_i = 
\sigma(\boldsymbol c^T \boldsymbol x) \cdot \sigma(-\boldsymbol c^T \boldsymbol 
x) \,. \) Since \( H \) is non-positive definite, \( l(\boldsymbol c) \) is 
convex. There are many techniques for solving convex optimization problems. 
Currently, logistic regression in MADlib can use:</p><ul>
+<li>Iteratively Reweighted Least Squares</li>
+</ul>
+<p>We estimate the standard error for coefficient \( i \) as </p><p 
class="formulaDsp">
+\[ \mathit{se}(c_i) = \left( (X^T A X)^{-1} \right)_{ii} \,. \]
+</p>
+<p> The Wald z-statistic is </p><p class="formulaDsp">
+\[ z_i = \frac{c_i}{\mathit{se}(c_i)} \,. \]
+</p>
+<p>The Wald \( p \)-value for coefficient \( i \) gives the probability (under 
the assumptions inherent in the Wald test) of seeing a value at least as 
extreme as the one observed, provided that the null hypothesis ( \( c_i = 0 \)) 
is true. Letting \( F \) denote the cumulative density function of a standard 
normal distribution, the Wald \( p \)-value for coefficient \( i \) is 
therefore </p><p class="formulaDsp">
+\[ p_i = \Pr(|Z| \geq |z_i|) = 2 \cdot (1 - F( |z_i| )) \]
+</p>
+<p> where \( Z \) is a standard normally distributed random variable.</p>
+<p>The odds ratio for coefficient \( i \) is estimated as \( \exp(c_i) \).</p>
+<p>The condition number is computed as \( \kappa(X^T A X) \) during the 
iteration immediately <em>preceding</em> convergence (i.e., \( A \) is computed 
using the coefficients of the previous iteration). A large condition number 
(say, more than 1000) indicates the presence of significant 
multicollinearity.</p>
+<p>The multinomial logistic regression uses a default reference category of 
zero, and the regression coefficients in the output are in the order described 
below. For a problem with \( K \) dependent variables \( (1, ..., K) \) and \( 
J \) categories \( (0, ..., J-1) \), let \( {m_{k,j}} \) denote the coefficient 
for dependent variable \( k \) and category \( j \). The output is \( {m_{k_1, 
j_0}, m_{k_1, j_1} \ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1}, \ldots 
m_{k_2, j_{J-1}} \ldots m_{k_K, j_{J-1}}} \). The order is NOT CONSISTENT with 
the multinomial regression marginal effect calculation with function 
<em>marginal_mlogregr</em>. This is deliberate because the interfaces of all 
multinomial regressions (robust, clustered, ...) will be moved to match that 
used in marginal.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>A collection of nice write-ups, with valuable pointers into further 
literature:</p>
+<p>[1] Annette J. Dobson: An Introduction to Generalized Linear Models, Second 
Edition. Nov 2001</p>
+<p>[2] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 
November 2009, <a 
href="http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf";>http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf</a></p>
+<p>[3] Scott A. Czepiel: Maximum Likelihood Estimation of Logistic Regression 
Models: Theory and Implementation, Retrieved Jul 12 2012, <a 
href="http://czep.net/stat/mlelr.pdf";>http://czep.net/stat/mlelr.pdf</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="multilogistic_8sql__in.html" title="SQL functions 
for multinomial logistic regression. ">multilogistic.sql_in</a> documenting the 
multinomial logistic regression functions</p>
+<p><a class="el" href="group__grp__logreg.html">Logistic Regression</a></p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__multinom.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__multinom.html 
b/docs/v1.15.1/group__grp__multinom.html
new file mode 100644
index 0000000..9081d96
--- /dev/null
+++ b/docs/v1.15.1/group__grp__multinom.html
@@ -0,0 +1,494 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Multinomial Regression</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__multinom.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Multinomial Regression<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a> &raquo; <a class="el" 
href="group__grp__regml.html">Regression Models</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li class="level1">
+<a href="#train">Training Function</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Function</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#background">Technical Background</a> </li>
+<li class="level1">
+<a href="#literature">Literature</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>In statistics, multinomial regression is a classification method that 
generalizes binomial regression to multiclass problems, i.e. with more than two 
possible discrete outcomes. That is, it is a model that is used to predict the 
probabilities of the different possible outcomes of a categorically distributed 
dependent variable, given a set of independent variables (which may be 
real-valued, binary-valued, categorical-valued, etc.).</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training 
Function</dt><dd>The multinomial regression training function has the following 
syntax: <pre class="syntax">
+multinom(source_table,
+         model_table,
+         dependent_varname,
+         independent_varname,
+         ref_category,
+         link_func,
+         grouping_col,
+         optim_params,
+         verbose
+        )
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>source_table </dt>
+<dd><p class="startdd">VARCHAR. Name of the table containing the training 
data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>model_table </dt>
+<dd><p class="startdd">VARCHAR. Name of the generated table containing the 
model.</p>
+<p>The model table produced by multinom() contains the following columns:</p>
+<table class="output">
+<tr>
+<th>&lt;...&gt; </th><td><p class="starttd">Grouping columns, if provided in 
input. This could be multiple columns depending on the 
<code>grouping_col</code> input. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>category </th><td><p class="starttd">VARCHAR. String representation of 
category value. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>coef </th><td><p class="starttd">FLOAT8[]. Vector of the coefficients in 
linear predictor. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>log_likelihood </th><td><p class="starttd">FLOAT8. The log-likelihood \( 
l(\boldsymbol \beta) \). The value will be the same across categories within 
the same group. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>std_err </th><td><p class="starttd">FLOAT8[]. Vector of the standard 
errors of the coefficients. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>z_stats </th><td><p class="starttd">FLOAT8[]. Vector of the z-statistics 
of the coefficients. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>p_values </th><td><p class="starttd">FLOAT8[]. Vector of the p-values of 
the coefficients. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_rows_processed </th><td><p class="starttd">BIGINT. Number of rows 
processed. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_rows_skipped </th><td><p class="starttd">BIGINT. Number of rows 
skipped due to missing values or failures. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_iterations </th><td>INTEGER. Number of iterations actually completed. 
This would be different from the <code>nIterations</code> argument if a 
<code>tolerance</code> parameter is provided and the algorithm converges before 
all iterations are completed.  </td></tr>
+</table>
+<p>A summary table named &lt;model_table&gt;_summary is also created at the 
same time, which has the following columns: </p><table class="output">
+<tr>
+<th>method </th><td><p class="starttd">VARCHAR. String describes the model: 
'multinom'. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>source_table </th><td><p class="starttd">VARCHAR. Data source table name. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>model_table </th><td><p class="starttd">VARCHAR. Model table name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_varname </th><td><p class="starttd">VARCHAR. Expression for 
dependent variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>independent_varname </th><td><p class="starttd">VARCHAR. Expression for 
independent variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>ref_category </th><td><p class="starttd">VARCHAR. String representation of 
reference category. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>link_func </th><td><p class="starttd">VARCHAR. String that contains link 
function parameters: only 'logit' is implemented now </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>grouping_col </th><td><p class="starttd">VARCHAR. String representation of 
grouping columns. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>optimizer_params </th><td><p class="starttd">VARCHAR. String that contains 
optimizer parameters, and has the form of 'optimizer=..., max_iter=..., 
tolerance=...'. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_all_groups </th><td><p class="starttd">INTEGER. Number of groups in 
glm training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_failed_groups </th><td><p class="starttd">INTEGER. Number of failed 
groups in glm training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_processed </th><td><p class="starttd">BIGINT. Total number of 
rows processed in all groups. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_skipped </th><td><p class="starttd">BIGINT. Total number of 
rows skipped in all groups due to missing values or failures. </p>
+<p class="endtd"></p>
+</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_varname </dt>
+<dd><p class="startdd">VARCHAR. Name of the dependent variable column.</p>
+<p class="enddd"></p>
+</dd>
+<dt>independent_varname </dt>
+<dd><p class="startdd">VARCHAR. Expression list to evaluate for the 
independent variables. An intercept variable is not assumed. It is common to 
provide an explicit intercept term by including a single constant 
<code>1</code> term in the independent variable list.</p>
+<p class="enddd"></p>
+</dd>
+<dt>link_function (optional) </dt>
+<dd><p class="startdd">VARCHAR, default: 'logit'. Parameters for link 
function. Currently, we support logit. </p>
+<p class="enddd"></p>
+</dd>
+<dt>ref_category (optional) </dt>
+<dd><p class="startdd">VARCHAR, default: '0'. Parameters to specify the 
reference category. </p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_col (optional) </dt>
+<dd><p class="startdd">VARCHAR, default: NULL. An expression list used to 
group the input dataset into discrete groups, running one regression per group. 
Similar to the SQL "GROUP BY" clause. When this value is NULL, no grouping is 
used and a single model is generated.</p>
+<p class="enddd"></p>
+</dd>
+<dt>optim_params (optional) </dt>
+<dd><p class="startdd">VARCHAR, default: 
'max_iter=100,optimizer=irls,tolerance=1e-6'. Parameters for optimizer. 
Currently, we support tolerance=[tolerance for relative error between 
log-likelihoods], max_iter=[maximum iterations to run], optimizer=irls.</p>
+<p class="enddd"></p>
+</dd>
+<dt>verbose (optional) </dt>
+<dd>BOOLEAN, default: FALSE. Provides verbose output of the results of 
training. </dd>
+</dl>
+<dl class="section note"><dt>Note</dt><dd>For p-values, we just return the 
computation result directly. Other statistical packages, like 'R', produce the 
same result, but on printing the result to screen, another format function is 
used and any p-value that is smaller than the machine epsilon (the smallest 
positive floating-point number 'x' such that '1 + x != 1') will be printed on 
screen as "&lt; xxx" (xxx is the value of the machine epsilon). Although the 
result may look different, they are in fact the same. </dd></dl>
+<p><a class="anchor" id="predict"></a></p><dl class="section 
user"><dt>Prediction Function</dt><dd>Multinomial regression prediction 
function has the following format: <pre class="syntax">
+multinom_predict(model_table,
+                 predict_table_input,
+                 output_table,
+                 predict_type,
+                 verbose,
+                 id_column
+                )
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>model_table </dt>
+<dd><p class="startdd">TEXT. Name of the generated table containing the model, 
which is the output table from multinom().</p>
+<p class="enddd"></p>
+</dd>
+<dt>predict_table_input </dt>
+<dd><p class="startdd">TEXT. The name of the table containing the data to 
predict on. The table must contain id column as the primary key.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. Name of the generated table containing the 
predicted values.</p>
+<p>The model table produced by multinom_predict contains the following 
columns:</p>
+<table class="output">
+<tr>
+<th>id </th><td><p class="starttd">SERIAL. Column to identify the predicted 
value. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>category </th><td><p class="starttd">TEXT. Available if the predicted type 
= 'response'. Column contains the predicted categories </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>category_value </th><td>FLOAT8. The predicted probability for the specific 
category_value.  </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>predict_type </dt>
+<dd>TEXT. Either 'response' or 'probability'. Using 'response' will give the 
predicted category with the largest probability. Using probability will give 
the predicted probabilities for all categories </dd>
+<dt>verbose </dt>
+<dd><p class="startdd">BOOLEAN. Control whether verbose is displayed. The 
default is FALSE. </p>
+<p class="enddd"></p>
+</dd>
+<dt>id_column </dt>
+<dd>TEXT. The name of the column in the input table. </dd>
+</dl>
+</dd></dl>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>Create the training data table. <pre class="example">
+DROP TABLE IF EXISTS test3;
+CREATE TABLE test3 (
+    feat1 INTEGER,
+    feat2 INTEGER,
+    cat INTEGER
+);
+INSERT INTO test3(feat1, feat2, cat) VALUES
+(1,35,1),
+(2,33,0),
+(3,39,1),
+(1,37,1),
+(2,31,1),
+(3,36,0),
+(2,36,1),
+(2,31,1),
+(2,41,1),
+(2,37,1),
+(1,44,1),
+(3,33,2),
+(1,31,1),
+(2,44,1),
+(1,35,1),
+(1,44,0),
+(1,46,0),
+(2,46,1),
+(2,46,2),
+(3,49,1),
+(2,39,0),
+(2,44,1),
+(1,47,1),
+(1,44,1),
+(1,37,2),
+(3,38,2),
+(1,49,0),
+(2,44,0),
+(3,61,2),
+(1,65,2),
+(3,67,1),
+(3,65,2),
+(1,65,2),
+(2,67,2),
+(1,65,2),
+(1,62,2),
+(3,52,2),
+(3,63,2),
+(2,59,2),
+(3,65,2),
+(2,59,0),
+(3,67,2),
+(3,67,2),
+(3,60,2),
+(3,67,2),
+(3,62,2),
+(2,54,2),
+(3,65,2),
+(3,62,2),
+(2,59,2),
+(3,60,2),
+(3,63,2),
+(3,65,2),
+(2,63,1),
+(2,67,2),
+(2,65,2),
+(2,62,2);
+</pre></li>
+<li>Run the multilogistic regression function. <pre class="example">
+DROP TABLE IF EXISTS test3_output;
+DROP TABLE IF EXISTS test3_output_summary;
+SELECT madlib.multinom('test3',
+                       'test3_output',
+                       'cat',
+                       'ARRAY[1, feat1, feat2]',
+                       '0',
+                       'logit'
+                       );
+</pre></li>
+<li>View the regression results. <pre class="example">
+-- Set extended display on for easier reading of output
+\x on
+SELECT * FROM test3_output;
+</pre></li>
+</ol>
+<p>Result: </p><pre class="result">
+-[ RECORD 1 
]------+------------------------------------------------------------
+category           | 1
+coef               | {1.45474045165731,0.084995618282504,-0.0172383499512136}
+log_likelihood     | -39.1475993094045
+std_err            | {2.13085878785549,0.585023211942952,0.0431489262260687}
+z_stats            | {0.682701481650677,0.145285890452484,-0.399508202380224}
+p_values           | {0.494795493298706,0.884485154314181,0.689518781152604}
+num_rows_processed | 57
+num_rows_skipped   | 0
+iteration          | 6
+-[ RECORD 2 
]------+------------------------------------------------------------
+category           | 2
+coef               | {-7.1290816775109,0.876487877074751,0.127886153038661}
+log_likelihood     | -39.1475993094045
+std_err            | {2.52105418324135,0.639578886139654,0.0445760103748678}
+z_stats            | {-2.82781771407425,1.37041402721253,2.86894569440347}
+p_values           | 
{0.00468664844488755,0.170557695812408,0.00411842502754068}
+num_rows_processed | 57
+num_rows_skipped   | 0
+iteration          | 6
+</pre><ol type="1">
+<li>Predicting dependent variable using multinomial model. (This example uses 
the original data table to perform the prediction. Typically a different test 
dataset with the same features as the original training dataset would be used 
for prediction.)</li>
+</ol>
+<pre class="example">
+\x off
+-- Add the id column for prediction function
+ALTER TABLE test3 ADD COLUMN id SERIAL;
+-- Predict probabilities for all categories using the original data
+SELECT madlib.multinom_predict('test3_out','test3', 'test3_prd_prob', 
'probability');
+-- Display the predicted value
+SELECT * FROM test3_prd_prob;
+</pre><p><a class="anchor" id="background"></a></p><dl class="section 
user"><dt>Technical Background</dt><dd>When link = 'logit', multinomial 
logistic regression models the outcomes of categorical dependent random 
variables (denoted \( Y \in \{ 0,1,2 \ldots k \} \)). The model assumes that 
the conditional mean of the dependent categorical variables is the logistic 
function of an affine combination of independent variables (usually denoted \( 
\boldsymbol x \)). That is, <p class="formulaDsp">
+\[ E[Y \mid \boldsymbol x] = \sigma(\boldsymbol c^T \boldsymbol x) \]
+</p>
+ for some unknown vector of coefficients \( \boldsymbol c \) and where \( 
\sigma(x) = \frac{1}{1 + \exp(-x)} \) is the logistic function. Multinomial 
logistic regression finds the vector of coefficients \( \boldsymbol c \) that 
maximizes the likelihood of the observations.</dd></dl>
+<p>Let</p><ul>
+<li>\( \boldsymbol y \in \{ 0,1 \}^{n \times k} \) denote the vector of 
observed dependent variables, with \( n \) rows and \( k \) columns, containing 
the observed values of the dependent variable,</li>
+<li>\( X \in \mathbf R^{n \times k} \) denote the design matrix with \( k \) 
columns and \( n \) rows, containing all observed vectors of independent 
variables \( \boldsymbol x_i \) as rows.</li>
+</ul>
+<p>By definition, </p><p class="formulaDsp">
+\[ P[Y = y_i | \boldsymbol x_i] = \sigma((-1)^{y_i} \cdot \boldsymbol c^T 
\boldsymbol x_i) \,. \]
+</p>
+<p> Maximizing the likelihood \( \prod_{i=1}^n \Pr(Y = y_i \mid \boldsymbol 
x_i) \) is equivalent to maximizing the log-likelihood \( \sum_{i=1}^n \log 
\Pr(Y = y_i \mid \boldsymbol x_i) \), which simplifies to </p><p 
class="formulaDsp">
+\[ l(\boldsymbol c) = -\sum_{i=1}^n \log(1 + \exp((-1)^{y_i} \cdot \boldsymbol 
c^T \boldsymbol x_i)) \,. \]
+</p>
+<p> The Hessian of this objective is \( H = -X^T A X \) where \( A = 
\text{diag}(a_1, \dots, a_n) \) is the diagonal matrix with \( a_i = 
\sigma(\boldsymbol c^T \boldsymbol x) \cdot \sigma(-\boldsymbol c^T \boldsymbol 
x) \,. \) Since \( H \) is non-positive definite, \( l(\boldsymbol c) \) is 
convex. There are many techniques for solving convex optimization problems. 
Currently, logistic regression in MADlib can use:</p><ul>
+<li>Iteratively Reweighted Least Squares</li>
+</ul>
+<p>We estimate the standard error for coefficient \( i \) as </p><p 
class="formulaDsp">
+\[ \mathit{se}(c_i) = \left( (X^T A X)^{-1} \right)_{ii} \,. \]
+</p>
+<p> The Wald z-statistic is </p><p class="formulaDsp">
+\[ z_i = \frac{c_i}{\mathit{se}(c_i)} \,. \]
+</p>
+<p>The Wald \( p \)-value for coefficient \( i \) gives the probability (under 
the assumptions inherent in the Wald test) of seeing a value at least as 
extreme as the one observed, provided that the null hypothesis ( \( c_i = 0 \)) 
is true. Letting \( F \) denote the cumulative density function of a standard 
normal distribution, the Wald \( p \)-value for coefficient \( i \) is 
therefore </p><p class="formulaDsp">
+\[ p_i = \Pr(|Z| \geq |z_i|) = 2 \cdot (1 - F( |z_i| )) \]
+</p>
+<p> where \( Z \) is a standard normally distributed random variable.</p>
+<p>The odds ratio for coefficient \( i \) is estimated as \( \exp(c_i) \).</p>
+<p>The condition number is computed as \( \kappa(X^T A X) \) during the 
iteration immediately <em>preceding</em> convergence (i.e., \( A \) is computed 
using the coefficients of the previous iteration). A large condition number 
(say, more than 1000) indicates the presence of significant 
multicollinearity.</p>
+<p>The multinomial logistic regression uses a default reference category of 
zero, and the regression coefficients in the output are in the order described 
below. For a problem with \( K \) dependent variables \( (1, ..., K) \) and \( 
J \) categories \( (0, ..., J-1) \), let \( {m_{k,j}} \) denote the coefficient 
for dependent variable \( k \) and category \( j \). The output is \( {m_{k_1, 
j_0}, m_{k_1, j_1} \ldots m_{k_1, j_{J-1}}, m_{k_2, j_0}, m_{k_2, j_1}, \ldots 
m_{k_2, j_{J-1}} \ldots m_{k_K, j_{J-1}}} \). The order is NOT CONSISTENT with 
the multinomial regression marginal effect calculation with function 
<em>marginal_mlogregr</em>. This is deliberate because the interfaces of all 
multinomial regressions (robust, clustered, ...) will be moved to match that 
used in marginal.</p>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd></dd></dl>
+<p>A collection of nice write-ups, with valuable pointers into further 
literature:</p>
+<p>[1] Annette J. Dobson: An Introduction to Generalized Linear Models, Second 
Edition. Nov 2001</p>
+<p>[2] Cosma Shalizi: Statistics 36-350: Data Mining, Lecture Notes, 18 
November 2009, <a 
href="http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf";>http://www.stat.cmu.edu/~cshalizi/350/lectures/26/lecture-26.pdf</a></p>
+<p>[3] Scott A. Czepiel: Maximum Likelihood Estimation of Logistic Regression 
Models: Theory and Implementation, Retrieved Jul 12 2012, <a 
href="http://czep.net/stat/mlelr.pdf";>http://czep.net/stat/mlelr.pdf</a></p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" href="multiresponseglm_8sql__in.html" title="SQL 
functions for multinomial regression. ">multiresponseglm.sql_in</a> documenting 
the multinomial regression functions</p>
+<p><a class="el" href="group__grp__logreg.html">Logistic Regression</a></p>
+<p><a class="el" href="group__grp__ordinal.html">Ordinal Regression</a></p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>

[16/51] [partial] madlib-site git commit: Doc: Add v1.15.1 documentation

Reply via email to