[29/51] [partial] madlib-site git commit: Doc: Add v1.14 documentation

riyer Wed, 02 May 2018 14:22:16 -0700

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__datatrans.html
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__datatrans.html 
b/docs/v1.14/group__grp__datatrans.html
new file mode 100644
index 0000000..5a7cbce
--- /dev/null
+++ b/docs/v1.14/group__grp__datatrans.html
@@ -0,0 +1,154 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Data Types and Transformations</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" 
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__datatrans.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Data Types and Transformations</div>  </div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
Description</h2>
+<p>Data types and transformation operations </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__arraysmatrix"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" href="group__grp__arraysmatrix.html">Arrays and 
Matrices</a></td></tr>
+<tr class="memdesc:group__grp__arraysmatrix"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Mathematical operations for 
arrays and matrices. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__encode__categorical"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__encode__categorical.html">Encoding Categorical 
Variables</a></td></tr>
+<tr class="memdesc:group__grp__encode__categorical"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Provides functions to 
encode categorical variables. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__path"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__path.html">Path</a></td></tr>
+<tr class="memdesc:group__grp__path"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Path Functions. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__pivot"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__pivot.html">Pivot</a></td></tr>
+<tr class="memdesc:group__grp__pivot"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Provides pivoting functions helpful for data preparation 
before modeling. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__sessionize"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__sessionize.html">Sessionize</a></td></tr>
+<tr class="memdesc:group__grp__sessionize"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Sessionize. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__stemmer"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__stemmer.html">Stemming</a></td></tr>
+<tr class="memdesc:group__grp__stemmer"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Provides porter stemmer operations supporting other MADlib 
modules. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:11 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__datatrans.js
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__datatrans.js 
b/docs/v1.14/group__grp__datatrans.js
new file mode 100644
index 0000000..4900455
--- /dev/null
+++ b/docs/v1.14/group__grp__datatrans.js
@@ -0,0 +1,9 @@
+var group__grp__datatrans =
+[
+    [ "Arrays and Matrices", "group__grp__arraysmatrix.html", 
"group__grp__arraysmatrix" ],
+    [ "Encoding Categorical Variables", 
"group__grp__encode__categorical.html", null ],
+    [ "Path", "group__grp__path.html", null ],
+    [ "Pivot", "group__grp__pivot.html", null ],
+    [ "Sessionize", "group__grp__sessionize.html", null ],
+    [ "Stemming", "group__grp__stemmer.html", null ]
+];
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__decision__tree.html
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__decision__tree.html 
b/docs/v1.14/group__grp__decision__tree.html
new file mode 100644
index 0000000..4903d8a
--- /dev/null
+++ b/docs/v1.14/group__grp__decision__tree.html
@@ -0,0 +1,1084 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Decision Tree</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" 
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__decision__tree.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Decision Tree<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a> &raquo; <a class="el" 
href="group__grp__tree.html">Tree Methods</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b><ul>
+<li class="level1">
+<a href="#train">Training Function</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Function</a> </li>
+<li class="level1">
+<a href="#display">Tree Display</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#literature">Literature</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>A decision tree is a supervised learning method that can be used for 
classification and regression. It consists of a structure in which internal 
nodes represent tests on attributes, and the branches from nodes represent the 
result of those tests. Each leaf node is a class label and the paths from root 
to leaf nodes define the set of classification or regression rules.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training 
Function</dt><dd>We implement the decision tree using the CART algorithm 
introduced by Breiman et al. [1]. The training function has the following 
syntax: <pre class="syntax">
+tree_train(
+    training_table_name,
+    output_table_name,
+    id_col_name,
+    dependent_variable,
+    list_of_features,
+    list_of_features_to_exclude,
+    split_criterion,
+    grouping_cols,
+    weights,
+    max_depth,
+    min_split,
+    min_bucket,
+    num_splits,
+    pruning_params,
+    null_handling_params,
+    verbosity
+    )
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>training_table_name </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the training 
data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table_name </dt>
+<dd><p class="startdd">TEXT. Name of the generated table containing the model. 
If a table with the same name already exists, an error will be returned. A 
summary table named <em>&lt;output_table_name&gt;_summary</em> is also created. 
A cross-validation table <em>&lt;output_table_name&gt;_cv</em> may also be 
created. These are described later on this page. </p>
+<p class="enddd"></p>
+</dd>
+<dt>id_col_name </dt>
+<dd><p class="startdd">TEXT. Name of the column containing id information in 
the training data. This is a mandatory argument and is used for prediction and 
cross-validation. The values are expected to be unique for each row. </p>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_variable </dt>
+<dd><p class="startdd">TEXT. Name of the column that contains the output 
(response) for training. Boolean, integer and text types are considered to be 
classification outputs, while double precision values are considered to be 
regression outputs. The response variable for a classification tree can be 
multinomial, but the time and space complexity of the training function 
increases linearly as the number of response classes increases.</p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of column names or 
expressions to use as predictors. Can also be a '*' implying all columns are to 
be used as predictors (except for the ones included in the next argument that 
lists exclusions). The types of the features can be mixed: boolean, integer, 
and text columns are considered categorical and double precision columns are 
considered continuous. Categorical variables are not encoded and used as is in 
the training.</p>
+<p>Array columns can also be included in the list, where the array is expanded 
to treat each element of the array as a feature.</p>
+<p>Note that not every combination of the levels of a categorical variable is 
checked when evaluating a split. The levels of the non-integer categorical 
variable are ordered by the entropy of the variable in predicting the response. 
The split at each node is evaluated between these ordered levels. Integer 
categorical variables, however, are simply ordered by their value. </p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features_to_exclude </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of column names to exclude 
from the predictors list. If the <em>dependent_variable</em> is an expression 
(including cast of a column name), then this list should include the columns 
present in the <em>dependent_variable</em> expression, otherwise those columns 
will be included in the features. The names in this parameter should be 
identical to the names used in the table and quoted appropriately. </p>
+<p class="enddd"></p>
+</dd>
+<dt>split_criterion </dt>
+<dd><p class="startdd">TEXT, default = 'gini' for classification, 'mse' for 
regression. Impurity function to compute the feature to use to split a node. 
Supported criteria are 'gini', 'entropy', 'misclassification' for 
classification trees. For regression trees, split_criterion of 'mse' 
(mean-squared error) is always used, irrespective of the input for this 
argument. Refer to reference [1] for more information on impurity measures.</p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_cols (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. Comma-separated list of column 
names to group the data by. This will produce multiple decision trees, one for 
each group. </p>
+<p class="enddd"></p>
+</dd>
+<dt>weights (optional) </dt>
+<dd><p class="startdd">TEXT. Column name containing numerical weights for each 
observation. Can be any value greater than 0 (does not need to be an integer). 
This can be used to handle the case of unbalanced data sets. The weights are 
used to compute a weighted average in the output leaf node. For classification, 
the contribution of a row towards the vote of its corresponding level is 
multiplied by the weight (weighted mode). For regression, the output value of 
the row is multiplied by the weight (weighted mean).</p>
+<p class="enddd"></p>
+</dd>
+<dt>max_depth (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 7. Maximum depth of any node of the 
final tree, with the root node counted as depth 0. A deeper tree can lead to 
better prediction but will also result in longer processing time and higher 
memory usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_split (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 20. Minimum number of observations 
that must exist in a node for a split to be attempted. The best value for this 
parameter depends on the number of tuples in the dataset.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_bucket (optional) </dt>
+<dd><p class="startdd">INTEGER, default: min_split/3. Minimum number of 
observations in any terminal node. If only one of min_bucket or min_split is 
specified, min_split is set to min_bucket*3 or min_bucket to min_split/3, as 
appropriate.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_splits (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 20. Continuous-valued features are 
binned into discrete quantiles to compute split boundaries. This global 
parameter is used to compute the resolution of splits for continuous features. 
Higher number of bins will lead to better prediction, but will also result in 
longer processing time and higher memory usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>pruning_params (optional) </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of key-value pairs giving 
the parameters for pruning the tree. </p><table class="output">
+<tr>
+<th>cp </th><td><p class="starttd">Default: 0. Complexity parameter. A split 
on a node is attempted only if it decreases the overall lack of fit by a factor 
of 'cp', otherwise the split is pruned away. This value is used to create an 
initial tree before running cross-validation (see below).</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>n_folds </th><td><p class="starttd">Default: 0 (i.e. no cross-validation). 
Number of cross-validation folds to use to compute the best value of 
<em>cp</em>. To perform cross-validation, a positive value of <em>n_folds</em> 
(2 or more) should be specified. An additional output table 
<em>&lt;model_table&gt;_cv</em> is created containing the values of evaluated 
<em>cp</em> and the cross-validation error statistics. The tree returned in the 
output table corresponds to the <em>cp</em> with the lowest cross-validation 
error (we pick the maximum <em>cp</em> if multiple values have same error).</p>
+<p>The list of <em>cp</em> values is automatically computed by parsing through 
the tree initially trained on the complete dataset. The tree output is a subset 
of this initial tree corresponding to the best computed <em>cp</em>.</p>
+<p class="endtd"></p>
+</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>null_handling_params (optional) </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of key-value pairs 
controlling the behavior of various features handling missing values. One of 
the following can be used if desired (not both): </p><table class="output">
+<tr>
+<th>max_surrogates </th><td>Default: 0. Number of surrogates to store for each 
node. One approach to handling NULLs is to use surrogate splits for each node. 
A surrogate variable enables you to make better use of the data by using 
another predictor variable that is associated (correlated) with the primary 
split variable. The surrogate variable comes into use when the primary 
predictior value is NULL. Surrogate rules implemented here are based on 
reference [1].  </td></tr>
+<tr>
+<th>null_as_category </th><td><p class="starttd">Default: FALSE. Whether to 
treat NULL as a valid level for categorical features. FALSE means that NULL is 
not a valid level, which is probably the most common sitation.</p>
+<p>If set to TRUE, NULL values are considered a categorical value and placed 
at the end of the ordering of categorical levels. Placing at the end ensures 
that NULL is never used as a value to split a node on. One reason to make NULL 
a category is that it allows you to predict on categorical levels that were not 
in the training data by lumping them into an "other bucket."</p>
+<p class="endtd">This parameter is ignored for continuous-valued features.   
</p>
+</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>verbosity (optional) </dt>
+<dd>BOOLEAN, default: FALSE. Provides verbose output of the training result. 
</dd>
+</dl>
+</dd></dl>
+<p><b>Output</b> </p><dl class="arglist">
+</dl>
+<p>The model table produced by the training function contains the following 
columns:</p>
+<table class="output">
+<tr>
+<th>&lt;...&gt; </th><td>Grouping columns, if provided as input, in the same 
types as the training table. This could be multiple columns depending on the 
<code>grouping_cols</code> input.  </td></tr>
+<tr>
+<th>tree </th><td>BYTEA8. Trained decision tree model stored in binary format 
(not human readable).  </td></tr>
+<tr>
+<th>cat_levels_in_text </th><td>TEXT[]. Ordered levels (values) of categorical 
variables corresponding to the categorical features in the 'list_of_features' 
argument above. Used to help interpret the trained decision tree. For example, 
if the categorical features specified are <em>weather_outlook</em> and 
<em>windy</em> in that order, then 'cat_levels_in_text' might be <em>[overcast, 
rain, sunny, False, True]</em>.  </td></tr>
+<tr>
+<th>cat_n_levels </th><td><p class="starttd">INTEGER[]. Number of levels for 
each categorical variable. Used to help interpret the trained decision tree. In 
the example from above, 'cat_n_levels' would be <em>[3, 2]</em> since there are 
3 levels for <em>weather_outlook</em> and 2 levels <em>windy</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>tree_depth </th><td><p class="starttd">INTEGER. The maximum depth the tree 
obtained after training (root has depth 0). </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>pruning_cp </th><td><p class="starttd">DOUBLE PRECISION. The cost 
complexity parameter used for pruning the trained tree(s). This could be 
different than the cp value input using the <em>pruning_params</em> if 
cross-validation is used.  </p>
+<p class="endtd"></p>
+</td></tr>
+</table>
+<p>A summary table named <em>&lt;output_table_name&gt;_summary</em> is also 
created at the same time, which has the following columns: </p><table 
class="output">
+<tr>
+<th>method </th><td><p class="starttd">TEXT. 'tree_train' </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>is_classification </th><td><p class="starttd">BOOLEAN. TRUE if the 
decision trees are for classification, FALSE if for regression. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>source_table </th><td><p class="starttd">TEXT. The data source table name. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>model_table </th><td><p class="starttd">TEXT. The model table name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>id_col_name </th><td><p class="starttd">TEXT. The ID column name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_varname </th><td><p class="starttd">TEXT. The dependent 
variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>independent_varnames </th><td><p class="starttd">TEXT. The independent 
variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_features </th><td><p class="starttd">TEXT. The list of categorical 
feature names as a comma-separated string. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>con_features </th><td><p class="starttd">TEXT. The list of continuous 
feature names as a comma-separated string. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>grouping_cols </th><td><p class="starttd">TEXT. Names of grouping columns. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_all_groups </th><td><p class="starttd">INTEGER. Number of groups in 
decision tree training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_failed_groups </th><td><p class="starttd">INTEGER. Number of failed 
groups in decision tree training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_processed </th><td><p class="starttd">BIGINT. Total numbers of 
rows processed in all groups. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_skipped </th><td><p class="starttd">BIGINT. Total numbers of 
rows skipped in all groups due to missing values or failures. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_levels </th><td><p class="starttd">TEXT. For classification, 
the distinct levels of the dependent variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_type </th><td><p class="starttd">TEXT. The type of dependent 
variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>input_cp </th><td><p class="starttd">DOUBLE PRECISION. The complexity 
parameter (cp) used for pruning the trained tree(s) before cross-validation is 
run. This is same as the cp value input using the <em>pruning_params</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>independent_var_types </th><td><p class="starttd">TEXT. A comma separated 
string for the types of independent variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>n_folds </th><td><p class="starttd">BIGINT. Number of cross-validation 
folds used. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>null_proxy </th><td>TEXT. Describes how NULLs are handled. If NULL is not 
treated as a separate categorical variable, this will be NULL. If NULL is 
treated as a separate categorical value, this will be set to "__NULL__"  
</td></tr>
+</table>
+<p>A cross-validation table called <em>&lt;output_table_name&gt;_cv</em> is 
created if 'n_folds' is set in the 'pruning_params'. It has the following 
columns: </p><table class="output">
+<tr>
+<th>cp </th><td><p class="starttd">DOUBLE PRECISION. Complexity parameter. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cv_error_avg </th><td><p class="starttd">DOUBLE PRECISION. Average error 
resulting from cp value. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cv_error_stdev </th><td>DOUBLE PRECISION. Standard deviation resulting 
from cp value.  </td></tr>
+</table>
+<dl class="section note"><dt>Note</dt><dd><ul>
+<li>Many of the parameters are designed to be similar to the popular R package 
'rpart'. An important distinction between rpart and the MADlib function is that 
for both response and feature variables, MADlib considers integer values as 
categorical values, while rpart considers them as continuous. To use integers 
as continuous, cast them to double precision.</li>
+<li>Integer values are ordered by value for computing the split boundaries. 
Cast to TEXT if the entropy-based ordering method is desired.</li>
+<li>When cross-validation is not used (<em>n_folds</em>=0), each tree output 
is pruned by the input cost complexity (<em>cp</em>). With cross-validation, 
the input <em>cp</em> is the minimum value of all the explored values of 'cp'. 
During cross-validation, we train an initial tree using the provided 
<em>cp</em> and explore all possible sub-trees (up to a single-node tree) to 
compute the optimal sub-tree. The optimal sub-tree and the 'cp' corresponding 
to this optimal sub-tree is placed in the <em>output_table</em>, with the 
columns named as <em>tree</em> and <em>pruning_cp</em> respectively.</li>
+<li>The main parameters that affect memory usage are: depth of tree 
(âmax_depthâ), number of features, number of values per categorical 
feature, and number of bins for continuous features (ânum_splitsâ). If you 
are hitting memory limits, consider reducing one or more of these 
parameters.</li>
+</ul>
+</dd></dl>
+<p><a class="anchor" id="predict"></a></p><dl class="section 
user"><dt>Prediction Function</dt><dd>The prediction function estimates the 
conditional mean given a new predictor. It has the following syntax: <pre 
class="syntax">
+tree_predict(tree_model,
+             new_data_table,
+             output_table,
+             type)
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>tree_model </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the decision tree 
model. This should be the output table returned from <em>tree_train.</em></p>
+<p class="enddd"></p>
+</dd>
+<dt>new_data_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing prediction data. 
This table is expected to contain the same features that were used during 
training. The table should also contain <em>id_col_name</em> used for 
identifying each row.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. Name of the table to output prediction results. 
If this table already exists, an error is returned. The table contains the 
<em>id_col_name</em> column giving the 'id' for each prediction and the 
prediction columns for the dependent variable.</p>
+<p>If <em>type</em> = 'response', then the table has a single additional 
column with the prediction value of the response. The type of this column 
depends on the type of the response variable used during training.</p>
+<p>If <em>type</em> = 'prob', then the table has multiple additional columns, 
one for each possible value of the response variable. The columns are labeled 
as 'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents 
each value of the response variable.</p>
+<p class="enddd"></p>
+</dd>
+<dt>type </dt>
+<dd>TEXT, optional, default: 'response'. For regression trees, the output is 
always the predicted value of the dependent variable. For classification trees, 
the <em>type</em> variable can be 'response', giving the classification 
prediction as output, or 'prob', giving the class probabilities as output. For 
each value of the dependent variable, a column with the probabilities is added 
to the output table.  </dd>
+</dl>
+<p><a class="anchor" id="display"></a></p><dl class="section user"><dt>Tree 
Display</dt><dd>The display function outputs a graph representation of the 
decision tree. The output can either be in the popular 'dot' format that can be 
visualized using various programs including those in the GraphViz package, or 
in a simple text format. The details of the text format are output with the 
tree. <pre class="syntax">
+tree_display(tree_model, dot_format, verbosity)
+</pre></dd></dl>
+<p>An additional display function is provided to output the surrogate splits 
chosen for each internal node: </p><pre class="syntax">
+tree_surr_display(tree_model)
+</pre><p>This output contains the list of surrogate splits for each internal 
node. The nodes are sorted in ascending order by id. This is equivalent to 
viewing the tree in a breadth-first manner. For each surrogate, we output the 
surrogate split (variable and threshold) and also give the number of rows that 
were common between the primary split and the surrogate split. Finally, the 
number of rows present in the majority branch of the primary split is also 
shown. Only surrogates that perform better than this majority branch are 
included in the surrogate list. When the primary variable has a NULL value the 
surrogate variables are used in order to compute the split for that node. If 
all surrogates variables are NULL, then the majority branch is used to compute 
the split for a tuple.</p>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>tree_model </dt>
+<dd>TEXT. Name of the table containing the decision tree model. </dd>
+<dt>dot_format </dt>
+<dd>BOOLEAN, default = TRUE. Output can either be in a dot format or a text 
format. If TRUE, the result is in the dot format, else output is in text 
format. </dd>
+<dt>verbosity </dt>
+<dd>BOOLEAN, default = FALSE. If set to TRUE, the dot format output will 
contain additional information (impurity, sample size, number of weighted rows 
for each response variable, classification or prediction if the tree was pruned 
at this level) </dd>
+</dl>
+<p>The output is always returned as a 'TEXT'. For the dot format, the output 
can be redirected to a file on the client side and then rendered using 
visualization programs.</p>
+<p>To export the dot format result to an external file, use the method below. 
Please note that you should use unaligned table output mode for psql with '-A' 
flag, or else you may get an error when you try to convert the dot file to 
another format for viewing (e.g., PDF). And inside the psql client, both '\t' 
and '\o' should be used:</p>
+<pre class="example">
+&gt; # under bash
+&gt; psql -A my_database
+# -- in psql now
+# \t
+# \o test.dot -- export to a file
+# select madlib.tree_display('tree_out');
+# \o
+# \t
+</pre><p>After the dot file has been generated, use third-party plotting 
software to plot the trees in a nice format: </p><pre class="example">
+&gt; # under bash, convert the dot file into a PDF file
+&gt; dot -Tpdf test.dot &gt; test.pdf
+&gt; xpdf test.pdf&amp;
+</pre><p>Please see the examples below for more details on the contents of the 
tree output formats.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd><h4>Decision Tree Classification Examples</h4>
+</dd></dl>
+<ol type="1">
+<li>Load input data set related to whether to play golf or not: <pre 
class="example">
+DROP TABLE IF EXISTS dt_golf CASCADE;
+CREATE TABLE dt_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    "Temp_Humidity" double precision[],
+    clouds_airquality text[],
+    windy boolean,
+    class text,
+    observation_weight double precision
+);
+INSERT INTO dt_golf VALUES
+(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t 
Play', 5.0),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t 
Play', 5.0),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 
'Play', 1.5),
+(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play', 
1.0),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 
'Don''t Play', 5.0),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play', 
5.0),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play', 
1.0),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play', 
5.0),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 
'Play', 1.5),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 
'Play', 1.5),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play', 1.0);
+</pre></li>
+<li>Run the decision tree training function: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT madlib.tree_train('dt_golf',         -- source table
+                         'train_output',    -- output model table
+                         'id',              -- id column
+                         'class',           -- response
+                         '"OUTLOOK", temperature, windy',   -- features
+                         NULL::text,        -- exclude columns
+                         'gini',            -- split criterion
+                         NULL::text,        -- no grouping
+                         NULL::text,        -- no weights, all observations 
treated equally
+                         5,                 -- max depth
+                         3,                 -- min split
+                         1,                 -- min bucket
+                         10                 -- number of bins per continuous 
variable
+                         );
+</pre> View the output table (excluding the tree which is in binary format): 
<pre class="example">
+SELECT pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM 
train_output;
+</pre> <pre class="result">
+ pruning_cp |        cat_levels_in_text        | cat_n_levels | tree_depth 
+------------+----------------------------------+--------------+------------
+          0 | {overcast,rain,sunny,False,True} | {3,2}        |          5
+</pre> View the summary table: <pre class="example">
+\x on
+SELECT * FROM train_output_summary;
+</pre> <pre class="result">
+-[ RECORD 1 ]---------+--------------------------------
+method                | tree_train
+is_classification     | t
+source_table          | dt_golf
+model_table           | train_output
+id_col_name           | id
+dependent_varname     | class
+independent_varnames  | "OUTLOOK", temperature, windy
+cat_features          | "OUTLOOK",windy
+con_features          | temperature
+grouping_cols         | 
+num_all_groups        | 1
+num_failed_groups     | 0
+total_rows_processed  | 14
+total_rows_skipped    | 0
+dependent_var_levels  | "Don't Play","Play"
+dependent_var_type    | text
+input_cp              | 0
+independent_var_types | text, boolean, double precision
+n_folds               | 0
+null_proxy            | 
+</pre></li>
+<li>Predict output categories. For the purpose of this example, we use the 
same data that was used for training: <pre class="example">
+\x off
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.tree_predict('train_output',          -- tree model
+                           'dt_golf',               -- new data table
+                           'prediction_results',    -- output table
+                           'response');             -- show response
+SELECT g.id, class, estimated_class FROM prediction_results p, 
+dt_golf g WHERE p.id = g.id ORDER BY g.id;
+</pre> <pre class="result">
+ id |   class    | estimated_class 
+----+------------+-----------------
+  1 | Don't Play | Don't Play
+  2 | Don't Play | Don't Play
+  3 | Play       | Play
+  4 | Play       | Play
+  5 | Play       | Play
+  6 | Don't Play | Don't Play
+  7 | Play       | Play
+  8 | Don't Play | Don't Play
+  9 | Play       | Play
+ 10 | Play       | Play
+ 11 | Play       | Play
+ 12 | Play       | Play
+ 13 | Play       | Play
+ 14 | Don't Play | Don't Play
+(14 rows)
+</pre> To display the probabilities associated with each value of the 
dependent variable, set the 'type' parameter to 'prob': <pre class="example">
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.tree_predict('train_output',          -- tree model
+                           'dt_golf',               -- new data table
+                           'prediction_results',    -- output table
+                           'prob');                 -- show probability
+SELECT g.id, class, "estimated_prob_Don't Play",  "estimated_prob_Play" 
+FROM prediction_results p, dt_golf g WHERE p.id = g.id ORDER BY g.id;
+</pre> <pre class="result">
+ id |   class    | estimated_prob_Don't Play | estimated_prob_Play 
+----+------------+---------------------------+---------------------
+  1 | Don't Play |                         1 |                   0
+  2 | Don't Play |                         1 |                   0
+  3 | Play       |                         0 |                   1
+  4 | Play       |                         0 |                   1
+  5 | Play       |                         0 |                   1
+  6 | Don't Play |                         1 |                   0
+  7 | Play       |                         0 |                   1
+  8 | Don't Play |                         1 |                   0
+  9 | Play       |                         0 |                   1
+ 10 | Play       |                         0 |                   1
+ 11 | Play       |                         0 |                   1
+ 12 | Play       |                         0 |                   1
+ 13 | Play       |                         0 |                   1
+ 14 | Don't Play |                         1 |                   0
+(14 rows)
+</pre></li>
+<li>View the tree in text format: <pre class="example">
+SELECT madlib.tree_display('train_output', FALSE);
+</pre> <pre class="result">
+&#160;-------------------------------------
+&#160;- Each node represented by 'id' inside ().
+&#160;- Each internal nodes has the split condition at the end, while each
+        leaf node has a * at the end.
+&#160;- For each internal node (i), its child nodes are indented by 1 level
+        with ids (2i+1) for True node and (2i+2) for False node.
+&#160;- Number of (weighted) rows for each response variable inside [].'
+        The response label order is given as ['"\'Don\'t Play\'"', 
'"\'Play\'"'].
+        For each leaf, the prediction is given after the '--&gt;'
+&#160;-------------------------------------
+ (0)[5 9]  "OUTLOOK" in {overcast}
+    (1)[0 4]  * --&gt; "Play"
+    (2)[5 5]  temperature &lt;= 75
+       (5)[3 5]  temperature &lt;= 65
+          (11)[1 0]  * --&gt; "Don't Play"
+          (12)[2 5]  temperature &lt;= 70
+             (25)[0 3]  * --&gt; "Play"
+             (26)[2 2]  temperature &lt;= 72
+                (53)[2 0]  * --&gt; "Don't Play"
+                (54)[0 2]  * --&gt; "Play"
+       (6)[2 0]  * --&gt; "Don't Play"
+&#160;-------------------------------------
+</pre> Here are some details on how to interpret the tree display above:<ul>
+<li>Node numbering starts at 0 for the root node and would be contiguous 
1,2...n if the tree was completely full (no pruning). Since the tree has been 
pruned, the node numbering is not contiguous.</li>
+<li>The order of values [x y] indicates the number of weighted rows that 
correspond to ["Don't play" "Play"] <em>before</em> the node test. For example, 
at (root) node 0, there are 5 rows for "Don't play" and 9 rows for "Play" in 
the raw data.</li>
+<li>If we apply the test of "OUTLOOK" being overcast, then the True (yes) 
result is leaf node 1 which is "Play". There are 0 "Don't play" rows and 4 
"Play" rows that correspond to this case (overcast). In other words, if it is 
overcast, you always play golf. If it is not overcast, you may or may not play 
golf, depending on the rest of the tree.</li>
+<li>The remaining 5 "Don't play" rows and 5 "Play rows" are then tested at 
node 2 on temperature&lt;=75. The False (no) result is leaf node 6 which is 
"Don't Play". The True (yes) result proceeds to leaf node 5 to test on 
temperature&lt;=65. And so on down the tree.</li>
+<li>Creating a dot format visualization of the tree, as described below, can 
help with following the decision flows.</li>
+</ul>
+</li>
+<li>Create a dot format display of the tree: <pre class="example">
+SELECT madlib.tree_display('train_output', TRUE);
+</pre> <pre class="result">
+ digraph "Classification tree for dt_golf" {
+          subgraph "cluster0"{
+          label=""
+ "g0_0" [label="\"OUTLOOK" &lt;= overcast", shape=ellipse];
+ "g0_0" -&gt; "g0_1"[label="yes"];
+ "g0_1" [label=""Play"",shape=box];
+ "g0_0" -&gt; "g0_2"[label="no"];
+ "g0_2" [label="temperature &lt;= 75", shape=ellipse];
+ "g0_2" -&gt; "g0_5"[label="yes"];
+ "g0_2" -&gt; "g0_6"[label="no"];
+ "g0_6" [label=""Don't Play"",shape=box];
+ "g0_5" [label="temperature &lt;= 65", shape=ellipse];
+ "g0_5" -&gt; "g0_11"[label="yes"];
+ "g0_11" [label=""Don't Play"",shape=box];
+ "g0_5" -&gt; "g0_12"[label="no"];
+ "g0_12" [label="temperature &lt;= 70", shape=ellipse];
+ "g0_12" -&gt; "g0_25"[label="yes"];
+ "g0_25" [label=""Play"",shape=box];
+ "g0_12" -&gt; "g0_26"[label="no"];
+ "g0_26" [label="temperature &lt;= 72", shape=ellipse];
+ "g0_26" -&gt; "g0_53"[label="yes"];
+ "g0_53" [label=""Don't Play"",shape=box];
+ "g0_26" -&gt; "g0_54"[label="no"];
+ "g0_54" [label=""Play"",shape=box];
+&#160;&#160;&#160;} //--- end of subgraph------------
+&#160;} //---end of digraph---------
+</pre> One important difference to note about the dot format above is how 
categorical variable tests are displayed:<ul>
+<li>In the text format of the tree, the node 0 test is "OUTLOOK" in 
{overcast}, but in the dot format of the tree, the same node 0 test reads 
"\"OUTLOOK" &lt;= overcast". This is because in dot format for categorical 
variables, the '&lt;=' symbol represents the location in the array 
'cat_levels_in_text' from the output table for the "OUTLOOK" levels. The array 
is ['overcast', 'rain', 'sunny', 'False', 'True'] with the first 3 entries 
corresponding to "OUTLOOK" and the last 2 entries corresponding to 'windy'. So 
the test "\"OUTLOOK" &lt;= overcast" means all "OUTLOOK" levels to the left of, 
and including, 'overcast'. In this case there are no levels to the left of 
'overcast' in the array so it is simply a test on whether it is overcast or 
not.</li>
+<li>If there was a test "\"OUTLOOK" &lt;= rain", this would include both 
'overcast' and 'rain', since 'overcast' is to the left of 'rain' in the 
array.</li>
+<li>If there was a test "windy &lt;= True", this would include both 'False' 
and 'True', since 'False' is to the left of 'True' in the array.</li>
+</ul>
+</li>
+<li>Now create a dot format display of the tree with additional information: 
<pre class="example">
+SELECT madlib.tree_display('train_output', TRUE, TRUE);
+</pre> <pre class="result">
+ digraph "Classification tree for dt_golf" {
+          subgraph "cluster0"{
+          label=""
+ "g0_0" [label="\"OUTLOOK" &lt;= overcast\n impurity = 0.459184\n samples = 
14\n value = [5 9]\n class = "Play"", shape=ellipse];
+ "g0_0" -&gt; "g0_1"[label="yes"];
+ "g0_1" [label=""Play"\n impurity = 0\n samples = 4\n value = [0 
4]",shape=box];
+ "g0_0" -&gt; "g0_2"[label="no"];
+ "g0_2" [label="temperature &lt;= 75\n impurity = 0.5\n samples = 10\n value = 
[5 5]\n class = "Don't Play"", shape=ellipse];
+ "g0_2" -&gt; "g0_5"[label="yes"];
+ "g0_2" -&gt; "g0_6"[label="no"];
+ "g0_6" [label=""Don't Play"\n impurity = 0\n samples = 2\n value = [2 
0]",shape=box];
+ "g0_5" [label="temperature &lt;= 65\n impurity = 0.46875\n samples = 8\n 
value = [3 5]\n class = "Play"", shape=ellipse];
+ "g0_5" -&gt; "g0_11"[label="yes"];
+ "g0_11" [label=""Don't Play"\n impurity = 0\n samples = 1\n value = [1 
0]",shape=box];
+ "g0_5" -&gt; "g0_12"[label="no"];
+ "g0_12" [label="temperature &lt;= 70\n impurity = 0.408163\n samples = 7\n 
value = [2 5]\n class = "Play"", shape=ellipse];
+ "g0_12" -&gt; "g0_25"[label="yes"];
+ "g0_25" [label=""Play"\n impurity = 0\n samples = 3\n value = [0 
3]",shape=box];
+ "g0_12" -&gt; "g0_26"[label="no"];
+ "g0_26" [label="temperature &lt;= 72\n impurity = 0.5\n samples = 4\n value = 
[2 2]\n class = "Don't Play"", shape=ellipse];
+ "g0_26" -&gt; "g0_53"[label="yes"];
+ "g0_53" [label=""Don't Play"\n impurity = 0\n samples = 2\n value = [2 
0]",shape=box];
+ "g0_26" -&gt; "g0_54"[label="no"];
+ "g0_54" [label=""Play"\n impurity = 0\n samples = 2\n value = [0 
2]",shape=box];
+&#160;&#160;&#160;} //--- end of subgraph------------
+&#160;} //---end of digraph---------
+</pre> The additional information in each node is: impurity, sample size, 
number of weighted rows for each response variable, and classification if the 
tree was pruned at this level. If your tree is not too big, you may wish to 
convert the dot format to PDF or another format for better visualization of the 
tree structure.</li>
+<li>Arrays of features. Categorical and continuous features can be array 
columns, in which case the array is expanded to treat each element of the array 
as a feature: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT madlib.tree_train('dt_golf',         -- source table
+                         'train_output',    -- output model table
+                         'id',              -- id column
+                         'class',           -- response
+                         '"Temp_Humidity", clouds_airquality',   -- features
+                         NULL::text,        -- exclude columns
+                         'gini',            -- split criterion
+                         NULL::text,        -- no grouping
+                         NULL::text,        -- no weights, all observations 
treated equally
+                         5,                 -- max depth
+                         3,                 -- min split
+                         1,                 -- min bucket
+                         10                 -- number of bins per continuous 
variable
+                         );
+SELECT pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM 
train_output;
+</pre> View the output table (excluding the tree which is in binary format): 
<pre class="example">
+SELECT pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM 
train_output;
+</pre> <pre class="result">
+ pruning_cp |               cat_levels_in_text               | cat_n_levels | 
tree_depth 
+------------+------------------------------------------------+--------------+------------
+          0 | {medium,none,high,low,unhealthy,good,moderate} | {4,3}        |  
        3
+</pre> The first 4 levels correspond to cloud ceiling and the next 3 levels 
correspond to air quality.</li>
+<li>Weighting observations. Use the 'weights' parameter to adjust a row's vote 
to balance the dataset. In our example, the weights are somewhat random but 
show that a different decision tree is create compared to the case where no 
weights are used: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT madlib.tree_train('dt_golf',         -- source table
+                         'train_output',    -- output model table
+                         'id',              -- id column
+                         'class',           -- response
+                         '"OUTLOOK", temperature, windy',   -- features
+                         NULL::text,        -- exclude columns
+                         'gini',            -- split criterion
+                         NULL::text,        -- no grouping
+                         'observation_weight', -- weight observations
+                         5,                 -- max depth
+                         3,                 -- min split
+                         1,                 -- min bucket
+                         10                 -- number of bins per continuous 
variable
+                         ); 
+SELECT madlib.tree_display('train_output');
+</pre> <pre class="result">
+&#160; -------------------------------------
+&#160; - Each node represented by 'id' inside ().
+&#160; - Each internal nodes has the split condition at the end, while each
+         leaf node has a * at the end.
+&#160; - For each internal node (i), its child nodes are indented by 1 level
+         with ids (2i+1) for True node and (2i+2) for False node.
+&#160; - Number of (weighted) rows for each response variable inside [].'
+         The response label order is given as ['"Don\'t Play"', '"Play"'].
+         For each leaf, the prediction is given after the '--&gt;'
+&#160; ------------------------------------- 
+ (0)[17 19]  temperature &lt;= 75
+    (1)[ 7 16]  temperature &lt;= 72
+       (3)[ 7 10]  temperature &lt;= 70
+          (7)[  1 8.5]  * --&gt; "Play"
+          (8)[  6 1.5]  "OUTLOOK" in {overcast}
+             (17)[  0 1.5]  * --&gt; "Play"
+             (18)[6 0]  * --&gt; "Don't Play"
+       (4)[0 6]  * --&gt; "Play"
+    (2)[10  3]  "OUTLOOK" in {overcast}
+       (5)[0 3]  * --&gt; "Play"
+       (6)[10  0]  * --&gt; "Don't Play"
+</pre></li>
+</ol>
+<h4>Decision Tree Regression Examples</h4>
+<ol type="1">
+<li>Load input data related to fuel consumption and 10 aspects of automobile 
design and performance for 32 automobiles (1973â74 models). Data was 
extracted from the 1974 Motor Trend US magazine. <pre class="example">
+DROP TABLE IF EXISTS mt_cars;
+CREATE TABLE mt_cars (
+    id integer NOT NULL,
+    mpg double precision,
+    cyl integer,
+    disp double precision,
+    hp integer,
+    drat double precision,
+    wt double precision,
+    qsec double precision,
+    vs integer,
+    am integer,
+    gear integer,
+    carb integer
+);
+INSERT INTO mt_cars VALUES
+(1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
+(2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
+(3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
+(4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
+(5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
+(6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
+(7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
+(8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
+(9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
+(10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
+(11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
+(12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
+(13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
+(14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
+(15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
+(16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
+(17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
+(18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
+(19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
+(20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
+(21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
+(22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
+(23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
+(24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
+(25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
+(26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
+(27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
+(28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
+(29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
+(30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
+(31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
+(32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
+</pre></li>
+<li>We train a regression decision tree with surrogates in order to handle the 
NULL feature values: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv;
+SELECT madlib.tree_train('mt_cars',         -- source table
+                         'train_output',    -- output model table
+                         'id',              -- id column
+                         'mpg',             -- dependent variable
+                         '*',               -- features
+                         'id, hp, drat, am, gear, carb',  -- exclude columns
+                         'mse',             -- split criterion
+                         NULL::text,        -- no grouping
+                         NULL::text,        -- no weights, all observations 
treated equally
+                         10,                -- max depth
+                         8,                 -- min split
+                         3,                 -- number of bins per continuous 
variable
+                         10,                -- number of splits
+                         NULL,              -- pruning parameters
+                         'max_surrogates=2' -- number of surrogates
+                         );
+</pre> View the output table (excluding the tree which is in binary format) 
which shows ordering of levels of categorical variables 'vs' and 'cyl': <pre 
class="example">
+SELECT pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM 
train_output;
+</pre> <pre class="result">
+ pruning_cp | cat_levels_in_text | cat_n_levels | tree_depth 
+------------+--------------------+--------------+------------
+          0 | {0,1,4,6,8}        | {2,3}        |          4
+</pre> View the summary table: <pre class="example">
+\x on
+SELECT * FROM train_output_summary;
+</pre> <pre class="result">
+-[ RECORD 1 
]---------+-----------------------------------------------------------------------
+method                | tree_train
+is_classification     | f
+source_table          | mt_cars
+model_table           | train_output
+id_col_name           | id
+dependent_varname     | mpg
+independent_varnames  | *
+cat_features          | vs,cyl
+con_features          | disp,qsec,wt
+grouping_cols         | 
+num_all_groups        | 1
+num_failed_groups     | 0
+total_rows_processed  | 32
+total_rows_skipped    | 0
+dependent_var_levels  | 
+dependent_var_type    | double precision
+input_cp              | 0
+independent_var_types | integer, integer, double precision, double precision, 
double precision
+n_folds               | 0
+null_proxy            | 
+</pre></li>
+<li>Predict regression output for the same data and compare with original: 
<pre class="example">
+\x off
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.tree_predict('train_output',
+                           'mt_cars',
+                           'prediction_results',
+                           'response');
+SELECT s.id, mpg, estimated_mpg, mpg-estimated_mpg as delta 
+FROM prediction_results p, 
+mt_cars s WHERE s.id = p.id ORDER BY id;
+</pre> Result: <pre class="result">
+ id | mpg  |  estimated_mpg   |        delta        
+----+------+------------------+---------------------
+  1 | 18.7 |            16.84 |                1.86
+  2 |   21 | 19.7428571428571 |    1.25714285714286
+  3 | 24.4 |            22.58 |                1.82
+  4 |   21 | 19.7428571428571 |    1.25714285714286
+  5 | 17.8 | 19.7428571428571 |   -1.94285714285714
+  6 | 16.4 |            16.84 |  -0.439999999999998
+  7 | 22.8 |            22.58 |   0.219999999999999
+  8 | 17.3 |           13.325 |               3.975
+  9 | 21.4 | 19.7428571428571 |    1.65714285714286
+ 10 | 15.2 |           13.325 |               1.875
+ 11 | 18.1 | 19.7428571428571 |   -1.64285714285714
+ 12 | 32.4 | 30.0666666666667 |    2.33333333333334
+ 13 | 14.3 |            14.78 |               -0.48
+ 14 | 22.8 |            22.58 |   0.219999999999999
+ 15 | 30.4 | 30.0666666666667 |   0.333333333333336
+ 16 | 19.2 | 19.7428571428571 |  -0.542857142857141
+ 17 | 33.9 | 30.0666666666667 |    3.83333333333334
+ 18 | 15.2 |            16.84 |               -1.64
+ 19 | 10.4 |           13.325 |              -2.925
+ 20 | 27.3 | 30.0666666666667 |   -2.76666666666666
+ 21 | 10.4 |           13.325 |              -2.925
+ 22 |   26 | 30.0666666666667 |   -4.06666666666666
+ 23 | 14.7 |            16.84 |               -2.14
+ 24 | 30.4 | 30.0666666666667 |   0.333333333333336
+ 25 | 21.5 |            22.58 |               -1.08
+ 26 | 15.8 |            14.78 |                1.02
+ 27 | 15.5 |            14.78 |   0.719999999999999
+ 28 |   15 |            14.78 |   0.219999999999999
+ 29 | 13.3 |            14.78 |               -1.48
+ 30 | 19.2 |            16.84 |                2.36
+ 31 | 19.7 | 19.7428571428571 | -0.0428571428571409
+ 32 | 21.4 |            22.58 |               -1.18
+(32 rows)
+</pre></li>
+<li>Display the decision tree in basic text format: <pre class="example">
+SELECT madlib.tree_display('train_output', FALSE);
+</pre> <pre class="result">
+&#160; -------------------------------------
+&#160;- Each node represented by 'id' inside ().
+&#160;- Each internal nodes has the split condition at the end, while each
+&#160;    leaf node has a * at the end.
+&#160;- For each internal node (i), its child nodes are indented by 1 level
+&#160;    with ids (2i+1) for True node and (2i+2) for False node.
+&#160;- Number of rows and average response value inside []. For a leaf node, 
this is the prediction.
+&#160;-------------------------------------
+ (0)[32, 20.0906]  cyl in {4}
+    (1)[11, 26.6636]  wt &lt;= 2.2
+       (3)[6, 30.0667]  *
+       (4)[5, 22.58]  *
+    (2)[21, 16.6476]  disp &lt;= 258
+       (5)[7, 19.7429]  *
+       (6)[14, 15.1]  qsec &lt;= 17.42
+          (13)[10, 15.81]  qsec &lt;= 16.9
+             (27)[5, 14.78]  *
+             (28)[5, 16.84]  *
+          (14)[4, 13.325]  *                                                   
                                                 
+ &#160;-------------------------------------
+(1 row)
+</pre></li>
+<li>Display the surrogate variables that are used to compute the split for 
each node when the primary variable is NULL: <pre class="example">
+SELECT madlib.tree_surr_display('train_output');
+</pre> <pre class="result">
+&#160;-------------------------------------
+       Surrogates for internal nodes
+&#160;-------------------------------------
+ (0) cyl in {4}
+      1: disp &lt;= 146.7    [common rows = 29]
+      2: vs in {1}    [common rows = 26]
+      [Majority branch = 11 ]                                          
+ (1) wt &lt;= 2.2
+      [Majority branch = 19 ]                                          
+ (2) disp &lt;= 258
+      1: cyl in {4,6}    [common rows = 19]
+      2: vs in {1}    [common rows = 18]
+      [Majority branch = 7 ]                                          
+ (6) qsec &lt;= 17.42
+      1: disp &gt; 275.8    [common rows = 11]
+      2: vs in {0}    [common rows = 10]
+      [Majority branch = 10 ]                                         
+ (13) qsec &lt;= 16.9
+      1: wt &lt;= 3.84    [common rows = 8]
+      2: disp &lt;= 360    [common rows = 7]
+      [Majority branch = 5 ]
+&#160;-------------------------------------
+(1 row)
+</pre> <dl class="section note"><dt>Note</dt><dd>The 'cyl' parameter in the 
data set has two tuples with NULL values (<em>id = 9</em> and <em>id = 
18</em>). In the prediction based on this tree, the surrogate splits for the 
<em>cyl in {4}</em> split in node 0 are used to predict those two tuples. The 
splits are used in descending order until a surrogate variable is found that is 
not NULL. In this case, the two tuples have non-NULL values for <em>disp</em>, 
hence the <em>disp &lt;= 146.7</em> split is used to make the prediction. If 
all the surrogate variables are NULL then the majority branch would be 
followed.</dd></dl>
+</li>
+<li>Now let's use cross validation to select the best value of the complexity 
parameter cp: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_summary, train_output_cv;
+SELECT madlib.tree_train('mt_cars',         -- source table
+                         'train_output',    -- output model table
+                         'id',              -- id column
+                         'mpg',             -- dependent variable
+                         '*',               -- features
+                         'id, hp, drat, am, gear, carb',  -- exclude columns
+                         'mse',             -- split criterion
+                         NULL::text,        -- no grouping
+                         NULL::text,        -- no weights, all observations 
treated equally
+                         10,                -- max depth
+                         8,                 -- min split
+                         3,                 -- number of bins per continuous 
variable
+                         10,                -- number of splits
+                         'n_folds=3'       -- pruning parameters for cross 
validation
+                         );
+</pre> View the output table (excluding the tree which is in binary format). 
The input cp value was 0 (default) and the best 'pruning_cp' value turns out to 
be 0 as well in this small example: <pre class="example">
+SELECT pruning_cp, cat_levels_in_text, cat_n_levels, tree_depth FROM 
train_output;
+</pre> <pre class="result">
+ pruning_cp | cat_levels_in_text | cat_n_levels | tree_depth 
+------------+--------------------+--------------+------------
+          0 | {0,1,4,6,8}        | {2,3}        |          4
+</pre> The cp values tested and average error and standard deviation are: <pre 
class="example">
+SELECT * FROM train_output_cv ORDER BY cv_error_avg ASC;
+</pre> <pre class="result">
+         cp          |   cv_error_avg   | cv_error_stddev  
+---------------------+------------------+------------------
+                   0 | 4.60222321567406 | 1.14990035501294
+ 0.00942145242026098 | 4.71906243157825 | 1.21587651168567
+  0.0156685263245236 | 4.86688342751006 | 1.30225133441406
+  0.0893348335770666 |  5.0608834230282 | 1.42488238861617
+   0.135752855572154 | 5.33192746100332 | 1.62718329150341
+   0.643125226048458 | 5.76814538295394 | 2.10750950120742
+(6 rows)
+</pre></li>
+</ol>
+<h4>NULL Handling Example</h4>
+<ol type="1">
+<li>Create toy example to illustrate 'null-as-category' handling for 
categorical features: <pre class="example">
+DROP TABLE IF EXISTS null_handling_example;
+CREATE TABLE null_handling_example (
+    id integer,
+    country text,
+    city text,
+    weather text,
+    response text
+);
+INSERT INTO null_handling_example VALUES
+(1,null,null,null,'a'),
+(2,'US',null,null,'b'),
+(3,'US','NY',null,'c'),
+(4,'US','NY','rainy','d');
+</pre></li>
+<li>Train decision tree. Note that 'NULL' is set as a valid level for the 
categorical features country, weather and city: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT madlib.tree_train('null_handling_example',         -- source table
+                         'train_output',                  -- output model table
+                         'id',                            -- id column
+                         'response',                      -- dependent variable
+                         'country, weather, city',        -- features
+                         NULL,                            -- features to 
exclude
+                         'gini',                          -- split criterion
+                         NULL::text,                      -- no grouping
+                         NULL::text,                      -- no weights, all 
observations treated equally
+                         4,                               -- max depth
+                         1,                               -- min split
+                         1,                               -- number of bins 
per continuous variable
+                         10,                              -- number of splits
+                         NULL,                            -- pruning parameters
+                         'null_as_category=true'          -- null handling
+                         );
+SELECT cat_levels_in_text, cat_n_levels FROM train_output;
+</pre> <pre class="result">
+            cat_levels_in_text            | cat_n_levels 
+------------------------------------------+--------------
+ {US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2}
+</pre> View the summary table: <pre class="example">
+\x on
+SELECT * FROM train_output_summary;
+</pre> <pre class="result">
+-[ RECORD 1 ]---------+-----------------------
+method                | tree_train
+is_classification     | t
+source_table          | null_handling_example
+model_table           | train_output
+id_col_name           | id
+dependent_varname     | response
+independent_varnames  | country, weather, city
+cat_features          | country,weather,city
+con_features          | 
+grouping_cols         | 
+num_all_groups        | 1
+num_failed_groups     | 0
+total_rows_processed  | 4
+total_rows_skipped    | 0
+dependent_var_levels  | "a","b","c","d"
+dependent_var_type    | text
+input_cp              | 0
+independent_var_types | text, text, text
+n_folds               | 0
+null_proxy            | __NULL__
+</pre></li>
+<li>Predict for data not previously seen by assuming NULL value as the 
default: <pre class="example">
+\x off
+DROP TABLE IF EXISTS table_test;
+CREATE TABLE table_test (
+    id integer,
+    country text,
+    city text,
+    weather text,
+    expected_response text
+);
+INSERT INTO table_test VALUES
+(1,'IN','MUM','cloudy','a'),
+(2,'US','HOU','humid','b'),
+(3,'US','NY','sunny','c'),
+(4,'US','NY','rainy','d');
+&#160;
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.tree_predict('train_output',
+                           'table_test',
+                           'prediction_results',
+                           'response');
+SELECT s.id, expected_response, estimated_response
+FROM prediction_results p, table_test s
+WHERE s.id = p.id ORDER BY id;
+</pre> <pre class="result">
+ id | expected_response | estimated_response 
+----+-------------------+--------------------
+  1 | a                 | a
+  2 | b                 | b
+  3 | c                 | c
+  4 | d                 | d
+(4 rows)
+</pre> There is only training data for country 'US' so the response for 
country 'IN' is 'a', corresponding to a NULL (not 'US') country level. 
Likewise, any city in the 'US' that is not 'NY' will predict response 'b', 
corresponding to a NULL (not 'NY') city level.</li>
+<li>Display the decision tree in basic text format: <pre class="example">
+SELECT madlib.tree_display('train_output', FALSE);
+</pre> <pre class="result">
+&#160; -------------------------------------
+&#160;- Each node represented by 'id' inside ().
+&#160;- Each internal nodes has the split condition at the end, while each
+&#160;    leaf node has a * at the end.
+&#160;- For each internal node (i), its child nodes are indented by 1 level
+&#160;    with ids (2i+1) for True node and (2i+2) for False node.
+&#160;- Number of rows and average response value inside []. For a leaf node, 
this is the prediction.
+&#160;-------------------------------------
+  (0)[1 1 1 1]  city in {NY}
+    (1)[0 0 1 1]  weather in {rainy}
+       (3)[0 0 0 1]  * --&gt; "d"
+       (4)[0 0 1 0]  * --&gt; "c"
+    (2)[1 1 0 0]  country in {US}
+       (5)[0 1 0 0]  * --&gt; "b"
+       (6)[1 0 0 0]  * --&gt; "a"
+&#160;-------------------------------------
+(1 row)
+</pre></li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd>[1] Breiman, Leo; Friedman, J. H.; Olshen, R. A.; 
Stone, C. J. (1984). Classification and regression trees. Monterey, CA: 
Wadsworth &amp; Brooks/Cole Advanced Books &amp; Software.</dd></dl>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" 
href="decision__tree_8sql__in.html">decision_tree.sql_in</a> documenting the 
training function</p>
+<p><a class="el" href="group__grp__random__forest.html">Random Forest</a></p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:12 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/e283664c/docs/v1.14/group__grp__dense__linear__solver.html
----------------------------------------------------------------------
diff --git a/docs/v1.14/group__grp__dense__linear__solver.html 
b/docs/v1.14/group__grp__dense__linear__solver.html
new file mode 100644
index 0000000..5b739e1
--- /dev/null
+++ b/docs/v1.14/group__grp__dense__linear__solver.html
@@ -0,0 +1,262 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.13"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Dense Linear Systems</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+  $(document).ready(initResizable);
+</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+  $(document).ready(function() { init_search(); });
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" 
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.14</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.13 -->
+<script type="text/javascript">
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+$(document).ready(function(){initNavTree('group__grp__dense__linear__solver.html','');});
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Dense Linear Systems<div class="ingroups"><a class="el" 
href="group__grp__utility__functions.html">Utility Functions</a> &raquo; <a 
class="el" href="group__grp__linear__solver.html">Linear 
Solvers</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b> <ul>
+<li class="level1">
+<a href="#dls_usage">Solution Function</a> </li>
+<li class="level1">
+<a href="#dls_opt_params">Optimizer Parameters</a> </li>
+<li class="level1">
+<a href="#dls_examples">Examples</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>The linear systems module implements solution methods for systems of 
consistent linear equations. Systems of linear equations take the form: </p><p 
class="formulaDsp">
+\[ Ax = b \]
+</p>
+<p>where \(x \in \mathbb{R}^{n}\), \(A \in \mathbb{R}^{m \times n} \) and \(b 
\in \mathbb{R}^{m}\). We assume that there are no rows of \(A\) where all 
elements are zero. The algorithms implemented in this module can handle large 
dense linear systems. Currently, the algorithms implemented in this module 
solve the linear system by a direct decomposition. Hence, these methods are 
known as <em>direct method</em>.</p>
+<p><a class="anchor" id="dls_usage"></a></p><dl class="section 
user"><dt>Solution Function</dt><dd><pre class="syntax">
+linear_solver_dense( tbl_source,
+                     tbl_result,
+                     row_id,
+                     LHS,
+                     RHS,
+                     grouping_col,
+                     optimizer,
+                     optimizer_params
+                   )
+</pre> <b>Arguments</b> <dl class="arglist">
+<dt>tbl_source </dt>
+<dd><p class="startdd">TEXT. The name of the table containing the training 
data. The input data is expected to be of the following form: 
</p><pre>{TABLE|VIEW} <em>sourceName</em> (
+    ...
+    <em>row_id</em>          FLOAT8,
+    <em>left_hand_side</em>  FLOAT8[],
+    <em>right_hand_side</em> FLOAT8,
+    ...
+)</pre><p>Each row represents a single equation. The <em>right_hand_side</em> 
column refers to the right hand side of the equations while the 
<em>left_hand_side</em> column refers to the multipliers on the variables on 
the left hand side of the same equations.</p>
+<p class="enddd"></p>
+</dd>
+<dt>tbl_result </dt>
+<dd><p class="startdd">TEXT. The name of the table where the output is saved. 
The output is stored in the table named by the <em>tbl_result</em> argument. It 
contains the following columns: </p><table class="output">
+<tr>
+<th>solution </th><td>FLOAT8[]. The solution variables in the same order as 
that provided as input in the 'left_hand_side' column name of the 
<em>source_table</em>  </td></tr>
+<tr>
+<th>residual_norm </th><td>FLOAT8. The scaled residual norm, defined as \( 
\frac{|Ax - b|}{|b|} \). This value is an indication of the accuracy of the 
solution.  </td></tr>
+<tr>
+<th>iters </th><td>INTEGER. Number of iterations required by the algorithm 
(only applicable for iterative algorithms). The output is NULL for 'direct' 
methods.   </td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>row_id </dt>
+<dd><p class="startdd">TEXT. The name of the column storing the 'row id' of 
the equations.</p>
+<p>For a system with N equations, the row_id's must be a continuous range of 
integers from \( 0 \ldots n-1 \). </p>
+<p class="enddd"></p>
+</dd>
+<dt>LHS </dt>
+<dd><p class="startdd">TEXT. The name of the column storing the 'left hand 
side' of the equations, stored as an array.</p>
+<p class="enddd"></p>
+</dd>
+<dt>RHS </dt>
+<dd><p class="startdd">TEXT. The name of the column storing the 'right hand 
side' of the equations.</p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_cols (optional)  </dt>
+<dd>TEXT, default: NULL. Group by column names. <em>Not currently implemented. 
Any non-NULL value is ignored.</em> </dd>
+<dt>optimizer (optional)  </dt>
+<dd><p class="startdd">TEXT, default: 'direct'. The type of optimizer.</p>
+<p class="enddd"></p>
+</dd>
+<dt>optimizer_params (optional)  </dt>
+<dd>TEXT, default: NULL. Optimizer specific parameters. </dd>
+</dl>
+</dd></dl>
+<p><a class="anchor" id="dls_opt_params"></a></p><dl class="section 
user"><dt>Optimizer Parameters</dt><dd></dd></dl>
+<p>For each optimizer, there are specific parameters that can be tuned for 
better performance.</p>
+<dl class="arglist">
+<dt>algorithm (default: householderqr) </dt>
+<dd><p class="startdd">There are several algorithms that can be classified as 
'direct' methods of solving linear systems. MADlib dense linear system solvers 
provide various algorithmic options for users.</p>
+<p>The following table provides a guideline on the choice of algorithm based 
on conditions on the A matrix, speed of the algorithms and numerical stability. 
</p><pre class="fragment"> Algorithm            | Conditions on A  | Speed | 
Accuracy
+ ----------------------------------------------------------
+ householderqr        | None             |  ++   |  +
+ partialpivlu         | Invertable       |  ++   |  +
+ fullpivlu            | None             |  -    |  +++
+ colpivhouseholderqr  | None             |  +    |  ++
+ fullpivhouseholderqr | None             |  -    |  +++
+ llt                  | Pos. Definite    |  +++  |  +
+ ldlt                 | Pos. or Neg Def  |  +++  |  ++
+</pre><p>For speed '++' is faster than '+', which is faster than '-'. For 
accuracy '+++' is better than '++'.</p>
+<p class="enddd">More details about the individual algorithms can be found in 
the <a 
href="http://eigen.tuxfamily.org/dox-devel/group__TutorialLinearAlgebra.html";>Eigen
 documentation</a>. Eigen is an open source library for linear algebra.  </p>
+</dd>
+</dl>
+<p><a class="anchor" id="dls_examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<ol type="1">
+<li>View online help for the linear systems solver function. <pre 
class="example">
+SELECT madlib.linear_solver_dense();
+</pre></li>
+<li>Create the sample data set. <pre class="example">
+CREATE TABLE linear_systems_test_data( id INTEGER NOT NULL,
+                                       lhs DOUBLE PRECISION[],
+                                       rhs DOUBLE PRECISION
+                                     );
+INSERT INTO linear_systems_test_data(id, lhs, rhs)
+       VALUES
+        (0, ARRAY[1,0,0], 20),
+        (1, ARRAY[0,1,0], 15),
+        (2, ARRAY[0,0,1], 20);
+</pre></li>
+<li>Solve the linear systems with default parameters. <pre class="example">
+SELECT madlib.linear_solver_dense( 'linear_systems_test_data',
+                                   'output_table',
+                                   'id',
+                                   'lhs',
+                                   'rhs'
+                                 );
+</pre></li>
+<li>Obtain the output from the output table. <pre class="example">
+\x on
+SELECT * FROM output_table;
+</pre> Result: <pre class="result">
+--------------------+-------------------------------------
+solution            | {20,15,20}
+residual_norm       | 0
+iters               | NULL
+</pre></li>
+<li>Choose an algorithm different than the default. <pre class="example">
+DROP TABLE IF EXISTS result_table;
+SELECT madlib.linear_solver_dense( 'linear_systems_test_data',
+                                   'result_table',
+                                   'id',
+                                   'lhs',
+                                   'rhs',
+                                   NULL,
+                                   'direct',
+                                   'algorithm=llt'
+                                 );
+</pre></li>
+</ol>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd>File <a class="el" href="dense__linear__systems_8sql__in.html" 
title="SQL functions for linear systems. ">dense_linear_systems.sql_in</a> 
documenting the SQL functions</dd></dl>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Wed May 2 2018 13:00:12 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.13 </li>
+  </ul>
+</div>
+</body>
+</html>

[29/51] [partial] madlib-site git commit: Doc: Add v1.14 documentation

Reply via email to