[11/51] [partial] madlib-site git commit: Doc: Add v1.15.1 documentation

nkak Mon, 15 Oct 2018 11:55:22 -0700

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__random__forest.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__random__forest.html 
b/docs/v1.15.1/group__grp__random__forest.html
new file mode 100644
index 0000000..5d2513c
--- /dev/null
+++ b/docs/v1.15.1/group__grp__random__forest.html
@@ -0,0 +1,1157 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Random Forest</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__random__forest.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="headertitle">
+<div class="title">Random Forest<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a> &raquo; <a class="el" 
href="group__grp__tree.html">Tree Methods</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<div class="toc"><b>Contents</b><ul>
+<li class="level1">
+<a href="#train">Training Function</a> </li>
+<li class="level1">
+<a href="#predict">Prediction Function</a> </li>
+<li class="level1">
+<a href="#get_tree">Tree Display</a> </li>
+<li class="level1">
+<a href="#get_importance">Importance Display</a> </li>
+<li class="level1">
+<a href="#examples">Examples</a> </li>
+<li class="level1">
+<a href="#literature">Literature</a> </li>
+<li class="level1">
+<a href="#related">Related Topics</a> </li>
+</ul>
+</div><p>Random forest builds an ensemble of classifiers, each of which is a 
tree model constructed using bootstrapped samples from the input data. The 
results of these models are then combined to yield a single prediction, which, 
at the expense of some loss in interpretation, can be highly accurate. Refer to 
Breiman et al. [1][2][3] for details on the implementation used here.</p>
+<p>Also refer to the <a href="group__grp__decision__tree.html">decision tree 
user documentation</a> since many parameters and examples are similar to random 
forest.</p>
+<p><a class="anchor" id="train"></a></p><dl class="section user"><dt>Training 
Function</dt><dd>Random forest training function has the following format: <pre 
class="syntax">
+forest_train(training_table_name,
+             output_table_name,
+             id_col_name,
+             dependent_variable,
+             list_of_features,
+             list_of_features_to_exclude,
+             grouping_cols,
+             num_trees,
+             num_random_features,
+             importance,
+             num_permutations,
+             max_tree_depth,
+             min_split,
+             min_bucket,
+             num_splits,
+             null_handling_params,
+             verbose,
+             sample_ratio
+             )
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>training_table_name </dt>
+<dd><p class="startdd">text. Name of the table containing the training 
data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table_name </dt>
+<dd><p class="startdd">TEXT. Name of the generated table containing the model. 
If a table with the same name already exists, an error will be returned. A 
summary table named <em>&lt;output_table_name&gt;_summary</em> and a grouping 
table named <em>&lt;output_table_name&gt;_group</em> are also created. These 
are described later on this page. </p>
+<p class="enddd"></p>
+</dd>
+<dt>id_col_name </dt>
+<dd><p class="startdd">TEXT. Name of the column containing id information in 
the training data. This is a mandatory argument and is used for prediction and 
other purposes. The values are expected to be unique for each row.</p>
+<p class="enddd"></p>
+</dd>
+<dt>dependent_variable </dt>
+<dd><p class="startdd">TEXT. Name of the column that contains the output 
(response) for training. Boolean, integer and text types are considered to be 
classification outputs, while double precision values are considered to be 
regression outputs. The response variable for a classification tree can be 
multinomial, but the time and space complexity of the training function 
increases linearly as the number of response classes increases.</p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of column names or 
expressions to use as predictors. Can also be a '*' implying all columns are to 
be used as predictors (except for the ones included in the next argument that 
lists exclusions). The types of the features can be mixed: boolean, integer, 
and text columns are considered categorical and double precision columns are 
considered continuous. Categorical variables are not encoded and used as is in 
the training.</p>
+<p>Array columns can also be included in the list, where the array is expanded 
to treat each element of the array as a feature.</p>
+<p>Note that not every combination of the levels of a categorical variable is 
checked when evaluating a split. The levels of the non-integer categorical 
variable are ordered by the entropy of the variable in predicting the response. 
The split at each node is evaluated between these ordered levels. Integer 
categorical variables, however, are simply ordered by their value. </p>
+<p class="enddd"></p>
+</dd>
+<dt>list_of_features_to_exclude </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of column names to exclude 
from the predictors list. If the <em>dependent_variable</em> is an expression 
(including cast of a column name), then this list should include the columns 
present in the <em>dependent_variable</em> expression, otherwise those columns 
will be included in the features (resulting in meaningless trees). The names in 
this parameter should be identical to the names used in the table and quoted 
appropriately. </p>
+<p class="enddd"></p>
+</dd>
+<dt>grouping_cols (optional) </dt>
+<dd><p class="startdd">TEXT, default: NULL. Comma-separated list of column 
names to group the data by. This will produce multiple random forests, one for 
each group.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_trees (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 100. Maximum number of trees to grow 
in the random forest model. Actual number of trees grown may be different, 
depending on the data.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_random_features (optional) </dt>
+<dd><p class="startdd">INTEGER, default: sqrt(n) for classification, n/3 for 
regression. This is the number of features to randomly select at each split.</p>
+<p class="enddd"></p>
+</dd>
+<dt>importance (optional) </dt>
+<dd><p class="startdd">BOOLEAN, default: true. Whether or not to calculate 
variable importance. If set to true, out-of-bag variable importance and 
impurity variable importance for categorical and continuous features will be 
output to the group table <em>&lt;model_table&gt;_group</em>. Note that total 
runtime will increase when variable importance is turned on. Refer to [1][2][3] 
for more information on variable importance. </p>
+<p class="enddd"></p>
+</dd>
+<dt>num_permutations (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 1. Number of times to permute each 
feature value while calculating the out-of-bag variable importance.</p>
+<dl class="section note"><dt>Note</dt><dd>Variable importance for a feature is 
determined by permuting the variable and computing the drop in predictive 
accuracy using out-of-bag samples [1]. Setting this greater than 1 performs an 
average over multiple importance calculations, but increases total run time. In 
most cases, the default value of 1 is sufficient to compute the importance. Due 
to nature of permutation, the importance value can end up being negative if the 
number of levels for a categorical variable is small and is unbalanced. In such 
a scenario, the importance values are shifted to ensure that the lowest 
importance value is 0. To see importance values normalized to sum to 100 across 
all variables, use the importance display helper function described later on 
this page.</dd></dl>
+<p class="enddd"></p>
+</dd>
+<dt>max_tree_depth (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 7. Maximum depth of any node of a 
tree, with the root node counted as depth 0. A deeper tree can lead to better 
prediction but will also result in longer processing time and higher memory 
usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_split (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 20. Minimum number of observations 
that must exist in a node for a split to be attempted.</p>
+<p class="enddd"></p>
+</dd>
+<dt>min_bucket (optional) </dt>
+<dd><p class="startdd">INTEGER, default: min_split/3. Minimum number of 
observations in any terminal node. If only one of min_bucket or min_split is 
specified, min_split is set to min_bucket*3 or min_bucket to min_split/3, as 
appropriate.</p>
+<p class="enddd"></p>
+</dd>
+<dt>num_splits (optional) </dt>
+<dd><p class="startdd">INTEGER, default: 20. Continuous-valued features are 
binned into discrete quantiles to compute split boundaries. This global 
parameter is used to compute the resolution of splits for continuous features. 
Higher number of bins will lead to better prediction, but will also result in 
longer processing time and higher memory usage.</p>
+<p class="enddd"></p>
+</dd>
+<dt>null_handling_params (optional) </dt>
+<dd><p class="startdd">TEXT. Comma-separated string of key-value pairs 
controlling the behavior of various features handling missing values. One of 
the following can be used if desired (not both): </p><table class="output">
+<tr>
+<th>max_surrogates </th><td>Default: 0. Number of surrogates to store for each 
node. One approach to handling NULLs is to use surrogate splits for each node. 
A surrogate variable enables you to make better use of the data by using 
another predictor variable that is associated (correlated) with the primary 
split variable. The surrogate variable comes into use when the primary 
predictior value is NULL. Surrogate rules implemented here are based on 
reference [1].  </td></tr>
+<tr>
+<th>null_as_category </th><td><p class="starttd">Default: FALSE. Whether to 
treat NULL as a valid level for categorical features. FALSE means that NULL is 
not a valid level, which is probably the most common sitation.</p>
+<p>If set to TRUE, NULL values are considered a categorical value and placed 
at the end of the ordering of categorical levels. Placing at the end ensures 
that NULL is never used as a value to split a node on. One reason to make NULL 
a category is that it allows you to predict on categorical levels that were not 
in the training data by lumping them into an "other bucket."</p>
+<p class="endtd">This parameter is ignored for continuous-valued features.   
</p>
+</td></tr>
+</table>
+<p class="enddd"></p>
+</dd>
+<dt>verbose (optional) </dt>
+<dd><p class="startdd">BOOLEAN, default: FALSE. Provides verbose output of the 
results of training.</p>
+<p class="enddd"></p>
+</dd>
+<dt>sample_ratio (optional) </dt>
+<dd>DOUBLE PRECISION, in the range of (0, 1], default: 1. If 'sample_ratio' is 
less than 1, a bootstrap sample size smaller than the data table is used for 
training each tree in the forest. A ratio that is close to 0 may result in 
trees with only the root node. This sample parameter allows users to quickly 
experiment with the random forest function since it reduces run time by using 
only some of the data. </dd>
+</dl>
+<p><b>Output</b> </p><dl class="arglist">
+</dl>
+<p>The model table produced by the training function contains the following 
columns:</p>
+<table class="output">
+<tr>
+<th>gid </th><td>INTEGER. Group id that uniquely identifies a set of grouping 
column values.  </td></tr>
+<tr>
+<th>sample_id </th><td>INTEGER. The id of the bootstrap sample that this tree 
is a part of.  </td></tr>
+<tr>
+<th>tree </th><td>BYTEA8. Trained tree model stored in binary format (not 
human readable).  </td></tr>
+</table>
+<p>A summary table named <em>&lt;model_table&gt;_summary</em> is also created 
at the same time, which contains the following columns: </p><table 
class="output">
+<tr>
+<th>method </th><td><p class="starttd">'forest_train' </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>is_classification </th><td><p class="starttd">BOOLEAN. True if it is a 
classification model, false if for regression. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>source_table </th><td><p class="starttd">TEXT. Data source table name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>model_table </th><td><p class="starttd">TEXT. Model table name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>id_col_name </th><td><p class="starttd">TEXT. The ID column name. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_varname </th><td><p class="starttd">TEXT. Dependent variable. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>independent_varnames </th><td><p class="starttd">TEXT. Independent 
variables </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_features </th><td><p class="starttd">TEXT. List of categorical 
features as a comma-separated string. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>con_features </th><td><p class="starttd">TEXT. List of continuous feature 
as a comma-separated string. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>grouping_cols </th><td><p class="starttd">INTEGER. Names of grouping 
columns. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_trees </th><td><p class="starttd">INTEGER. Number of trees grown by 
the model. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_random_features </th><td><p class="starttd">INTEGER. Number of 
features randomly selected for each split. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>max_tree_depth </th><td><p class="starttd">INTEGER. Maximum depth of any 
tree in the random forest model_table. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>min_split </th><td><p class="starttd">INTEGER. Minimum number of 
observations in a node for it to be split. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>min_bucket </th><td><p class="starttd">INTEGER. Minimum number of 
observations in any terminal node. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_splits </th><td><p class="starttd">INTEGER. Number of buckets for 
continuous variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>verbose </th><td><p class="starttd">BOOLEAN. Whether or not to display 
debug info. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>importance </th><td><p class="starttd">BOOLEAN. Whether or not to 
calculate variable importance. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_permutations </th><td><p class="starttd">INTEGER. Number of times 
feature values are permuted while calculating out-of-bag variable importance. 
</p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_all_groups </th><td><p class="starttd">INTEGER. Number of groups 
during forest training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>num_failed_groups </th><td><p class="starttd">INTEGER. Number of failed 
groups during forest training. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_processed </th><td><p class="starttd">BIGINT. Total numbers of 
rows processed in all groups. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>total_rows_skipped </th><td><p class="starttd">BIGINT. Total numbers of 
rows skipped in all groups due to missing values or failures. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_levels </th><td><p class="starttd">TEXT. For classification, 
the distinct levels of the dependent variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>dependent_var_type </th><td><p class="starttd">TEXT. The type of dependent 
variable. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>independent_var_types </th><td><p class="starttd">TEXT. A comma separated 
string for the types of independent variables. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>null_proxy </th><td>TEXT. Describes how NULLs are handled. If NULL is not 
treated as a separate categorical variable, this will be NULL. If NULL is 
treated as a separate categorical value, this will be set to "__NULL__"  
</td></tr>
+</table>
+<p>A table named <em>&lt;model_table&gt;_group</em> is also created at the 
same time, even if no grouping is specified. It contains the following columns: 
</p><table class="output">
+<tr>
+<th>gid </th><td><p class="starttd">integer. Group id that uniquely identifies 
a set of grouping column values. If grouping is not used, this will always be 
1. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>&lt;...&gt; </th><td><p class="starttd">Same type as in the training data 
table 'grouping_cols'. This could be multiple columns depending on the 
'grouping_cols' input. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>success </th><td><p class="starttd">BOOLEAN. Indicator of the success of 
the group. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_levels_in_text </th><td><p class="starttd">TEXT[]. Ordered levels 
(values) of categorical variables corresponding to the categorical features in 
the 'list_of_features' argument above. Used to help interpret the trained tree. 
For example, if the categorical features specified are <em>weather_outlook</em> 
and <em>windy</em> in that order, then 'cat_levels_in_text' might be 
<em>[overcast, rain, sunny, False, True]</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>cat_n_levels </th><td><p class="starttd">INTEGER[]. Number of levels for 
each categorical variable. Used to help interpret the trained tree. In the 
example from above, 'cat_n_levels' would be <em>[3, 2]</em> since there are 3 
levels for <em>weather_outlook</em> and 2 levels <em>windy</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>oob_error </th><td><p class="starttd">DOUBLE PRECISION. Out-of-bag error 
for the random forest model. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>oob_var_importance </th><td><p class="starttd">DOUBLE PRECISION[]. 
Out-of-bag variable importance for both categorical and continuous features. 
The order corresponds to the order of the variables in 'independent_varnames' 
in <em> &lt;model_table&gt;_summary</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+<tr>
+<th>impurity_var_importance </th><td><p class="starttd">DOUBLE PRECISION[]. 
Impurity variable importance for both categorial and continuous features. The 
order corresponds to the order of the variables in 'independent_varnames' in 
<em> &lt;model_table&gt;_summary</em>. </p>
+<p class="endtd"></p>
+</td></tr>
+</table>
+<dl class="section note"><dt>Note</dt><dd>The main parameters that affect 
memory usage are: depth of tree (âmax_tree_depthâ), number of features, 
number of values per categorical feature, and number of bins for continuous 
features (ânum_splitsâ). If you are hitting memory limits, consider 
reducing one or more of these parameters.</dd></dl>
+<p><a class="anchor" id="predict"></a></p><dl class="section 
user"><dt>Prediction Function</dt><dd>The prediction function estimates the 
conditional mean given a new predictor. It has the following syntax: <pre 
class="syntax">
+forest_predict(random_forest_model,
+               new_data_table,
+               output_table,
+               type)
+</pre></dd></dl>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>forest_model </dt>
+<dd><p class="startdd">text. Name of the table containing the random forest 
model from training.</p>
+<p class="enddd"></p>
+</dd>
+<dt>new_data_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing prediction data. 
This table is expected to contain the same features that were used during 
training. The table should also contain <em>id_col_name</em> used for 
identifying each row.</p>
+<p class="enddd"></p>
+</dd>
+<dt>output_table </dt>
+<dd><p class="startdd">TEXT. Name of the table to output prediction results. 
If this table already exists, an error is returned. The table contains the 
<em>id_col_name</em> column giving the 'id' for each prediction and the 
prediction columns for the dependent variable.</p>
+<p>If <em>type</em> = 'response', then the table has a single additional 
column with the prediction value of the response. The type of this column 
depends on the type of the response variable used during training.</p>
+<p>If <em>type</em> = 'prob', then the table has multiple additional columns, 
one for each possible value of the response variable. The columns are labeled 
as 'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents 
each value of the response variable.</p>
+<p class="enddd"></p>
+</dd>
+<dt>type </dt>
+<dd>TEXT, optional, default: 'response'. For regression trees, the output is 
always the predicted value of the dependent variable. For classification trees, 
the <em>type</em> variable can be 'response', giving the classification 
prediction as output, or 'prob', giving the class probabilities as output. For 
each value of the dependent variable, a column with the probabilities is added 
to the output table.  </dd>
+</dl>
+<p><a class="anchor" id="get_tree"></a></p><dl class="section user"><dt>Tree 
Display</dt><dd>The display function outputs a graph representation of a single 
tree of the random forest. The output can either be in the popular 'dot' format 
that can be visualized using various programs including those in the GraphViz 
package, or in a simple text format. The details of the text format are output 
with the tree. <pre class="syntax">
+get_tree(forest_model_table,
+         gid,
+         sample_id,
+         dot_format,
+         verbose)
+</pre></dd></dl>
+<p>An additional display function is provided to output the surrogate splits 
chosen for each internal node: </p><pre class="syntax">
+get_tree_surr(forest_model_table,
+              gid,
+              sample_id)
+</pre><p>This output contains the list of surrogate splits for each internal 
node. The nodes are sorted in ascending order by id. This is equivalent to 
viewing the tree in a breadth-first manner. For each surrogate, we output the 
surrogate split (variable and threshold) and also give the number of rows that 
were common between the primary split and the surrogate split. Finally, the 
number of rows present in the majority branch of the primary split is also 
shown. Only surrogates that perform better than this majority branch are 
included in the surrogate list. When the primary variable has a NULL value the 
surrogate variables are used in order to compute the split for that node. If 
all surrogates variables are NULL, then the majority branch is used to compute 
the split for a tuple.</p>
+<p><b>Arguments</b> </p><dl class="arglist">
+<dt>forest_model_table </dt>
+<dd><p class="startdd">TEXT. Name of the table containing the random forest 
model.</p>
+<p class="enddd"></p>
+</dd>
+<dt>gid </dt>
+<dd><p class="startdd">INTEGER. Id of the group that this tree is part of.</p>
+<p class="enddd"></p>
+</dd>
+<dt>sample_id </dt>
+<dd><p class="startdd">INTEGER. Id of the bootstrap sample that this tree is 
part of.</p>
+<p class="enddd"></p>
+</dd>
+<dt>dot_format (optional) </dt>
+<dd><p class="startdd">BOOLEAN, default = TRUE. Output can either be in a dot 
format or a text format. If TRUE, the result is in the dot format, else output 
is in text format.</p>
+<p class="enddd"></p>
+</dd>
+<dt>verbose (optional) </dt>
+<dd>BOOLEAN, default = FALSE. If true, the dot format output will contain 
additional information (impurity, sample size, number of weighted rows for each 
response variable, classification or prediction if the tree was pruned at this 
level) </dd>
+</dl>
+<p>The output is always returned as a 'TEXT'. For the dot format, the output 
can be redirected to a file on the client side and then rendered using 
visualization programs.</p>
+<p>To export the dot format result to an external file, use the method below. 
Please note that you should use unaligned table output mode for psql with '-A' 
flag, or else you may get an error when you try to convert the dot file to 
another format for viewing (e.g., PDF). And inside the psql client, both '\t' 
and '\o' should be used:</p>
+<pre class="example">
+&gt; # under bash
+&gt; psql -A my_database
+# -- in psql now
+# \t
+# \o test.dot -- export to a file
+# select madlib.tree_display('tree_out');
+# \o
+# \t
+</pre><p>After the dot file has been generated, use third-party plotting 
software to plot the trees in a nice format: </p><pre class="example">
+&gt; # under bash, convert the dot file into a PDF file
+&gt; dot -Tpdf test.dot &gt; test.pdf
+&gt; xpdf test.pdf&amp;
+</pre><p>Please see the <a href="group__grp__decision__tree.html">decision 
tree user documentation</a> for more details on working with tree output 
formats.</p>
+<p><a class="anchor" id="get_importance"></a></p><dl class="section 
user"><dt>Importance Display</dt><dd>This is a helper function that creates a 
table to more easily view out-of-bag and impurity variable importance values 
for a given model table. This function rescales the importance values to 
represent them as percentages i.e. importance values are scaled to sum to 
100.</dd></dl>
+<pre class="syntax">
+<a class="el" 
href="random__forest_8sql__in.html#aea5ca2b827a4ee552a8c0f4d4d947725">get_var_importance(model_table,
 output_table)</a>
+</pre><p><b>Arguments</b> </p><dl class="arglist">
+<dt>model_table </dt>
+<dd>TEXT. Name of the table containing the random forest model. </dd>
+<dt>output_table </dt>
+<dd>TEXT. Name of the table to create for importance values. </dd>
+</dl>
+<p>The summary and group tables generated by the forest_train function are 
required for this function to work.</p>
+<p><a class="anchor" id="examples"></a></p><dl class="section 
user"><dt>Examples</dt><dd></dd></dl>
+<dl class="section note"><dt>Note</dt><dd><ul>
+<li>Not all random forest parameters are demonstrated in the examples below. 
Some are shown in the <a href="group__grp__decision__tree.html">decision tree 
user documentation</a> since usage is similar.</li>
+<li>Your results may look different than those below due the random nature of 
random forests.</li>
+</ul>
+</dd></dl>
+<p><b>Random Forest Classification Example</b></p>
+<ol type="1">
+<li>Load input data set related to whether to play golf or not: <pre 
class="example">
+DROP TABLE IF EXISTS rf_golf CASCADE;
+CREATE TABLE rf_golf (
+    id integer NOT NULL,
+    "OUTLOOK" text,
+    temperature double precision,
+    humidity double precision,
+    "Temp_Humidity" double precision[],
+    clouds_airquality text[],
+    windy boolean,
+    class text
+);
+INSERT INTO rf_golf VALUES
+(1,'sunny', 85, 85, ARRAY[85, 85],ARRAY['none', 'unhealthy'], 'false','Don''t 
Play'),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['none', 'moderate'], 'true', 'Don''t 
Play'),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['low', 'moderate'], 'false', 
'Play'),
+(4, 'rain', 70, 96, ARRAY[70, 96], ARRAY['low', 'moderate'], 'false', 'Play'),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['medium', 'good'], 'false', 'Play'),
+(6, 'rain', 65, 70, ARRAY[65, 70], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play'),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['medium', 'moderate'], 'true', 
'Play'),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['high', 'unhealthy'], 'false', 
'Don''t Play'),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['high', 'good'], 'false', 'Play'),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['medium', 'good'], 'false', 'Play'),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['none', 'good'], 'true', 'Play'),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['medium', 'moderate'], 'true', 
'Play'),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['medium', 'moderate'], 'false', 
'Play'),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['low', 'unhealthy'], 'true', 'Don''t 
Play');
+</pre></li>
+<li>Train random forest and view the summary table: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('rf_golf',         -- source table
+                           'train_output',    -- output model table
+                           'id',              -- id column
+                           'class',           -- response
+                           '"OUTLOOK", temperature, humidity, windy',   -- 
features
+                           NULL,              -- exclude columns
+                           NULL,              -- grouping columns
+                           20::integer,       -- number of trees
+                           2::integer,        -- number of random features
+                           TRUE::boolean,     -- variable importance
+                           1::integer,        -- num_permutations
+                           8::integer,        -- max depth
+                           3::integer,        -- min split
+                           1::integer,        -- min bucket
+                           10::integer        -- number of splits per 
continuous variable
+                           );
+\x on
+SELECT * FROM train_output_summary;
+</pre> <pre class="result">
+-[ RECORD 1 ]---------+--------------------------------------------------
+method                | forest_train
+is_classification     | t
+source_table          | rf_golf
+model_table           | train_output
+id_col_name           | id
+dependent_varname     | class
+independent_varnames  | "OUTLOOK",windy,temperature,humidity
+cat_features          | "OUTLOOK",windy
+con_features          | temperature,humidity
+grouping_cols         |
+num_trees             | 20
+num_random_features   | 2
+max_tree_depth        | 8
+min_split             | 3
+min_bucket            | 1
+num_splits            | 10
+verbose               | f
+importance            | t
+num_permutations      | 1
+num_all_groups        | 1
+num_failed_groups     | 0
+total_rows_processed  | 14
+total_rows_skipped    | 0
+dependent_var_levels  | "Don't Play","Play"
+dependent_var_type    | text
+independent_var_types | text, boolean, double precision, double precision
+null_proxy            | None
+</pre> View the group table output: <pre class="example">
+SELECT * FROM train_output_group;
+</pre> <pre class="result">
+-[ RECORD 1 
]-----------+----------------------------------------------------------------------
+gid                     | 1
+success                 | t
+cat_n_levels            | {3,2}
+cat_levels_in_text      | {overcast,sunny,rain,False,True}
+oob_error               | 0.64285714285714285714
+oob_var_importance      | 
{0.0525595238095238,0,0.0138095238095238,0.0276190476190476}
+impurity_var_importance | 
{0.254133481284938,0.0837130966399198,0.258520599370744,0.173196167388586}
+</pre> The 'cat_levels_in_text' array shows the levels of the categorical 
variables "OUTLOOK" and windy, which have 3 and 2 levels respectively. 
Out-of-bag and impurity variable importance arrays are ordered according to the 
order of the variables in 'independent_varnames' in 
&lt;model_table&gt;_summary. A higher value means higher importance for the 
variable. We can use the helper function to get a normalized view of variable 
importance: <pre class="example">
+\x off
+DROP TABLE IF EXISTS imp_output;
+SELECT madlib.get_var_importance('train_output','imp_output');
+SELECT * FROM imp_output ORDER BY oob_var_importance DESC;
+</pre> <pre class="result">
+   feature   | oob_var_importance | impurity_var_importance
+-------------+--------------------+-------------------------
+ "OUTLOOK"   |   55.9214692843572 |        33.0230751036133
+ humidity    |   29.3856871437619 |        22.5057714332356
+ temperature |    14.692843571881 |        33.5931539822541
+ windy       |                  0 |         10.877999480897
+(4 rows)
+</pre></li>
+<li>Predict output categories. For the purpose of this example, we use the 
same data that was used for training: <pre class="example">
+\x off
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.forest_predict('train_output',        -- tree model
+                             'rf_golf',             -- new data table
+                             'prediction_results',  -- output table
+                             'response');           -- show response
+SELECT g.id, class, estimated_class FROM prediction_results p,
+rf_golf g WHERE p.id = g.id ORDER BY g.id;
+</pre> <pre class="result">
+ id |   class    | estimated_class
+----+------------+-----------------
+  1 | Don't Play | Don't Play
+  2 | Don't Play | Don't Play
+  3 | Play       | Play
+  4 | Play       | Play
+  5 | Play       | Play
+  6 | Don't Play | Don't Play
+  7 | Play       | Play
+  8 | Don't Play | Don't Play
+  9 | Play       | Play
+ 10 | Play       | Play
+ 11 | Play       | Play
+ 12 | Play       | Play
+ 13 | Play       | Play
+ 14 | Don't Play | Don't Play
+(14 rows)
+</pre> To display the probabilities associated with each value of the 
dependent variable, set the 'type' parameter to 'prob': <pre class="example">
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.forest_predict('train_output',        -- tree model
+                             'rf_golf',             -- new data table
+                             'prediction_results',  -- output table
+                             'prob');               -- show probability
+SELECT g.id, class, "estimated_prob_Don't Play",  "estimated_prob_Play"
+FROM prediction_results p, rf_golf g WHERE p.id = g.id ORDER BY g.id;
+</pre> <pre class="result">
+ id |   class    | estimated_prob_Don't Play | estimated_prob_Play
+----+------------+---------------------------+---------------------
+  1 | Don't Play |                       0.9 |                 0.1
+  2 | Don't Play |                      0.85 |                0.15
+  3 | Play       |                         0 |                   1
+  4 | Play       |                      0.35 |                0.65
+  5 | Play       |                      0.05 |                0.95
+  6 | Don't Play |                      0.85 |                0.15
+  7 | Play       |                      0.25 |                0.75
+  8 | Don't Play |                      0.85 |                0.15
+  9 | Play       |                      0.15 |                0.85
+ 10 | Play       |                      0.15 |                0.85
+ 11 | Play       |                      0.35 |                0.65
+ 12 | Play       |                       0.1 |                 0.9
+ 13 | Play       |                         0 |                   1
+ 14 | Don't Play |                       0.8 |                 0.2
+(14 rows)
+</pre></li>
+<li>View a single tree in text format within the forest identified by 'gid' 
and 'sample_id', out of the several that were created: <pre class="example">
+SELECT madlib.get_tree('train_output',1,7, FALSE);
+</pre> <pre class="result">
+&#160;-------------------------------------
+&#160;- Each node represented by 'id' inside ().
+&#160;- Leaf nodes have a * while internal nodes have the split condition at 
the end.
+&#160;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
+&#160;- For each split the first indented child (2i+1) is the 'True' node and
+second indented child (2i+2) is the 'False' node.
+&#160;- Number of (weighted) rows for each response variable inside [].
+&#160;- Order of values = ['"Don\'t Play"', '"Play"']
+&#160;-------------------------------------
+ (0)[ 5 10]  windy in {False}
+    (1)[2 8]  "OUTLOOK" in {overcast,sunny}
+       (3)[2 1]  humidity &lt;= 75
+          (7)[0 1]  * --&gt; "Play"
+          (8)[2 0]  * --&gt; "Don't Play"
+       (4)[0 7]  * --&gt; "Play"
+    (2)[3 2]  temperature &lt;= 75
+       (5)[1 2]  humidity &lt;= 70
+          (11)[1 1]  * --&gt; "Don't Play"
+          (12)[0 1]  * --&gt; "Play"
+       (6)[2 0]  * --&gt; "Don't Play"
+&#160;-------------------------------------
+</pre> Please see the <a href="group__grp__decision__tree.html">decision tree 
user documentation</a> for an explanation on how to interpret the tree display 
above.</li>
+<li>View tree in dot format: <pre class="example">
+SELECT madlib.get_tree('train_output',1,7);
+</pre> <pre class="result">
+&#160;----------------------------------------------------
+ digraph "Classification tree for rf_golf" {
+ "0" [label="windy &lt;= False", shape=ellipse];
+ "0" -&gt; "1"[label="yes"];
+ "0" -&gt; "2"[label="no"];
+ "1" [label="\"OUTLOOK" &lt;= sunny", shape=ellipse];
+ "1" -&gt; "3"[label="yes"];
+ "1" -&gt; "4"[label="no"];
+ "4" [label=""Play"",shape=box];
+ "2" [label="temperature &lt;= 75", shape=ellipse];
+ "2" -&gt; "5"[label="yes"];
+ "2" -&gt; "6"[label="no"];
+ "6" [label=""Don't Play"",shape=box];
+ "3" [label="humidity &lt;= 75", shape=ellipse];
+ "3" -&gt; "7"[label="yes"];
+ "7" [label=""Play"",shape=box];
+ "3" -&gt; "8"[label="no"];
+ "8" [label=""Don't Play"",shape=box];
+ "5" [label="humidity &lt;= 70", shape=ellipse];
+ "5" -&gt; "11"[label="yes"];
+ "11" [label=""Don't Play"",shape=box];
+ "5" -&gt; "12"[label="no"];
+ "12" [label=""Play"",shape=box];
+ } //---end of digraph---------
+</pre></li>
+<li>View tree in dot format with additional information: <pre class="example">
+SELECT madlib.get_tree('train_output',1,7, TRUE, TRUE);
+</pre> <pre class="result">
+&#160;---------------------------------------------------------------------------------------------------------------------------
+ digraph "Classification tree for rf_golf" {
+ "0" [label="windy &lt;= False\\n impurity = 0.444444\\n samples = 15\\n value 
= [ 5 10]\\n class = \"Play"", shape=ellipse];
+ "0" -&gt; "1"[label="yes"];
+ "0" -&gt; "2"[label="no"];
+ "1" [label=""OUTLOOK" &lt;= sunny\n impurity = 0.32\n samples = 10\n value = 
[2 8]\n class = "Play"", shape=ellipse];
+ "1" -&gt; "3"[label="yes"];
+ "1" -&gt; "4"[label="no"];
+ "4" [label=""Play"\n impurity = 0\n samples = 7\n value = [0 7]",shape=box];
+ "2" [label="temperature &lt;= 75\n impurity = 0.48\n samples = 5\n value = [3 
2]\n class = "Don't Play"", shape=ellipse];
+ "2" -&gt; "5"[label="yes"];
+ "2" -&gt; "6"[label="no"];
+ "6" [label=""Don't Play"\n impurity = 0\n samples = 2\n value = [2 
0]",shape=box];
+ "3" [label="humidity &lt;= 75\n impurity = 0.444444\n samples = 3\n value = 
[2 1]\n class = "Don't Play"", shape=ellipse];
+ "3" -&gt; "7"[label="yes"];
+ "7" [label=""Play"\n impurity = 0\n samples = 1\n value = [0 1]",shape=box];
+ "3" -&gt; "8"[label="no"];
+ "8" [label=""Don't Play"\n impurity = 0\n samples = 2\n value = [2 
0]",shape=box];
+ "5" [label="humidity &lt;= 70\n impurity = 0.444444\n samples = 3\n value = 
[1 2]\n class = "Play"", shape=ellipse];
+ "5" -&gt; "11"[label="yes"];
+ "11" [label=""Don't Play"\n impurity = 0.5\n samples = 2\n value = [1 
1]",shape=box];
+ "5" -&gt; "12"[label="no"];
+ "12" [label=""Play"\n impurity = 0\n samples = 1\n value = [0 1]",shape=box];
+ } //---end of digraph---------
+</pre></li>
+<li>Arrays of features. Categorical and continuous features can be array 
columns, in which case the array is expanded to treat each element of the array 
as a feature: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('rf_golf',         -- source table
+                           'train_output',    -- output model table
+                           'id',              -- id column
+                           'class',           -- response
+                           '"Temp_Humidity", clouds_airquality',   -- features
+                           NULL,              -- exclude columns
+                           NULL,              -- grouping columns
+                           20::integer,       -- number of trees
+                           2::integer,        -- number of random features
+                           TRUE::boolean,     -- variable importance
+                           1::integer,        -- num_permutations
+                           8::integer,        -- max depth
+                           3::integer,        -- min split
+                           1::integer,        -- min bucket
+                           10::integer        -- number of splits per 
continuous variable
+                           );
+\x on
+SELECT * FROM train_output_summary;
+</pre> <pre class="result">
+-[ RECORD 1 
]---------+----------------------------------------------------------------------------------------
+method                | forest_train
+is_classification     | t
+source_table          | rf_golf
+model_table           | train_output
+id_col_name           | id
+dependent_varname     | class
+independent_varnames  | 
(clouds_airquality)[1],(clouds_airquality)[2],("Temp_Humidity")[1],("Temp_Humidity")[2]
+cat_features          | (clouds_airquality)[1],(clouds_airquality)[2]
+con_features          | ("Temp_Humidity")[1],("Temp_Humidity")[2]
+grouping_cols         |
+num_trees             | 20
+num_random_features   | 2
+max_tree_depth        | 8
+min_split             | 3
+min_bucket            | 1
+num_splits            | 10
+verbose               | f
+importance            | t
+num_permutations      | 1
+num_all_groups        | 1
+num_failed_groups     | 0
+total_rows_processed  | 14
+total_rows_skipped    | 0
+dependent_var_levels  | "Don't Play","Play"
+dependent_var_type    | text
+independent_var_types | text, text, double precision, double precision
+null_proxy            | None
+</pre></li>
+<li>Sample ratio. Use the sample ratio parameter to train on a subset of the 
data: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('rf_golf',         -- source table
+                           'train_output',    -- output model table
+                           'id',              -- id column
+                           'class',           -- response
+                           '"OUTLOOK", temperature, humidity, windy',   -- 
features
+                           NULL,              -- exclude columns
+                           NULL,              -- grouping columns
+                           20::integer,       -- number of trees
+                           2::integer,        -- number of random features
+                           TRUE::boolean,     -- variable importance
+                           1::integer,        -- num_permutations
+                           8::integer,        -- max depth
+                           3::integer,        -- min split
+                           1::integer,        -- min bucket
+                           10::integer,       -- number of splits per 
continuous variable
+                           NULL,              -- NULL handling
+                           FALSE,             -- Verbose
+                           0.5                -- Sample ratio
+                           );
+SELECT * FROM train_output_group;
+</pre> <pre class="result">
+-[ RECORD 1 
]-----------+--------------------------------------------------------------------
+gid                     | 1
+success                 | t
+cat_n_levels            | {3,2}
+cat_levels_in_text      | {overcast,rain,sunny,False,True}
+oob_error               | 0.57142857142857142857
+oob_var_importance      | 
{0,0.0166666666666667,0.0166666666666667,0.0166666666666667}
+impurity_var_importance | 
{0.143759266026582,0.0342777777777778,0.157507369614512,0.0554953231292517}
+</pre></li>
+</ol>
+<p><b>Random Forest Regression Example</b></p>
+<ol type="1">
+<li>Load input data related to fuel consumption and 10 aspects of automobile 
design and performance for 32 automobiles (1973â74 models). Data was 
extracted from the 1974 Motor Trend US magazine. <pre class="example">
+DROP TABLE IF EXISTS mt_cars;
+CREATE TABLE mt_cars (
+    id integer NOT NULL,
+    mpg double precision,
+    cyl integer,
+    disp double precision,
+    hp integer,
+    drat double precision,
+    wt double precision,
+    qsec double precision,
+    vs integer,
+    am integer,
+    gear integer,
+    carb integer
+);
+INSERT INTO mt_cars VALUES
+(1,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2),
+(2,21,6,160,110,3.9,2.62,16.46,0,1,4,4),
+(3,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2),
+(4,21,6,160,110,3.9,2.875,17.02,0,1,4,4),
+(5,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4),
+(6,16.4,8,275.8,180,3.078,4.07,17.4,0,0,3,3),
+(7,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1),
+(8,17.3,8,275.8,180,3.078,3.73,17.6,0,0,3,3),
+(9,21.4,null,258,110,3.08,3.215,19.44,1,0,3,1),
+(10,15.2,8,275.8,180,3.078,3.78,18,0,0,3,3),
+(11,18.1,6,225,105,2.768,3.46,20.22,1,0,3,1),
+(12,32.4,4,78.7,66,4.08,2.20,19.47,1,1,4,1),
+(13,14.3,8,360,245,3.21,3.578,15.84,0,0,3,4),
+(14,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2),
+(15,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2),
+(16,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4),
+(17,33.9,4,71.14,65,4.22,1.835,19.9,1,1,4,1),
+(18,15.2,null,304,150,3.15,3.435,17.3,0,0,3,2),
+(19,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4),
+(20,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1),
+(21,10.4,8,460,215,3,5.424,17.82,0,0,3,4),
+(22,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2),
+(23,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4),
+(24,30.4,4,95.14,113,3.77,1.513,16.9,1,1,5,2),
+(25,21.5,4,120.1,97,3.70,2.465,20.01,1,0,3,1),
+(26,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4),
+(27,15.5,8,318,150,2.768,3.52,16.87,0,0,3,2),
+(28,15,8,301,335,3.54,3.578,14.6,0,1,5,8),
+(29,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4),
+(30,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2),
+(31,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6),
+(32,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2);
+</pre></li>
+<li>We train a regression random forest tree with grouping on transmission 
type (0 = automatic, 1 = manual) and use surrogates for NULL handling: <pre 
class="example">
+DROP TABLE IF EXISTS mt_cars_output, mt_cars_output_group, 
mt_cars_output_summary;
+SELECT madlib.forest_train('mt_cars',         -- source table
+                           'mt_cars_output',  -- output model table
+                           'id',              -- id column
+                           'mpg',             -- response
+                           '*',               -- features
+                           'id, hp, drat, am, gear, carb',  -- exclude columns
+                           'am',              -- grouping columns
+                           10::integer,       -- number of trees
+                           2::integer,        -- number of random features
+                           TRUE::boolean,     -- variable importance
+                           1,                 -- num_permutations
+                           10,                -- max depth
+                           8,                 -- min split
+                           3,                 -- min bucket
+                           10,                -- number of splits per 
continuous variable
+                           'max_surrogates=2' -- NULL handling
+                           );
+\x on
+SELECT * FROM mt_cars_output_summary;
+</pre> <pre class="result">
+-[ RECORD 1 
]---------+-----------------------------------------------------------------------
+method                | forest_train
+is_classification     | f
+source_table          | mt_cars
+model_table           | mt_cars_output
+id_col_name           | id
+dependent_varname     | mpg
+independent_varnames  | vs,cyl,disp,qsec,wt
+cat_features          | vs,cyl
+con_features          | disp,qsec,wt
+grouping_cols         | am
+num_trees             | 10
+num_random_features   | 2
+max_tree_depth        | 10
+min_split             | 8
+min_bucket            | 3
+num_splits            | 10
+verbose               | f
+importance            | t
+num_permutations      | 1
+num_all_groups        | 2
+num_failed_groups     | 0
+total_rows_processed  | 32
+total_rows_skipped    | 0
+dependent_var_levels  |
+dependent_var_type    | double precision
+independent_var_types | integer, integer, double precision, double precision, 
double precision
+null_proxy            | None
+</pre> Review the group table to see variable importance by group: <pre 
class="example">
+SELECT * FROM mt_cars_output_group ORDER BY gid;
+</pre> <pre class="result">
+-[ RECORD 1 
]-----------+----------------------------------------------------------------------------------------
+gid                     | 1
+am                      | 0
+success                 | t
+cat_n_levels            | {2,3}
+cat_levels_in_text      | {0,1,4,6,8}
+oob_error               | 8.64500988190963
+oob_var_importance      | 
{3.91269987042436,0,2.28278236607143,0.0994074074074073,3.42585277187264}
+impurity_var_importance | 
{5.07135586863621,3.72145581490929,5.06700415274492,0.594942174008333,8.10909642389614}
+-[ RECORD 2 
]-----------+----------------------------------------------------------------------------------------
+gid                     | 2
+am                      | 1
+success                 | t
+cat_n_levels            | {2,3}
+cat_levels_in_text      | {0,1,4,6,8}
+oob_error               | 16.5197718747446
+oob_var_importance      | 
{5.22711111111111,10.0872041666667,9.6875362244898,3.97782,2.99447839506173}
+impurity_var_importance | 
{5.1269704861111,7.04765974920884,20.9817274159476,4.02800949238769,10.5539079705215}
+</pre> Use the helper function to display normalized variable importance: <pre 
class="example">
+\x off
+DROP TABLE IF EXISTS mt_imp_output;
+SELECT madlib.get_var_importance('mt_cars_output','mt_imp_output');
+SELECT * FROM mt_imp_output ORDER BY am, oob_var_importance DESC;
+</pre> <pre class="result">
+ am | feature | oob_var_importance | impurity_var_importance
+----+---------+--------------------+-------------------------
+  0 | vs      |   40.2510395098467 |        22.4755743014842
+  0 | wt      |   35.2427070417256 |        35.9384361725319
+  0 | disp    |   23.4836216045257 |        22.4562880757909
+  0 | qsec    |   1.02263184390195 |        2.63670453886068
+  0 | cyl     |                  0 |        16.4929969113323
+  1 | cyl     |   31.5479979891794 |        14.7631219023997
+  1 | disp    |   30.2980259228064 |        43.9515825943964
+  1 | vs      |   16.3479283355324 |        10.7397480823277
+  1 | qsec    |   12.4407373230344 |         8.4376938269215
+  1 | wt      |    9.3653104294474 |        22.1078535939547
+</pre></li>
+<li>Predict regression output for the same data and compare with original: 
<pre class="example">
+\x off
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.forest_predict('mt_cars_output',
+                             'mt_cars',
+                             'prediction_results',
+                             'response');
+SELECT s.am, s.id, mpg, estimated_mpg, mpg-estimated_mpg as delta
+FROM prediction_results p, mt_cars s WHERE s.id = p.id ORDER BY s.am, s.id;
+</pre> <pre class="result">
+  am | id | mpg  |  estimated_mpg   |        delta
+----+----+------+------------------+----------------------
+  0 |  1 | 18.7 | 16.5055222816399 |     2.19447771836007
+  0 |  3 | 24.4 | 21.8437857142857 |     2.55621428571428
+  0 |  5 | 17.8 | 19.2085504201681 |    -1.40855042016807
+  0 |  6 | 16.4 | 15.7340778371955 |    0.665922162804513
+  0 |  8 | 17.3 | 15.7340778371955 |     1.56592216280452
+  0 |  9 | 21.4 | 18.2305980392157 |     3.16940196078431
+  0 | 10 | 15.2 | 15.2640778371955 |  -0.0640778371954838
+  0 | 11 | 18.1 | 18.9192647058824 |    -0.81926470588235
+  0 | 13 | 14.3 | 15.0690909090909 |   -0.769090909090908
+  0 | 14 | 22.8 | 21.8437857142857 |    0.956214285714289
+  0 | 16 | 19.2 | 19.2085504201681 | -0.00855042016807062
+  0 | 18 | 15.2 | 16.0805222816399 |    -0.88052228163993
+  0 | 19 | 10.4 | 14.7914111705288 |    -4.39141117052882
+  0 | 21 | 10.4 | 14.7914111705288 |    -4.39141117052882
+  0 | 23 | 14.7 | 15.0525222816399 |    -0.35252228163993
+  0 | 25 | 21.5 | 21.8437857142857 |   -0.343785714285712
+  0 | 27 | 15.5 | 15.4775222816399 |   0.0224777183600704
+  0 | 29 | 13.3 | 15.0690909090909 |    -1.76909090909091
+  0 | 30 | 19.2 | 15.4775222816399 |     3.72247771836007
+  1 |  2 |   21 |         19.53275 |              1.46725
+  1 |  4 |   21 | 20.3594166666667 |    0.640583333333332
+  1 |  7 | 22.8 | 23.0550833333333 |   -0.255083333333335
+  1 | 12 | 32.4 | 27.1501666666667 |     5.24983333333333
+  1 | 15 | 30.4 | 28.9628333333333 |     1.43716666666667
+  1 | 17 | 33.9 | 28.0211666666667 |     5.87883333333333
+  1 | 20 | 27.3 | 27.7138333333333 |   -0.413833333333333
+  1 | 22 |   26 | 26.8808333333333 |   -0.880833333333335
+  1 | 24 | 30.4 |          27.8225 |               2.5775
+  1 | 26 | 15.8 | 17.2924166666667 |    -1.49241666666666
+  1 | 28 |   15 | 17.2924166666667 |    -2.29241666666667
+  1 | 31 | 19.7 |         19.53275 |    0.167249999999999
+  1 | 32 | 21.4 | 23.0550833333333 |    -1.65508333333334
+(32 rows)
+</pre></li>
+<li>Display a single tree of the random forest in dot format: <pre 
class="example">
+SELECT madlib.get_tree('mt_cars_output',1,7);
+</pre> <pre class="result">
+ digraph "Regression tree for mt_cars" {
+ "0" [label="disp &lt;= 258", shape=ellipse];
+ "0" -&gt; "1"[label="yes"];
+ "1" [label="20.35",shape=box];
+ "0" -&gt; "2"[label="no"];
+ "2" [label="qsec &lt;= 17.6", shape=ellipse];
+ "2" -&gt; "5"[label="yes"];
+ "5" [label="15.8",shape=box];
+ "2" -&gt; "6"[label="no"];
+ "6" [label="12.8",shape=box];
+ } //---end of digraph---------
+</pre> Display the surrogate variables that are used to compute the split for 
each node when the primary variable is NULL: <pre class="example">
+SELECT madlib.get_tree_surr('mt_cars_output',1,7);
+</pre> <pre class="result">
+   &#160;-------------------------------------
+       Surrogates for internal nodes
+   &#160;-------------------------------------
+ (0) disp &lt;= 258
+      1: wt &lt;= 3.46    [common rows = 12]
+      2: cyl in {4,6}    [common rows = 11]
+      [Majority branch = 6 ]
+ (2) qsec &lt;= 17.6
+      1: wt &lt;= 3.435    [common rows = 6]
+      2: disp &gt; 275.8    [common rows = 5]
+      [Majority branch = 4 ]
+</pre></li>
+</ol>
+<h4>NULL Handling Example</h4>
+<ol type="1">
+<li>Create toy example to illustrate 'null-as-category' handling for 
categorical features: <pre class="example">
+DROP TABLE IF EXISTS null_handling_example;
+CREATE TABLE null_handling_example (
+    id integer,
+    country text,
+    city text,
+    weather text,
+    response text
+);
+INSERT INTO null_handling_example VALUES
+(1,null,null,null,'a'),
+(2,'US',null,null,'b'),
+(3,'US','NY',null,'c'),
+(4,'US','NY','rainy','d');
+</pre></li>
+<li>Train random forest tree. Note that 'NULL' is set as a valid level for the 
categorical features country, weather and city: <pre class="example">
+DROP TABLE IF EXISTS train_output, train_output_group, train_output_summary;
+SELECT madlib.forest_train('null_handling_example',  -- source table
+                           'train_output',    -- output model table
+                           'id',              -- id column
+                           'response',        -- response
+                           'country, weather, city',   -- features
+                           NULL,              -- exclude columns
+                           NULL,              -- grouping columns
+                           10::integer,        -- number of trees
+                           2::integer,        -- number of random features
+                           TRUE::boolean,     -- variable importance
+                           1::integer,        -- num_permutations
+                           3::integer,        -- max depth
+                           2::integer,        -- min split
+                           1::integer,        -- min bucket
+                           3::integer,        -- number of splits per 
continuous variable
+                           'null_as_category=TRUE'
+                           );
+\x on
+SELECT * FROM train_output_summary;
+</pre> <pre class="result">
+-[ RECORD 1 ]---------+----------------------
+method                | forest_train
+is_classification     | t
+source_table          | null_handling_example
+model_table           | train_output
+id_col_name           | id
+dependent_varname     | response
+independent_varnames  | country,weather,city
+cat_features          | country,weather,city
+con_features          |
+grouping_cols         |
+num_trees             | 10
+num_random_features   | 2
+max_tree_depth        | 3
+min_split             | 2
+min_bucket            | 1
+num_splits            | 3
+verbose               | f
+importance            | t
+num_permutations      | 1
+num_all_groups        | 1
+num_failed_groups     | 0
+total_rows_processed  | 4
+total_rows_skipped    | 0
+dependent_var_levels  | "a","b","c","d"
+dependent_var_type    | text
+independent_var_types | text, text, text
+null_proxy            | __NULL__
+</pre> View the summary table: <pre class="example">
+SELECT * FROM train_output_group;
+</pre> <pre class="result">
+-[ RECORD 1 ]-----------+-----------------------------------------
+gid                     | 1
+success                 | t
+cat_n_levels            | {2,2,2}
+cat_levels_in_text      | {US,__NULL__,rainy,__NULL__,NY,__NULL__}
+oob_error               | 1.00000000000000000000
+oob_var_importance      | {0,0,0}
+impurity_var_importance | {0.125,0.0944444444444,0.1836666666667}
+</pre></li>
+<li>Predict for data not previously seen by assuming NULL value as the 
default: <pre class="example">
+\x off
+DROP TABLE IF EXISTS table_test;
+CREATE TABLE table_test (
+    id integer,
+    country text,
+    city text,
+    weather text,
+    expected_response text
+);
+INSERT INTO table_test VALUES
+(1,'IN','MUM','cloudy','a'),
+(2,'US','HOU','humid','b'),
+(3,'US','NY','sunny','c'),
+(4,'US','NY','rainy','d');
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.forest_predict('train_output',
+                             'table_test',
+                             'prediction_results',
+                             'response');
+SELECT s.id, expected_response, estimated_response
+FROM prediction_results p, table_test s
+WHERE s.id = p.id ORDER BY id;
+</pre> <pre class="result">
+ id | expected_response | estimated_response
+----+-------------------+--------------------
+  1 | a                 | a
+  2 | b                 | b
+  3 | c                 | c
+  4 | d                 | d
+(4 rows)
+</pre> There is only training data for country 'US' so the response for 
country 'IN' is 'a', corresponding to a NULL (not 'US') country level. 
Likewise, any city in the 'US' that is not 'NY' will predict response 'b', 
corresponding to a NULL (not 'NY') city level.</li>
+</ol>
+<p><a class="anchor" id="literature"></a></p><dl class="section 
user"><dt>Literature</dt><dd>[1] L. Breiman and A. Cutler. Random Forests. <a 
href="http://www.stat.berkeley.edu/~breiman/RandomForests";>http://www.stat.berkeley.edu/~breiman/RandomForests</a></dd></dl>
+<p>[2] L. Breiman, A. Cutler, A. Liaw, and M. Wiener. randomForest: Breiman 
and Cutler's Random Forests for Classification and Regression. <a 
href="http://cran.r-project.org/web/packages/randomForest/index.html";>http://cran.r-project.org/web/packages/randomForest/index.html</a></p>
+<p>[3] L. Breiman, J. Friedman, R. Olshen, C. Stone. "Classification and 
Regression Trees", Chapman &amp; Hall, 1984.</p>
+<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related 
Topics</dt><dd></dd></dl>
+<p>File <a class="el" 
href="random__forest_8sql__in.html">random_forest.sql_in</a> documenting the 
training function</p>
+<p><a class="el" href="group__grp__decision__tree.html">Decision Tree</a></p>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>


http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__regml.html
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__regml.html 
b/docs/v1.15.1/group__grp__regml.html
new file mode 100644
index 0000000..b948081
--- /dev/null
+++ b/docs/v1.15.1/group__grp__regml.html
@@ -0,0 +1,173 @@
+<!-- HTML header for doxygen 1.8.4-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml";>
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen 1.8.14"/>
+<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data 
mining,deep learning,ensemble methods,data science,market basket 
analysis,affinity analysis,pca,lda,regression,elastic net,huber 
white,proportional hazards,k-means,latent dirichlet allocation,bayes,support 
vector machines,svm"/>
+<title>MADlib: Regression Models</title>
+<link href="tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="jquery.js"></script>
+<script type="text/javascript" src="dynsections.js"></script>
+<link href="navtree.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="resize.js"></script>
+<script type="text/javascript" src="navtreedata.js"></script>
+<script type="text/javascript" src="navtree.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(initResizable);
+/* @license-end */</script>
+<link href="search/search.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="search/searchdata.js"></script>
+<script type="text/javascript" src="search/search.js"></script>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+  $(document).ready(function() { init_search(); });
+/* @license-end */
+</script>
+<script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"],
+    jax: ["input/TeX","output/HTML-CSS"],
+});
+</script><script type="text/javascript" async 
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js";></script>
+<!-- hack in the navigation tree -->
+<script type="text/javascript" src="eigen_navtree_hacks.js"></script>
+<link href="doxygen.css" rel="stylesheet" type="text/css" />
+<link href="madlib_extra.css" rel="stylesheet" type="text/css"/>
+<!-- google analytics -->
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+  
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+  ga('create', 'UA-45382226-1', 'madlib.apache.org');
+  ga('send', 'pageview');
+</script>
+</head>
+<body>
+<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
+<div id="titlearea">
+<table cellspacing="0" cellpadding="0">
+ <tbody>
+ <tr style="height: 56px;">
+  <td id="projectlogo"><a href="http://madlib.apache.org";><img alt="Logo" 
src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td>
+  <td style="padding-left: 0.5em;">
+   <div id="projectname">
+   <span id="projectnumber">1.15.1</span>
+   </div>
+   <div id="projectbrief">User Documentation for Apache MADlib</div>
+  </td>
+   <td>        <div id="MSearchBox" class="MSearchBoxInactive">
+        <span class="left">
+          <img id="MSearchSelect" src="search/mag_sel.png"
+               onmouseover="return searchBox.OnSearchSelectShow()"
+               onmouseout="return searchBox.OnSearchSelectHide()"
+               alt=""/>
+          <input type="text" id="MSearchField" value="Search" accesskey="S"
+               onfocus="searchBox.OnSearchFieldFocus(true)" 
+               onblur="searchBox.OnSearchFieldFocus(false)" 
+               onkeyup="searchBox.OnSearchFieldChange(event)"/>
+          </span><span class="right">
+            <a id="MSearchClose" 
href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" 
border="0" src="search/close.png" alt=""/></a>
+          </span>
+        </div>
+</td>
+ </tr>
+ </tbody>
+</table>
+</div>
+<!-- end header part -->
+<!-- Generated by Doxygen 1.8.14 -->
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+var searchBox = new SearchBox("searchBox", "search",false,'Search');
+/* @license-end */
+</script>
+</div><!-- top -->
+<div id="side-nav" class="ui-resizable side-nav-resizable">
+  <div id="nav-tree">
+    <div id="nav-tree-contents">
+      <div id="nav-sync" class="sync"></div>
+    </div>
+  </div>
+  <div id="splitbar" style="-moz-user-select:none;" 
+       class="ui-resizable-handle">
+  </div>
+</div>
+<script type="text/javascript">
+/* @license 
magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&amp;dn=gpl-2.0.txt 
GPL-v2 */
+$(document).ready(function(){initNavTree('group__grp__regml.html','');});
+/* @license-end */
+</script>
+<div id="doc-content">
+<!-- window showing the filter options -->
+<div id="MSearchSelectWindow"
+     onmouseover="return searchBox.OnSearchSelectShow()"
+     onmouseout="return searchBox.OnSearchSelectHide()"
+     onkeydown="return searchBox.OnSearchSelectKey(event)">
+</div>
+
+<!-- iframe showing the search results (closed by default) -->
+<div id="MSearchResultsWindow">
+<iframe src="javascript:void(0)" frameborder="0" 
+        name="MSearchResults" id="MSearchResults">
+</iframe>
+</div>
+
+<div class="header">
+  <div class="summary">
+<a href="#groups">Modules</a>  </div>
+  <div class="headertitle">
+<div class="title">Regression Models<div class="ingroups"><a class="el" 
href="group__grp__super.html">Supervised Learning</a></div></div>  </div>
+</div><!--header-->
+<div class="contents">
+<a name="details" id="details"></a><h2 class="groupheader">Detailed 
Description</h2>
+<p>A collection of methods for modeling conditional expectation of a response 
variable. </p>
+<table class="memberdecls">
+<tr class="heading"><td colspan="2"><h2 class="groupheader"><a 
name="groups"></a>
+Modules</h2></td></tr>
+<tr class="memitem:group__grp__clustered__errors"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__clustered__errors.html">Clustered Variance</a></td></tr>
+<tr class="memdesc:group__grp__clustered__errors"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Calculates clustered 
variance for linear, logistic, and multinomial logistic regression models, and 
Cox proportional hazards models. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__cox__prop__hazards"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" 
href="group__grp__cox__prop__hazards.html">Cox-Proportional Hazards 
Regression</a></td></tr>
+<tr class="memdesc:group__grp__cox__prop__hazards"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Models the relationship 
between one or more independent predictor variables and the amount of time 
before an event occurs. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__elasticnet"><td class="memItemLeft" 
align="right" valign="top">&#160;</td><td class="memItemRight" 
valign="bottom"><a class="el" href="group__grp__elasticnet.html">Elastic Net 
Regularization</a></td></tr>
+<tr class="memdesc:group__grp__elasticnet"><td 
class="mdescLeft">&#160;</td><td class="mdescRight">Generates a regularized 
regression model for variable selection in linear and logistic regression 
problems, combining the L1 and L2 penalties of the lasso and ridge methods. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__glm"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__glm.html">Generalized Linear Models</a></td></tr>
+<tr class="memdesc:group__grp__glm"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Estimate generalized linear model (GLM). GLM is a flexible 
generalization of ordinary linear regression that allows for response variables 
that have error distribution models other than a normal distribution. The GLM 
generalizes linear regression by allowing the linear model to be related to the 
response variable via a link function and by allowing the magnitude of the 
variance of each measurement to be a function of its predicted value. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__linreg"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__linreg.html">Linear Regression</a></td></tr>
+<tr class="memdesc:group__grp__linreg"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Also called Ordinary Least Squares Regression, models linear 
relationship between a dependent variable and one or more independent 
variables. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__logreg"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__logreg.html">Logistic Regression</a></td></tr>
+<tr class="memdesc:group__grp__logreg"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Models the relationship between one or more predictor 
variables and a binary categorical dependent variable by predicting the 
probability of the dependent variable using a logistic function. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__marginal"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__marginal.html">Marginal Effects</a></td></tr>
+<tr class="memdesc:group__grp__marginal"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Calculates marginal effects for the coefficients in 
regression problems. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__multinom"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__multinom.html">Multinomial Regression</a></td></tr>
+<tr class="memdesc:group__grp__multinom"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Multinomial regression is to model the conditional 
distribution of the multinomial response variable using a linear combination of 
predictors. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__ordinal"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__ordinal.html">Ordinal Regression</a></td></tr>
+<tr class="memdesc:group__grp__ordinal"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Regression to model data with ordinal response variable. <br 
/></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:group__grp__robust"><td class="memItemLeft" align="right" 
valign="top">&#160;</td><td class="memItemRight" valign="bottom"><a class="el" 
href="group__grp__robust.html">Robust Variance</a></td></tr>
+<tr class="memdesc:group__grp__robust"><td class="mdescLeft">&#160;</td><td 
class="mdescRight">Calculates Huber-White variance estimates for linear, 
logistic, and multinomial regression models, and for Cox proportional hazards 
models. <br /></td></tr>
+<tr class="separator:"><td class="memSeparator" colspan="2">&#160;</td></tr>
+</table>
+</div><!-- contents -->
+</div><!-- doc-content -->
+<!-- start footer part -->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    <li class="footer">Generated on Mon Oct 15 2018 11:24:30 for MADlib by
+    <a href="http://www.doxygen.org/index.html";>
+    <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li>
+  </ul>
+</div>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/madlib-site/blob/af0e5f14/docs/v1.15.1/group__grp__regml.js
----------------------------------------------------------------------
diff --git a/docs/v1.15.1/group__grp__regml.js 
b/docs/v1.15.1/group__grp__regml.js
new file mode 100644
index 0000000..76e3c69
--- /dev/null
+++ b/docs/v1.15.1/group__grp__regml.js
@@ -0,0 +1,13 @@
+var group__grp__regml =
+[
+    [ "Clustered Variance", "group__grp__clustered__errors.html", null ],
+    [ "Cox-Proportional Hazards Regression", 
"group__grp__cox__prop__hazards.html", null ],
+    [ "Elastic Net Regularization", "group__grp__elasticnet.html", null ],
+    [ "Generalized Linear Models", "group__grp__glm.html", null ],
+    [ "Linear Regression", "group__grp__linreg.html", null ],
+    [ "Logistic Regression", "group__grp__logreg.html", null ],
+    [ "Marginal Effects", "group__grp__marginal.html", null ],
+    [ "Multinomial Regression", "group__grp__multinom.html", null ],
+    [ "Ordinal Regression", "group__grp__ordinal.html", null ],
+    [ "Robust Variance", "group__grp__robust.html", null ]
+];
\ No newline at end of file

[11/51] [partial] madlib-site git commit: Doc: Add v1.15.1 documentation

Reply via email to