http://git-wip-us.apache.org/repos/asf/madlib-site/blob/573d66d8/docs/rc/group__grp__sparse__linear__solver.html ---------------------------------------------------------------------- diff --git a/docs/rc/group__grp__sparse__linear__solver.html b/docs/rc/group__grp__sparse__linear__solver.html deleted file mode 100644 index 139e0d7..0000000 --- a/docs/rc/group__grp__sparse__linear__solver.html +++ /dev/null @@ -1,361 +0,0 @@ -<!-- HTML header for doxygen 1.8.4--> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml"> -<head> -<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> -<meta http-equiv="X-UA-Compatible" content="IE=9"/> -<meta name="generator" content="Doxygen 1.8.14"/> -<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> -<title>MADlib: Sparse Linear Systems</title> -<link href="tabs.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="jquery.js"></script> -<script type="text/javascript" src="dynsections.js"></script> -<link href="navtree.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="resize.js"></script> -<script type="text/javascript" src="navtreedata.js"></script> -<script type="text/javascript" src="navtree.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(initResizable); -/* @license-end */</script> -<link href="search/search.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="search/searchdata.js"></script> -<script type="text/javascript" src="search/search.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(function() { init_search(); }); -/* @license-end */ -</script> -<script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], - jax: ["input/TeX","output/HTML-CSS"], -}); -</script><script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js"></script> -<!-- hack in the navigation tree --> -<script type="text/javascript" src="eigen_navtree_hacks.js"></script> -<link href="doxygen.css" rel="stylesheet" type="text/css" /> -<link href="madlib_extra.css" rel="stylesheet" type="text/css"/> -<!-- google analytics --> -<script> - (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ - (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), - m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) - })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); - ga('create', 'UA-45382226-1', 'madlib.apache.org'); - ga('send', 'pageview'); -</script> -</head> -<body> -<div id="top"><!-- do not remove this div, it is closed by doxygen! --> -<div id="titlearea"> -<table cellspacing="0" cellpadding="0"> - <tbody> - <tr style="height: 56px;"> - <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> - <td style="padding-left: 0.5em;"> - <div id="projectname"> - <span id="projectnumber">1.15</span> - </div> - <div id="projectbrief">User Documentation for Apache MADlib</div> - </td> - <td> <div id="MSearchBox" class="MSearchBoxInactive"> - <span class="left"> - <img id="MSearchSelect" src="search/mag_sel.png" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - alt=""/> - <input type="text" id="MSearchField" value="Search" accesskey="S" - onfocus="searchBox.OnSearchFieldFocus(true)" - onblur="searchBox.OnSearchFieldFocus(false)" - onkeyup="searchBox.OnSearchFieldChange(event)"/> - </span><span class="right"> - <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> - </span> - </div> -</td> - </tr> - </tbody> -</table> -</div> -<!-- end header part --> -<!-- Generated by Doxygen 1.8.14 --> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -var searchBox = new SearchBox("searchBox", "search",false,'Search'); -/* @license-end */ -</script> -</div><!-- top --> -<div id="side-nav" class="ui-resizable side-nav-resizable"> - <div id="nav-tree"> - <div id="nav-tree-contents"> - <div id="nav-sync" class="sync"></div> - </div> - </div> - <div id="splitbar" style="-moz-user-select:none;" - class="ui-resizable-handle"> - </div> -</div> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -$(document).ready(function(){initNavTree('group__grp__sparse__linear__solver.html','');}); -/* @license-end */ -</script> -<div id="doc-content"> -<!-- window showing the filter options --> -<div id="MSearchSelectWindow" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - onkeydown="return searchBox.OnSearchSelectKey(event)"> -</div> - -<!-- iframe showing the search results (closed by default) --> -<div id="MSearchResultsWindow"> -<iframe src="javascript:void(0)" frameborder="0" - name="MSearchResults" id="MSearchResults"> -</iframe> -</div> - -<div class="header"> - <div class="headertitle"> -<div class="title">Sparse Linear Systems<div class="ingroups"><a class="el" href="group__grp__other__functions.html">Utilities</a> » <a class="el" href="group__grp__linear__solver.html">Linear Solvers</a></div></div> </div> -</div><!--header--> -<div class="contents"> -<div class="toc"><b>Contents</b> <ul> -<li class="level1"> -<a href="#sls_usage">Solution Function</a> </li> -<li class="level1"> -<a href="#sls_opt_params">Optimizer Parameters</a> </li> -<li class="level1"> -<a href="#sls_output">Output Tables</a> </li> -<li class="level1"> -<a href="#sls_examples">Examples</a> </li> -<li> -<a href="related">Related Topics</a> </li> -</ul> -</div><p>The sparse linear systems module implements solution methods for systems of consistent linear equations. Systems of linear equations take the form: </p><p class="formulaDsp"> -\[ Ax = b \] -</p> -<p>where \(x \in \mathbb{R}^{n}\), \(A \in \mathbb{R}^{m \times n} \) and \(b \in \mathbb{R}^{m}\). This module accepts sparse matrix input formats for \(A\) and \(b\). We assume that there are no rows of \(A\) where all elements are zero.</p> -<dl class="section note"><dt>Note</dt><dd>Algorithms with fail if there is an row of the input matrix containing all zeros.</dd></dl> -<p>The algorithms implemented in this module can handle large sparse square linear systems. Currently, the algorithms implemented in this module solve the linear system using direct or iterative methods.</p> -<p><a class="anchor" id="sls_usage"></a></p><dl class="section user"><dt>Sparse Linear Systems Solution Function</dt><dd></dd></dl> -<pre class="syntax"> -linear_solver_sparse( tbl_source_lhs, - tbl_source_rhs, - tbl_result, - lhs_row_id, - lhs_col_id, - lhs_value, - rhs_row_id, - rhs_value, - grouping_cols := NULL, - optimizer := 'direct', - optimizer_params := - 'algorithm = llt' - ) -</pre><p> <b>Arguments</b> </p><dl class="arglist"> -<dt>tbl_source_lhs </dt> -<dd><p class="startdd">The name of the table containing the left hand side matrix. For the LHS matrix, the input data is expected to be of the following form: </p><pre> -{TABLE|VIEW} <em>sourceName</em> ( - ... - <em>row_id</em> FLOAT8, - <em>col_id</em> FLOAT8, - <em>value</em> FLOAT8, - ... -)</pre><p> Each row represents a single equation. The <em>rhs</em> columns refer to the right hand side of the equations and the <em>lhs</em> columns refer to the multipliers on the variables on the left hand side of the same equations. </p> -<p class="enddd"></p> -</dd> -<dt>tbl_source_rhs </dt> -<dd><p class="startdd">TEXT. The name of the table containing the right hand side vector. For the RHS matrix, the input data is expected to be of the following form: </p><pre class="fragment">{TABLE|VIEW} <em>sourceName</em> ( - ... - <em>row_id</em> FLOAT8, - <em>value</em> FLOAT8 - ... -)</pre><p> Each row represents a single equation. The <em>rhs</em> columns refer to the right hand side of the equations while the <em>lhs</em> columns refers to the multipliers on the variables on the left hand side of the same equations. </p> -<p class="enddd"></p> -</dd> -<dt>tbl_result </dt> -<dd><p class="startdd">TEXT. The name of the table where the output is saved. Output is stored in the tabled named by the <em>tbl_result</em> argument. The table contains the following columns. The output contains the following columns: </p><table class="output"> -<tr> -<th>solution </th><td>FLOAT8[]. The solution is an array with the variables in the same order as that provided as input in the 'left_hand_side' column name of the 'source_table' </td></tr> -<tr> -<th>residual_norm </th><td>FLOAT8. Scaled residual norm, defined as \( \frac{|Ax - b|}{|b|} \). This value is an indication of the accuracy of the solution. </td></tr> -<tr> -<th>iters </th><td>INTEGER. Number of iterations required by the algorithm (only applicable for iterative algorithms) . The output is NULL for 'direct' methods. </td></tr> -</table> -<p class="enddd"></p> -</dd> -<dt>lhs_row_id </dt> -<dd>TEXT. The name of the column storing the 'row id' of the equations. <dl class="section note"><dt>Note</dt><dd>For a system with N equations, the row_id's must be a continuous range of integers from \( 0 \ldots n-1 \).</dd></dl> -</dd> -<dt>lhs_col_id </dt> -<dd><p class="startdd">TEXT. The name of the column (in tbl_source_lhs) storing the 'col id' of the equations.</p> -<p class="enddd"></p> -</dd> -<dt>lhs_value </dt> -<dd><p class="startdd">TEXT. The name of the column (in tbl_source_lhs) storing the 'value' of the equations.</p> -<p class="enddd"></p> -</dd> -<dt>rhs_row_id </dt> -<dd><p class="startdd">TEXT. The name of the column (in tbl_source_rhs) storing the 'col id' of the equations.</p> -<p class="enddd"></p> -</dd> -<dt>rhs_value </dt> -<dd><p class="startdd">TEXT. The name of the column (in tbl_source_rhs) storing the 'value' of the equations.</p> -<p class="enddd"></p> -</dd> -<dt>num_vars </dt> -<dd><p class="startdd">INTEGER. The number of variables in the linear system equations.</p> -<p class="enddd"></p> -</dd> -<dt>grouping_col (optional) </dt> -<dd>TEXT, default: NULL. Group by column names. <dl class="section note"><dt>Note</dt><dd>The grouping feature is currently not implemented and this parameter is only a placeholder.</dd></dl> -</dd> -<dt>optimizer (optional) </dt> -<dd><p class="startdd">TEXT, default: 'direct'. Type of optimizer.</p> -<p class="enddd"></p> -</dd> -<dt>optimizer_params (optional) </dt> -<dd>TEXT, default: NULL. Optimizer specific parameters. </dd> -</dl> -<p><a class="anchor" id="sls_opt_params"></a></p><dl class="section user"><dt>Optimizer Parameters</dt><dd></dd></dl> -<p>For each optimizer, there are specific parameters that can be tuned for better performance.</p> -<dl class="arglist"> -<dt>algorithm (default: ldlt) </dt> -<dd><p class="startdd"></p> -<p>There are several algorithms that can be classified as 'direct' methods of solving linear systems. Madlib functions provide various algorithmic options available for users.</p> -<p>The following table provides a guideline on the choice of algorithm based on conditions on the A matrix, speed of the algorithms and numerical stability.</p> -<pre class="fragment"> Algorithm | Conditions on A | Speed | Memory - ---------------------------------------------------------- - llt | Sym. Pos Def | ++ | --- - ldlt | Sym. Pos Def | ++ | --- - - For speed '++' is faster than '+', which is faster than '-'. - For accuracy '+++' is better than '++'. - For memory, '-' uses less memory than '--'. - - Note: ldlt is often preferred over llt -</pre><p>There are several algorithms that can be classified as 'iterative' methods of solving linear systems. Madlib functions provide various algorithmic options available for users.</p> -<p>The following table provides a guideline on the choice of algorithm based on conditions on the A matrix, speed of the algorithms and numerical stability.</p> -<pre class="fragment"> Algorithm | Conditions on A | Speed | Memory | Convergence - ---------------------------------------------------------------------- - cg-mem | Sym. Pos Def | +++ | - | ++ - bicgstab-mem | Square | ++ | - | + - precond-cg-mem | Sym. Pos Def | ++ | - | +++ - precond-bicgstab-mem | Square | + | - | ++ - - For memory, '-' uses less memory than '--'. - For speed, '++' is faster than '+'. -</pre><p>Algorithm Details: </p><table class="output"> -<tr> -<th>cg-mem</th><td>In memory conjugate gradient with diagonal preconditioners. </td></tr> -<tr> -<th>bicgstab-mem</th><td>Bi-conjugate gradient (equivalent to performing CG on the least squares formulation of Ax=b) with incomplete LU preconditioners. </td></tr> -<tr> -<th>precond-cg-mem</th><td>In memory conjugate gradient with diagonal preconditioners. </td></tr> -<tr> -<th>bicgstab-mem</th><td>Bi-conjugate gradient (equivalent to performing CG on the least squares formulation of Ax=b) with incomplete LU preconditioners. </td></tr> -</table> -<p class="enddd"></p> -</dd> -<dt>toler (default: 1e-5) </dt> -<dd><p class="startdd">Termination tolerance (applicable only for iterative methods) which determines the stopping criterion (with respect to residual norm) for iterative methods. </p> -<p class="enddd"></p> -</dd> -</dl> -<p><a class="anchor" id="sls_examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl> -<ol type="1"> -<li>View online help for the sparse linear systems solver function. <pre class="example"> -SELECT madlib.linear_solver_sparse(); -</pre></li> -<li>Create the sample data set. <pre class="example"> -DROP TABLE IF EXISTS sparse_linear_systems_lhs; -CREATE TABLE sparse_linear_systems_lhs ( - rid INTEGER NOT NULL, - cid INTEGER, - val DOUBLE PRECISION -); -DROP TABLE IF EXISTS sparse_linear_systems_rhs; -CREATE TABLE sparse_linear_systems_rhs ( - rid INTEGER NOT NULL, - val DOUBLE PRECISION -); -INSERT INTO sparse_linear_systems_lhs(rid, cid, val) VALUES -(0, 0, 1), -(1, 1, 1), -(2, 2, 1), -(3, 3, 1); -INSERT INTO sparse_linear_systems_rhs(rid, val) VALUES -(0, 10), -(1, 20), -(2, 30); -</pre></li> -<li>Solve the linear systems with default parameters. <pre class="example"> -SELECT madlib.linear_solver_sparse( 'sparse_linear_systems_lhs', - 'sparse_linear_systems_rhs', - 'output_table', - 'rid', - 'cid', - 'val', - 'rid', - 'val', - 4 - ); -</pre></li> -<li>View the contents of the output table. <pre class="example"> -\x on -SELECT * FROM output_table; -</pre> Result: <pre class="result"> ---------------------+------------------------------------- -solution | {10,20,30,0} -residual_norm | 0 -iters | NULL -</pre></li> -<li>Choose a different algorithm than the default algorithm. <pre class="example"> -DROP TABLE IF EXISTS output_table; -SELECT madlib.linear_solver_sparse( 'sparse_linear_systems_lhs', - 'sparse_linear_systems_rhs', - 'output_table', - 'rid', - 'cid', - 'val', - 'rid', - 'val', - 4, - NULL, - 'direct', - 'algorithm=llt' - ); -</pre></li> -<li>Choose a different algorithm than the default algorithm. <pre class="example"> -DROP TABLE IF EXISTS output_table; -SELECT madlib.linear_solver_sparse( - 'sparse_linear_systems_lhs', - 'sparse_linear_systems_rhs', - 'output_table', - 'rid', - 'cid', - 'val', - 'rid', - 'val', - 4, - NULL, - 'iterative', - 'algorithm=cg-mem, toler=1e-5' - ); -</pre></li> -</ol> -<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd>File sparse_linear_sytems.sql_in documenting the SQL functions.</dd></dl> -</div><!-- contents --> -</div><!-- doc-content --> -<!-- start footer part --> -<div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> - <ul> - <li class="footer">Generated on Mon Aug 6 2018 21:55:39 for MADlib by - <a href="http://www.doxygen.org/index.html"> - <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li> - </ul> -</div> -</body> -</html>
http://git-wip-us.apache.org/repos/asf/madlib-site/blob/573d66d8/docs/rc/group__grp__sssp.html ---------------------------------------------------------------------- diff --git a/docs/rc/group__grp__sssp.html b/docs/rc/group__grp__sssp.html deleted file mode 100644 index 8281939..0000000 --- a/docs/rc/group__grp__sssp.html +++ /dev/null @@ -1,370 +0,0 @@ -<!-- HTML header for doxygen 1.8.4--> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml"> -<head> -<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> -<meta http-equiv="X-UA-Compatible" content="IE=9"/> -<meta name="generator" content="Doxygen 1.8.14"/> -<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> -<title>MADlib: Single Source Shortest Path</title> -<link href="tabs.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="jquery.js"></script> -<script type="text/javascript" src="dynsections.js"></script> -<link href="navtree.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="resize.js"></script> -<script type="text/javascript" src="navtreedata.js"></script> -<script type="text/javascript" src="navtree.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(initResizable); -/* @license-end */</script> -<link href="search/search.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="search/searchdata.js"></script> -<script type="text/javascript" src="search/search.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(function() { init_search(); }); -/* @license-end */ -</script> -<script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], - jax: ["input/TeX","output/HTML-CSS"], -}); -</script><script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js"></script> -<!-- hack in the navigation tree --> -<script type="text/javascript" src="eigen_navtree_hacks.js"></script> -<link href="doxygen.css" rel="stylesheet" type="text/css" /> -<link href="madlib_extra.css" rel="stylesheet" type="text/css"/> -<!-- google analytics --> -<script> - (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ - (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), - m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) - })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); - ga('create', 'UA-45382226-1', 'madlib.apache.org'); - ga('send', 'pageview'); -</script> -</head> -<body> -<div id="top"><!-- do not remove this div, it is closed by doxygen! --> -<div id="titlearea"> -<table cellspacing="0" cellpadding="0"> - <tbody> - <tr style="height: 56px;"> - <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> - <td style="padding-left: 0.5em;"> - <div id="projectname"> - <span id="projectnumber">1.15</span> - </div> - <div id="projectbrief">User Documentation for Apache MADlib</div> - </td> - <td> <div id="MSearchBox" class="MSearchBoxInactive"> - <span class="left"> - <img id="MSearchSelect" src="search/mag_sel.png" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - alt=""/> - <input type="text" id="MSearchField" value="Search" accesskey="S" - onfocus="searchBox.OnSearchFieldFocus(true)" - onblur="searchBox.OnSearchFieldFocus(false)" - onkeyup="searchBox.OnSearchFieldChange(event)"/> - </span><span class="right"> - <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> - </span> - </div> -</td> - </tr> - </tbody> -</table> -</div> -<!-- end header part --> -<!-- Generated by Doxygen 1.8.14 --> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -var searchBox = new SearchBox("searchBox", "search",false,'Search'); -/* @license-end */ -</script> -</div><!-- top --> -<div id="side-nav" class="ui-resizable side-nav-resizable"> - <div id="nav-tree"> - <div id="nav-tree-contents"> - <div id="nav-sync" class="sync"></div> - </div> - </div> - <div id="splitbar" style="-moz-user-select:none;" - class="ui-resizable-handle"> - </div> -</div> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -$(document).ready(function(){initNavTree('group__grp__sssp.html','');}); -/* @license-end */ -</script> -<div id="doc-content"> -<!-- window showing the filter options --> -<div id="MSearchSelectWindow" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - onkeydown="return searchBox.OnSearchSelectKey(event)"> -</div> - -<!-- iframe showing the search results (closed by default) --> -<div id="MSearchResultsWindow"> -<iframe src="javascript:void(0)" frameborder="0" - name="MSearchResults" id="MSearchResults"> -</iframe> -</div> - -<div class="header"> - <div class="headertitle"> -<div class="title">Single Source Shortest Path<div class="ingroups"><a class="el" href="group__grp__graph.html">Graph</a></div></div> </div> -</div><!--header--> -<div class="contents"> -<div class="toc"><b>Contents</b> <ul> -<li> -<a href="#sssp">SSSP</a> </li> -<li> -<a href="#notes">Notes</a> </li> -<li> -<a href="#examples">Examples</a> </li> -<li> -<a href="#literature">Literature</a> </li> -</ul> -</div><p>Given a graph and a source vertex, the single source shortest path (SSSP) algorithm finds a path from the source vertex to every other vertex in the graph, such that the sum of the weights of the path edges is minimized.</p> -<p><a class="anchor" id="sssp"></a></p><dl class="section user"><dt>SSSP</dt><dd><pre class="syntax"> -graph_sssp( vertex_table, - vertex_id, - edge_table, - edge_args, - source_vertex, - out_table, - grouping_cols - ) -</pre></dd></dl> -<p><b>Arguments</b> </p><dl class="arglist"> -<dt>vertex_table </dt> -<dd><p class="startdd">TEXT. Name of the table containing the vertex data for the graph. Must contain the column specified in the 'vertex_id' parameter below.</p> -<p class="enddd"></p> -</dd> -<dt>vertex_id </dt> -<dd><p class="startdd">TEXT, default = 'id'. Name of the column in 'vertex_table' containing vertex ids. The vertex ids are of type INTEGER with no duplicates. They do not need to be contiguous.</p> -<p class="enddd"></p> -</dd> -<dt>edge_table </dt> -<dd><p class="startdd">TEXT. Name of the table containing the edge data. The edge table must contain columns for source vertex, destination vertex and edge weight. Column naming convention is described below in the 'edge_args' parameter.</p> -<p class="enddd"></p> -</dd> -<dt>edge_args </dt> -<dd><p class="startdd">TEXT. A comma-delimited string containing multiple named arguments of the form "name=value". The following parameters are supported for this string argument:</p><ul> -<li>src (INTEGER): Name of the column containing the source vertex ids in the edge table. Default column name is 'src'.</li> -<li>dest (INTEGER): Name of the column containing the destination vertex ids in the edge table. Default column name is 'dest'.</li> -<li>weight (FLOAT8): Name of the column containing the edge weights in the edge table. Default column name is 'weight'.</li> -</ul> -<p class="enddd"></p> -</dd> -<dt>source_vertex </dt> -<dd><p class="startdd">INTEGER. The source vertex id for the algorithm to start. This vertex id must exist in the 'vertex_id' column of 'vertex_table'.</p> -<p class="enddd"></p> -</dd> -<dt>out_table </dt> -<dd><p class="startdd">TEXT. Name of the table to store the result of SSSP. It contains a row for every vertex of every group and have the following columns (in addition to the grouping columns):</p><ul> -<li>vertex_id : The id for the destination. Will use the input parameter 'vertex_id' for column naming.</li> -<li>weight : The total weight of the shortest path from the source vertex to this particular vertex. Will use the input parameter 'weight' for column naming.</li> -<li>parent : The parent of this vertex in the shortest path from source. Will use 'parent' for column naming.</li> -</ul> -<p>A summary table named <out_table>_summary is also created. This is an internal table that keeps a record of the input parameters and is used by the path function described below. </p> -<p class="enddd"></p> -</dd> -<dt>grouping_cols (optional) </dt> -<dd>TEXT, default = NULL. List of columns used to group the input into discrete subgraphs. These columns must exist in the edge table. When this value is null, no grouping is used and a single SSSP result is generated. </dd> -</dl> -<dl class="section user"><dt>Path Retrieval</dt><dd></dd></dl> -<p>The path retrieval function returns the shortest path from the source vertex to a specified desination vertex.</p> -<pre class="syntax"> -graph_sssp_get_path( sssp_table, - dest_vertex, - path_table - ) -</pre><p><b>Arguments</b> </p><dl class="arglist"> -<dt>sssp_table </dt> -<dd><p class="startdd">TEXT. Name of the table that contains the SSSP output.</p> -<p class="enddd"></p> -</dd> -<dt>dest_vertex </dt> -<dd><p class="startdd">INTEGER. The vertex that will be the destination of the desired path.</p> -<p class="enddd"></p> -</dd> -<dt>path_table </dt> -<dd><p class="startdd">TEXT. Name of the output table that contains the path. It contains a row for every group and has the following columns:</p><ul> -<li>grouping_cols : The grouping columns given in the creation of the SSSP table. If there are no grouping columns, these columns will not exist and the table will have a single row.</li> -<li>path (ARRAY) : The shortest path from the source vertex (as specified in the SSSP execution) to the destination vertex. </li> -</ul> -<p class="enddd"></p> -</dd> -</dl> -<p><a class="anchor" id="notes"></a></p><dl class="section user"><dt>Notes</dt><dd></dd></dl> -<p>The Bellman-Ford algorithm [1] is used to implement SSSP. This algorithm allows negative edges but not negative cycles. In the case of graphs with negative cycles, an error will be given and no output table will be generated.</p> -<p>Also see the Grail project [2] for more background on graph analytics processing in relational databases.</p> -<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl> -<ol type="1"> -<li>Create vertex and edge tables to represent the graph: <pre class="syntax"> -DROP TABLE IF EXISTS vertex, edge; -CREATE TABLE vertex( - id INTEGER - ); -CREATE TABLE edge( - src INTEGER, - dest INTEGER, - weight FLOAT8 - ); -INSERT INTO vertex VALUES -(0), -(1), -(2), -(3), -(4), -(5), -(6), -(7); -INSERT INTO edge VALUES -(0, 1, 1.0), -(0, 2, 1.0), -(0, 4, 10.0), -(1, 2, 2.0), -(1, 3, 10.0), -(2, 3, 1.0), -(2, 5, 1.0), -(2, 6, 3.0), -(3, 0, 1.0), -(4, 0, -2.0), -(5, 6, 1.0), -(6, 7, 1.0); -</pre></li> -<li>Calculate the shortest paths from vertex 0: <pre class="syntax"> -DROP TABLE IF EXISTS out, out_summary; -SELECT madlib.graph_sssp( - 'vertex', -- Vertex table - NULL, -- Vertix id column (NULL means use default naming) - 'edge', -- Edge table - NULL, -- Edge arguments (NULL means use default naming) - 0, -- Source vertex for path calculation - 'out'); -- Output table of shortest paths -SELECT * FROM out ORDER BY id; -</pre> <pre class="result"> - id | weight | parent -----+--------+-------- - 0 | 0 | 0 - 1 | 1 | 0 - 2 | 1 | 0 - 3 | 2 | 2 - 4 | 10 | 0 - 5 | 2 | 2 - 6 | 3 | 5 - 7 | 4 | 6 -(8 rows) -</pre></li> -<li>Get the shortest path to vertex 5: <pre class="syntax"> -DROP TABLE IF EXISTS out_path; -SELECT madlib.graph_sssp_get_path('out',5,'out_path'); -SELECT * FROM out_path; -</pre> <pre class="result"> - path ---------- - {0,2,5} -</pre></li> -<li>Now let's do a similar example except using different column names in the tables (i.e., not the defaults). Create the vertex and edge tables: <pre class="syntax"> -DROP TABLE IF EXISTS vertex_alt, edge_alt; -CREATE TABLE vertex_alt AS SELECT id AS v_id FROM vertex; -CREATE TABLE edge_alt AS SELECT src AS e_src, dest, weight AS e_weight FROM edge; -</pre></li> -<li>Get the shortest path from vertex 1: <pre class="syntax"> -DROP TABLE IF EXISTS out_alt, out_alt_summary; -SELECT madlib.graph_sssp( - 'vertex_alt', -- Vertex table - 'v_id', -- Vertex id column (NULL means use default naming) - 'edge_alt', -- Edge table - 'src=e_src, weight=e_weight', -- Edge arguments (NULL means use default naming) - 1, -- Source vertex for path calculation - 'out_alt'); -- Output table of shortest paths -SELECT * FROM out_alt ORDER BY v_id; -</pre> <pre class="result"> - v_id | e_weight | parent -------+----------+-------- - 0 | 4 | 3 - 1 | 0 | 1 - 2 | 2 | 1 - 3 | 3 | 2 - 4 | 14 | 0 - 5 | 3 | 2 - 6 | 4 | 5 - 7 | 5 | 6 -(8 rows) -</pre></li> -<li>Create a graph with 2 groups: <pre class="syntax"> -DROP TABLE IF EXISTS edge_gr; -CREATE TABLE edge_gr AS -( - SELECT *, 0 AS grp FROM edge - UNION - SELECT *, 1 AS grp FROM edge WHERE src < 6 AND dest < 6 -); -INSERT INTO edge_gr VALUES -(4,5,-20,1); -</pre></li> -<li>Find SSSP for all groups <pre class="syntax"> -DROP TABLE IF EXISTS out_gr, out_gr_summary; -SELECT madlib.graph_sssp( - 'vertex', -- Vertex table - NULL, -- Vertex id column (NULL means use default naming) - 'edge_gr', -- Edge table - NULL, -- Edge arguments (NULL means use default naming) - 0, -- Source vertex for path calculation - 'out_gr', -- Output table of shortest paths - 'grp' -- Grouping columns -); -SELECT * FROM out_gr ORDER BY grp,id; -</pre> <pre class="result"> - grp | id | weight | parent ------+----+--------+-------- - 0 | 0 | 0 | 0 - 0 | 1 | 1 | 0 - 0 | 2 | 1 | 0 - 0 | 3 | 2 | 2 - 0 | 4 | 10 | 0 - 0 | 5 | 2 | 2 - 0 | 6 | 3 | 5 - 0 | 7 | 4 | 6 - 1 | 0 | 0 | 0 - 1 | 1 | 1 | 0 - 1 | 2 | 1 | 0 - 1 | 3 | 2 | 2 - 1 | 4 | 10 | 0 - 1 | 5 | -10 | 4 -</pre></li> -<li>Find the path to vertex 5 in every group <pre class="syntax"> -DROP TABLE IF EXISTS out_gr_path; -SELECT madlib.graph_sssp_get_path('out_gr',5,'out_gr_path'); -SELECT * FROM out_gr_path ORDER BY grp; -</pre> <pre class="result"> - grp | path ------+--------- - 0 | {0,2,5} - 1 | {0,4,5} -</pre></li> -</ol> -<p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl> -<p>[1] BellmanâFord algorithm. <a href="https://en.wikipedia.org/wiki/Bellman%E2%80%93Ford_algorithm">https://en.wikipedia.org/wiki/Bellman%E2%80%93Ford_algorithm</a></p> -<p>[2] The case against specialized graph analytics engines, J. Fan, G. Soosai Raj, and J. M. Patel. CIDR 2015. <a href="http://cidrdb.org/cidr2015/Papers/CIDR15_Paper20.pdf">http://cidrdb.org/cidr2015/Papers/CIDR15_Paper20.pdf</a> </p> -</div><!-- contents --> -</div><!-- doc-content --> -<!-- start footer part --> -<div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> - <ul> - <li class="footer">Generated on Mon Aug 6 2018 21:55:39 for MADlib by - <a href="http://www.doxygen.org/index.html"> - <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li> - </ul> -</div> -</body> -</html> http://git-wip-us.apache.org/repos/asf/madlib-site/blob/573d66d8/docs/rc/group__grp__stats.html ---------------------------------------------------------------------- diff --git a/docs/rc/group__grp__stats.html b/docs/rc/group__grp__stats.html deleted file mode 100644 index d4afa83..0000000 --- a/docs/rc/group__grp__stats.html +++ /dev/null @@ -1,152 +0,0 @@ -<!-- HTML header for doxygen 1.8.4--> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml"> -<head> -<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> -<meta http-equiv="X-UA-Compatible" content="IE=9"/> -<meta name="generator" content="Doxygen 1.8.14"/> -<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> -<title>MADlib: Statistics</title> -<link href="tabs.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="jquery.js"></script> -<script type="text/javascript" src="dynsections.js"></script> -<link href="navtree.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="resize.js"></script> -<script type="text/javascript" src="navtreedata.js"></script> -<script type="text/javascript" src="navtree.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(initResizable); -/* @license-end */</script> -<link href="search/search.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="search/searchdata.js"></script> -<script type="text/javascript" src="search/search.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(function() { init_search(); }); -/* @license-end */ -</script> -<script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], - jax: ["input/TeX","output/HTML-CSS"], -}); -</script><script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js"></script> -<!-- hack in the navigation tree --> -<script type="text/javascript" src="eigen_navtree_hacks.js"></script> -<link href="doxygen.css" rel="stylesheet" type="text/css" /> -<link href="madlib_extra.css" rel="stylesheet" type="text/css"/> -<!-- google analytics --> -<script> - (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ - (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), - m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) - })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); - ga('create', 'UA-45382226-1', 'madlib.apache.org'); - ga('send', 'pageview'); -</script> -</head> -<body> -<div id="top"><!-- do not remove this div, it is closed by doxygen! --> -<div id="titlearea"> -<table cellspacing="0" cellpadding="0"> - <tbody> - <tr style="height: 56px;"> - <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> - <td style="padding-left: 0.5em;"> - <div id="projectname"> - <span id="projectnumber">1.15</span> - </div> - <div id="projectbrief">User Documentation for Apache MADlib</div> - </td> - <td> <div id="MSearchBox" class="MSearchBoxInactive"> - <span class="left"> - <img id="MSearchSelect" src="search/mag_sel.png" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - alt=""/> - <input type="text" id="MSearchField" value="Search" accesskey="S" - onfocus="searchBox.OnSearchFieldFocus(true)" - onblur="searchBox.OnSearchFieldFocus(false)" - onkeyup="searchBox.OnSearchFieldChange(event)"/> - </span><span class="right"> - <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> - </span> - </div> -</td> - </tr> - </tbody> -</table> -</div> -<!-- end header part --> -<!-- Generated by Doxygen 1.8.14 --> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -var searchBox = new SearchBox("searchBox", "search",false,'Search'); -/* @license-end */ -</script> -</div><!-- top --> -<div id="side-nav" class="ui-resizable side-nav-resizable"> - <div id="nav-tree"> - <div id="nav-tree-contents"> - <div id="nav-sync" class="sync"></div> - </div> - </div> - <div id="splitbar" style="-moz-user-select:none;" - class="ui-resizable-handle"> - </div> -</div> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -$(document).ready(function(){initNavTree('group__grp__stats.html','');}); -/* @license-end */ -</script> -<div id="doc-content"> -<!-- window showing the filter options --> -<div id="MSearchSelectWindow" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - onkeydown="return searchBox.OnSearchSelectKey(event)"> -</div> - -<!-- iframe showing the search results (closed by default) --> -<div id="MSearchResultsWindow"> -<iframe src="javascript:void(0)" frameborder="0" - name="MSearchResults" id="MSearchResults"> -</iframe> -</div> - -<div class="header"> - <div class="summary"> -<a href="#groups">Modules</a> </div> - <div class="headertitle"> -<div class="title">Statistics</div> </div> -</div><!--header--> -<div class="contents"> -<a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2> -<p>A collection of probability and statistics modules. </p> -<table class="memberdecls"> -<tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="groups"></a> -Modules</h2></td></tr> -<tr class="memitem:group__grp__desc__stats"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__desc__stats.html">Descriptive Statistics</a></td></tr> -<tr class="memdesc:group__grp__desc__stats"><td class="mdescLeft"> </td><td class="mdescRight">Methods to compute descriptive statistics of a dataset. <br /></td></tr> -<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr> -<tr class="memitem:group__grp__inf__stats"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__inf__stats.html">Inferential Statistics</a></td></tr> -<tr class="memdesc:group__grp__inf__stats"><td class="mdescLeft"> </td><td class="mdescRight">Methods to compute inferential statistics of a dataset. <br /></td></tr> -<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr> -<tr class="memitem:group__grp__prob"><td class="memItemLeft" align="right" valign="top"> </td><td class="memItemRight" valign="bottom"><a class="el" href="group__grp__prob.html">Probability Functions</a></td></tr> -<tr class="memdesc:group__grp__prob"><td class="mdescLeft"> </td><td class="mdescRight">Provides cumulative distribution, density/mass, and quantile functions for a wide range of probability distributions. <br /></td></tr> -<tr class="separator:"><td class="memSeparator" colspan="2"> </td></tr> -</table> -</div><!-- contents --> -</div><!-- doc-content --> -<!-- start footer part --> -<div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> - <ul> - <li class="footer">Generated on Mon Aug 6 2018 21:55:39 for MADlib by - <a href="http://www.doxygen.org/index.html"> - <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li> - </ul> -</div> -</body> -</html> http://git-wip-us.apache.org/repos/asf/madlib-site/blob/573d66d8/docs/rc/group__grp__stats.js ---------------------------------------------------------------------- diff --git a/docs/rc/group__grp__stats.js b/docs/rc/group__grp__stats.js deleted file mode 100644 index 0828141..0000000 --- a/docs/rc/group__grp__stats.js +++ /dev/null @@ -1,6 +0,0 @@ -var group__grp__stats = -[ - [ "Descriptive Statistics", "group__grp__desc__stats.html", "group__grp__desc__stats" ], - [ "Inferential Statistics", "group__grp__inf__stats.html", "group__grp__inf__stats" ], - [ "Probability Functions", "group__grp__prob.html", null ] -]; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/madlib-site/blob/573d66d8/docs/rc/group__grp__stats__tests.html ---------------------------------------------------------------------- diff --git a/docs/rc/group__grp__stats__tests.html b/docs/rc/group__grp__stats__tests.html deleted file mode 100644 index d1703f3..0000000 --- a/docs/rc/group__grp__stats__tests.html +++ /dev/null @@ -1,548 +0,0 @@ -<!-- HTML header for doxygen 1.8.4--> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml"> -<head> -<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> -<meta http-equiv="X-UA-Compatible" content="IE=9"/> -<meta name="generator" content="Doxygen 1.8.14"/> -<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> -<title>MADlib: Hypothesis Tests</title> -<link href="tabs.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="jquery.js"></script> -<script type="text/javascript" src="dynsections.js"></script> -<link href="navtree.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="resize.js"></script> -<script type="text/javascript" src="navtreedata.js"></script> -<script type="text/javascript" src="navtree.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(initResizable); -/* @license-end */</script> -<link href="search/search.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="search/searchdata.js"></script> -<script type="text/javascript" src="search/search.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(function() { init_search(); }); -/* @license-end */ -</script> -<script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], - jax: ["input/TeX","output/HTML-CSS"], -}); -</script><script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js"></script> -<!-- hack in the navigation tree --> -<script type="text/javascript" src="eigen_navtree_hacks.js"></script> -<link href="doxygen.css" rel="stylesheet" type="text/css" /> -<link href="madlib_extra.css" rel="stylesheet" type="text/css"/> -<!-- google analytics --> -<script> - (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ - (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), - m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) - })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); - ga('create', 'UA-45382226-1', 'madlib.apache.org'); - ga('send', 'pageview'); -</script> -</head> -<body> -<div id="top"><!-- do not remove this div, it is closed by doxygen! --> -<div id="titlearea"> -<table cellspacing="0" cellpadding="0"> - <tbody> - <tr style="height: 56px;"> - <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> - <td style="padding-left: 0.5em;"> - <div id="projectname"> - <span id="projectnumber">1.15</span> - </div> - <div id="projectbrief">User Documentation for Apache MADlib</div> - </td> - <td> <div id="MSearchBox" class="MSearchBoxInactive"> - <span class="left"> - <img id="MSearchSelect" src="search/mag_sel.png" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - alt=""/> - <input type="text" id="MSearchField" value="Search" accesskey="S" - onfocus="searchBox.OnSearchFieldFocus(true)" - onblur="searchBox.OnSearchFieldFocus(false)" - onkeyup="searchBox.OnSearchFieldChange(event)"/> - </span><span class="right"> - <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> - </span> - </div> -</td> - </tr> - </tbody> -</table> -</div> -<!-- end header part --> -<!-- Generated by Doxygen 1.8.14 --> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -var searchBox = new SearchBox("searchBox", "search",false,'Search'); -/* @license-end */ -</script> -</div><!-- top --> -<div id="side-nav" class="ui-resizable side-nav-resizable"> - <div id="nav-tree"> - <div id="nav-tree-contents"> - <div id="nav-sync" class="sync"></div> - </div> - </div> - <div id="splitbar" style="-moz-user-select:none;" - class="ui-resizable-handle"> - </div> -</div> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -$(document).ready(function(){initNavTree('group__grp__stats__tests.html','');}); -/* @license-end */ -</script> -<div id="doc-content"> -<!-- window showing the filter options --> -<div id="MSearchSelectWindow" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - onkeydown="return searchBox.OnSearchSelectKey(event)"> -</div> - -<!-- iframe showing the search results (closed by default) --> -<div id="MSearchResultsWindow"> -<iframe src="javascript:void(0)" frameborder="0" - name="MSearchResults" id="MSearchResults"> -</iframe> -</div> - -<div class="header"> - <div class="headertitle"> -<div class="title">Hypothesis Tests<div class="ingroups"><a class="el" href="group__grp__stats.html">Statistics</a> » <a class="el" href="group__grp__inf__stats.html">Inferential Statistics</a></div></div> </div> -</div><!--header--> -<div class="contents"> -<div class="toc"><b>Contents</b> <ul> -<li> -<a href="#input">Input</a> </li> -<li> -<a href="#usage">Usage</a> </li> -<li> -<a href="#examples">Examples</a> </li> -<li> -<a href="#literature">Literature</a> </li> -<li> -<a href="#related">Related Topics</a> </li> -</ul> -</div><p>Hypothesis tests are used to confirm or reject a <em>null hypothesis</em> \( H_0 \) about the distribution of random variables, given realizations of these random variables. Since in general it is not possible to make statements with certainty, one is interested in the probability \( p \) of seeing random variates at least as extreme as the ones observed, assuming that \( H_0 \) is true. If this probability \( p \) is small, \( H_0 \) will be rejected by the test with <em>significance level</em> \( p \). Falsifying \( H_0 \) is the canonic goal when employing a hypothesis test. That is, hypothesis tests are typically used in order to substantiate that instead the <em>alternative hypothesis</em> \( H_1 \) is true.</p> -<p>Hypothesis tests may be divided into parametric and non-parametric tests. A parametric test assumes certain distributions and makes inferences about parameters of the distributions (e.g., the mean of a normal distribution). Formally, there is a given domain of possible parameters \( \Gamma \) and the null hypothesis \( H_0 \) is the event that the true parameter \( \gamma_0 \in \Gamma_0 \), where \( \Gamma_0 \subsetneq \Gamma \). Non-parametric tests, on the other hand, do not assume any particular distribution of the sample (e.g., a non-parametric test may simply test if two distributions are similar).</p> -<p>The first step of a hypothesis test is to compute a <em>test statistic</em>, which is a function of the random variates, i.e., a random variate itself. A hypothesis test relies on the distribution of the test statistic being (approximately) known. Now, the \( p \)-value is the probability of seeing a test statistic at least as extreme as the one observed, assuming that \( H_0 \) is true. In a case where the null hypothesis corresponds to a family of distributions (e.g., in a parametric test where \( \Gamma_0 \) is not a singleton set), the \( p \)-value is the supremum, over all possible distributions according to the null hypothesis, of these probabilities.</p> -<dl class="section note"><dt>Note</dt><dd>Please refer to <a class="el" href="hypothesis__tests_8sql__in.html">hypothesis_tests.sql_in</a> for additional technical information on the MADlib implementation of hypothesis tests, and for detailed function signatures for all tests.</dd></dl> -<p><a class="anchor" id="input"></a></p><dl class="section user"><dt>Input</dt><dd></dd></dl> -<p>Input data is assumed to be normalized with all values stored row-wise. In general, the following inputs are expected.</p> -<p><b>One-sample tests</b> expect the following form: </p><pre>{TABLE|VIEW} <em>source</em> ( - ... - <em>value</em> DOUBLE PRECISION - ... -)</pre><p><b>Two-sample tests</b> expect the following form: </p><pre>{TABLE|VIEW} <em>source</em> ( - ... - <em>first</em> BOOLEAN, - <em>value</em> DOUBLE PRECISION - ... -)</pre><p> The <code>first</code> column indicates whether a value is from the first sample (if <code>TRUE</code>) or the second sample (if <code>FALSE</code>).</p> -<p><b>Many-sample tests</b> expect the following form: </p><pre>{TABLE|VIEW} <em>source</em> ( - ... - <em>group</em> INTEGER, - <em>value</em> DOUBLE PRECISION - ... -)</pre><p><a class="anchor" id="usage"></a></p><dl class="section user"><dt>Usage</dt><dd></dd></dl> -<p>All tests are implemented as aggregate functions. The non-parametric (rank-based) tests are implemented as ordered aggregate functions and thus necessitate an <code>ORDER BY</code> clause. In the following, the most simple forms of usage are given. Specific function signatures, as described in <a class="el" href="hypothesis__tests_8sql__in.html">hypothesis_tests.sql_in</a>, may require more arguments or a different <code>ORDER BY</code> clause.</p> -<ul> -<li>Run a parametric one-sample test: <pre>SELECT <em>test</em>(<em>value</em>) FROM <em>source</em></pre> where '<em>test</em>' can be one of<ul> -<li><code>t_test_one</code> (one-sample or dependent paired Student's t-test)</li> -<li><code>chi2_gof_test</code> (Pearson's chi-squared goodness of fit test, also used for chi-squared independence test as shown in example section below)</li> -</ul> -</li> -<li>Run a parametric two-sample/multi-sample test: <pre>SELECT <em>test</em>(<em>first/group</em>, <em>value</em>) FROM <em>source</em></pre> where '<em>test</em>' can be one of<ul> -<li><code>f_test</code> (Fisher F-test)</li> -<li><code>t_test_two_pooled</code> (two-sample pooled Studentâs t-test, i.e. equal variances)</li> -<li><code>t_test_two_unpooled</code> (two-sample unpooled t-test, i.e., unequal variances, also known as Welch's t-test)</li> -<li><code>one_way_anova</code> (one-way analysis of variance, multi-sample)</li> -</ul> -</li> -<li><p class="startli">Run a non-parametric two-sample/multi-sample test: </p><pre>SELECT <em>test</em>(<em>first/group</em>, <em>value</em> ORDER BY <em>value</em>) FROM <em>source</em></pre><p> where '<em>test</em>' can be one of</p><ul> -<li><code>ks_test</code> (Kolmogorov-Smirnov test)</li> -<li><code>mw_test</code> (Mann-Whitney test)</li> -<li><code>wsr_test</code> (Wilcoxon signed-rank test, multi-sample)</li> -</ul> -<p class="startli"><b>Note on non-parametric tests:</b> Kolomogov-Smirnov two-sample test is based on the asymptotic theory. The p-value is given by comparing the test statistics with the Kolomogov distribution. The p-value is also adjusted for data with heavy tail distribution, which may give different results than those given by R function's ks.test. See [3] for a detailed explanation. The literature is not unanimous about the definitions of the Wilcoxon rank sum and Mann-Whitney tests. There are two possible definitions for the statistic; MADlib outputs the minimum of the two and uses it for significance testing. This might give different results for both mw_test and wsr_test compared to statistical functions in other popular packages (like R's wilcox.test function). See [4] for a detailed explanation.</p> -</li> -</ul> -<p><a class="anchor" id="examples"></a></p><dl class="section user"><dt>Examples</dt><dd></dd></dl> -<ul> -<li><b>One-sample and two-sample t-test</b> (data is subset of mpg data from <a href="http://www.itl.nist.gov/div898/handbook/eda/section3/eda352.htm">NIST/SEMATECH</a>)</li> -</ul> -<pre class="example"> --- Load data -DROP TABLE IF EXISTS auto83b; -CREATE TABLE auto83b ( - id SERIAL, - mpg_us DOUBLE PRECISION, - mpg_j DOUBLE PRECISION -); -COPY auto83b (mpg_us, mpg_j) FROM stdin DELIMITER '|'; -18|24 -15|27 -18|27 -16|25 -17|31 -15|35 -14|24 -14|19 -21|31 -10|32 -10|24 -11|26 -9| 9 -\N|32 -\N|37 -\N|38 -\N|34 -\N|34 -\N|32 -\N|33 -\N|32 -\N|25 -\N|24 -\N|37 -13|\N -12|\N -18|\N -21|\N -19|\N -21|\N -15|\N -16|\N -15|\N -11|\N -20|\N -21|\N -19|\N -15|\N -\. -</pre><pre class="example"> --- Create table for one sample tests -DROP TABLE IF EXISTS auto83b_one_sample; -CREATE TABLE auto83b_one_sample AS - SELECT mpg_us AS mpg - FROM auto83b - WHERE mpg_us is not NULL; --- Print table -SELECT * FROM auto83b_one_sample; -</pre><pre class="result"> -mpg - 18 - 15 - 18 - 16 - 17 - 15 - 14 - 14 - 21 - 10 - 10 - 11 - 9 - 13 - 12 - 18 - 21 - 19 - 21 - 15 - 16 - 15 - 11 - 20 - 21 - 19 - 15 -(27 rows) -</pre> <pre class="example"> --- Create table for two sample tests -DROP TABLE IF EXISTS auto83b_two_sample; -CREATE TABLE auto83b_two_sample AS -SELECT TRUE AS is_us, mpg_us AS mpg - FROM auto83b - WHERE mpg_us is not NULL - UNION ALL - SELECT FALSE, mpg_j - FROM auto83b - WHERE mpg_j is not NULL; --- Print table -SELECT * FROM auto83b_two_sample; -</pre> <pre class="result"> - is_us | mpg --------+----- - t | 18 - t | 15 - t | 18 - t | 16 - t | 17 - t | 15 - t | 14 - t | 14 - t | 21 - t | 10 - t | 10 - t | 11 - t | 9 - t | 13 - t | 12 - t | 18 - t | 21 - t | 19 - t | 21 - t | 15 - t | 16 - t | 15 - t | 11 - t | 20 - t | 21 - t | 19 - t | 15 - f | 24 - f | 27 - f | 27 - f | 25 - f | 31 - f | 35 - f | 24 - f | 19 - f | 31 - f | 32 - f | 24 - f | 26 - f | 9 - f | 32 - f | 37 - f | 38 - f | 34 - f | 34 - f | 32 - f | 33 - f | 32 - f | 25 - f | 24 - f | 37 -(51 rows) -</pre> <pre class="example"> --- One sample tests -SELECT (madlib.t_test_one(mpg - 20)).* FROM auto83b_one_sample; -- test rejected for mean = 20 -</pre><pre class="result"> - statistic | df | p_value_one_sided | p_value_two_sided - ------------------+----+-------------------+---------------------- - -6.0532478722666 | 26 | 0.999998926789141 | 2.14642171769697e-06 - </pre><pre class="example"> -SELECT (madlib.t_test_one(mpg - 15.7)).* FROM auto83b_one_sample; -- test not rejected -</pre><pre class="result"> - statistic | df | p_value_one_sided | p_value_two_sided - ---------------------+----+-------------------+------------------- - 0.00521831713126531 | 26 | 0.497938118950661 | 0.995876237901321 -</pre><pre class="example"> --- Two sample tests -SELECT (madlib.t_test_two_pooled(is_us, mpg)).* FROM auto83b_two_sample; -</pre> <pre class="result"> - statistic | df | p_value_one_sided | p_value_two_sided - -------------------+----+-------------------+---------------------- - -8.89342267075968 | 49 | 0.999999999995748 | 8.50408632402377e-12 - </pre><pre class="example"> -SELECT (madlib.t_test_two_unpooled(is_us, mpg)).* FROM auto83b_two_sample; -</pre><pre class="result"> - statistic | df | p_value_one_sided | p_value_two_sided - -------------------+------------------+-------------------+---------------------- - -8.61746388524314 | 35.1283818346179 | 0.999999999821218 | 3.57563867403599e-10 -</pre><ul> -<li><b>F-Test</b> (Uses same data as above t-test)</li> -</ul> -<pre class="example"> -SELECT (madlib.f_test(is_us, mpg)).* FROM auto83b_two_sample; --- Test result indicates that the two distributions have different variances -</pre> <pre class="result"> - statistic | df1 | df2 | p_value_one_sided | p_value_two_sided - -------------------+-----+-----+-------------------+--------------------- - 0.311786921089247 | 26 | 23 | 0.997559863672441 | 0.00488027265511803 -</pre><ul> -<li><b>Chi-squared goodness-of-fit test</b> (<a href="http://www.statsdirect.com/help/default.htm#nonparametric_methods/chisq_goodness_fit.htm">Data source</a>)</li> -</ul> -<pre class="example"> -CREATE TABLE chi2_test_blood_group ( - id SERIAL, - blood_group VARCHAR, - observed BIGINT, - expected DOUBLE PRECISION -); -INSERT INTO chi2_test_blood_group(blood_group, observed, expected) VALUES - ('O', 67, 82.28), - ('A', 83, 84.15), - ('B', 29, 14.96), - ('AB', 8, 5.61); -SELECT (madlib.chi2_gof_test(observed, expected)).* FROM chi2_test_blood_group; -</pre> <pre class="result"> - statistic | p_value | df | phi | contingency_coef - ------------------+----------------------+----+------------------+------------------- - 17.0481013341976 | 0.000690824622923826 | 3 | 2.06446732440826 | 0.899977280680593 - </pre><ul> -<li><b>Chi-squared independence test</b> (<a href="http://itl.nist.gov/div898/software/dataplot/refman1/auxillar/chistest.htm">Data source</a>)</li> -</ul> -<p>The Chi-squared independence test uses the Chi-squared goodness-of-fit function, as shown in the example below. The expected value needs to be computed and passed to the goodness-of-fit function. The expected value for MADlib is computed as <em>sum of rows * sum of columns</em>, for each element of the input matrix. For e.g., expected value for element (2,1) would be <em>sum of row 2 * sum of column 1</em>.</p> -<pre class="example"> -CREATE TABLE chi2_test_friendly ( - id_x SERIAL, - values INTEGER[] -); -INSERT INTO chi2_test_friendly(values) VALUES - (array[5, 29, 14, 16]), - (array[15, 54, 14, 10]), - (array[20, 84, 17, 94]), - (array[68, 119, 26, 7]);</pre><pre class="example">-- Input table is expected to be unpivoted, so need to pivot it -CREATE TABLE chi2_test_friendly_unpivoted AS -SELECT id_x, id_y, values[id_y] AS observed -FROM - chi2_test_friendly, - generate_series(1,4) AS id_y;</pre><pre class="example">-- Compute Chi-squared independence statistic, by calculating expected value in the SQL and calling the goodness-of-fit function -SELECT (madlib.chi2_gof_test(observed, expected, deg_freedom)).* -FROM ( - -- Compute expected values and degrees of freedom - SELECT - observed, - sum(observed) OVER (PARTITION BY id_x)::DOUBLE PRECISION * - sum(observed) OVER (PARTITION BY id_y) AS expected - FROM chi2_test_friendly_unpivoted -) p, ( - SELECT - (count(DISTINCT id_x) - 1) * (count(DISTINCT id_y) - 1) AS deg_freedom - FROM chi2_test_friendly_unpivoted -) q; -</pre> <pre class="result"> - statistic | p_value | df | phi | contingency_coef - ------------------+----------------------+----+------------------+------------------- - 138.289841626008 | 2.32528678709871e-25 | 9 | 2.93991753313346 | 0.946730727519112 - </pre><ul> -<li><b>ANOVA test</b> (<a href="http://www.itl.nist.gov/div898/handbook/prc/section4/prc433.htm">Data source</a>)</li> -</ul> -<pre class="example"> -CREATE TABLE nist_anova_test ( - id SERIAL, - resistance FLOAT8[] -); -INSERT INTO nist_anova_test(resistance) VALUES - (array[6.9,8.3,8.0]), - (array[5.4,6.8,10.5]), - (array[5.8,7.8,8.1]), - (array[4.6,9.2,6.9]), - (array[4.0,6.5,9.3]);</pre><pre class="example">SELECT (madlib.one_way_anova(level, value)).* FROM ( - SELECT level, resistance[level] AS value - FROM - nist_anova_test, (SELECT * FROM generate_series(1,3) level) q1 -) q2; -</pre> <pre class="result"> - sum_squares_between | sum_squares_within | df_between | df_within | mean_squares_between | mean_squares_within | statistic | p_value - ---------------------+--------------------+------------+-----------+----------------------+---------------------+------------------+-------------------- - 27.8973333333333 | 17.452 | 2 | 12 | 13.9486666666667 | 1.45433333333333 | 9.59110703644281 | 0.0032482226008593 -</pre><ul> -<li><b>Kolmogorov-Smirnov test</b> (<a href="http://www.physics.csbsju.edu/stats/KS-test.html">Data source</a>)</li> -</ul> -<pre class="example"> -CREATE TABLE ks_sample_1 AS -SELECT - TRUE AS first, - unnest(ARRAY[0.22, -0.87, -2.39, -1.79, 0.37, -1.54, 1.28, -0.31, -0.74, 1.72, 0.38, -0.17, -0.62, -1.10, 0.30, 0.15, 2.30, 0.19, -0.50, -0.09]) AS value -UNION ALL -SELECT - FALSE, - unnest(ARRAY[-5.13, -2.19, -2.43, -3.83, 0.50, -3.25, 4.32, 1.63, 5.18, -0.43, 7.11, 4.87, -3.10, -5.81, 3.76, 6.31, 2.58, 0.07, 5.76, 3.50]);</pre><pre class="example">SELECT (madlib.ks_test(first, value, - (SELECT count(value) FROM ks_sample_1 WHERE first), - (SELECT count(value) FROM ks_sample_1 WHERE NOT first) - ORDER BY value)).* -FROM ks_sample_1; -</pre> <pre class="result"> - statistic | k_statistic | p_value - -----------+-----------------+-------------------- - 0.45 | 1.4926782214936 | 0.0232132758544496 -</pre><ul> -<li><b>Mann-Whitney test</b> (use same data as t-test)</li> -</ul> -<pre class="example"> -SELECT (madlib.mw_test(is_us, mpg ORDER BY mpg)).* from auto83b_two_sample; --- Note first parameter above is BOOLEAN -</pre> <pre class="result"> - statistic | u_statistic | p_value_one_sided | p_value_two_sided - -------------------+-------------+-------------------+---------------------- - -5.50097925755249 | 32.5 | 0.999999981115618 | 3.77687645883758e-08 -</pre><ul> -<li><b>Wilcoxon signed-rank test</b></li> -</ul> -<pre class="example"> -DROP TABLE IF EXISTS test_wsr; -CREATE TABLE test_wsr ( - x DOUBLE PRECISION, - y DOUBLE PRECISION -); -COPY test_wsr (x, y) FROM stdin DELIMITER '|'; -0.32|0.39 -0.4|0.47 -0.11|0.11 -0.47|0.43 -0.32|0.42 -0.35|0.3 -0.32|0.43 -0.63|0.98 -0.5|0.86 -0.6|0.79 -0.38|0.33 -0.46|0.45 -0.2|0.22 -0.31|0.3 -0.62|0.6 -0.52|0.53 -0.77|0.85 -0.23|0.21 -0.3|0.33 -0.7|0.57 -0.41|0.43 -0.53|0.49 -0.19|0.2 -0.31|0.35 -0.48|0.4 -\. - -SELECT (madlib.wsr_test( - x - y, - 2 * 2^(-52) * greatest(x,y) - ORDER BY abs(x - y) -)).* -FROM test_wsr; -</pre> <pre class="result"> - statistic | rank_sum_pos | rank_sum_neg | num | z_statistic | p_value_one_sided | p_value_two_sided - -----------+--------------+--------------+-----+-------------------+-------------------+------------------- - 105.5 | 105.5 | 194.5 | 24 | -1.27318365656729 | 0.898523560667509 | 0.202952878664983 -</pre><p><a class="anchor" id="literature"></a></p><dl class="section user"><dt>Literature</dt><dd></dd></dl> -<p>[1] M. Hollander, D. Wolfe: <em>Nonparametric Statistical Methods</em>, 2nd edition, Wiley, 1999</p> -<p>[2] E. Lehmann, J. Romano: <em>Testing Statistical Hypotheses</em>, 3rd edition, Springer, 2005</p> -<p>[3] M. Stephens: <em>Use of the Kolmogorov-Smirnov, Cramer-Von Mises and related statistics without extensive tables</em>, Journal of the Royal Statistical Society. Series B (Methodological) (1970): 115-122.</p> -<p>[4] Wikipedia: MannâWhitney U test calculation, <a href="http://en.wikipedia.org/wiki/Mann-Whitney_test#Calculations">http://en.wikipedia.org/wiki/Mann-Whitney_test#Calculations</a></p> -<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl> -<p>File <a class="el" href="hypothesis__tests_8sql__in.html" title="SQL functions for statistical hypothesis tests. ">hypothesis_tests.sql_in</a> documenting the SQL functions. </p> -</div><!-- contents --> -</div><!-- doc-content --> -<!-- start footer part --> -<div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> - <ul> - <li class="footer">Generated on Mon Aug 6 2018 21:55:39 for MADlib by - <a href="http://www.doxygen.org/index.html"> - <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li> - </ul> -</div> -</body> -</html> http://git-wip-us.apache.org/repos/asf/madlib-site/blob/573d66d8/docs/rc/group__grp__stemmer.html ---------------------------------------------------------------------- diff --git a/docs/rc/group__grp__stemmer.html b/docs/rc/group__grp__stemmer.html deleted file mode 100644 index 82703d1..0000000 --- a/docs/rc/group__grp__stemmer.html +++ /dev/null @@ -1,247 +0,0 @@ -<!-- HTML header for doxygen 1.8.4--> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml"> -<head> -<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/> -<meta http-equiv="X-UA-Compatible" content="IE=9"/> -<meta name="generator" content="Doxygen 1.8.14"/> -<meta name="keywords" content="madlib,postgres,greenplum,machine learning,data mining,deep learning,ensemble methods,data science,market basket analysis,affinity analysis,pca,lda,regression,elastic net,huber white,proportional hazards,k-means,latent dirichlet allocation,bayes,support vector machines,svm"/> -<title>MADlib: Stemming</title> -<link href="tabs.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="jquery.js"></script> -<script type="text/javascript" src="dynsections.js"></script> -<link href="navtree.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="resize.js"></script> -<script type="text/javascript" src="navtreedata.js"></script> -<script type="text/javascript" src="navtree.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(initResizable); -/* @license-end */</script> -<link href="search/search.css" rel="stylesheet" type="text/css"/> -<script type="text/javascript" src="search/searchdata.js"></script> -<script type="text/javascript" src="search/search.js"></script> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ - $(document).ready(function() { init_search(); }); -/* @license-end */ -</script> -<script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - extensions: ["tex2jax.js", "TeX/AMSmath.js", "TeX/AMSsymbols.js"], - jax: ["input/TeX","output/HTML-CSS"], -}); -</script><script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.2/MathJax.js"></script> -<!-- hack in the navigation tree --> -<script type="text/javascript" src="eigen_navtree_hacks.js"></script> -<link href="doxygen.css" rel="stylesheet" type="text/css" /> -<link href="madlib_extra.css" rel="stylesheet" type="text/css"/> -<!-- google analytics --> -<script> - (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ - (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), - m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) - })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); - ga('create', 'UA-45382226-1', 'madlib.apache.org'); - ga('send', 'pageview'); -</script> -</head> -<body> -<div id="top"><!-- do not remove this div, it is closed by doxygen! --> -<div id="titlearea"> -<table cellspacing="0" cellpadding="0"> - <tbody> - <tr style="height: 56px;"> - <td id="projectlogo"><a href="http://madlib.apache.org"><img alt="Logo" src="madlib.png" height="50" style="padding-left:0.5em;" border="0"/ ></a></td> - <td style="padding-left: 0.5em;"> - <div id="projectname"> - <span id="projectnumber">1.15</span> - </div> - <div id="projectbrief">User Documentation for Apache MADlib</div> - </td> - <td> <div id="MSearchBox" class="MSearchBoxInactive"> - <span class="left"> - <img id="MSearchSelect" src="search/mag_sel.png" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - alt=""/> - <input type="text" id="MSearchField" value="Search" accesskey="S" - onfocus="searchBox.OnSearchFieldFocus(true)" - onblur="searchBox.OnSearchFieldFocus(false)" - onkeyup="searchBox.OnSearchFieldChange(event)"/> - </span><span class="right"> - <a id="MSearchClose" href="javascript:searchBox.CloseResultsWindow()"><img id="MSearchCloseImg" border="0" src="search/close.png" alt=""/></a> - </span> - </div> -</td> - </tr> - </tbody> -</table> -</div> -<!-- end header part --> -<!-- Generated by Doxygen 1.8.14 --> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -var searchBox = new SearchBox("searchBox", "search",false,'Search'); -/* @license-end */ -</script> -</div><!-- top --> -<div id="side-nav" class="ui-resizable side-nav-resizable"> - <div id="nav-tree"> - <div id="nav-tree-contents"> - <div id="nav-sync" class="sync"></div> - </div> - </div> - <div id="splitbar" style="-moz-user-select:none;" - class="ui-resizable-handle"> - </div> -</div> -<script type="text/javascript"> -/* @license magnet:?xt=urn:btih:cf05388f2679ee054f2beb29a391d25f4e673ac3&dn=gpl-2.0.txt GPL-v2 */ -$(document).ready(function(){initNavTree('group__grp__stemmer.html','');}); -/* @license-end */ -</script> -<div id="doc-content"> -<!-- window showing the filter options --> -<div id="MSearchSelectWindow" - onmouseover="return searchBox.OnSearchSelectShow()" - onmouseout="return searchBox.OnSearchSelectHide()" - onkeydown="return searchBox.OnSearchSelectKey(event)"> -</div> - -<!-- iframe showing the search results (closed by default) --> -<div id="MSearchResultsWindow"> -<iframe src="javascript:void(0)" frameborder="0" - name="MSearchResults" id="MSearchResults"> -</iframe> -</div> - -<div class="header"> - <div class="headertitle"> -<div class="title">Stemming<div class="ingroups"><a class="el" href="group__grp__datatrans.html">Data Types and Transformations</a></div></div> </div> -</div><!--header--> -<div class="contents"> -<div class="toc"><b>Contents</b> <ul> -<li> -<a href="#notes">Implementation Notes</a> </li> -<li> -<a href="#list">List of Stemmer Operations</a> </li> -<li> -<a href="#examples">Examples</a> </li> -<li> -<a href="#related">Related Topics</a> </li> -</ul> -</div><p>This module provides a basic stemming operation for text input. It is a support module for several machine learning algorithms that require a stemmer. Currently, it only supports English words.</p> -<p>This function is a SQL interface to the implementation of the <a href="http://tartarus.org/~martin/PorterStemmer/">Porter Stemming Algorithm</a>. The original stemming algorithm is written and maintained by Martin Porter</p> -<p><a class="anchor" id="notes"></a></p><dl class="section user"><dt>Implementation Notes</dt><dd></dd></dl> -<p>All functions described in this module work with text OR text array.</p> -<p>Several of the function require TEXT VALUES, and returns NULL for a NULL input. See details in description of individual functions.</p> -<p><a class="anchor" id="list"></a></p><dl class="section user"><dt>Stemmer Operations</dt><dd><table class="output"> -<tr> -<th><a class="el" href="porter__stemmer_8sql__in.html#aca5bc24a9a8f5c33470b9f0bf0b3c515" title="Returns stem of input token. Returns NULL if input token is NULL. ">stem_token()</a></th><td><p class="starttd">Returns the stem of the token. Returns NULL if input is NULL.</p> -<p class="endtd"></p> -</td></tr> -<tr> -<th><a class="el" href="porter__stemmer_8sql__in.html#a1ac3a2fd645ddf807b36a1328134a4ea" title="Returns stems in an array of input token array. Returns NULL element for corresponding input NULL tok...">stem_token_arr()</a></th><td><p class="starttd">Returns the stems in an array of input token array. The stem would be NULL for corresponding NULL token.</p> -<p class="endtd"><a class="anchor" id="examples"></a></p> -</td></tr> -</table> -</dd></dl> -<dl class="section user"><dt>Examples</dt><dd></dd></dl> -<ol type="1"> -<li>Create a table with some words to be stemmed. <pre class="example"> -CREATE TABLE token_tbl ( id integer, - word text - ); -INSERT INTO token_tbl VALUES - (1, 'kneel'), - (2, 'kneeled'), - (3, 'kneeling'), - (4, 'kneels'), - (5, 'knees'), - (6, 'knell'), - (7, 'knelt'), - (8, 'knew'), - (9, 'knick'), - (10, 'knif'), - (11, 'knife'), - (12, 'knight'), - (13, 'knightly'), - (14, 'knights'), - (15, 'knit'), - (16, 'knits'), - (17, 'knitted'), - (18, 'knitting'), - (19, 'knives'), - (20, 'knob'), - (21, 'knobs'), - (22, 'knock'), - (23, 'knocked'), - (24, 'knocker'), - (25, 'knockers'), - (26, 'knocking'), - (27, 'knocks'), - (28, 'knopp'), - (29, 'knot'), - (30, 'knots'); -</pre></li> -<li>Return the stem words <pre class="example"> -SELECT madlib.stem_token(word) FROM token_tbl; -</pre> <pre class="result"> - stem_token - ------------ - kneel - kneel - kneel - kneel - knee - knell - knelt - knew - knick - knif - knife - knight - knight - knight - knit - knit - knit - knit - knive - knob - knob - knock - knock - knocker - knocker - knock - knock - knopp - knot - knot -(30 rows) -</pre></li> -<li>The input can be processed as an array <pre class="example"> -SELECT madlib.stem_token_arr(array_agg(word order by word)) FROM token_tbl; -</pre> <pre class="result"> - stem_token_arr - ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - {kneel,kneel,kneel,kneel,knee,knell,knelt,knew,knick,knif,knife,knight,knight,knight,knit,knit,knit,knit,knive,knob,knob,knock,knock,knocker,knocker,knock,knock,knopp,knot,knot} -(1 row) -</pre></li> -</ol> -<p><a class="anchor" id="related"></a></p><dl class="section user"><dt>Related Topics</dt><dd></dd></dl> -<p>File <a class="el" href="porter__stemmer_8sql__in.html" title="implementation of porter stemmer operations in SQL ">porter_stemmer.sql_in</a> for list of functions and usage. </p> -</div><!-- contents --> -</div><!-- doc-content --> -<!-- start footer part --> -<div id="nav-path" class="navpath"><!-- id is needed for treeview function! --> - <ul> - <li class="footer">Generated on Mon Aug 6 2018 21:55:39 for MADlib by - <a href="http://www.doxygen.org/index.html"> - <img class="footer" src="doxygen.png" alt="doxygen"/></a> 1.8.14 </li> - </ul> -</div> -</body> -</html>