use...

buildbot Thu, 19 Mar 2015 14:22:54 -0700

Added: 
websites/staging/mahout/trunk/content/users/mapreduce/clustering/streaming-k-means.html
==============================================================================
--- 
websites/staging/mahout/trunk/content/users/mapreduce/clustering/streaming-k-means.html
 (added)
+++ 
websites/staging/mahout/trunk/content/users/mapreduce/clustering/streaming-k-means.html
 Thu Mar 19 21:21:45 2015
@@ -0,0 +1,431 @@
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en"><head><meta 
http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data 
framework, data integration,
+        data matching, data mining, data mining algorithms, data mining 
analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data 
mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning 
methods,
+        learning techniques, lucene, machine learning, machine translation, 
mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive 
bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data 
mining">
+  <link rel="shortcut icon" type="image/x-icon" 
href="http://mahout.apache.org/images/favicon.ico";>
+  <script type="text/javascript" src="/js/prototype.js"></script>
+  <script type="text/javascript" src="/js/effects.js"></script>
+  <script type="text/javascript" src="/js/search.js"></script>
+  <script type="text/javascript" src="/js/slides.js"></script>
+
+  <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
+  <link href="/css/bootstrap-responsive.css" rel="stylesheet">
+  <link rel="stylesheet" href="/css/global.css" type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        
'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
 : 
+        
'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+       
+         var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/overview.html"></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search"; method="get" 
class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org"; name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/images/mahout-lupe.png" 
alt="Search" />
+    </form>
+  </div>
+
+    <div class="navbar navbar-inverse" 
style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: 
none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" 
data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development 
Project</a> -->
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+              <li><a href="/">Home</a></li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">General<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/general/downloads.html">Downloads</a>
+                  <li><a href="/general/who-we-are.html">Who we are</a>
+                  <li><a 
href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                  <li><a href="/general/release-notes.html">Release Notes</a> 
+                  <li><a href="/general/books-tutorials-and-talks.html">Books, 
Tutorials, Talks</a></li>
+                  <li><a href="/general/powered-by-mahout.html">Powered By 
Mahout</a>
+                  <li><a 
href="/general/professional-support.html">Professional Support</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Resources</li>
+                  <li><a href="/general/reference-reading.html">Reference 
Reading</a>
+                  <li><a href="/general/faq.html">FAQ</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Legal</li>
+                  <li><a 
href="http://www.apache.org/licenses/";>License</a></li>
+                  <li><a 
href="http://www.apache.org/security/";>Security</a></li>
+                  <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+                </ul>
+              </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Developers<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/developers/developer-resources.html">Developer 
resources</a></li>
+                  <li><a href="/developers/version-control.html">Version 
control</a></li>
+                  <li><a href="/developers/buildingmahout.html">Build from 
source</a></li>
+                  <li><a href="/developers/issue-tracker.html">Issue 
tracker</a></li>
+                  <li><a href="https://builds.apache.org/job/Mahout-Quality/"; 
target="_blank">Code quality reports</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Contributions</li>
+                  <li><a href="/developers/how-to-contribute.html">How to 
contribute</a></li>
+                  <li><a href="/developers/how-to-become-a-committer.html">How 
to become a committer</a></li>
+                  <li><a href="/developers/gsoc.html">GSoC</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">For committers</li>
+                  <li><a href="/developers/how-to-update-the-website.html">How 
to update the website</a></li>
+                  <li><a href="/developers/patch-check-list.html">Patch check 
list</a></li>
+                  <li><a href="/developers/github.html">Handling Github 
PRs</a></li>
+                  <li><a href="/developers/how-to-release.html">How to 
release</a></li>
+                  <li><a href="/developers/thirdparty-dependencies.html">Third 
party dependencies</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Basics<b class="caret"></b></a>
+                 <ul class="dropdown-menu">
+                  <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                  <li><a href="/users/basics/quickstart.html">Quickstart</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Working with text</li>
+                  <li><a 
href="/users/basics/creating-vectors-from-text.html">Creating vectors from 
text</a>
+                  <li><a 
href="/users/basics/collocations.html">Collocations</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Dimensionality reduction</li>
+                  <li><a 
href="/users/dim-reduction/dimensional-reduction.html">Singular Value 
Decomposition</a></li>
+                  <li><a href="/users/dim-reduction/ssvd.html">Stochastic 
SVD</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Topic Models</li>      
+                  <li><a 
href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet 
Allocation</a></li>
+                </ul>
+                 </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Spark<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/sparkbindings/home.html">Scala &amp; 
Spark Bindings Overview</a></li>
+                  <li><a 
href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark 
Shell</a></li>
+                             <li class="divider"></li>
+                  <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                </ul>
+               </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Classification<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a 
href="/users/mapreduce/classification/bayesian.html">Naive Bayes</a></li>
+                  <li><a 
href="/users/mapreduce/classification/hidden-markov-models.html">Hidden Markov 
Models</a></li>
+                  <li><a 
href="/users/mapreduce/classification/logistic-regression.html">Logistic 
Regression</a></li>
+                  <li><a 
href="/users/mapreduce/classification/partial-implementation.html">Random 
Forest</a></li>
+
+                  <li class="divider"></li>
+                  <li class="nav-header">Examples</li>
+                  <li><a 
href="/users/mapreduce/classification/breiman-example.html">Breiman 
example</a></li>
+                  <li><a 
href="/users/mapreduce/classification/twenty-newsgroups.html">20 newsgroups 
example</a></li>
+                </ul></li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Clustering<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a 
href="/users/mapreduce/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a 
href="/users/mapreduce/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a 
href="/users/mapreduce/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li>
+                <li><a 
href="/users/mapreduce/clustering/streaming-k-means.html">Streaming 
KMeans</a></li>
+                <li><a 
href="/users/mapreduce/clustering/spectral-clustering.html">Spectral 
Clustering</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Commandline usage</li>
+                <li><a 
href="/users/mapreduce/clustering/k-means-commandline.html">Options for 
k-Means</a></li>
+                <li><a 
href="/users/mapreduce/clustering/canopy-commandline.html">Options for 
Canopy</a></li>
+                <li><a 
href="/users/mapreduce/clustering/fuzzy-k-means-commandline.html">Options for 
Fuzzy k-Means</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Examples</li>
+                <li><a 
href="/users/mapreduce/clustering/clustering-of-synthetic-control-data.html">Synthetic
 data</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Post processing</li>
+                <li><a 
href="/users/mapreduce/clustering/cluster-dumper.html">Cluster Dumper 
tool</a></li>
+                <li><a 
href="/users/mapreduce/clustering/visualizing-sample-clusters.html">Cluster 
visualisation</a></li>
+                </ul></li>
+                <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a 
href="/users/mapreduce/recommender/quickstart.html">Quickstart</a></li>
+                <li><a 
href="/users/mapreduce/recommender/recommender-first-timer-faq.html">First 
Timer FAQ</a></li>
+                <li><a 
href="/users/mapreduce/recommender/userbased-5-minutes.html">A user-based 
recommender <br/>in 5 minutes</a></li>
+               <li><a 
href="/users/mapreduce/recommender/matrix-factorization.html">Matrix 
factorization-based<br/> recommenders</a></li>
+                <li><a 
href="/users/mapreduce/recommender/recommender-documentation.html">Overview</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Hadoop</li>
+                <li><a 
href="/users/mapreduce/recommender/intro-itembased-hadoop.html">Intro to 
item-based recommendations<br/> with Hadoop</a></li>
+                <li><a 
href="/users/mapreduce/recommender/intro-als-hadoop.html">Intro to ALS 
recommendations<br/> with Hadoop</a></li>
+                <li class="nav-header">Spark</li>
+                <li><a 
href="/users/mapreduce/recommender/intro-cooccurrence-spark.html">Intro to 
cooccurrence-based<br/> recommendations with Spark</a></li>
+              </ul>
+            </li>
+           </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+       <ul class="sidemenu">
+               <li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout"; 
data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var 
js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+       </ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html";>How the 
ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html";>Get 
Involved</a></li>
+      <li><a href="http://www.apache.org/dev/";>Developer Resources</a></li>
+      <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+      <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/";>Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/";>Hadoop</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+    <h1 id="streamingkmeans-algorithm"><em>StreamingKMeans</em> algorithm</h1>
+<p>The <em>StreamingKMeans</em> algorithm is a variant of Algorithm 1 from <a 
href="http://nips.cc/Conferences/2011/Program/event.php?ID=2989"; title="M. 
Shindler, A. Wong, A. Meyerson: Fast and Accurate k-means For Large 
Datasets">Shindler et al</a> and consists of two steps:</p>
+<ol>
+<li>Streaming step </li>
+<li>BallKMeans step. </li>
+</ol>
+<p>The streaming step is a randomized algorithm that makes one pass through 
the data and 
+produces as many centroids as it determines is optimal. This step can be 
viewed as 
+a preparatory dimensionality reduction. If the size of the data stream is 
<em>n</em> and the 
+expected number of clusters is <em>k</em>, the streaming step will produce 
roughly <em>k*log(n)</em> 
+clusters that will be passed on to the BallKMeans step which will further 
reduce the 
+number of clusters down to <em>k</em>. BallKMeans is a randomized Lloyd-type 
algorithm that
+has been studied in detail, see <a 
href="http://www.math.uwaterloo.ca/~cswamy/papers/kmeansfnl.pdf"; title="R. 
Ostrovsky, Y. Rabani, L. Schulman, Ch. Swamy: The Effectiveness of Lloyd-Type 
Methods for the k-means Problem">Ostrovsky et al</a>.</p>
+<h2 id="streaming-step">Streaming step</h2>
+<hr />
+<h3 id="overview">Overview</h3>
+<p>The streaming step is a derivative of the streaming 
+portion of Algorithm 1 in <a 
href="http://nips.cc/Conferences/2011/Program/event.php?ID=2989"; title="M. 
Shindler, A. Wong, A. Meyerson: Fast and Accurate k-means For Large 
Datasets">Shindler et al</a>. The main difference between the two is that 
+Algorithm 1 of <a 
href="http://nips.cc/Conferences/2011/Program/event.php?ID=2989"; title="M. 
Shindler, A. Wong, A. Meyerson: Fast and Accurate k-means For Large 
Datasets">Shindler et al</a> assumes 
+the knowledge of the size of the data stream and uses it to set a key 
parameter 
+for the algorithm. More precisely, the initial <em>distanceCutoff</em> 
(defined below), which is 
+denoted by <em>f</em> in <a 
href="http://nips.cc/Conferences/2011/Program/event.php?ID=2989"; title="M. 
Shindler, A. Wong, A. Meyerson: Fast and Accurate k-means For Large 
Datasets">Shindler et al</a>, is set to <em>1/(k(1+log(n))</em>. The 
<em>distanceCutoff</em> influences the number of clusters that the algorithm 
+will produce. 
+In contrast, Mahout implementation does not require the knowledge of the size 
of the 
+data stream. Instead, it dynamically re-evaluates the parameters that depend 
on the size 
+of the data stream at runtime as more and more data is processed. In 
particular, 
+the parameter <em>numClusters</em> (defined below) changes its value as the 
data is processed.   </p>
+<h3 id="parameters">Parameters</h3>
+<ul>
+<li><strong>numClusters</strong> (int): Conceptually, <em>numClusters</em> 
represents the algorithm's guess at the optimal 
+number of clusters it is shooting for. In particular, <em>numClusters</em> 
will increase at run 
+time as more and more data is processed. Note that â¢numClustersâ¢ is not 
the number of clusters that the algorithm will produce. Also, 
<em>numClusters</em> should not be set to the final number of clusters that we 
expect to receive as the output of <em>StreamingKMeans</em>. </li>
+<li><strong>distanceCutoff</strong> (double): a parameter representing the 
value of the distance between a point and 
+its closest centroid after which
+the new point will definitely be assigned to a new cluster. 
<em>distanceCutoff</em> can be thought 
+of as an estimate of the variable <em>f</em> from Shindler et al. The default 
initial value for 
+<em>distanceCutoff</em> is <em>1.0/numClusters</em> and 
<em>distanceCutoff</em> grows as a geometric progression with 
+common ratio <em>beta</em> (see below).    </li>
+<li><strong>beta</strong> (double): a constant parameter that controls the 
growth of <em>distanceCutoff</em>. If the initial setting of 
<em>distanceCutoff</em> is <em>d0</em>, <em>distanceCutoff</em> will grow as 
the geometric progression with initial term <em>d0</em> and common ratio 
<em>beta</em>. The default value for <em>beta</em> is 1.3. </li>
+<li><strong>clusterLogFactor</strong> (double): a constant parameter such that 
<em>clusterLogFactor</em> <em>log(numProcessedPoints)</em> is the runtime 
estimate of the number of clusters to be produced by the streaming step. If the 
final number of clusters (that we expect <em>StreamingKMeans</em> to output) is 
<em>k</em>, <em>clusterLogFactor</em> can be set to <em>k</em>.  </li>
+<li><strong>clusterOvershoot</strong> (double): a constant multiplicative 
slack factor that slows down the collapsing of clusters. The default value is 
2. </li>
+</ul>
+<h3 id="algorithm">Algorithm</h3>
+<p>The algorithm processes the data one-by-one and makes only one pass through 
the data.
+The first point from the data stream will form the centroid of the first 
cluster (this designation may change as more points are processed). Suppose 
there are <em>r</em> clusters at one point and a new point <em>p</em> is being 
processed. The new point can either be added to one of the existing <em>r</em> 
clusters or become a new cluster. To decide:</p>
+<ul>
+<li>let <em>c</em> be the closest cluster to point <em>p</em></li>
+<li>let <em>d</em> be the distance between <em>c</em> and <em>p</em></li>
+<li>if <em>d &gt; distanceCutoff</em>, create a new cluster from <em>p</em> 
(<em>p</em> is too far away from the clusters to be part of any one of 
them)</li>
+<li>else (<em>d &lt;= distanceCutoff</em>), create a new cluster with 
probability <em>d / distanceCutoff</em> (the probability of creating a new 
cluster increases as <em>d</em> increases). </li>
+</ul>
+<p>There will be either <em>r</em> or <em>r+1</em> clusters after processing a 
new point.</p>
+<p>As the number of clusters increases, it will go over the  
<em>clusterOvershoot * numClusters</em> limit (<em>numClusters</em> represents 
a recommendation for the number of clusters that the streaming step should aim 
for and <em>clusterOvershoot</em> is the slack). To decrease the number of 
clusters the existing clusters
+are treated as data points and are re-clustered (collapsed). This tends to 
make the number of clusters go down. If the number of clusters is still too 
high, <em>distanceCutoff</em> is increased.</p>
+<h2 id="ballkmeans-step">BallKMeans step</h2>
+<hr />
+<h3 id="overview_1">Overview</h3>
+<p>The algorithm is a Lloyd-type algorithm that takes a set of weighted 
vectors and returns k centroids, see <a 
href="http://www.math.uwaterloo.ca/~cswamy/papers/kmeansfnl.pdf"; title="R. 
Ostrovsky, Y. Rabani, L. Schulman, Ch. Swamy: The Effectiveness of Lloyd-Type 
Methods for the k-means Problem">Ostrovsky et al</a> for details. The algorithm 
has two stages: </p>
+<ol>
+<li>Seeding </li>
+<li>Ball k-means </li>
+</ol>
+<p>The seeding stage is an initial guess of where the centroids should be. The 
initial guess is improved using the ball k-means stage. </p>
+<h3 id="parameters_1">Parameters</h3>
+<ul>
+<li>
+<p><strong>numClusters</strong> (int): the number k of centroids to return.  
The algorithm will return exactly this number of centroids.</p>
+</li>
+<li>
+<p><strong>maxNumIterations</strong> (int): After seeding, the iterative 
clustering procedure will be run at most <em>maxNumIterations</em> times.  1 or 
2 iterations are recommended.  Increasing beyond this will increase the 
accuracy of the result at the expense of runtime.  Each successive iteration 
yields diminishing returns in lowering the cost.</p>
+</li>
+<li>
+<p><strong>trimFraction</strong> (double): Outliers are ignored when computing 
the center of mass for a cluster.  For any datapoint <em>x</em>, let <em>c</em> 
be the nearest centroid.  Let <em>d</em> be the minimum distance from 
<em>c</em> to another centroid.  If the distance from <em>x</em> to <em>c</em> 
is greater than <em>trimFraction * d</em>, then <em>x</em> is considered an 
outlier during that iteration of ball k-means.  The default is 9/10.  In <a 
href="http://www.math.uwaterloo.ca/~cswamy/papers/kmeansfnl.pdf"; title="R. 
Ostrovsky, Y. Rabani, L. Schulman, Ch. Swamy: The Effectiveness of Lloyd-Type 
Methods for the k-means Problem">Ostrovsky et al</a>, the authors use 
<em>trimFraction</em> = 1/3, but this does not mean that 1/3 is optimal in 
practice.</p>
+</li>
+<li>
+<p><strong>kMeansPlusPlusInit</strong> (boolean): If true, the seeding method 
is k-means++.  If false, the seeding method is to select points uniformly at 
random.  The default is true.</p>
+</li>
+<li>
+<p><strong>correctWeights</strong> (boolean): If <em>correctWeights</em> is 
true, outliers will be considered when calculating the weight of centroids.  
The default is true. Note that outliers are not considered when calculating the 
position of centroids.</p>
+</li>
+<li>
+<p><strong>testProbability</strong> (double): If <em>testProbability</em> is 
<em>p</em> (0 &lt; <em>p</em> &lt; 1), the data (of size n) is partitioned into 
a test set (of size <em>p*n</em>) and a training set (of size 
<em>(1-p)*n</em>).  If 0, no test set is created (the entire data set is used 
for both training and testing).  The default is 0.1 if <em>numRuns</em> &gt; 1. 
 If <em>numRuns</em> = 1, then no test set should be created (since it is only 
used to compare the cost between different runs).</p>
+</li>
+<li>
+<p><strong>numRuns</strong> (int): This is the number of runs to perform. The 
solution of lowest cost is returned.  The default is 1 run.</p>
+</li>
+</ul>
+<h3 id="algorithm_1">Algorithm</h3>
+<p>The algorithm can be instructed to take multiple independent runs (using 
the <em>numRuns</em> parameter) and the algorithm will select the best solution 
(i.e., the one with the lowest cost). In practice, one run is sufficient to 
find a good solution.  </p>
+<p>Each run operates as follows: a seeding procedure is used to select k 
centroids, and then ball k-means is run iteratively to refine the solution.</p>
+<p>The seeding procedure can be set to either 'uniformly at random' or 
'k-means++' using <em>kMeansPlusPlusInit</em> boolean variable. Seeding with 
k-means++ involves more computation but offers better results in practice. </p>
+<p>Each iteration of ball k-means runs as follows:</p>
+<ol>
+<li>Clusters are formed by assigning each datapoint to the nearest 
centroid</li>
+<li>The centers of mass of the trimmed clusters (see <em>trimFraction</em> 
parameter above) become the new centroids </li>
+</ol>
+<p>The data may be partitioned into a test set and a training set (see 
<em>testProbability</em>). The seeding procedure and ball k-means run on the 
training set. The cost is computed on the test set.</p>
+<h2 id="usage-of-streamingkmeans">Usage of <em>StreamingKMeans</em></h2>
+<div class="codehilite"><pre> <span class="n">bin</span><span 
class="o">/</span><span class="n">mahout</span> <span 
class="n">streamingkmeans</span>  
+   <span class="o">-</span><span class="nb">i</span> <span 
class="o">&lt;</span><span class="n">input</span><span class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">o</span> <span 
class="o">&lt;</span><span class="n">output</span><span class="o">&gt;</span> 
+   <span class="o">-</span><span class="n">ow</span>  
+   <span class="o">-</span><span class="n">k</span> <span 
class="o">&lt;</span><span class="n">k</span><span class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">km</span> <span 
class="o">&lt;</span><span class="n">estimatedNumMapClusters</span><span 
class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">e</span> <span 
class="o">&lt;</span><span class="n">estimatedDistanceCutoff</span><span 
class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">mi</span> <span 
class="o">&lt;</span><span class="n">maxNumIterations</span><span 
class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">tf</span> <span 
class="o">&lt;</span><span class="n">trimFraction</span><span 
class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">ri</span>                  
+   <span class="o">-</span><span class="n">iw</span>  
+   <span class="o">-</span><span class="n">testp</span> <span 
class="o">&lt;</span><span class="n">testProbability</span><span 
class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">nbkm</span> <span 
class="o">&lt;</span><span class="n">numBallKMeansRuns</span><span 
class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">dm</span> <span 
class="o">&lt;</span><span class="n">distanceMeasure</span><span 
class="o">&gt;</span>   
+   <span class="o">-</span><span class="n">sc</span> <span 
class="o">&lt;</span><span class="n">searcherClass</span><span 
class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">np</span> <span 
class="o">&lt;</span><span class="n">numProjections</span><span 
class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">s</span> <span 
class="o">&lt;</span><span class="n">searchSize</span><span 
class="o">&gt;</span>   
+   <span class="o">-</span><span class="n">rskm</span>  
+   <span class="o">-</span><span class="n">xm</span> <span 
class="o">&lt;</span><span class="n">method</span><span class="o">&gt;</span>  
+   <span class="o">-</span><span class="n">h</span>   
+   <span class="o">--</span><span class="n">tempDir</span> <span 
class="o">&lt;</span><span class="n">tempDir</span><span class="o">&gt;</span>  
 
+   <span class="o">--</span><span class="n">startPhase</span> <span 
class="o">&lt;</span><span class="n">startPhase</span><span 
class="o">&gt;</span>   
+   <span class="o">--</span><span class="n">endPhase</span> <span 
class="o">&lt;</span><span class="n">endPhase</span><span class="o">&gt;</span>
+</pre></div>
+
+
+<h3 id="details-on-job-specific-options">Details on Job-Specific Options:</h3>
+<ul>
+<li><code>--input (-i) &lt;input&gt;</code>: Path to job input directory.      
   </li>
+<li><code>--output (-o) &lt;output&gt;</code>: The directory pathname for 
output.            </li>
+<li><code>--overwrite (-ow)</code>: If present, overwrite the output directory 
before running job.</li>
+<li><code>--numClusters (-k) &lt;k&gt;</code>: The k in k-Means. Approximately 
this many clusters will be generated.      </li>
+<li><code>--estimatedNumMapClusters (-km) 
&lt;estimatedNumMapClusters&gt;</code>: The estimated number of clusters to use 
for the Map phase of the job when running StreamingKMeans. This should be 
around k * log(n), where k is the final number of clusters and n is the total 
number of data points to cluster.           </li>
+<li><code>--estimatedDistanceCutoff (-e) 
&lt;estimatedDistanceCutoff&gt;</code>: The initial estimated distance cutoff 
between two points for forming new clusters. If no value is given, it's 
estimated from the data set  </li>
+<li><code>--maxNumIterations (-mi) &lt;maxNumIterations&gt;</code>: The 
maximum number of iterations to run for the BallKMeans algorithm used by the 
reducer. If no value is given, defaults to 10.    </li>
+<li><code>--trimFraction (-tf) &lt;trimFraction&gt;</code>: The 'ball' aspect 
of ball k-means means that only the closest points to the centroid will 
actually be used for updating. The fraction of the points to be used is those 
points whose distance to the center is within trimFraction * distance to the 
closest other center. If no value is given, defaults to 0.9.   </li>
+<li><code>--randomInit</code> (<code>-ri</code>) Whether to use k-means++ 
initialization or random initialization of the seed centroids. Essentially, 
k-means++ provides better clusters, but takes longer, whereas random 
initialization takes less time, but produces worse clusters, and tends to fail 
more often and needs multiple runs to compare to k-means++. If set, uses the 
random initialization.</li>
+<li><code>--ignoreWeights (-iw)</code>: Whether to correct the weights of the 
centroids after the clustering is done. The weights end up being wrong because 
of the trimFraction and possible train/test splits. In some cases, especially 
in a pipeline, having an accurate count of the weights is useful. If set, 
ignores the final weights. </li>
+<li><code>--testProbability (-testp) &lt;testProbability&gt;</code>: A double 
value  between 0 and 1  that represents  the percentage of  points to be used  
for 'testing'  different  clustering runs in  the final  BallKMeans step.  If 
no value is  given, defaults to  0.1  </li>
+<li><code>--numBallKMeansRuns (-nbkm) &lt;numBallKMeansRuns&gt;</code>: Number 
of  BallKMeans runs to  use at the end to  try to cluster the  points. If no  
value is given,  defaults to 4  </li>
+<li><code>--distanceMeasure (-dm) &lt;distanceMeasure&gt;</code>: The 
classname of  the  DistanceMeasure.  Default is  SquaredEuclidean.  </li>
+<li><code>--searcherClass (-sc) &lt;searcherClass&gt;</code>: The type of  
searcher to be  used when  performing nearest  neighbor searches.  Defaults to  
ProjectionSearch.  </li>
+<li><code>--numProjections (-np) &lt;numProjections&gt;</code>: The number of  
projections  considered in  estimating the  distances between  vectors. Only 
used  when the distance  measure requested is either ProjectionSearch or 
FastProjectionSearch. If no value is given, defaults to 3.  </li>
+<li><code>--searchSize (-s) &lt;searchSize&gt;</code>: In more efficient  
searches (non  BruteSearch), not all distances are calculated for determining 
the nearest neighbors. The number of elements whose distances from the query 
vector is actually computer is proportional to searchSize. If no value is 
given, defaults to 1.  </li>
+<li><code>--reduceStreamingKMeans (-rskm)</code>: There might be too many 
intermediate clusters from the mapper to fit into memory, so the reducer can 
run  another pass of StreamingKMeans to collapse them down to a fewer clusters. 
 </li>
+<li><code>--method (-xm)</code> method The execution  method to use:  
sequential or  mapreduce. Default  is mapreduce.  </li>
+<li><code>-- help (-h)</code>: Print out help  </li>
+<li><code>--tempDir &lt;tempDir&gt;</code>: Intermediate output directory.</li>
+<li><code>--startPhase &lt;startPhase&gt;</code> First phase to run.  </li>
+<li><code>--endPhase &lt;endPhase&gt;</code> Last phase to run.   </li>
+</ul>
+<h2 id="references">References</h2>
+<ol>
+<li><a href="http://nips.cc/Conferences/2011/Program/event.php?ID=2989"; 
title="M. Shindler, A. Wong, A. Meyerson: Fast and Accurate k-means For Large 
Datasets">M. Shindler, A. Wong, A. Meyerson: Fast and Accurate k-means For 
Large Datasets</a></li>
+<li><a href="http://www.math.uwaterloo.ca/~cswamy/papers/kmeansfnl.pdf"; 
title="R. Ostrovsky, Y. Rabani, L. Schulman, Ch. Swamy: The Effectiveness of 
Lloyd-Type Methods for the k-means Problem">R. Ostrovsky, Y. Rabani, L. 
Schulman, Ch. Swamy: The Effectiveness of Lloyd-Type Methods for the k-means 
Problem</a></li>
+</ol>
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014 The Apache Software Foundation, Licensed under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0";>Apache 
License, Version 2.0</a>.
+        <br />
+        Apache and the Apache feather logos are trademarks of The Apache 
Software Foundation.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/js/jquery-1.9.1.min.js"></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') 
+
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>


Added: 
websites/staging/mahout/trunk/content/users/mapreduce/clustering/viewing-result.html
==============================================================================
--- 
websites/staging/mahout/trunk/content/users/mapreduce/clustering/viewing-result.html
 (added)
+++ 
websites/staging/mahout/trunk/content/users/mapreduce/clustering/viewing-result.html
 Thu Mar 19 21:21:45 2015
@@ -0,0 +1,286 @@
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en"><head><meta 
http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data 
framework, data integration,
+        data matching, data mining, data mining algorithms, data mining 
analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data 
mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning 
methods,
+        learning techniques, lucene, machine learning, machine translation, 
mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive 
bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data 
mining">
+  <link rel="shortcut icon" type="image/x-icon" 
href="http://mahout.apache.org/images/favicon.ico";>
+  <script type="text/javascript" src="/js/prototype.js"></script>
+  <script type="text/javascript" src="/js/effects.js"></script>
+  <script type="text/javascript" src="/js/search.js"></script>
+  <script type="text/javascript" src="/js/slides.js"></script>
+
+  <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
+  <link href="/css/bootstrap-responsive.css" rel="stylesheet">
+  <link rel="stylesheet" href="/css/global.css" type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        
'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
 : 
+        
'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+       
+         var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/overview.html"></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search"; method="get" 
class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org"; name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/images/mahout-lupe.png" 
alt="Search" />
+    </form>
+  </div>
+
+    <div class="navbar navbar-inverse" 
style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: 
none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" 
data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development 
Project</a> -->
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+              <li><a href="/">Home</a></li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">General<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/general/downloads.html">Downloads</a>
+                  <li><a href="/general/who-we-are.html">Who we are</a>
+                  <li><a 
href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                  <li><a href="/general/release-notes.html">Release Notes</a> 
+                  <li><a href="/general/books-tutorials-and-talks.html">Books, 
Tutorials, Talks</a></li>
+                  <li><a href="/general/powered-by-mahout.html">Powered By 
Mahout</a>
+                  <li><a 
href="/general/professional-support.html">Professional Support</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Resources</li>
+                  <li><a href="/general/reference-reading.html">Reference 
Reading</a>
+                  <li><a href="/general/faq.html">FAQ</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Legal</li>
+                  <li><a 
href="http://www.apache.org/licenses/";>License</a></li>
+                  <li><a 
href="http://www.apache.org/security/";>Security</a></li>
+                  <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+                </ul>
+              </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Developers<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/developers/developer-resources.html">Developer 
resources</a></li>
+                  <li><a href="/developers/version-control.html">Version 
control</a></li>
+                  <li><a href="/developers/buildingmahout.html">Build from 
source</a></li>
+                  <li><a href="/developers/issue-tracker.html">Issue 
tracker</a></li>
+                  <li><a href="https://builds.apache.org/job/Mahout-Quality/"; 
target="_blank">Code quality reports</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Contributions</li>
+                  <li><a href="/developers/how-to-contribute.html">How to 
contribute</a></li>
+                  <li><a href="/developers/how-to-become-a-committer.html">How 
to become a committer</a></li>
+                  <li><a href="/developers/gsoc.html">GSoC</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">For committers</li>
+                  <li><a href="/developers/how-to-update-the-website.html">How 
to update the website</a></li>
+                  <li><a href="/developers/patch-check-list.html">Patch check 
list</a></li>
+                  <li><a href="/developers/github.html">Handling Github 
PRs</a></li>
+                  <li><a href="/developers/how-to-release.html">How to 
release</a></li>
+                  <li><a href="/developers/thirdparty-dependencies.html">Third 
party dependencies</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Basics<b class="caret"></b></a>
+                 <ul class="dropdown-menu">
+                  <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                  <li><a href="/users/basics/quickstart.html">Quickstart</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Working with text</li>
+                  <li><a 
href="/users/basics/creating-vectors-from-text.html">Creating vectors from 
text</a>
+                  <li><a 
href="/users/basics/collocations.html">Collocations</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Dimensionality reduction</li>
+                  <li><a 
href="/users/dim-reduction/dimensional-reduction.html">Singular Value 
Decomposition</a></li>
+                  <li><a href="/users/dim-reduction/ssvd.html">Stochastic 
SVD</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Topic Models</li>      
+                  <li><a 
href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet 
Allocation</a></li>
+                </ul>
+                 </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Spark<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/sparkbindings/home.html">Scala &amp; 
Spark Bindings Overview</a></li>
+                  <li><a 
href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark 
Shell</a></li>
+                             <li class="divider"></li>
+                  <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                </ul>
+               </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Classification<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a 
href="/users/mapreduce/classification/bayesian.html">Naive Bayes</a></li>
+                  <li><a 
href="/users/mapreduce/classification/hidden-markov-models.html">Hidden Markov 
Models</a></li>
+                  <li><a 
href="/users/mapreduce/classification/logistic-regression.html">Logistic 
Regression</a></li>
+                  <li><a 
href="/users/mapreduce/classification/partial-implementation.html">Random 
Forest</a></li>
+
+                  <li class="divider"></li>
+                  <li class="nav-header">Examples</li>
+                  <li><a 
href="/users/mapreduce/classification/breiman-example.html">Breiman 
example</a></li>
+                  <li><a 
href="/users/mapreduce/classification/twenty-newsgroups.html">20 newsgroups 
example</a></li>
+                </ul></li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Clustering<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a 
href="/users/mapreduce/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a 
href="/users/mapreduce/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a 
href="/users/mapreduce/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li>
+                <li><a 
href="/users/mapreduce/clustering/streaming-k-means.html">Streaming 
KMeans</a></li>
+                <li><a 
href="/users/mapreduce/clustering/spectral-clustering.html">Spectral 
Clustering</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Commandline usage</li>
+                <li><a 
href="/users/mapreduce/clustering/k-means-commandline.html">Options for 
k-Means</a></li>
+                <li><a 
href="/users/mapreduce/clustering/canopy-commandline.html">Options for 
Canopy</a></li>
+                <li><a 
href="/users/mapreduce/clustering/fuzzy-k-means-commandline.html">Options for 
Fuzzy k-Means</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Examples</li>
+                <li><a 
href="/users/mapreduce/clustering/clustering-of-synthetic-control-data.html">Synthetic
 data</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Post processing</li>
+                <li><a 
href="/users/mapreduce/clustering/cluster-dumper.html">Cluster Dumper 
tool</a></li>
+                <li><a 
href="/users/mapreduce/clustering/visualizing-sample-clusters.html">Cluster 
visualisation</a></li>
+                </ul></li>
+                <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a 
href="/users/mapreduce/recommender/quickstart.html">Quickstart</a></li>
+                <li><a 
href="/users/mapreduce/recommender/recommender-first-timer-faq.html">First 
Timer FAQ</a></li>
+                <li><a 
href="/users/mapreduce/recommender/userbased-5-minutes.html">A user-based 
recommender <br/>in 5 minutes</a></li>
+               <li><a 
href="/users/mapreduce/recommender/matrix-factorization.html">Matrix 
factorization-based<br/> recommenders</a></li>
+                <li><a 
href="/users/mapreduce/recommender/recommender-documentation.html">Overview</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Hadoop</li>
+                <li><a 
href="/users/mapreduce/recommender/intro-itembased-hadoop.html">Intro to 
item-based recommendations<br/> with Hadoop</a></li>
+                <li><a 
href="/users/mapreduce/recommender/intro-als-hadoop.html">Intro to ALS 
recommendations<br/> with Hadoop</a></li>
+                <li class="nav-header">Spark</li>
+                <li><a 
href="/users/mapreduce/recommender/intro-cooccurrence-spark.html">Intro to 
cooccurrence-based<br/> recommendations with Spark</a></li>
+              </ul>
+            </li>
+           </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+       <ul class="sidemenu">
+               <li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout"; 
data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var 
js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+       </ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html";>How the 
ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html";>Get 
Involved</a></li>
+      <li><a href="http://www.apache.org/dev/";>Developer Resources</a></li>
+      <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+      <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/";>Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/";>Hadoop</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+    <ul>
+<li><a href="#ViewingResult-AlgorithmViewingpages">Algorithm Viewing 
pages</a></li>
+</ul>
+<p>There are various technologies available to view the output of Mahout
+algorithms.
+* Clusters</p>
+<p><a name="ViewingResult-AlgorithmViewingpages"></a></p>
+<h1 id="algorithm-viewing-pages">Algorithm Viewing pages</h1>
+<p>{pagetree:root=@self|excerpt=true|expandCollapseAll=true}</p>
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014 The Apache Software Foundation, Licensed under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0";>Apache 
License, Version 2.0</a>.
+        <br />
+        Apache and the Apache feather logos are trademarks of The Apache 
Software Foundation.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/js/jquery-1.9.1.min.js"></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') 
+
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>

Added: 
websites/staging/mahout/trunk/content/users/mapreduce/clustering/viewing-results.html
==============================================================================
--- 
websites/staging/mahout/trunk/content/users/mapreduce/clustering/viewing-results.html
 (added)
+++ 
websites/staging/mahout/trunk/content/users/mapreduce/clustering/viewing-results.html
 Thu Mar 19 21:21:45 2015
@@ -0,0 +1,309 @@
+<!DOCTYPE html>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<html xmlns="http://www.w3.org/1999/xhtml"; xml:lang="en" lang="en"><head><meta 
http-equiv="Content-Type" content="text/html; charset=UTF-8">
+  <title>Apache Mahout: Scalable machine learning and data mining</title>
+  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+  <meta name="Distribution" content="Global">
+  <meta name="Robots" content="index,follow">
+  <meta name="keywords" content="apache, apache hadoop, apache lucene,
+        business data mining, cluster analysis,
+        collaborative filtering, data extraction, data filtering, data 
framework, data integration,
+        data matching, data mining, data mining algorithms, data mining 
analysis, data mining data,
+        data mining introduction, data mining software,
+        data mining techniques, data representation, data set, datamining,
+        feature extraction, fuzzy k means, genetic algorithm, hadoop,
+        hierarchical clustering, high dimensional, introduction to data 
mining, kmeans,
+        knowledge discovery, learning approach, learning approaches, learning 
methods,
+        learning techniques, lucene, machine learning, machine translation, 
mahout apache,
+        mahout taste, map reduce hadoop, mining data, mining methods, naive 
bayes,
+        natural language processing,
+        supervised, text mining, time series data, unsupervised, web data 
mining">
+  <link rel="shortcut icon" type="image/x-icon" 
href="http://mahout.apache.org/images/favicon.ico";>
+  <script type="text/javascript" src="/js/prototype.js"></script>
+  <script type="text/javascript" src="/js/effects.js"></script>
+  <script type="text/javascript" src="/js/search.js"></script>
+  <script type="text/javascript" src="/js/slides.js"></script>
+
+  <link href="/css/bootstrap.min.css" rel="stylesheet" media="screen">
+  <link href="/css/bootstrap-responsive.css" rel="stylesheet">
+  <link rel="stylesheet" href="/css/global.css" type="text/css">
+
+  <!-- mathJax stuff -- use `\(...\)` for inline style math in markdown -->
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    tex2jax: {
+      skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
+    }
+  });
+  MathJax.Hub.Queue(function() {
+    var all = MathJax.Hub.getAllJax(), i;
+    for(i = 0; i < all.length; i += 1) {
+      all[i].SourceElement().parentNode.className += ' has-jax';
+    }
+  });
+  </script>
+  <script type="text/javascript">
+    var mathjax = document.createElement('script'); 
+    mathjax.type = 'text/javascript'; 
+    mathjax.async = true;
+
+    mathjax.src = ('https:' == document.location.protocol) ?
+        
'https://c328740.ssl.cf1.rackcdn.com/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'
 : 
+        
'http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+       
+         var s = document.getElementsByTagName('script')[0]; 
+    s.parentNode.insertBefore(mathjax, s);
+  </script>
+</head>
+
+<body id="home" data-twttr-rendered="true">
+  <div id="wrap">
+   <div id="header">
+    <div id="logo"><a href="/overview.html"></a></div>
+  <div id="search">
+    <form id="search-form" action="http://www.google.com/search"; method="get" 
class="navbar-search pull-right">    
+      <input value="http://mahout.apache.org"; name="sitesearch" type="hidden">
+      <input class="search-query" name="q" id="query" type="text">
+      <input id="submission" type="image" src="/images/mahout-lupe.png" 
alt="Search" />
+    </form>
+  </div>
+
+    <div class="navbar navbar-inverse" 
style="position:absolute;top:133px;padding-right:0px;padding-left:0px;">
+      <div class="navbar-inner" style="border: none; background: #999; border: 
none; border-radius: 0px;">
+        <div class="container">
+          <button type="button" class="btn btn-navbar" data-toggle="collapse" 
data-target=".nav-collapse">
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+            <span class="icon-bar"></span>
+          </button>
+          <!-- <a class="brand" href="#">Apache Community Development 
Project</a> -->
+          <div class="nav-collapse collapse">
+            <ul class="nav">
+              <li><a href="/">Home</a></li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">General<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/general/downloads.html">Downloads</a>
+                  <li><a href="/general/who-we-are.html">Who we are</a>
+                  <li><a 
href="/general/mailing-lists,-irc-and-archives.html">Mailing Lists</a>
+                  <li><a href="/general/release-notes.html">Release Notes</a> 
+                  <li><a href="/general/books-tutorials-and-talks.html">Books, 
Tutorials, Talks</a></li>
+                  <li><a href="/general/powered-by-mahout.html">Powered By 
Mahout</a>
+                  <li><a 
href="/general/professional-support.html">Professional Support</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Resources</li>
+                  <li><a href="/general/reference-reading.html">Reference 
Reading</a>
+                  <li><a href="/general/faq.html">FAQ</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Legal</li>
+                  <li><a 
href="http://www.apache.org/licenses/";>License</a></li>
+                  <li><a 
href="http://www.apache.org/security/";>Security</a></li>
+                  <li><a href="/general/privacy-policy.html">Privacy Policy</a>
+                </ul>
+              </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Developers<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/developers/developer-resources.html">Developer 
resources</a></li>
+                  <li><a href="/developers/version-control.html">Version 
control</a></li>
+                  <li><a href="/developers/buildingmahout.html">Build from 
source</a></li>
+                  <li><a href="/developers/issue-tracker.html">Issue 
tracker</a></li>
+                  <li><a href="https://builds.apache.org/job/Mahout-Quality/"; 
target="_blank">Code quality reports</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Contributions</li>
+                  <li><a href="/developers/how-to-contribute.html">How to 
contribute</a></li>
+                  <li><a href="/developers/how-to-become-a-committer.html">How 
to become a committer</a></li>
+                  <li><a href="/developers/gsoc.html">GSoC</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">For committers</li>
+                  <li><a href="/developers/how-to-update-the-website.html">How 
to update the website</a></li>
+                  <li><a href="/developers/patch-check-list.html">Patch check 
list</a></li>
+                  <li><a href="/developers/github.html">Handling Github 
PRs</a></li>
+                  <li><a href="/developers/how-to-release.html">How to 
release</a></li>
+                  <li><a href="/developers/thirdparty-dependencies.html">Third 
party dependencies</a></li>
+                </ul>
+               </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Basics<b class="caret"></b></a>
+                 <ul class="dropdown-menu">
+                  <li><a href="/users/basics/algorithms.html">List of 
algorithms</a>
+                  <li><a href="/users/basics/quickstart.html">Quickstart</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Working with text</li>
+                  <li><a 
href="/users/basics/creating-vectors-from-text.html">Creating vectors from 
text</a>
+                  <li><a 
href="/users/basics/collocations.html">Collocations</a>
+                  <li class="divider"></li>
+                  <li class="nav-header">Dimensionality reduction</li>
+                  <li><a 
href="/users/dim-reduction/dimensional-reduction.html">Singular Value 
Decomposition</a></li>
+                  <li><a href="/users/dim-reduction/ssvd.html">Stochastic 
SVD</a></li>
+                  <li class="divider"></li>
+                  <li class="nav-header">Topic Models</li>      
+                  <li><a 
href="/users/clustering/latent-dirichlet-allocation.html">Latent Dirichlet 
Allocation</a></li>
+                </ul>
+                 </li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Spark<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a href="/users/sparkbindings/home.html">Scala &amp; 
Spark Bindings Overview</a></li>
+                  <li><a 
href="/users/sparkbindings/play-with-shell.html">Playing with Mahout's Spark 
Shell</a></li>
+                             <li class="divider"></li>
+                  <li><a href="/users/sparkbindings/faq.html">FAQ</a></li>
+                </ul>
+               </li>
+              <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Classification<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                  <li><a 
href="/users/mapreduce/classification/bayesian.html">Naive Bayes</a></li>
+                  <li><a 
href="/users/mapreduce/classification/hidden-markov-models.html">Hidden Markov 
Models</a></li>
+                  <li><a 
href="/users/mapreduce/classification/logistic-regression.html">Logistic 
Regression</a></li>
+                  <li><a 
href="/users/mapreduce/classification/partial-implementation.html">Random 
Forest</a></li>
+
+                  <li class="divider"></li>
+                  <li class="nav-header">Examples</li>
+                  <li><a 
href="/users/mapreduce/classification/breiman-example.html">Breiman 
example</a></li>
+                  <li><a 
href="/users/mapreduce/classification/twenty-newsgroups.html">20 newsgroups 
example</a></li>
+                </ul></li>
+               <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Clustering<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a 
href="/users/mapreduce/clustering/k-means-clustering.html">k-Means</a></li>
+                <li><a 
href="/users/mapreduce/clustering/canopy-clustering.html">Canopy</a></li>
+                <li><a 
href="/users/mapreduce/clustering/fuzzy-k-means.html">Fuzzy k-Means</a></li>
+                <li><a 
href="/users/mapreduce/clustering/streaming-k-means.html">Streaming 
KMeans</a></li>
+                <li><a 
href="/users/mapreduce/clustering/spectral-clustering.html">Spectral 
Clustering</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Commandline usage</li>
+                <li><a 
href="/users/mapreduce/clustering/k-means-commandline.html">Options for 
k-Means</a></li>
+                <li><a 
href="/users/mapreduce/clustering/canopy-commandline.html">Options for 
Canopy</a></li>
+                <li><a 
href="/users/mapreduce/clustering/fuzzy-k-means-commandline.html">Options for 
Fuzzy k-Means</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Examples</li>
+                <li><a 
href="/users/mapreduce/clustering/clustering-of-synthetic-control-data.html">Synthetic
 data</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Post processing</li>
+                <li><a 
href="/users/mapreduce/clustering/cluster-dumper.html">Cluster Dumper 
tool</a></li>
+                <li><a 
href="/users/mapreduce/clustering/visualizing-sample-clusters.html">Cluster 
visualisation</a></li>
+                </ul></li>
+                <li class="dropdown"> <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Recommendations<b class="caret"></b></a>
+                <ul class="dropdown-menu">
+                <li><a 
href="/users/mapreduce/recommender/quickstart.html">Quickstart</a></li>
+                <li><a 
href="/users/mapreduce/recommender/recommender-first-timer-faq.html">First 
Timer FAQ</a></li>
+                <li><a 
href="/users/mapreduce/recommender/userbased-5-minutes.html">A user-based 
recommender <br/>in 5 minutes</a></li>
+               <li><a 
href="/users/mapreduce/recommender/matrix-factorization.html">Matrix 
factorization-based<br/> recommenders</a></li>
+                <li><a 
href="/users/mapreduce/recommender/recommender-documentation.html">Overview</a></li>
+                <li class="divider"></li>
+                <li class="nav-header">Hadoop</li>
+                <li><a 
href="/users/mapreduce/recommender/intro-itembased-hadoop.html">Intro to 
item-based recommendations<br/> with Hadoop</a></li>
+                <li><a 
href="/users/mapreduce/recommender/intro-als-hadoop.html">Intro to ALS 
recommendations<br/> with Hadoop</a></li>
+                <li class="nav-header">Spark</li>
+                <li><a 
href="/users/mapreduce/recommender/intro-cooccurrence-spark.html">Intro to 
cooccurrence-based<br/> recommendations with Spark</a></li>
+              </ul>
+            </li>
+           </ul>
+          </div><!--/.nav-collapse -->
+        </div>
+      </div>
+    </div>
+
+</div>
+
+ <div id="sidebar">
+  <div id="sidebar-wrap">
+    <h2>Twitter</h2>
+       <ul class="sidemenu">
+               <li>
+<a class="twitter-timeline" href="https://twitter.com/ApacheMahout"; 
data-widget-id="422861673444028416">Tweets by @ApacheMahout</a>
+<script>!function(d,s,id){var 
js,fjs=d.getElementsByTagName(s)[0],p=/^http:/.test(d.location)?'http':'https';if(!d.getElementById(id)){js=d.createElement(s);js.id=id;js.src=p+"://platform.twitter.com/widgets.js";fjs.parentNode.insertBefore(js,fjs);}}(document,"script","twitter-wjs");</script>
+</li>
+       </ul>
+    <h2>Apache Software Foundation</h2>
+    <ul class="sidemenu">
+      <li><a href="http://www.apache.org/foundation/how-it-works.html";>How the 
ASF works</a></li>
+      <li><a href="http://www.apache.org/foundation/getinvolved.html";>Get 
Involved</a></li>
+      <li><a href="http://www.apache.org/dev/";>Developer Resources</a></li>
+      <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+      <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+    </ul>
+    <h2>Related Projects</h2>
+    <ul class="sidemenu">
+      <li><a href="http://lucene.apache.org/";>Lucene</a></li>
+      <li><a href="http://hadoop.apache.org/";>Hadoop</a></li>
+    </ul>
+  </div>
+</div>
+
+  <div id="content-wrap" class="clearfix">
+   <div id="main">
+    <p><a name="ViewingResults-Intro"></a></p>
+<h1 id="intro">Intro</h1>
+<p>Many of the Mahout libraries run as batch jobs, dumping results into Hadoop
+sequence files or other data structures.  This page is intended to
+demonstrate the various ways one might inspect the outcome of various jobs.
+ The page is organized by algorithms.</p>
+<p><a name="ViewingResults-GeneralUtilities"></a></p>
+<h1 id="general-utilities">General Utilities</h1>
+<p><a name="ViewingResults-SequenceFileDumper"></a></p>
+<h2 id="sequence-file-dumper">Sequence File Dumper</h2>
+<p><a name="ViewingResults-Clustering"></a></p>
+<h1 id="clustering">Clustering</h1>
+<p><a name="ViewingResults-ClusterDumper"></a></p>
+<h2 id="cluster-dumper">Cluster Dumper</h2>
+<p>Run the following to print out all options:</p>
+<div class="codehilite"><pre><span class="n">java</span>  <span 
class="o">-</span><span class="n">cp</span> &quot;<span 
class="o">*</span>&quot; <span class="n">org</span><span 
class="p">.</span><span class="n">apache</span><span class="p">.</span><span 
class="n">mahout</span><span class="p">.</span><span 
class="n">utils</span><span class="p">.</span><span 
class="n">clustering</span><span class="p">.</span><span 
class="n">ClusterDumper</span> <span class="o">--</span><span 
class="n">help</span>
+</pre></div>
+
+
+<p><a name="ViewingResults-Example"></a></p>
+<h3 id="example">Example</h3>
+<div class="codehilite"><pre><span class="n">java</span>  <span 
class="o">-</span><span class="n">cp</span> &quot;<span 
class="o">*</span>&quot; <span class="n">org</span><span 
class="p">.</span><span class="n">apache</span><span class="p">.</span><span 
class="n">mahout</span><span class="p">.</span><span 
class="n">utils</span><span class="p">.</span><span 
class="n">clustering</span><span class="p">.</span><span 
class="n">ClusterDumper</span> <span class="o">--</span><span 
class="n">seqFileDir</span>
+</pre></div>
+
+
+<p>./solr-clust-n2/out/clusters-2
+          --dictionary ./solr-clust-n2/dictionary.txt
+          --substring 100 --pointsDir ./solr-clust-n2/out/points/</p>
+<p><a name="ViewingResults-ClusterLabels(MAHOUT-163)"></a></p>
+<h2 id="cluster-labels-mahout-163">Cluster Labels (MAHOUT-163)</h2>
+<p><a name="ViewingResults-Classification"></a></p>
+<h1 id="classification">Classification</h1>
+   </div>
+  </div>     
+</div> 
+  <footer class="footer" align="center">
+    <div class="container">
+      <p>
+        Copyright &copy; 2014 The Apache Software Foundation, Licensed under
+        the <a href="http://www.apache.org/licenses/LICENSE-2.0";>Apache 
License, Version 2.0</a>.
+        <br />
+        Apache and the Apache feather logos are trademarks of The Apache 
Software Foundation.
+      </p>
+    </div>
+  </footer>
+  
+  <script src="/js/jquery-1.9.1.min.js"></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script>
+    (function() {
+      var cx = '012254517474945470291:vhsfv7eokdc';
+      var gcse = document.createElement('script');
+      gcse.type = 'text/javascript';
+      gcse.async = true;
+      gcse.src = (document.location.protocol == 'https:' ? 'https:' : 'http:') 
+
+          '//www.google.com/cse/cse.js?cx=' + cx;
+      var s = document.getElementsByTagName('script')[0];
+      s.parentNode.insertBefore(gcse, s);
+    })();
+  </script>
+</body>
+</html>

svn commit: r944380 [18/24] - in /websites/staging/mahout/trunk/content: ./ developers/ general/ users/basics/ users/classification/ users/clustering/ users/dim-reduction/ users/mapreduce/ users/mapreduce/classification/ users/mapreduce/clustering/ use...

Reply via email to