svn commit: r1726699 [1/3] - in /spark: ./ _layouts/ site/ site/graphx/ site/mllib/ site/news/ site/releases/ site/screencasts/ site/sql/ site/streaming/

yhuai Mon, 25 Jan 2016 13:58:17 -0800

Author: yhuai
Date: Mon Jan 25 21:57:32 2016
New Revision: 1726699

URL: http://svn.apache.org/viewvc?rev=1726699&view=rev
Log:
Update the Spark example page to include examples using high level APIs


Modified:
    spark/_config.yml
    spark/_layouts/global.html
    spark/examples.md
    spark/site/community.html
    spark/site/documentation.html
    spark/site/downloads.html
    spark/site/examples.html
    spark/site/faq.html
    spark/site/graphx/index.html
    spark/site/index.html
    spark/site/mailing-lists.html
    spark/site/mllib/index.html
    spark/site/news/amp-camp-2013-registration-ope.html
    spark/site/news/announcing-the-first-spark-summit.html
    spark/site/news/fourth-spark-screencast-published.html
    spark/site/news/index.html
    spark/site/news/nsdi-paper.html
    spark/site/news/one-month-to-spark-summit-2015.html
    spark/site/news/proposals-open-for-spark-summit-east.html
    spark/site/news/registration-open-for-spark-summit-east.html
    spark/site/news/run-spark-and-shark-on-amazon-emr.html
    spark/site/news/spark-0-6-1-and-0-5-2-released.html
    spark/site/news/spark-0-6-2-released.html
    spark/site/news/spark-0-7-0-released.html
    spark/site/news/spark-0-7-2-released.html
    spark/site/news/spark-0-7-3-released.html
    spark/site/news/spark-0-8-0-released.html
    spark/site/news/spark-0-8-1-released.html
    spark/site/news/spark-0-9-0-released.html
    spark/site/news/spark-0-9-1-released.html
    spark/site/news/spark-0-9-2-released.html
    spark/site/news/spark-1-0-0-released.html
    spark/site/news/spark-1-0-1-released.html
    spark/site/news/spark-1-0-2-released.html
    spark/site/news/spark-1-1-0-released.html
    spark/site/news/spark-1-1-1-released.html
    spark/site/news/spark-1-2-0-released.html
    spark/site/news/spark-1-2-1-released.html
    spark/site/news/spark-1-2-2-released.html
    spark/site/news/spark-1-3-0-released.html
    spark/site/news/spark-1-4-0-released.html
    spark/site/news/spark-1-4-1-released.html
    spark/site/news/spark-1-5-0-released.html
    spark/site/news/spark-1-5-1-released.html
    spark/site/news/spark-1-5-2-released.html
    spark/site/news/spark-1-6-0-released.html
    spark/site/news/spark-accepted-into-apache-incubator.html
    spark/site/news/spark-and-shark-in-the-news.html
    spark/site/news/spark-becomes-tlp.html
    spark/site/news/spark-featured-in-wired.html
    spark/site/news/spark-mailing-lists-moving-to-apache.html
    spark/site/news/spark-meetups.html
    spark/site/news/spark-screencasts-published.html
    spark/site/news/spark-summit-2013-is-a-wrap.html
    spark/site/news/spark-summit-2014-videos-posted.html
    spark/site/news/spark-summit-2015-videos-posted.html
    spark/site/news/spark-summit-agenda-posted.html
    spark/site/news/spark-summit-east-2015-videos-posted.html
    spark/site/news/spark-summit-east-2016-cfp-closing.html
    spark/site/news/spark-summit-east-agenda-posted.html
    spark/site/news/spark-summit-europe-agenda-posted.html
    spark/site/news/spark-summit-europe.html
    spark/site/news/spark-tips-from-quantifind.html
    spark/site/news/spark-user-survey-and-powered-by-page.html
    spark/site/news/spark-version-0-6-0-released.html
    spark/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html
    spark/site/news/strata-exercises-now-available-online.html
    spark/site/news/submit-talks-to-spark-summit-2014.html
    spark/site/news/submit-talks-to-spark-summit-east-2016.html
    spark/site/news/two-weeks-to-spark-summit-2014.html
    spark/site/news/video-from-first-spark-development-meetup.html
    spark/site/releases/spark-release-0-3.html
    spark/site/releases/spark-release-0-5-0.html
    spark/site/releases/spark-release-0-5-1.html
    spark/site/releases/spark-release-0-5-2.html
    spark/site/releases/spark-release-0-6-0.html
    spark/site/releases/spark-release-0-6-1.html
    spark/site/releases/spark-release-0-6-2.html
    spark/site/releases/spark-release-0-7-0.html
    spark/site/releases/spark-release-0-7-2.html
    spark/site/releases/spark-release-0-7-3.html
    spark/site/releases/spark-release-0-8-0.html
    spark/site/releases/spark-release-0-8-1.html
    spark/site/releases/spark-release-0-9-0.html
    spark/site/releases/spark-release-0-9-1.html
    spark/site/releases/spark-release-0-9-2.html
    spark/site/releases/spark-release-1-0-0.html
    spark/site/releases/spark-release-1-0-1.html
    spark/site/releases/spark-release-1-0-2.html
    spark/site/releases/spark-release-1-1-0.html
    spark/site/releases/spark-release-1-1-1.html
    spark/site/releases/spark-release-1-2-0.html
    spark/site/releases/spark-release-1-2-1.html
    spark/site/releases/spark-release-1-2-2.html
    spark/site/releases/spark-release-1-3-0.html
    spark/site/releases/spark-release-1-3-1.html
    spark/site/releases/spark-release-1-4-0.html
    spark/site/releases/spark-release-1-4-1.html
    spark/site/releases/spark-release-1-5-0.html
    spark/site/releases/spark-release-1-5-1.html
    spark/site/releases/spark-release-1-5-2.html
    spark/site/releases/spark-release-1-6-0.html
    spark/site/research.html
    spark/site/screencasts/1-first-steps-with-spark.html
    spark/site/screencasts/2-spark-documentation-overview.html
    spark/site/screencasts/3-transformations-and-caching.html
    spark/site/screencasts/4-a-standalone-job-in-spark.html
    spark/site/screencasts/index.html
    spark/site/sql/index.html
    spark/site/streaming/index.html

Modified: spark/_config.yml
URL: 
http://svn.apache.org/viewvc/spark/_config.yml?rev=1726699&r1=1726698&r2=1726699&view=diff
==============================================================================
--- spark/_config.yml (original)
+++ spark/_config.yml Mon Jan 25 21:57:32 2016
@@ -1,4 +1,6 @@
-pygments: true
+# pygments option has been renamed to highlighter.
+# pygments: true
+highlighter: pygments
 markdown: kramdown
 kramdown:
   entity_output: symbol

Modified: spark/_layouts/global.html
URL: 
http://svn.apache.org/viewvc/spark/_layouts/global.html?rev=1726699&r1=1726698&r2=1726699&view=diff
==============================================================================
--- spark/_layouts/global.html (original)
+++ spark/_layouts/global.html Mon Jan 25 21:57:32 2016
@@ -24,6 +24,9 @@
   <link href="{{site.url}}css/cerulean.min.css" rel="stylesheet">
   <link href="{{site.url}}css/custom.css" rel="stylesheet">
 
+  <!-- Code highlighter CSS -->
+  <link href="{{site.url}}css/pygments-default.css" rel="stylesheet">
+
   <script type="text/javascript">
   <!-- Google Analytics initialization -->
   var _gaq = _gaq || [];

Modified: spark/examples.md
URL: 
http://svn.apache.org/viewvc/spark/examples.md?rev=1726699&r1=1726698&r2=1726699&view=diff
==============================================================================
--- spark/examples.md (original)
+++ spark/examples.md Mon Jan 25 21:57:32 2016
@@ -6,258 +6,411 @@ navigation:
   weight: 4
   show: true
 ---
-<h2>Spark Examples</h2>
+<h1>Spark Examples</h1>
 
 These examples give a quick overview of the Spark API.
 Spark is built on the concept of <em>distributed datasets</em>, which contain 
arbitrary Java or
 Python objects. You create a dataset from external data, then apply parallel 
operations
-to it. There are two types of operations: <em>transformations</em>, which 
define a new dataset based on
-previous ones, and <em>actions</em>, which kick off a job to execute on a 
cluster.
+to it. The building block of the Spark API is its [RDD 
API](http://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds).
+In the RDD API,
+there are two types of operations: <em>transformations</em>, which define a 
new dataset based on previous ones,
+and <em>actions</em>, which kick off a job to execute on a cluster.
+On top of Sparkâs RDD API, high level APIs are provided, e.g.
+[DataFrame 
API](http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframes) 
and
+[Machine Learning API](http://spark.apache.org/docs/latest/mllib-guide.html).
+These high level APIs provide a concise way to conduct certain data operations.
+In this page, we will show examples using RDD API as well as examples using 
high level APIs.
 
-<h3>Text Search</h3>
+<h2>RDD API Examples</h2>
 
-In this example, we search through the error messages in a log file:
+<h3>Word Count</h3>
+<p>In this example, we use a few transformations to build a dataset of 
(String, Int) pairs called <code>counts</code> and then save it to a file.</p>
 
 <ul class="nav nav-tabs">
   <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
   <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
   <li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
 </ul>
+
 <div class="tab-content">
-  <div class="tab-pane tab-pane-python active">
-    <div class="code code-tab">
-    text_file = spark.textFile(<span class="string">"hdfs://..."</span>)<br />
-    errors = text_file.<span class="sparkop">filter</span>(<span 
class="closure">lambda line: "ERROR" in line</span>)<br />
-    <span class="comment"># Count all the errors</span><br>
-    errors.<span class="sparkop">count</span>()<br>
-    <span class="comment"># Count errors mentioning MySQL</span><br>
-    errors.<span class="sparkop">filter</span>(<span class="closure">lambda 
line: "MySQL" in line</span>).<span class="sparkop">count</span>()<br>
-    <span class="comment"># Fetch the MySQL errors as an array of 
strings</span><br>
-    errors.<span class="sparkop">filter</span>(<span class="closure">lambda 
line: "MySQL" in line</span>).<span class="sparkop">collect</span>()<br>
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-scala">
-    <div class="code code-tab">
-    <span class="keyword">val</span> textFile = spark.textFile(<span 
class="string">"hdfs://..."</span>)<br>
-    <span class="keyword">val</span> errors = textFile.<span 
class="sparkop">filter</span>(<span class="closure">line =&gt; 
line.contains("ERROR")</span>)<br>
-    <span class="comment">// Count all the errors</span><br>
-    errors.<span class="sparkop">count</span>()<br>
-    <span class="comment">// Count errors mentioning MySQL</span><br>
-    errors.<span class="sparkop">filter</span>(<span class="closure">line 
=&gt; line.contains("MySQL")</span>).<span class="sparkop">count</span>()<br>
-    <span class="comment">// Fetch the MySQL errors as an array of 
strings</span><br>
-    errors.<span class="sparkop">filter</span>(<span class="closure">line 
=&gt; line.contains("MySQL")</span>).<span class="sparkop">collect</span>()<br>
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-java">
-    <div class="code code-tab">
-    JavaRDD&lt;String&gt; textFile = spark.textFile(<span 
class="string">"hdfs://..."</span>);<br>
-    JavaRDD&lt;String&gt; errors = textFile.<span 
class="sparkop">filter</span>(<span class="closure">new Function&lt;String, 
Boolean&gt;() {<br>
-    &nbsp;&nbsp;public Boolean call(String s) { return s.contains("ERROR"); 
}<br>
-    }</span>);<br>
-    <span class="comment">// Count all the errors</span><br>
-    errors.<span class="sparkop">count</span>();<br>
-    <span class="comment">// Count errors mentioning MySQL</span><br>
-    errors.<span class="sparkop">filter</span>(<span class="closure">new 
Function&lt;String, Boolean&gt;() {<br>
-    &nbsp;&nbsp;public Boolean call(String s) { return s.contains("MySQL"); 
}<br>
-    }</span>).<span class="sparkop">count</span>();<br>
-    <span class="comment">// Fetch the MySQL errors as an array of 
strings</span><br>
-    errors.<span class="sparkop">filter</span>(<span class="closure">new 
Function&lt;String, Boolean&gt;() {<br>
-    &nbsp;&nbsp;public Boolean call(String s) { return s.contains("MySQL"); 
}<br>
-    }</span>).<span class="sparkop">collect</span>();<br>
-    </div>
-  </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+text_file = sc.textFile("hdfs://...")
+counts = text_file.flatMap(lambda line: line.split(" ")) \
+             .map(lambda word: (word, 1)) \
+             .reduceByKey(lambda a, b: a + b)
+counts.saveAsTextFile("hdfs://...")
+{% endhighlight %}
+</div>
 </div>
 
-<p>The red code fragments are function literals (closures) that get passed 
automatically to the cluster. The blue ones are Spark operations.</p>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+val textFile = sc.textFile("hdfs://...")
+val counts = textFile.flatMap(line => line.split(" "))
+                 .map(word => (word, 1))
+                 .reduceByKey(_ + _)
+counts.saveAsTextFile("hdfs://...")
+{% endhighlight %}
+</div>
+</div>
 
-<h3>In-Memory Text Search</h3>
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+JavaRDD<String> textFile = sc.textFile("hdfs://...");
+JavaRDD<String> words = textFile.flatMap(new FlatMapFunction<String, String>() 
{
+  public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); 
}
+});
+JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, 
String, Integer>() {
+  public Tuple2<String, Integer> call(String s) { return new Tuple2<String, 
Integer>(s, 1); }
+});
+JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, 
Integer, Integer>() {
+  public Integer call(Integer a, Integer b) { return a + b; }
+});
+counts.saveAsTextFile("hdfs://...");
+{% endhighlight %}
+</div>
+</div>
+</div>
 
-<p>Spark can <em>cache</em> datasets in memory to speed up reuse. In the 
example above, we can load just the error messages in RAM using:</p>
+<h3>Pi Estimation</h3>
+<p>Spark can also be used for compute-intensive tasks. This code estimates 
<span style="font-family: serif; font-size: 120%;">Ï</span> by "throwing 
darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) 
and see how many fall in the unit circle. The fraction should be <span 
style="font-family: serif; font-size: 120%;">Ï / 4</span>, so we use this to 
get our estimate.</p>
 
 <ul class="nav nav-tabs">
   <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
   <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
   <li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
 </ul>
+
 <div class="tab-content">
-  <div class="tab-pane tab-pane-python active">
-    <div class="code code-tab">
-    errors.<span class="sparkop">cache</span>()
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-scala">
-    <div class="code code-tab">
-    errors.<span class="sparkop">cache</span>()
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-java">
-    <div class="code code-tab">
-    errors.<span class="sparkop">cache</span>();
-    </div>
-  </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+def sample(p):
+    x, y = random(), random()
+    return 1 if x*x + y*y < 1 else 0
+
+count = sc.parallelize(xrange(0, NUM_SAMPLES)).map(sample) \
+             .reduce(lambda a, b: a + b)
+print "Pi is roughly %f" % (4.0 * count / NUM_SAMPLES)
+{% endhighlight %}
+</div>
 </div>
 
-<p>After the first action that uses <code>errors</code>, later ones will be 
much faster.</p>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+val count = sc.parallelize(1 to NUM_SAMPLES).map{i =>
+  val x = Math.random()
+  val y = Math.random()
+  if (x*x + y*y < 1) 1 else 0
+}.reduce(_ + _)
+println("Pi is roughly " + 4.0 * count / NUM_SAMPLES)
+{% endhighlight %}
+</div>
+</div>
 
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+List<Integer> l = new ArrayList<Integer>(NUM_SAMPLES);
+for (int i = 0; i < NUM_SAMPLES; i++) {
+  l.add(i);
+}
+
+long count = sc.parallelize(l).filter(new Function<Integer, Boolean>() {
+  public Boolean call(Integer i) {
+    double x = Math.random();
+    double y = Math.random();
+    return x*x + y*y < 1;
+  }
+}).count();
+System.out.println("Pi is roughly " + 4.0 * count / NUM_SAMPLES);
+{% endhighlight %}
+</div>
+</div>
+</div>
 
-<h3>Word Count</h3>
+<h2>DataFrame API Examples</h2>
+<p>
+In Spark, a <a 
href="http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframes";>DataFrame</a>
+is a distributed collection of data organized into named columns.
+Users can use DataFrame API to perform various relational operations on both 
external
+data sources and Sparkâs built-in distributed collections without providing 
specific procedures for processing data.
+Also, programs based on DataFrame API will be automatically optimized by 
Sparkâs built-in optimizer, Catalyst.
+</p>
 
-<p>In this example, we use a few more transformations to build a dataset of 
(String, Int) pairs called <code>counts</code> and then save it to a file.</p>
+<h3>Text Search</h3>
+<p>In this example, we search through the error messages in a log file.</p>
 
 <ul class="nav nav-tabs">
   <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
   <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
   <li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
 </ul>
+
 <div class="tab-content">
-  <div class="tab-pane tab-pane-python active">
-    <div class="code code-tab">
-    text_file = spark.textFile(<span class="string">"hdfs://..."</span>)<br>
-    counts = text_file.<span class="sparkop">flatMap</span>(<span 
class="closure">lambda line: line.split(" ")</span>) \<br>
-    
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span
 class="sparkop">map</span>(<span class="closure">lambda word: (word, 
1)</span>) \<br>
-    
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span
 class="sparkop">reduceByKey</span>(<span class="closure">lambda a, b: a + 
b</span>)<br>
-    counts.<span class="sparkop">saveAsTextFile</span>(<span 
class="string">"hdfs://..."</span>)
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-scala">
-    <div class="code code-tab">
-    <span class="keyword">val</span> textFile = spark.textFile(<span 
class="string">"hdfs://..."</span>)<br>
-    <span class="keyword">val</span> counts = textFile.<span 
class="sparkop">flatMap</span>(<span class="closure">line =&gt; line.split(" 
")</span>)<br>
-    
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span
 class="sparkop">map</span>(<span class="closure">word =&gt; (word, 
1)</span>)<br>
-    
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span
 class="sparkop">reduceByKey</span>(<span class="closure">_ + _</span>)<br>
-    counts.<span class="sparkop">saveAsTextFile</span>(<span 
class="string">"hdfs://..."</span>)
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-java">
-    <div class="code code-tab">
-    JavaRDD&lt;String&gt; textFile = spark.textFile(<span 
class="string">"hdfs://..."</span>);<br>
-    JavaRDD&lt;String&gt; words = textFile.<span 
class="sparkop">flatMap</span>(<span class="closure">new 
FlatMapFunction&lt;String, String&gt;() {<br>
-    &nbsp;&nbsp;public Iterable&lt;String&gt; call(String s) { return 
Arrays.asList(s.split(" ")); }<br>
-    }</span>);<br>
-    JavaPairRDD&lt;String, Integer&gt; pairs = words.<span 
class="sparkop">mapToPair</span>(<span class="closure">new 
PairFunction&lt;String, String, Integer&gt;() {<br>
-    &nbsp;&nbsp;public Tuple2&lt;String, Integer&gt; call(String s) { return 
new Tuple2&lt;String, Integer&gt;(s, 1); }<br>
-    }</span>);<br>
-    JavaPairRDD&lt;String, Integer&gt; counts = pairs.<span 
class="sparkop">reduceByKey</span>(<span class="closure">new 
Function2&lt;Integer, Integer, Integer&gt;() {<br>
-    &nbsp;&nbsp;public Integer call(Integer a, Integer b) { return a + b; }<br>
-    }</span>);<br>
-    counts.<span class="sparkop">saveAsTextFile</span>(<span 
class="string">"hdfs://..."</span>);
-    </div>
-  </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+textFile = sc.textFile("hdfs://...")
+
+# Creates a DataFrame having a single column named "line"
+df = textFile.map(lambda r: Row(r)).toDF(["line"])
+errors = df.filter(col("line").like("%ERROR%"))
+# Counts all the errors
+errors.count()
+# Counts errors mentioning MySQL
+errors.filter(col("line").like("%MySQL%")).count()
+# Fetches the MySQL errors as an array of strings
+errors.filter(col("line").like("%MySQL%")).collect()
+{% endhighlight %}
+</div>
 </div>
 
-<h3>Estimating Pi</h3>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+val textFile = sc.textFile("hdfs://...")
+
+// Creates a DataFrame having a single column named "line"
+val df = textFile.toDF("line")
+val errors = df.filter(col("line").like("%ERROR%"))
+// Counts all the errors
+errors.count()
+// Counts errors mentioning MySQL
+errors.filter(col("line").like("%MySQL%")).count()
+// Fetches the MySQL errors as an array of strings
+errors.filter(col("line").like("%MySQL%")).collect()
+{% endhighlight %}
+</div>
+</div>
 
-<p>Spark can also be used for compute-intensive tasks. This code estimates 
<span style="font-family: serif; font-size: 120%;">Ï</span> by "throwing 
darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) 
and see how many fall in the unit circle. The fraction should be <span 
style="font-family: serif; font-size: 120%;">Ï / 4</span>, so we use this to 
get our estimate.</p>
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+// Creates a DataFrame having a single column named "line"
+JavaRDD<String> textFile = sc.textFile("hdfs://...");
+JavaRDD<Row> rowRDD = textFile.map(
+  new Function<String, Row>() {
+    public Row call(String line) throws Exception {
+      return RowFactory.create(line);
+    }
+  });
+List<StructField> fields = new ArrayList<StructField>();
+fields.add(DataTypes.createStructField("line", DataTypes.StringType, true));
+StructType schema = DataTypes.createStructType(fields);
+DataFrame df = sqlContext.createDataFrame(rowRDD, schema);
+
+DataFrame errors = df.filter(col("line").like("%ERROR%"));
+// Counts all the errors
+errors.count();
+// Counts errors mentioning MySQL
+errors.filter(col("line").like("%MySQL%")).count();
+// Fetches the MySQL errors as an array of strings
+errors.filter(col("line").like("%MySQL%")).collect();
+{% endhighlight %}
+</div>
+</div>
+</div>
+
+<h3>Simple Data Operations</h3>
+<p>
+In this example, we read a table stored in a database and calculate the number 
of people for every age.
+Finally, we save the calculated result to S3 in the format of JSON.
+A simple MySQL table "people" is used in the example and this table has two 
columns,
+"name" and "age".
+</p>
 
 <ul class="nav nav-tabs">
   <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
   <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
   <li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
 </ul>
+
 <div class="tab-content">
-  <div class="tab-pane tab-pane-python active">
-    <div class="code code-tab">
-    <span class="keyword">def</span> sample(p):<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;x, y = random(), random()<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;<span class="keyword">return</span> 1 <span 
class="keyword">if</span> x*x + y*y < 1 <span class="keyword">else</span> 
0<br><br>
-    count = spark.parallelize(xrange(0, NUM_SAMPLES)).<span 
class="sparkop">map</span>(<span class="closure">sample</span>) \<br>
-    
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;.<span
 class="sparkop">reduce</span>(<span class="closure">lambda a, b: a + 
b</span>)<br>
-    print <span class="string">"Pi is roughly %f"</span> % (4.0 * count / 
NUM_SAMPLES)<br>
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-scala">
-    <div class="code code-tab">
-    <span class="keyword">val</span> count = spark.parallelize(1 to 
NUM_SAMPLES).<span class="sparkop">map</span>{<span class="closure">i =&gt;<br>
-    &nbsp;&nbsp;val x = Math.random()<br>
-    &nbsp;&nbsp;val y = Math.random()<br>
-    &nbsp;&nbsp;if (x*x + y*y &lt; 1) 1 else 0<br>
-    </span>}.<span class="sparkop">reduce</span>(<span class="closure">_ + 
_</span>)<br>
-    println(<span class="string">"Pi is roughly "</span> + 4.0 * count / 
NUM_SAMPLES)<br>
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-java">
-    <div class="code code-tab">
-    <span class="keyword">int</span> count = spark.parallelize(makeRange(1, 
NUM_SAMPLES)).<span class="sparkop">filter</span>(<span class="closure">new 
Function&lt;Integer, Boolean&gt;() {<br>
-    &nbsp;&nbsp;public Boolean call(Integer i) {<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;double x = Math.random();<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;double y = Math.random();<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;return x*x + y*y &lt; 1;<br>
-    &nbsp;&nbsp;}<br>
-    }</span>).<span class="sparkop">count</span>();<br>
-    System.out.println(<span class="string">"Pi is roughly "</span> + 4 * 
count / NUM_SAMPLES);<br>
-    </div>
-  </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+# Creates a DataFrame based on a table named "people"
+# stored in a MySQL database.
+url = \
+  "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword"
+df = sqlContext \
+  .read \
+  .format("jdbc") \
+  .option("url", url) \
+  .option("dbtable", "people") \
+  .load()
+
+# Looks the schema of this DataFrame.
+df.printSchema()
+
+# Counts people by age
+countsByAge = df.groupBy("age").count()
+countsByAge.show()
+
+# Saves countsByAge to S3 in the JSON format.
+countsByAge.write.format("json").save("s3a://...")
+{% endhighlight %}
+</div>
 </div>
 
-<h3>Logistic Regression</h3>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+// Creates a DataFrame based on a table named "people"
+// stored in a MySQL database.
+val url =
+  "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword"
+val df = sqlContext
+  .read
+  .format("jdbc")
+  .option("url", url)
+  .option("dbtable", "people")
+  .load()
+
+// Looks the schema of this DataFrame.
+df.printSchema()
+
+// Counts people by age
+val countsByAge = df.groupBy("age").count()
+countsByAge.show()
+
+// Saves countsByAge to S3 in the JSON format.
+countsByAge.write.format("json").save("s3a://...")
+{% endhighlight %}
+</div>
+</div>
+
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+// Creates a DataFrame based on a table named "people"
+// stored in a MySQL database.
+String url =
+  "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword";
+DataFrame df = sqlContext
+  .read()
+  .format("jdbc")
+  .option("url", url)
+  .option("dbtable", "people")
+  .load();
+
+// Looks the schema of this DataFrame.
+df.printSchema();
+
+// Counts people by age
+DataFrame countsByAge = df.groupBy("age").count();
+countsByAge.show();
+
+// Saves countsByAge to S3 in the JSON format.
+countsByAge.write().format("json").save("s3a://...");
+{% endhighlight %}
+</div>
+</div>
+</div>
 
-<p>This is an iterative machine learning algorithm that seeks to find the best 
hyperplane that separates two sets of points in a multi-dimensional feature 
space. It can be used to classify messages into spam vs non-spam, for example. 
Because the algorithm applies the same MapReduce operation repeatedly to the 
same dataset, it benefits greatly from caching the input in RAM across 
iterations.</p>
+<h2>Machine Learning Example</h2>
+<p>
+<a href="http://spark.apache.org/docs/latest/mllib-guide.html";>MLlib</a>, 
Sparkâs Machine Learning (ML) library, provides many distributed ML 
algorithms.
+These algorithms cover tasks such as feature extraction, classification, 
regression, clustering,
+recommendation, and more. 
+MLlib also provides tools such as ML Pipelines for building workflows, 
CrossValidator for tuning parameters,
+and model persistence for saving and loading models.
+</p>
+
+<h3>Prediction with Logistic Regression</h3>
+<p>
+In this example, we take a dataset of labels and feature vectors.
+We learn to predict the labels from feature vectors using the Logistic 
Regression algorithm.
+</p>
 
 <ul class="nav nav-tabs">
   <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li>
   <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li>
   <li class="lang-tab lang-tab-java"><a href="#">Java</a></li>
 </ul>
+
 <div class="tab-content">
-  <div class="tab-pane tab-pane-python active">
-    <div class="code code-tab">
-    points = spark.textFile(...).<span 
class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br>
-    w = numpy.random.ranf(size = D) <span class="comment"># current separating 
plane</span><br>
-    <span class="keyword">for</span> i <span class="keyword">in</span> 
range(ITERATIONS):<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;gradient = points.<span 
class="sparkop">map</span>(<span class="closure"><br>
-    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;lambda p: (1 / (1 + 
exp(-p.y*(w.dot(p.x)))) - 1) * p.y * p.x<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;</span>).<span class="sparkop">reduce</span>(<span 
class="closure">lambda a, b: a + b</span>)<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;w -= gradient<br>
-    print <span class="string">"Final separating plane: %s"</span> % w<br>
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-scala">
-    <div class="code code-tab">
-    <span class="keyword">val</span> points = spark.textFile(...).<span 
class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br>
-    <span class="keyword">var</span> w = Vector.random(D) <span 
class="comment">// current separating plane</span><br>
-    <span class="keyword">for</span> (i &lt;- 1 to ITERATIONS) {<br>
-    &nbsp;&nbsp;<span class="keyword">val</span> gradient = points.<span 
class="sparkop">map</span>(<span class="closure">p =&gt;<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;(1 / (1 + exp(-p.y*(w dot p.x))) - 1) * p.y * 
p.x<br>
-    &nbsp;&nbsp;</span>).<span class="sparkop">reduce</span>(<span 
class="closure">_ + _</span>)<br>
-    &nbsp;&nbsp;w -= gradient<br>
-    }<br>
-    println(<span class="string">"Final separating plane: "</span> + w)<br>
-    </div>
-  </div>
-  <div class="tab-pane tab-pane-java">
-    <div class="code code-tab">
-    <span class="keyword">class</span> ComputeGradient <span 
class="keyword">extends</span> Function&lt;DataPoint, Vector&gt; {<br>
-    &nbsp;&nbsp;<span class="keyword">private</span> Vector w;<br>
-    &nbsp;&nbsp;ComputeGradient(Vector w) { <span 
class="keyword">this</span>.w = w; }<br>
-    &nbsp;&nbsp;<span class="keyword">public</span> Vector call(DataPoint p) 
{<br>
-    &nbsp;&nbsp;&nbsp;&nbsp;<span class="keyword">return</span> p.x.times(p.y 
* (1 / (1 + Math.exp(w.dot(p.x))) - 1));<br>
-    &nbsp;&nbsp;}<br>
-    }<br>
-    <br>
-    JavaRDD&lt;DataPoint&gt; points = spark.textFile(...).<span 
class="sparkop">map</span>(<span class="closure">new ParsePoint()</span>).<span 
class="sparkop">cache</span>();<br>
-    Vector w = Vector.random(D); <span class="comment">// current separating 
plane</span><br>
-    <span class="keyword">for</span> (<span class="keyword">int</span> i = 0; 
i &lt; ITERATIONS; i++) {<br>
-    &nbsp;&nbsp;Vector gradient = points.<span 
class="sparkop">map</span>(<span class="closure">new 
ComputeGradient(w)</span>).<span class="sparkop">reduce</span>(<span 
class="closure">new AddVectors()</span>);<br>
-    &nbsp;&nbsp;w = w.subtract(gradient);<br>
-    }<br>
-    System.out.println(<span class="string">"Final separating plane: "</span> 
+ w);<br>
-    </div>
-  </div>
+<div class="tab-pane tab-pane-python active">
+<div class="code code-tab">
+{% highlight python %}
+# Every record of this DataFrame contains the label and
+# features represented by a vector.
+df = sqlContext.createDataFrame(data, ["label", "features"])
+
+# Set parameters for the algorithm.
+# Here, we limit the number of iterations to 10.
+lr = LogisticRegression(maxIter=10)
+
+# Fit the model to the data.
+model = lr.fit(df)
+
+# Given a dataset, predict each point's label, and show the results.
+model.transform(df).show()
+{% endhighlight %}
+</div>
 </div>
 
-<p>Note that the current separating plane, <code>w</code>, gets shipped 
automatically to the cluster with every <code>map</code> call.</p>
-
-<p>The graph below compares the running time per iteration of this Spark 
program against a Hadoop implementation on 100 GB of data on a 100-node 
cluster, showing the benefit of in-memory caching:</p>
+<div class="tab-pane tab-pane-scala">
+<div class="code code-tab">
+{% highlight scala %}
+// Every record of this DataFrame contains the label and
+// features represented by a vector.
+val df = sqlContext.createDataFrame(data).toDF("label", "features")
+
+// Set parameters for the algorithm.
+// Here, we limit the number of iterations to 10.
+val lr = new LogisticRegression().setMaxIter(10)
+
+// Fit the model to the data.
+val model = lr.fit(df)
+
+// Inspect the model: get the feature weights.
+val weights = model.weights
+
+// Given a dataset, predict each point's label, and show the results.
+model.transform(df).show()
+{% endhighlight %}
+</div>
+</div>
 
-<p style="margin-top: 20px; margin-bottom: 30px;">
-<img src="{{site.url}}images/logistic-regression.png" alt="Logistic regression 
performance in Spark vs Hadoop">
-</p>
+<div class="tab-pane tab-pane-java">
+<div class="code code-tab">
+{% highlight java %}
+// Every record of this DataFrame contains the label and
+// features represented by a vector.
+StructType schema = new StructType(new StructField[]{
+  new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
+  new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+
+// Set parameters for the algorithm.
+// Here, we limit the number of iterations to 10.
+LogisticRegression lr = new LogisticRegression().setMaxIter(10);
+
+// Fit the model to the data.
+LogisticRegressionModel model = lr.fit(df);
+
+// Inspect the model: get the feature weights.
+Vector weights = model.weights();
+
+// Given a dataset, predict each point's label, and show the results.
+model.transform(df).show();
+{% endhighlight %}
+</div>
+</div>
+</div>
 
 <a name="additional"></a>
-<h2>Additional Examples</h2>
+<h1>Additional Examples</h1>
 
 Many additional examples are distributed with Spark:
 
  * Basic Spark: [Scala 
examples](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples),
 [Java 
examples](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples),
 [Python 
examples](https://github.com/apache/spark/tree/master/examples/src/main/python)
- * Spark Streaming: [Scala 
examples](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming),
 [Java 
examples](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples/streaming)
-
+ * Spark Streaming: [Scala 
examples](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming),
 [Java 
examples](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples/streaming)
\ No newline at end of file

Modified: spark/site/community.html
URL: 
http://svn.apache.org/viewvc/spark/site/community.html?rev=1726699&r1=1726698&r2=1726699&view=diff
==============================================================================
--- spark/site/community.html (original)
+++ spark/site/community.html Mon Jan 25 21:57:32 2016
@@ -18,6 +18,9 @@
   <link href="/css/cerulean.min.css" rel="stylesheet">
   <link href="/css/custom.css" rel="stylesheet">
 
+  <!-- Code highlighter CSS -->
+  <link href="/css/pygments-default.css" rel="stylesheet">
+
   <script type="text/javascript">
   <!-- Google Analytics initialization -->
   var _gaq = _gaq || [];

Modified: spark/site/documentation.html
URL: 
http://svn.apache.org/viewvc/spark/site/documentation.html?rev=1726699&r1=1726698&r2=1726699&view=diff
==============================================================================
--- spark/site/documentation.html (original)
+++ spark/site/documentation.html Mon Jan 25 21:57:32 2016
@@ -18,6 +18,9 @@
   <link href="/css/cerulean.min.css" rel="stylesheet">
   <link href="/css/custom.css" rel="stylesheet">
 
+  <!-- Code highlighter CSS -->
+  <link href="/css/pygments-default.css" rel="stylesheet">
+
   <script type="text/javascript">
   <!-- Google Analytics initialization -->
   var _gaq = _gaq || [];

Modified: spark/site/downloads.html
URL: 
http://svn.apache.org/viewvc/spark/site/downloads.html?rev=1726699&r1=1726698&r2=1726699&view=diff
==============================================================================
--- spark/site/downloads.html (original)
+++ spark/site/downloads.html Mon Jan 25 21:57:32 2016
@@ -18,6 +18,9 @@
   <link href="/css/cerulean.min.css" rel="stylesheet">
   <link href="/css/custom.css" rel="stylesheet">
 
+  <!-- Code highlighter CSS -->
+  <link href="/css/pygments-default.css" rel="stylesheet">
+
   <script type="text/javascript">
   <!-- Google Analytics initialization -->
   var _gaq = _gaq || [];




---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

svn commit: r1726699 [1/3] - in /spark: ./ _layouts/ site/ site/graphx/ site/mllib/ site/news/ site/releases/ site/screencasts/ site/sql/ site/streaming/

Reply via email to