Author: yhuai Date: Mon Jan 25 21:57:32 2016 New Revision: 1726699 URL: http://svn.apache.org/viewvc?rev=1726699&view=rev Log: Update the Spark example page to include examples using high level APIs
Modified: spark/_config.yml spark/_layouts/global.html spark/examples.md spark/site/community.html spark/site/documentation.html spark/site/downloads.html spark/site/examples.html spark/site/faq.html spark/site/graphx/index.html spark/site/index.html spark/site/mailing-lists.html spark/site/mllib/index.html spark/site/news/amp-camp-2013-registration-ope.html spark/site/news/announcing-the-first-spark-summit.html spark/site/news/fourth-spark-screencast-published.html spark/site/news/index.html spark/site/news/nsdi-paper.html spark/site/news/one-month-to-spark-summit-2015.html spark/site/news/proposals-open-for-spark-summit-east.html spark/site/news/registration-open-for-spark-summit-east.html spark/site/news/run-spark-and-shark-on-amazon-emr.html spark/site/news/spark-0-6-1-and-0-5-2-released.html spark/site/news/spark-0-6-2-released.html spark/site/news/spark-0-7-0-released.html spark/site/news/spark-0-7-2-released.html spark/site/news/spark-0-7-3-released.html spark/site/news/spark-0-8-0-released.html spark/site/news/spark-0-8-1-released.html spark/site/news/spark-0-9-0-released.html spark/site/news/spark-0-9-1-released.html spark/site/news/spark-0-9-2-released.html spark/site/news/spark-1-0-0-released.html spark/site/news/spark-1-0-1-released.html spark/site/news/spark-1-0-2-released.html spark/site/news/spark-1-1-0-released.html spark/site/news/spark-1-1-1-released.html spark/site/news/spark-1-2-0-released.html spark/site/news/spark-1-2-1-released.html spark/site/news/spark-1-2-2-released.html spark/site/news/spark-1-3-0-released.html spark/site/news/spark-1-4-0-released.html spark/site/news/spark-1-4-1-released.html spark/site/news/spark-1-5-0-released.html spark/site/news/spark-1-5-1-released.html spark/site/news/spark-1-5-2-released.html spark/site/news/spark-1-6-0-released.html spark/site/news/spark-accepted-into-apache-incubator.html spark/site/news/spark-and-shark-in-the-news.html spark/site/news/spark-becomes-tlp.html spark/site/news/spark-featured-in-wired.html spark/site/news/spark-mailing-lists-moving-to-apache.html spark/site/news/spark-meetups.html spark/site/news/spark-screencasts-published.html spark/site/news/spark-summit-2013-is-a-wrap.html spark/site/news/spark-summit-2014-videos-posted.html spark/site/news/spark-summit-2015-videos-posted.html spark/site/news/spark-summit-agenda-posted.html spark/site/news/spark-summit-east-2015-videos-posted.html spark/site/news/spark-summit-east-2016-cfp-closing.html spark/site/news/spark-summit-east-agenda-posted.html spark/site/news/spark-summit-europe-agenda-posted.html spark/site/news/spark-summit-europe.html spark/site/news/spark-tips-from-quantifind.html spark/site/news/spark-user-survey-and-powered-by-page.html spark/site/news/spark-version-0-6-0-released.html spark/site/news/spark-wins-daytona-gray-sort-100tb-benchmark.html spark/site/news/strata-exercises-now-available-online.html spark/site/news/submit-talks-to-spark-summit-2014.html spark/site/news/submit-talks-to-spark-summit-east-2016.html spark/site/news/two-weeks-to-spark-summit-2014.html spark/site/news/video-from-first-spark-development-meetup.html spark/site/releases/spark-release-0-3.html spark/site/releases/spark-release-0-5-0.html spark/site/releases/spark-release-0-5-1.html spark/site/releases/spark-release-0-5-2.html spark/site/releases/spark-release-0-6-0.html spark/site/releases/spark-release-0-6-1.html spark/site/releases/spark-release-0-6-2.html spark/site/releases/spark-release-0-7-0.html spark/site/releases/spark-release-0-7-2.html spark/site/releases/spark-release-0-7-3.html spark/site/releases/spark-release-0-8-0.html spark/site/releases/spark-release-0-8-1.html spark/site/releases/spark-release-0-9-0.html spark/site/releases/spark-release-0-9-1.html spark/site/releases/spark-release-0-9-2.html spark/site/releases/spark-release-1-0-0.html spark/site/releases/spark-release-1-0-1.html spark/site/releases/spark-release-1-0-2.html spark/site/releases/spark-release-1-1-0.html spark/site/releases/spark-release-1-1-1.html spark/site/releases/spark-release-1-2-0.html spark/site/releases/spark-release-1-2-1.html spark/site/releases/spark-release-1-2-2.html spark/site/releases/spark-release-1-3-0.html spark/site/releases/spark-release-1-3-1.html spark/site/releases/spark-release-1-4-0.html spark/site/releases/spark-release-1-4-1.html spark/site/releases/spark-release-1-5-0.html spark/site/releases/spark-release-1-5-1.html spark/site/releases/spark-release-1-5-2.html spark/site/releases/spark-release-1-6-0.html spark/site/research.html spark/site/screencasts/1-first-steps-with-spark.html spark/site/screencasts/2-spark-documentation-overview.html spark/site/screencasts/3-transformations-and-caching.html spark/site/screencasts/4-a-standalone-job-in-spark.html spark/site/screencasts/index.html spark/site/sql/index.html spark/site/streaming/index.html Modified: spark/_config.yml URL: http://svn.apache.org/viewvc/spark/_config.yml?rev=1726699&r1=1726698&r2=1726699&view=diff ============================================================================== --- spark/_config.yml (original) +++ spark/_config.yml Mon Jan 25 21:57:32 2016 @@ -1,4 +1,6 @@ -pygments: true +# pygments option has been renamed to highlighter. +# pygments: true +highlighter: pygments markdown: kramdown kramdown: entity_output: symbol Modified: spark/_layouts/global.html URL: http://svn.apache.org/viewvc/spark/_layouts/global.html?rev=1726699&r1=1726698&r2=1726699&view=diff ============================================================================== --- spark/_layouts/global.html (original) +++ spark/_layouts/global.html Mon Jan 25 21:57:32 2016 @@ -24,6 +24,9 @@ <link href="{{site.url}}css/cerulean.min.css" rel="stylesheet"> <link href="{{site.url}}css/custom.css" rel="stylesheet"> + <!-- Code highlighter CSS --> + <link href="{{site.url}}css/pygments-default.css" rel="stylesheet"> + <script type="text/javascript"> <!-- Google Analytics initialization --> var _gaq = _gaq || []; Modified: spark/examples.md URL: http://svn.apache.org/viewvc/spark/examples.md?rev=1726699&r1=1726698&r2=1726699&view=diff ============================================================================== --- spark/examples.md (original) +++ spark/examples.md Mon Jan 25 21:57:32 2016 @@ -6,258 +6,411 @@ navigation: weight: 4 show: true --- -<h2>Spark Examples</h2> +<h1>Spark Examples</h1> These examples give a quick overview of the Spark API. Spark is built on the concept of <em>distributed datasets</em>, which contain arbitrary Java or Python objects. You create a dataset from external data, then apply parallel operations -to it. There are two types of operations: <em>transformations</em>, which define a new dataset based on -previous ones, and <em>actions</em>, which kick off a job to execute on a cluster. +to it. The building block of the Spark API is its [RDD API](http://spark.apache.org/docs/latest/programming-guide.html#resilient-distributed-datasets-rdds). +In the RDD API, +there are two types of operations: <em>transformations</em>, which define a new dataset based on previous ones, +and <em>actions</em>, which kick off a job to execute on a cluster. +On top of Sparkâs RDD API, high level APIs are provided, e.g. +[DataFrame API](http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframes) and +[Machine Learning API](http://spark.apache.org/docs/latest/mllib-guide.html). +These high level APIs provide a concise way to conduct certain data operations. +In this page, we will show examples using RDD API as well as examples using high level APIs. -<h3>Text Search</h3> +<h2>RDD API Examples</h2> -In this example, we search through the error messages in a log file: +<h3>Word Count</h3> +<p>In this example, we use a few transformations to build a dataset of (String, Int) pairs called <code>counts</code> and then save it to a file.</p> <ul class="nav nav-tabs"> <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li> <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li> <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> </ul> + <div class="tab-content"> - <div class="tab-pane tab-pane-python active"> - <div class="code code-tab"> - text_file = spark.textFile(<span class="string">"hdfs://..."</span>)<br /> - errors = text_file.<span class="sparkop">filter</span>(<span class="closure">lambda line: "ERROR" in line</span>)<br /> - <span class="comment"># Count all the errors</span><br> - errors.<span class="sparkop">count</span>()<br> - <span class="comment"># Count errors mentioning MySQL</span><br> - errors.<span class="sparkop">filter</span>(<span class="closure">lambda line: "MySQL" in line</span>).<span class="sparkop">count</span>()<br> - <span class="comment"># Fetch the MySQL errors as an array of strings</span><br> - errors.<span class="sparkop">filter</span>(<span class="closure">lambda line: "MySQL" in line</span>).<span class="sparkop">collect</span>()<br> - </div> - </div> - <div class="tab-pane tab-pane-scala"> - <div class="code code-tab"> - <span class="keyword">val</span> textFile = spark.textFile(<span class="string">"hdfs://..."</span>)<br> - <span class="keyword">val</span> errors = textFile.<span class="sparkop">filter</span>(<span class="closure">line => line.contains("ERROR")</span>)<br> - <span class="comment">// Count all the errors</span><br> - errors.<span class="sparkop">count</span>()<br> - <span class="comment">// Count errors mentioning MySQL</span><br> - errors.<span class="sparkop">filter</span>(<span class="closure">line => line.contains("MySQL")</span>).<span class="sparkop">count</span>()<br> - <span class="comment">// Fetch the MySQL errors as an array of strings</span><br> - errors.<span class="sparkop">filter</span>(<span class="closure">line => line.contains("MySQL")</span>).<span class="sparkop">collect</span>()<br> - </div> - </div> - <div class="tab-pane tab-pane-java"> - <div class="code code-tab"> - JavaRDD<String> textFile = spark.textFile(<span class="string">"hdfs://..."</span>);<br> - JavaRDD<String> errors = textFile.<span class="sparkop">filter</span>(<span class="closure">new Function<String, Boolean>() {<br> - public Boolean call(String s) { return s.contains("ERROR"); }<br> - }</span>);<br> - <span class="comment">// Count all the errors</span><br> - errors.<span class="sparkop">count</span>();<br> - <span class="comment">// Count errors mentioning MySQL</span><br> - errors.<span class="sparkop">filter</span>(<span class="closure">new Function<String, Boolean>() {<br> - public Boolean call(String s) { return s.contains("MySQL"); }<br> - }</span>).<span class="sparkop">count</span>();<br> - <span class="comment">// Fetch the MySQL errors as an array of strings</span><br> - errors.<span class="sparkop">filter</span>(<span class="closure">new Function<String, Boolean>() {<br> - public Boolean call(String s) { return s.contains("MySQL"); }<br> - }</span>).<span class="sparkop">collect</span>();<br> - </div> - </div> +<div class="tab-pane tab-pane-python active"> +<div class="code code-tab"> +{% highlight python %} +text_file = sc.textFile("hdfs://...") +counts = text_file.flatMap(lambda line: line.split(" ")) \ + .map(lambda word: (word, 1)) \ + .reduceByKey(lambda a, b: a + b) +counts.saveAsTextFile("hdfs://...") +{% endhighlight %} +</div> </div> -<p>The red code fragments are function literals (closures) that get passed automatically to the cluster. The blue ones are Spark operations.</p> +<div class="tab-pane tab-pane-scala"> +<div class="code code-tab"> +{% highlight scala %} +val textFile = sc.textFile("hdfs://...") +val counts = textFile.flatMap(line => line.split(" ")) + .map(word => (word, 1)) + .reduceByKey(_ + _) +counts.saveAsTextFile("hdfs://...") +{% endhighlight %} +</div> +</div> -<h3>In-Memory Text Search</h3> +<div class="tab-pane tab-pane-java"> +<div class="code code-tab"> +{% highlight java %} +JavaRDD<String> textFile = sc.textFile("hdfs://..."); +JavaRDD<String> words = textFile.flatMap(new FlatMapFunction<String, String>() { + public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); } +}); +JavaPairRDD<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>() { + public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } +}); +JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() { + public Integer call(Integer a, Integer b) { return a + b; } +}); +counts.saveAsTextFile("hdfs://..."); +{% endhighlight %} +</div> +</div> +</div> -<p>Spark can <em>cache</em> datasets in memory to speed up reuse. In the example above, we can load just the error messages in RAM using:</p> +<h3>Pi Estimation</h3> +<p>Spark can also be used for compute-intensive tasks. This code estimates <span style="font-family: serif; font-size: 120%;">Ï</span> by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be <span style="font-family: serif; font-size: 120%;">Ï / 4</span>, so we use this to get our estimate.</p> <ul class="nav nav-tabs"> <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li> <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li> <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> </ul> + <div class="tab-content"> - <div class="tab-pane tab-pane-python active"> - <div class="code code-tab"> - errors.<span class="sparkop">cache</span>() - </div> - </div> - <div class="tab-pane tab-pane-scala"> - <div class="code code-tab"> - errors.<span class="sparkop">cache</span>() - </div> - </div> - <div class="tab-pane tab-pane-java"> - <div class="code code-tab"> - errors.<span class="sparkop">cache</span>(); - </div> - </div> +<div class="tab-pane tab-pane-python active"> +<div class="code code-tab"> +{% highlight python %} +def sample(p): + x, y = random(), random() + return 1 if x*x + y*y < 1 else 0 + +count = sc.parallelize(xrange(0, NUM_SAMPLES)).map(sample) \ + .reduce(lambda a, b: a + b) +print "Pi is roughly %f" % (4.0 * count / NUM_SAMPLES) +{% endhighlight %} +</div> </div> -<p>After the first action that uses <code>errors</code>, later ones will be much faster.</p> +<div class="tab-pane tab-pane-scala"> +<div class="code code-tab"> +{% highlight scala %} +val count = sc.parallelize(1 to NUM_SAMPLES).map{i => + val x = Math.random() + val y = Math.random() + if (x*x + y*y < 1) 1 else 0 +}.reduce(_ + _) +println("Pi is roughly " + 4.0 * count / NUM_SAMPLES) +{% endhighlight %} +</div> +</div> +<div class="tab-pane tab-pane-java"> +<div class="code code-tab"> +{% highlight java %} +List<Integer> l = new ArrayList<Integer>(NUM_SAMPLES); +for (int i = 0; i < NUM_SAMPLES; i++) { + l.add(i); +} + +long count = sc.parallelize(l).filter(new Function<Integer, Boolean>() { + public Boolean call(Integer i) { + double x = Math.random(); + double y = Math.random(); + return x*x + y*y < 1; + } +}).count(); +System.out.println("Pi is roughly " + 4.0 * count / NUM_SAMPLES); +{% endhighlight %} +</div> +</div> +</div> -<h3>Word Count</h3> +<h2>DataFrame API Examples</h2> +<p> +In Spark, a <a href="http://spark.apache.org/docs/latest/sql-programming-guide.html#dataframes">DataFrame</a> +is a distributed collection of data organized into named columns. +Users can use DataFrame API to perform various relational operations on both external +data sources and Sparkâs built-in distributed collections without providing specific procedures for processing data. +Also, programs based on DataFrame API will be automatically optimized by Sparkâs built-in optimizer, Catalyst. +</p> -<p>In this example, we use a few more transformations to build a dataset of (String, Int) pairs called <code>counts</code> and then save it to a file.</p> +<h3>Text Search</h3> +<p>In this example, we search through the error messages in a log file.</p> <ul class="nav nav-tabs"> <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li> <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li> <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> </ul> + <div class="tab-content"> - <div class="tab-pane tab-pane-python active"> - <div class="code code-tab"> - text_file = spark.textFile(<span class="string">"hdfs://..."</span>)<br> - counts = text_file.<span class="sparkop">flatMap</span>(<span class="closure">lambda line: line.split(" ")</span>) \<br> - .<span class="sparkop">map</span>(<span class="closure">lambda word: (word, 1)</span>) \<br> - .<span class="sparkop">reduceByKey</span>(<span class="closure">lambda a, b: a + b</span>)<br> - counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>) - </div> - </div> - <div class="tab-pane tab-pane-scala"> - <div class="code code-tab"> - <span class="keyword">val</span> textFile = spark.textFile(<span class="string">"hdfs://..."</span>)<br> - <span class="keyword">val</span> counts = textFile.<span class="sparkop">flatMap</span>(<span class="closure">line => line.split(" ")</span>)<br> - .<span class="sparkop">map</span>(<span class="closure">word => (word, 1)</span>)<br> - .<span class="sparkop">reduceByKey</span>(<span class="closure">_ + _</span>)<br> - counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>) - </div> - </div> - <div class="tab-pane tab-pane-java"> - <div class="code code-tab"> - JavaRDD<String> textFile = spark.textFile(<span class="string">"hdfs://..."</span>);<br> - JavaRDD<String> words = textFile.<span class="sparkop">flatMap</span>(<span class="closure">new FlatMapFunction<String, String>() {<br> - public Iterable<String> call(String s) { return Arrays.asList(s.split(" ")); }<br> - }</span>);<br> - JavaPairRDD<String, Integer> pairs = words.<span class="sparkop">mapToPair</span>(<span class="closure">new PairFunction<String, String, Integer>() {<br> - public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); }<br> - }</span>);<br> - JavaPairRDD<String, Integer> counts = pairs.<span class="sparkop">reduceByKey</span>(<span class="closure">new Function2<Integer, Integer, Integer>() {<br> - public Integer call(Integer a, Integer b) { return a + b; }<br> - }</span>);<br> - counts.<span class="sparkop">saveAsTextFile</span>(<span class="string">"hdfs://..."</span>); - </div> - </div> +<div class="tab-pane tab-pane-python active"> +<div class="code code-tab"> +{% highlight python %} +textFile = sc.textFile("hdfs://...") + +# Creates a DataFrame having a single column named "line" +df = textFile.map(lambda r: Row(r)).toDF(["line"]) +errors = df.filter(col("line").like("%ERROR%")) +# Counts all the errors +errors.count() +# Counts errors mentioning MySQL +errors.filter(col("line").like("%MySQL%")).count() +# Fetches the MySQL errors as an array of strings +errors.filter(col("line").like("%MySQL%")).collect() +{% endhighlight %} +</div> </div> -<h3>Estimating Pi</h3> +<div class="tab-pane tab-pane-scala"> +<div class="code code-tab"> +{% highlight scala %} +val textFile = sc.textFile("hdfs://...") + +// Creates a DataFrame having a single column named "line" +val df = textFile.toDF("line") +val errors = df.filter(col("line").like("%ERROR%")) +// Counts all the errors +errors.count() +// Counts errors mentioning MySQL +errors.filter(col("line").like("%MySQL%")).count() +// Fetches the MySQL errors as an array of strings +errors.filter(col("line").like("%MySQL%")).collect() +{% endhighlight %} +</div> +</div> -<p>Spark can also be used for compute-intensive tasks. This code estimates <span style="font-family: serif; font-size: 120%;">Ï</span> by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be <span style="font-family: serif; font-size: 120%;">Ï / 4</span>, so we use this to get our estimate.</p> +<div class="tab-pane tab-pane-java"> +<div class="code code-tab"> +{% highlight java %} +// Creates a DataFrame having a single column named "line" +JavaRDD<String> textFile = sc.textFile("hdfs://..."); +JavaRDD<Row> rowRDD = textFile.map( + new Function<String, Row>() { + public Row call(String line) throws Exception { + return RowFactory.create(line); + } + }); +List<StructField> fields = new ArrayList<StructField>(); +fields.add(DataTypes.createStructField("line", DataTypes.StringType, true)); +StructType schema = DataTypes.createStructType(fields); +DataFrame df = sqlContext.createDataFrame(rowRDD, schema); + +DataFrame errors = df.filter(col("line").like("%ERROR%")); +// Counts all the errors +errors.count(); +// Counts errors mentioning MySQL +errors.filter(col("line").like("%MySQL%")).count(); +// Fetches the MySQL errors as an array of strings +errors.filter(col("line").like("%MySQL%")).collect(); +{% endhighlight %} +</div> +</div> +</div> + +<h3>Simple Data Operations</h3> +<p> +In this example, we read a table stored in a database and calculate the number of people for every age. +Finally, we save the calculated result to S3 in the format of JSON. +A simple MySQL table "people" is used in the example and this table has two columns, +"name" and "age". +</p> <ul class="nav nav-tabs"> <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li> <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li> <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> </ul> + <div class="tab-content"> - <div class="tab-pane tab-pane-python active"> - <div class="code code-tab"> - <span class="keyword">def</span> sample(p):<br> - x, y = random(), random()<br> - <span class="keyword">return</span> 1 <span class="keyword">if</span> x*x + y*y < 1 <span class="keyword">else</span> 0<br><br> - count = spark.parallelize(xrange(0, NUM_SAMPLES)).<span class="sparkop">map</span>(<span class="closure">sample</span>) \<br> - .<span class="sparkop">reduce</span>(<span class="closure">lambda a, b: a + b</span>)<br> - print <span class="string">"Pi is roughly %f"</span> % (4.0 * count / NUM_SAMPLES)<br> - </div> - </div> - <div class="tab-pane tab-pane-scala"> - <div class="code code-tab"> - <span class="keyword">val</span> count = spark.parallelize(1 to NUM_SAMPLES).<span class="sparkop">map</span>{<span class="closure">i =><br> - val x = Math.random()<br> - val y = Math.random()<br> - if (x*x + y*y < 1) 1 else 0<br> - </span>}.<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br> - println(<span class="string">"Pi is roughly "</span> + 4.0 * count / NUM_SAMPLES)<br> - </div> - </div> - <div class="tab-pane tab-pane-java"> - <div class="code code-tab"> - <span class="keyword">int</span> count = spark.parallelize(makeRange(1, NUM_SAMPLES)).<span class="sparkop">filter</span>(<span class="closure">new Function<Integer, Boolean>() {<br> - public Boolean call(Integer i) {<br> - double x = Math.random();<br> - double y = Math.random();<br> - return x*x + y*y < 1;<br> - }<br> - }</span>).<span class="sparkop">count</span>();<br> - System.out.println(<span class="string">"Pi is roughly "</span> + 4 * count / NUM_SAMPLES);<br> - </div> - </div> +<div class="tab-pane tab-pane-python active"> +<div class="code code-tab"> +{% highlight python %} +# Creates a DataFrame based on a table named "people" +# stored in a MySQL database. +url = \ + "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword" +df = sqlContext \ + .read \ + .format("jdbc") \ + .option("url", url) \ + .option("dbtable", "people") \ + .load() + +# Looks the schema of this DataFrame. +df.printSchema() + +# Counts people by age +countsByAge = df.groupBy("age").count() +countsByAge.show() + +# Saves countsByAge to S3 in the JSON format. +countsByAge.write.format("json").save("s3a://...") +{% endhighlight %} +</div> </div> -<h3>Logistic Regression</h3> +<div class="tab-pane tab-pane-scala"> +<div class="code code-tab"> +{% highlight scala %} +// Creates a DataFrame based on a table named "people" +// stored in a MySQL database. +val url = + "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword" +val df = sqlContext + .read + .format("jdbc") + .option("url", url) + .option("dbtable", "people") + .load() + +// Looks the schema of this DataFrame. +df.printSchema() + +// Counts people by age +val countsByAge = df.groupBy("age").count() +countsByAge.show() + +// Saves countsByAge to S3 in the JSON format. +countsByAge.write.format("json").save("s3a://...") +{% endhighlight %} +</div> +</div> + +<div class="tab-pane tab-pane-java"> +<div class="code code-tab"> +{% highlight java %} +// Creates a DataFrame based on a table named "people" +// stored in a MySQL database. +String url = + "jdbc:mysql://yourIP:yourPort/test?user=yourUsername;password=yourPassword"; +DataFrame df = sqlContext + .read() + .format("jdbc") + .option("url", url) + .option("dbtable", "people") + .load(); + +// Looks the schema of this DataFrame. +df.printSchema(); + +// Counts people by age +DataFrame countsByAge = df.groupBy("age").count(); +countsByAge.show(); + +// Saves countsByAge to S3 in the JSON format. +countsByAge.write().format("json").save("s3a://..."); +{% endhighlight %} +</div> +</div> +</div> -<p>This is an iterative machine learning algorithm that seeks to find the best hyperplane that separates two sets of points in a multi-dimensional feature space. It can be used to classify messages into spam vs non-spam, for example. Because the algorithm applies the same MapReduce operation repeatedly to the same dataset, it benefits greatly from caching the input in RAM across iterations.</p> +<h2>Machine Learning Example</h2> +<p> +<a href="http://spark.apache.org/docs/latest/mllib-guide.html">MLlib</a>, Sparkâs Machine Learning (ML) library, provides many distributed ML algorithms. +These algorithms cover tasks such as feature extraction, classification, regression, clustering, +recommendation, and more. +MLlib also provides tools such as ML Pipelines for building workflows, CrossValidator for tuning parameters, +and model persistence for saving and loading models. +</p> + +<h3>Prediction with Logistic Regression</h3> +<p> +In this example, we take a dataset of labels and feature vectors. +We learn to predict the labels from feature vectors using the Logistic Regression algorithm. +</p> <ul class="nav nav-tabs"> <li class="lang-tab lang-tab-python active"><a href="#">Python</a></li> <li class="lang-tab lang-tab-scala"><a href="#">Scala</a></li> <li class="lang-tab lang-tab-java"><a href="#">Java</a></li> </ul> + <div class="tab-content"> - <div class="tab-pane tab-pane-python active"> - <div class="code code-tab"> - points = spark.textFile(...).<span class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br> - w = numpy.random.ranf(size = D) <span class="comment"># current separating plane</span><br> - <span class="keyword">for</span> i <span class="keyword">in</span> range(ITERATIONS):<br> - gradient = points.<span class="sparkop">map</span>(<span class="closure"><br> - lambda p: (1 / (1 + exp(-p.y*(w.dot(p.x)))) - 1) * p.y * p.x<br> - </span>).<span class="sparkop">reduce</span>(<span class="closure">lambda a, b: a + b</span>)<br> - w -= gradient<br> - print <span class="string">"Final separating plane: %s"</span> % w<br> - </div> - </div> - <div class="tab-pane tab-pane-scala"> - <div class="code code-tab"> - <span class="keyword">val</span> points = spark.textFile(...).<span class="sparkop">map</span>(parsePoint).<span class="sparkop">cache</span>()<br> - <span class="keyword">var</span> w = Vector.random(D) <span class="comment">// current separating plane</span><br> - <span class="keyword">for</span> (i <- 1 to ITERATIONS) {<br> - <span class="keyword">val</span> gradient = points.<span class="sparkop">map</span>(<span class="closure">p =><br> - (1 / (1 + exp(-p.y*(w dot p.x))) - 1) * p.y * p.x<br> - </span>).<span class="sparkop">reduce</span>(<span class="closure">_ + _</span>)<br> - w -= gradient<br> - }<br> - println(<span class="string">"Final separating plane: "</span> + w)<br> - </div> - </div> - <div class="tab-pane tab-pane-java"> - <div class="code code-tab"> - <span class="keyword">class</span> ComputeGradient <span class="keyword">extends</span> Function<DataPoint, Vector> {<br> - <span class="keyword">private</span> Vector w;<br> - ComputeGradient(Vector w) { <span class="keyword">this</span>.w = w; }<br> - <span class="keyword">public</span> Vector call(DataPoint p) {<br> - <span class="keyword">return</span> p.x.times(p.y * (1 / (1 + Math.exp(w.dot(p.x))) - 1));<br> - }<br> - }<br> - <br> - JavaRDD<DataPoint> points = spark.textFile(...).<span class="sparkop">map</span>(<span class="closure">new ParsePoint()</span>).<span class="sparkop">cache</span>();<br> - Vector w = Vector.random(D); <span class="comment">// current separating plane</span><br> - <span class="keyword">for</span> (<span class="keyword">int</span> i = 0; i < ITERATIONS; i++) {<br> - Vector gradient = points.<span class="sparkop">map</span>(<span class="closure">new ComputeGradient(w)</span>).<span class="sparkop">reduce</span>(<span class="closure">new AddVectors()</span>);<br> - w = w.subtract(gradient);<br> - }<br> - System.out.println(<span class="string">"Final separating plane: "</span> + w);<br> - </div> - </div> +<div class="tab-pane tab-pane-python active"> +<div class="code code-tab"> +{% highlight python %} +# Every record of this DataFrame contains the label and +# features represented by a vector. +df = sqlContext.createDataFrame(data, ["label", "features"]) + +# Set parameters for the algorithm. +# Here, we limit the number of iterations to 10. +lr = LogisticRegression(maxIter=10) + +# Fit the model to the data. +model = lr.fit(df) + +# Given a dataset, predict each point's label, and show the results. +model.transform(df).show() +{% endhighlight %} +</div> </div> -<p>Note that the current separating plane, <code>w</code>, gets shipped automatically to the cluster with every <code>map</code> call.</p> - -<p>The graph below compares the running time per iteration of this Spark program against a Hadoop implementation on 100 GB of data on a 100-node cluster, showing the benefit of in-memory caching:</p> +<div class="tab-pane tab-pane-scala"> +<div class="code code-tab"> +{% highlight scala %} +// Every record of this DataFrame contains the label and +// features represented by a vector. +val df = sqlContext.createDataFrame(data).toDF("label", "features") + +// Set parameters for the algorithm. +// Here, we limit the number of iterations to 10. +val lr = new LogisticRegression().setMaxIter(10) + +// Fit the model to the data. +val model = lr.fit(df) + +// Inspect the model: get the feature weights. +val weights = model.weights + +// Given a dataset, predict each point's label, and show the results. +model.transform(df).show() +{% endhighlight %} +</div> +</div> -<p style="margin-top: 20px; margin-bottom: 30px;"> -<img src="{{site.url}}images/logistic-regression.png" alt="Logistic regression performance in Spark vs Hadoop"> -</p> +<div class="tab-pane tab-pane-java"> +<div class="code code-tab"> +{% highlight java %} +// Every record of this DataFrame contains the label and +// features represented by a vector. +StructType schema = new StructType(new StructField[]{ + new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); +DataFrame df = jsql.createDataFrame(data, schema); + +// Set parameters for the algorithm. +// Here, we limit the number of iterations to 10. +LogisticRegression lr = new LogisticRegression().setMaxIter(10); + +// Fit the model to the data. +LogisticRegressionModel model = lr.fit(df); + +// Inspect the model: get the feature weights. +Vector weights = model.weights(); + +// Given a dataset, predict each point's label, and show the results. +model.transform(df).show(); +{% endhighlight %} +</div> +</div> +</div> <a name="additional"></a> -<h2>Additional Examples</h2> +<h1>Additional Examples</h1> Many additional examples are distributed with Spark: * Basic Spark: [Scala examples](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples), [Java examples](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples), [Python examples](https://github.com/apache/spark/tree/master/examples/src/main/python) - * Spark Streaming: [Scala examples](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming), [Java examples](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples/streaming) - + * Spark Streaming: [Scala examples](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples/streaming), [Java examples](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples/streaming) \ No newline at end of file Modified: spark/site/community.html URL: http://svn.apache.org/viewvc/spark/site/community.html?rev=1726699&r1=1726698&r2=1726699&view=diff ============================================================================== --- spark/site/community.html (original) +++ spark/site/community.html Mon Jan 25 21:57:32 2016 @@ -18,6 +18,9 @@ <link href="/css/cerulean.min.css" rel="stylesheet"> <link href="/css/custom.css" rel="stylesheet"> + <!-- Code highlighter CSS --> + <link href="/css/pygments-default.css" rel="stylesheet"> + <script type="text/javascript"> <!-- Google Analytics initialization --> var _gaq = _gaq || []; Modified: spark/site/documentation.html URL: http://svn.apache.org/viewvc/spark/site/documentation.html?rev=1726699&r1=1726698&r2=1726699&view=diff ============================================================================== --- spark/site/documentation.html (original) +++ spark/site/documentation.html Mon Jan 25 21:57:32 2016 @@ -18,6 +18,9 @@ <link href="/css/cerulean.min.css" rel="stylesheet"> <link href="/css/custom.css" rel="stylesheet"> + <!-- Code highlighter CSS --> + <link href="/css/pygments-default.css" rel="stylesheet"> + <script type="text/javascript"> <!-- Google Analytics initialization --> var _gaq = _gaq || []; Modified: spark/site/downloads.html URL: http://svn.apache.org/viewvc/spark/site/downloads.html?rev=1726699&r1=1726698&r2=1726699&view=diff ============================================================================== --- spark/site/downloads.html (original) +++ spark/site/downloads.html Mon Jan 25 21:57:32 2016 @@ -18,6 +18,9 @@ <link href="/css/cerulean.min.css" rel="stylesheet"> <link href="/css/custom.css" rel="stylesheet"> + <!-- Code highlighter CSS --> + <link href="/css/pygments-default.css" rel="stylesheet"> + <script type="text/javascript"> <!-- Google Analytics initialization --> var _gaq = _gaq || []; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org