http://git-wip-us.apache.org/repos/asf/spark-website/blob/d2bcf185/site/docs/2.1.0/sparkr.html ---------------------------------------------------------------------- diff --git a/site/docs/2.1.0/sparkr.html b/site/docs/2.1.0/sparkr.html index 0a1a347..e861a01 100644 --- a/site/docs/2.1.0/sparkr.html +++ b/site/docs/2.1.0/sparkr.html @@ -127,53 +127,53 @@ <ul id="markdown-toc"> - <li><a href="#overview" id="markdown-toc-overview">Overview</a></li> - <li><a href="#sparkdataframe" id="markdown-toc-sparkdataframe">SparkDataFrame</a> <ul> - <li><a href="#starting-up-sparksession" id="markdown-toc-starting-up-sparksession">Starting Up: SparkSession</a></li> - <li><a href="#starting-up-from-rstudio" id="markdown-toc-starting-up-from-rstudio">Starting Up from RStudio</a></li> - <li><a href="#creating-sparkdataframes" id="markdown-toc-creating-sparkdataframes">Creating SparkDataFrames</a> <ul> - <li><a href="#from-local-data-frames" id="markdown-toc-from-local-data-frames">From local data frames</a></li> - <li><a href="#from-data-sources" id="markdown-toc-from-data-sources">From Data Sources</a></li> - <li><a href="#from-hive-tables" id="markdown-toc-from-hive-tables">From Hive tables</a></li> + <li><a href="#overview">Overview</a></li> + <li><a href="#sparkdataframe">SparkDataFrame</a> <ul> + <li><a href="#starting-up-sparksession">Starting Up: SparkSession</a></li> + <li><a href="#starting-up-from-rstudio">Starting Up from RStudio</a></li> + <li><a href="#creating-sparkdataframes">Creating SparkDataFrames</a> <ul> + <li><a href="#from-local-data-frames">From local data frames</a></li> + <li><a href="#from-data-sources">From Data Sources</a></li> + <li><a href="#from-hive-tables">From Hive tables</a></li> </ul> </li> - <li><a href="#sparkdataframe-operations" id="markdown-toc-sparkdataframe-operations">SparkDataFrame Operations</a> <ul> - <li><a href="#selecting-rows-columns" id="markdown-toc-selecting-rows-columns">Selecting rows, columns</a></li> - <li><a href="#grouping-aggregation" id="markdown-toc-grouping-aggregation">Grouping, Aggregation</a></li> - <li><a href="#operating-on-columns" id="markdown-toc-operating-on-columns">Operating on Columns</a></li> - <li><a href="#applying-user-defined-function" id="markdown-toc-applying-user-defined-function">Applying User-Defined Function</a> <ul> - <li><a href="#run-a-given-function-on-a-large-dataset-using-dapply-or-dapplycollect" id="markdown-toc-run-a-given-function-on-a-large-dataset-using-dapply-or-dapplycollect">Run a given function on a large dataset using <code>dapply</code> or <code>dapplyCollect</code></a> <ul> - <li><a href="#dapply" id="markdown-toc-dapply">dapply</a></li> - <li><a href="#dapplycollect" id="markdown-toc-dapplycollect">dapplyCollect</a></li> + <li><a href="#sparkdataframe-operations">SparkDataFrame Operations</a> <ul> + <li><a href="#selecting-rows-columns">Selecting rows, columns</a></li> + <li><a href="#grouping-aggregation">Grouping, Aggregation</a></li> + <li><a href="#operating-on-columns">Operating on Columns</a></li> + <li><a href="#applying-user-defined-function">Applying User-Defined Function</a> <ul> + <li><a href="#run-a-given-function-on-a-large-dataset-using-dapply-or-dapplycollect">Run a given function on a large dataset using <code>dapply</code> or <code>dapplyCollect</code></a> <ul> + <li><a href="#dapply">dapply</a></li> + <li><a href="#dapplycollect">dapplyCollect</a></li> </ul> </li> - <li><a href="#run-a-given-function-on-a-large-dataset-grouping-by-input-columns-and-using-gapply-or-gapplycollect" id="markdown-toc-run-a-given-function-on-a-large-dataset-grouping-by-input-columns-and-using-gapply-or-gapplycollect">Run a given function on a large dataset grouping by input column(s) and using <code>gapply</code> or <code>gapplyCollect</code></a> <ul> - <li><a href="#gapply" id="markdown-toc-gapply">gapply</a></li> - <li><a href="#gapplycollect" id="markdown-toc-gapplycollect">gapplyCollect</a></li> + <li><a href="#run-a-given-function-on-a-large-dataset-grouping-by-input-columns-and-using-gapply-or-gapplycollect">Run a given function on a large dataset grouping by input column(s) and using <code>gapply</code> or <code>gapplyCollect</code></a> <ul> + <li><a href="#gapply">gapply</a></li> + <li><a href="#gapplycollect">gapplyCollect</a></li> </ul> </li> - <li><a href="#data-type-mapping-between-r-and-spark" id="markdown-toc-data-type-mapping-between-r-and-spark">Data type mapping between R and Spark</a></li> - <li><a href="#run-local-r-functions-distributed-using-sparklapply" id="markdown-toc-run-local-r-functions-distributed-using-sparklapply">Run local R functions distributed using <code>spark.lapply</code></a> <ul> - <li><a href="#sparklapply" id="markdown-toc-sparklapply">spark.lapply</a></li> + <li><a href="#data-type-mapping-between-r-and-spark">Data type mapping between R and Spark</a></li> + <li><a href="#run-local-r-functions-distributed-using-sparklapply">Run local R functions distributed using <code>spark.lapply</code></a> <ul> + <li><a href="#sparklapply">spark.lapply</a></li> </ul> </li> </ul> </li> </ul> </li> - <li><a href="#running-sql-queries-from-sparkr" id="markdown-toc-running-sql-queries-from-sparkr">Running SQL Queries from SparkR</a></li> + <li><a href="#running-sql-queries-from-sparkr">Running SQL Queries from SparkR</a></li> </ul> </li> - <li><a href="#machine-learning" id="markdown-toc-machine-learning">Machine Learning</a> <ul> - <li><a href="#algorithms" id="markdown-toc-algorithms">Algorithms</a></li> - <li><a href="#model-persistence" id="markdown-toc-model-persistence">Model persistence</a></li> + <li><a href="#machine-learning">Machine Learning</a> <ul> + <li><a href="#algorithms">Algorithms</a></li> + <li><a href="#model-persistence">Model persistence</a></li> </ul> </li> - <li><a href="#r-function-name-conflicts" id="markdown-toc-r-function-name-conflicts">R Function Name Conflicts</a></li> - <li><a href="#migration-guide" id="markdown-toc-migration-guide">Migration Guide</a> <ul> - <li><a href="#upgrading-from-sparkr-15x-to-16x" id="markdown-toc-upgrading-from-sparkr-15x-to-16x">Upgrading From SparkR 1.5.x to 1.6.x</a></li> - <li><a href="#upgrading-from-sparkr-16x-to-20" id="markdown-toc-upgrading-from-sparkr-16x-to-20">Upgrading From SparkR 1.6.x to 2.0</a></li> - <li><a href="#upgrading-to-sparkr-210" id="markdown-toc-upgrading-to-sparkr-210">Upgrading to SparkR 2.1.0</a></li> + <li><a href="#r-function-name-conflicts">R Function Name Conflicts</a></li> + <li><a href="#migration-guide">Migration Guide</a> <ul> + <li><a href="#upgrading-from-sparkr-15x-to-16x">Upgrading From SparkR 1.5.x to 1.6.x</a></li> + <li><a href="#upgrading-from-sparkr-16x-to-20">Upgrading From SparkR 1.6.x to 2.0</a></li> + <li><a href="#upgrading-to-sparkr-210">Upgrading to SparkR 2.1.0</a></li> </ul> </li> </ul> @@ -202,7 +202,7 @@ You can create a <code>SparkSession</code> using <code>sparkR.session</code> and <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r">sparkR.session<span class="p">()</span></code></pre></div> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span>sparkR.session<span class="p">()</span></code></pre></figure> </div> @@ -223,11 +223,11 @@ them, pass them as you would other configuration properties in the <code>sparkCo <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="kr">if</span> <span class="p">(</span><span class="kp">nchar</span><span class="p">(</span><span class="kp">Sys.getenv</span><span class="p">(</span><span class="s">"SPARK_HOME"</span><span class="p">))</span> <span class="o"><</span> <span class="m">1</span><span class="p">)</span> <span class="p">{</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="kr">if</span> <span class="p">(</span><span class="kp">nchar</span><span class="p">(</span><span class="kp">Sys.getenv</span><span class="p">(</span><span class="s">"SPARK_HOME"</span><span class="p">))</span> <span class="o"><</span> <span class="m">1</span><span class="p">)</span> <span class="p">{</span> <span class="kp">Sys.setenv</span><span class="p">(</span>SPARK_HOME <span class="o">=</span> <span class="s">"/home/spark"</span><span class="p">)</span> <span class="p">}</span> <span class="kn">library</span><span class="p">(</span>SparkR<span class="p">,</span> lib.loc <span class="o">=</span> <span class="kt">c</span><span class="p">(</span><span class="kp">file.path</span><span class="p">(</span><span class="kp">Sys.getenv</span><span class="p">(</span><span class="s">"SPARK_HOME"</span><span class="p">),</span> <span class="s">"R"</span><span class="p">,</span> <span class="s">"lib"</span><span class="p">)))</span> -sparkR.session<span class="p">(</span>master <span class="o">=</span> <span class="s">"local[*]"</span><span class="p">,</span> sparkConfig <span class="o">=</span> <span class="kt">list</span><span class="p">(</span>spark.driver.memory <span class="o">=</span> <span class="s">"2g"</span><span class="p">))</span></code></pre></div> +sparkR.session<span class="p">(</span>master <span class="o">=</span> <span class="s">"local[*]"</span><span class="p">,</span> sparkConfig <span class="o">=</span> <span class="kt">list</span><span class="p">(</span>spark.driver.memory <span class="o">=</span> <span class="s">"2g"</span><span class="p">))</span></code></pre></figure> </div> @@ -282,14 +282,14 @@ sparkR.session<span class="p">(</span>master <span class="o">=</span> <span clas <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r">df <span class="o"><-</span> as.DataFrame<span class="p">(</span>faithful<span class="p">)</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span>df <span class="o"><-</span> as.DataFrame<span class="p">(</span>faithful<span class="p">)</span> <span class="c1"># Displays the first part of the SparkDataFrame</span> <span class="kp">head</span><span class="p">(</span>df<span class="p">)</span> <span class="c1">## eruptions waiting</span> <span class="c1">##1 3.600 79</span> <span class="c1">##2 1.800 54</span> -<span class="c1">##3 3.333 74</span></code></pre></div> +<span class="c1">##3 3.333 74</span></code></pre></figure> </div> @@ -303,7 +303,7 @@ specifying <code>--packages</code> with <code>spark-submit</code> or <code>spark <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r">sparkR.session<span class="p">(</span>sparkPackages <span class="o">=</span> <span class="s">"com.databricks:spark-avro_2.11:3.0.0"</span><span class="p">)</span></code></pre></div> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span>sparkR.session<span class="p">(</span>sparkPackages <span class="o">=</span> <span class="s">"com.databricks:spark-avro_2.11:3.0.0"</span><span class="p">)</span></code></pre></figure> </div> @@ -311,7 +311,7 @@ specifying <code>--packages</code> with <code>spark-submit</code> or <code>spark <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r">people <span class="o"><-</span> read.df<span class="p">(</span><span class="s">"./examples/src/main/resources/people.json"</span><span class="p">,</span> <span class="s">"json"</span><span class="p">)</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span>people <span class="o"><-</span> read.df<span class="p">(</span><span class="s">"./examples/src/main/resources/people.json"</span><span class="p">,</span> <span class="s">"json"</span><span class="p">)</span> <span class="kp">head</span><span class="p">(</span>people<span class="p">)</span> <span class="c1">## age name</span> <span class="c1">##1 NA Michael</span> @@ -325,7 +325,7 @@ printSchema<span class="p">(</span>people<span class="p">)</span> <span class="c1"># |-- name: string (nullable = true)</span> <span class="c1"># Similarly, multiple files can be read with read.json</span> -people <span class="o"><-</span> read.json<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">"./examples/src/main/resources/people.json"</span><span class="p">,</span> <span class="s">"./examples/src/main/resources/people2.json"</span><span class="p">))</span></code></pre></div> +people <span class="o"><-</span> read.json<span class="p">(</span><span class="kt">c</span><span class="p">(</span><span class="s">"./examples/src/main/resources/people.json"</span><span class="p">,</span> <span class="s">"./examples/src/main/resources/people2.json"</span><span class="p">))</span></code></pre></figure> </div> @@ -333,7 +333,7 @@ people <span class="o"><-</span> read.json<span class="p">(</span><span class <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r">df <span class="o"><-</span> read.df<span class="p">(</span>csvPath<span class="p">,</span> <span class="s">"csv"</span><span class="p">,</span> header <span class="o">=</span> <span class="s">"true"</span><span class="p">,</span> inferSchema <span class="o">=</span> <span class="s">"true"</span><span class="p">,</span> na.strings <span class="o">=</span> <span class="s">"NA"</span><span class="p">)</span></code></pre></div> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span>df <span class="o"><-</span> read.df<span class="p">(</span>csvPath<span class="p">,</span> <span class="s">"csv"</span><span class="p">,</span> header <span class="o">=</span> <span class="s">"true"</span><span class="p">,</span> inferSchema <span class="o">=</span> <span class="s">"true"</span><span class="p">,</span> na.strings <span class="o">=</span> <span class="s">"NA"</span><span class="p">)</span></code></pre></figure> </div> @@ -342,7 +342,7 @@ to a Parquet file using <code>write.df</code>.</p> <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r">write.df<span class="p">(</span>people<span class="p">,</span> path <span class="o">=</span> <span class="s">"people.parquet"</span><span class="p">,</span> <span class="kn">source</span> <span class="o">=</span> <span class="s">"parquet"</span><span class="p">,</span> mode <span class="o">=</span> <span class="s">"overwrite"</span><span class="p">)</span></code></pre></div> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span>write.df<span class="p">(</span>people<span class="p">,</span> path <span class="o">=</span> <span class="s">"people.parquet"</span><span class="p">,</span> <span class="kn">source</span> <span class="o">=</span> <span class="s">"parquet"</span><span class="p">,</span> mode <span class="o">=</span> <span class="s">"overwrite"</span><span class="p">)</span></code></pre></figure> </div> @@ -352,7 +352,7 @@ to a Parquet file using <code>write.df</code>.</p> <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r">sparkR.session<span class="p">()</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span>sparkR.session<span class="p">()</span> sql<span class="p">(</span><span class="s">"CREATE TABLE IF NOT EXISTS src (key INT, value STRING)"</span><span class="p">)</span> sql<span class="p">(</span><span class="s">"LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"</span><span class="p">)</span> @@ -365,7 +365,7 @@ results <span class="o"><-</span> sql<span class="p">(</span><span class="s"> <span class="c1">## key value</span> <span class="c1">## 1 238 val_238</span> <span class="c1">## 2 86 val_86</span> -<span class="c1">## 3 311 val_311</span></code></pre></div> +<span class="c1">## 3 311 val_311</span></code></pre></figure> </div> @@ -378,7 +378,7 @@ Here we include some basic examples and a complete list can be found in the <a h <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Create the SparkDataFrame</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="c1"># Create the SparkDataFrame</span> df <span class="o"><-</span> as.DataFrame<span class="p">(</span>faithful<span class="p">)</span> <span class="c1"># Get basic information about the SparkDataFrame</span> @@ -400,7 +400,7 @@ df <span class="c1">## eruptions waiting</span> <span class="c1">##1 1.750 47</span> <span class="c1">##2 1.750 47</span> -<span class="c1">##3 1.867 48</span></code></pre></div> +<span class="c1">##3 1.867 48</span></code></pre></figure> </div> @@ -410,7 +410,7 @@ df <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># We use the `n` operator to count the number of times each waiting time appears</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="c1"># We use the `n` operator to count the number of times each waiting time appears</span> <span class="kp">head</span><span class="p">(</span>summarize<span class="p">(</span>groupBy<span class="p">(</span>df<span class="p">,</span> df<span class="o">$</span>waiting<span class="p">),</span> count <span class="o">=</span> n<span class="p">(</span>df<span class="o">$</span>waiting<span class="p">)))</span> <span class="c1">## waiting count</span> <span class="c1">##1 70 4</span> @@ -423,7 +423,7 @@ waiting_counts <span class="o"><-</span> summarize<span class="p">(</span>gro <span class="c1">## waiting count</span> <span class="c1">##1 78 15</span> <span class="c1">##2 83 14</span> -<span class="c1">##3 81 13</span></code></pre></div> +<span class="c1">##3 81 13</span></code></pre></figure> </div> @@ -433,14 +433,14 @@ waiting_counts <span class="o"><-</span> summarize<span class="p">(</span>gro <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Convert waiting time from hours to seconds.</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="c1"># Convert waiting time from hours to seconds.</span> <span class="c1"># Note that we can assign this to a new column in the same SparkDataFrame</span> df<span class="o">$</span>waiting_secs <span class="o"><-</span> df<span class="o">$</span>waiting <span class="o">*</span> <span class="m">60</span> <span class="kp">head</span><span class="p">(</span>df<span class="p">)</span> <span class="c1">## eruptions waiting waiting_secs</span> <span class="c1">##1 3.600 79 4740</span> <span class="c1">##2 1.800 54 3240</span> -<span class="c1">##3 3.333 74 4440</span></code></pre></div> +<span class="c1">##3 3.333 74 4440</span></code></pre></figure> </div> @@ -455,7 +455,7 @@ and should have only one parameter, to which a <code>data.frame</code> correspon <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Convert waiting time from hours to seconds.</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="c1"># Convert waiting time from hours to seconds.</span> <span class="c1"># Note that we can apply UDF to DataFrame.</span> schema <span class="o"><-</span> structType<span class="p">(</span>structField<span class="p">(</span><span class="s">"eruptions"</span><span class="p">,</span> <span class="s">"double"</span><span class="p">),</span> structField<span class="p">(</span><span class="s">"waiting"</span><span class="p">,</span> <span class="s">"double"</span><span class="p">),</span> structField<span class="p">(</span><span class="s">"waiting_secs"</span><span class="p">,</span> <span class="s">"double"</span><span class="p">))</span> @@ -467,7 +467,7 @@ df1 <span class="o"><-</span> dapply<span class="p">(</span>df<span class="p" <span class="c1">##3 3.333 74 4440</span> <span class="c1">##4 2.283 62 3720</span> <span class="c1">##5 4.533 85 5100</span> -<span class="c1">##6 2.883 55 3300</span></code></pre></div> +<span class="c1">##6 2.883 55 3300</span></code></pre></figure> </div> @@ -477,7 +477,7 @@ should be a <code>data.frame</code>. But, Schema is not required to be passed. N <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Convert waiting time from hours to seconds.</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="c1"># Convert waiting time from hours to seconds.</span> <span class="c1"># Note that we can apply UDF to DataFrame and return a R's data.frame</span> ldf <span class="o"><-</span> dapplyCollect<span class="p">(</span> df<span class="p">,</span> @@ -488,7 +488,7 @@ ldf <span class="o"><-</span> dapplyCollect<span class="p">(</span> <span class="c1">## eruptions waiting waiting_secs</span> <span class="c1">##1 3.600 79 4740</span> <span class="c1">##2 1.800 54 3240</span> -<span class="c1">##3 3.333 74 4440</span></code></pre></div> +<span class="c1">##3 3.333 74 4440</span></code></pre></figure> </div> @@ -502,7 +502,7 @@ The output of function should be a <code>data.frame</code>. Schema specifies the <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Determine six waiting times with the largest eruption time in minutes.</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="c1"># Determine six waiting times with the largest eruption time in minutes.</span> schema <span class="o"><-</span> structType<span class="p">(</span>structField<span class="p">(</span><span class="s">"waiting"</span><span class="p">,</span> <span class="s">"double"</span><span class="p">),</span> structField<span class="p">(</span><span class="s">"max_eruption"</span><span class="p">,</span> <span class="s">"double"</span><span class="p">))</span> result <span class="o"><-</span> gapply<span class="p">(</span> df<span class="p">,</span> @@ -519,7 +519,7 @@ result <span class="o"><-</span> gapply<span class="p">(</span> <span class="c1">##3 71 5.033</span> <span class="c1">##4 87 5.000</span> <span class="c1">##5 63 4.933</span> -<span class="c1">##6 89 4.900</span></code></pre></div> +<span class="c1">##6 89 4.900</span></code></pre></figure> </div> @@ -528,7 +528,7 @@ result <span class="o"><-</span> gapply<span class="p">(</span> <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Determine six waiting times with the largest eruption time in minutes.</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="c1"># Determine six waiting times with the largest eruption time in minutes.</span> result <span class="o"><-</span> gapplyCollect<span class="p">(</span> df<span class="p">,</span> <span class="s">"waiting"</span><span class="p">,</span> @@ -545,7 +545,7 @@ result <span class="o"><-</span> gapplyCollect<span class="p">(</span> <span class="c1">##3 71 5.033</span> <span class="c1">##4 87 5.000</span> <span class="c1">##5 63 4.933</span> -<span class="c1">##6 89 4.900</span></code></pre></div> +<span class="c1">##6 89 4.900</span></code></pre></figure> </div> @@ -628,7 +628,7 @@ should fit in a single machine. If that is not the case they can do something li <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Perform distributed training of multiple models with spark.lapply. Here, we pass</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="c1"># Perform distributed training of multiple models with spark.lapply. Here, we pass</span> <span class="c1"># a read-only list of arguments which specifies family the generalized linear model should be.</span> families <span class="o"><-</span> <span class="kt">c</span><span class="p">(</span><span class="s">"gaussian"</span><span class="p">,</span> <span class="s">"poisson"</span><span class="p">)</span> train <span class="o"><-</span> <span class="kr">function</span><span class="p">(</span>family<span class="p">)</span> <span class="p">{</span> @@ -639,7 +639,7 @@ train <span class="o"><-</span> <span class="kr">function</span><span class=" model.summaries <span class="o"><-</span> spark.lapply<span class="p">(</span>families<span class="p">,</span> train<span class="p">)</span> <span class="c1"># Print the summary of each model</span> -<span class="kp">print</span><span class="p">(</span>model.summaries<span class="p">)</span></code></pre></div> +<span class="kp">print</span><span class="p">(</span>model.summaries<span class="p">)</span></code></pre></figure> </div> @@ -649,7 +649,7 @@ The <code>sql</code> function enables applications to run SQL queries programmat <div data-lang="r"> - <div class="highlight"><pre><code class="language-r" data-lang="r"><span class="c1"># Load a JSON file</span> + <figure class="highlight"><pre><code class="language-r" data-lang="r"><span></span><span class="c1"># Load a JSON file</span> people <span class="o"><-</span> read.df<span class="p">(</span><span class="s">"./examples/src/main/resources/people.json"</span><span class="p">,</span> <span class="s">"json"</span><span class="p">)</span> <span class="c1"># Register this SparkDataFrame as a temporary view.</span> @@ -659,7 +659,7 @@ createOrReplaceTempView<span class="p">(</span>people<span class="p">,</span> <s teenagers <span class="o"><-</span> sql<span class="p">(</span><span class="s">"SELECT name FROM people WHERE age >= 13 AND age <= 19"</span><span class="p">)</span> <span class="kp">head</span><span class="p">(</span>teenagers<span class="p">)</span> <span class="c1">## name</span> -<span class="c1">##1 Justin</span></code></pre></div> +<span class="c1">##1 Justin</span></code></pre></figure> </div> @@ -691,28 +691,27 @@ SparkR supports a subset of the available R formula operators for model fitting, <h2 id="model-persistence">Model persistence</h2> -<p>The following example shows how to save/load a MLlib model by SparkR.</p> -<div class="highlight"><pre>irisDF <span class="o"><-</span> <span class="kp">suppressWarnings</span><span class="p">(</span>createDataFrame<span class="p">(</span>iris<span class="p">))</span> +<p>The following example shows how to save/load a MLlib model by SparkR. +<div class="highlight"><pre><span></span>irisDF <span class="o"><-</span> <span class="kp">suppressWarnings</span><span class="p">(</span>createDataFrame<span class="p">(</span>iris<span class="p">))</span> <span class="c1"># Fit a generalized linear model of family "gaussian" with spark.glm</span> gaussianDF <span class="o"><-</span> irisDF gaussianTestDF <span class="o"><-</span> irisDF -gaussianGLM <span class="o"><-</span> spark.glm<span class="p">(</span>gaussianDF<span class="p">,</span> Sepal_Length <span class="o">~</span> Sepal_Width <span class="o">+</span> Species<span class="p">,</span> family <span class="o">=</span> <span class="s">"gaussian"</span><span class="p">)</span> +gaussianGLM <span class="o"><-</span> spark.glm<span class="p">(</span>gaussianDF<span class="p">,</span> Sepal_Length <span class="o">~</span> Sepal_Width <span class="o">+</span> Species<span class="p">,</span> family <span class="o">=</span> <span class="s">"gaussian"</span><span class="p">)</span></p> -<span class="c1"># Save and then load a fitted MLlib model</span> +<p><span class="c1"># Save and then load a fitted MLlib model</span> modelPath <span class="o"><-</span> <span class="kp">tempfile</span><span class="p">(</span>pattern <span class="o">=</span> <span class="s">"ml"</span><span class="p">,</span> fileext <span class="o">=</span> <span class="s">".tmp"</span><span class="p">)</span> write.ml<span class="p">(</span>gaussianGLM<span class="p">,</span> modelPath<span class="p">)</span> -gaussianGLM2 <span class="o"><-</span> read.ml<span class="p">(</span>modelPath<span class="p">)</span> +gaussianGLM2 <span class="o"><-</span> read.ml<span class="p">(</span>modelPath<span class="p">)</span></p> -<span class="c1"># Check model summary</span> -<span class="kp">summary</span><span class="p">(</span>gaussianGLM2<span class="p">)</span> +<p><span class="c1"># Check model summary</span> +<span class="kp">summary</span><span class="p">(</span>gaussianGLM2<span class="p">)</span></p> -<span class="c1"># Check model prediction</span> +<p><span class="c1"># Check model prediction</span> gaussianPredictions <span class="o"><-</span> predict<span class="p">(</span>gaussianGLM2<span class="p">,</span> gaussianTestDF<span class="p">)</span> -showDF<span class="p">(</span>gaussianPredictions<span class="p">)</span> +showDF<span class="p">(</span>gaussianPredictions<span class="p">)</span></p> -<span class="kp">unlink</span><span class="p">(</span>modelPath<span class="p">)</span> -</pre></div> -<div><small>Find full example code at "examples/src/main/r/ml/ml.R" in the Spark repo.</small></div> +<p><span class="kp">unlink</span><span class="p">(</span>modelPath<span class="p">)</span> +</pre></div><div><small>Find full example code at “examples/src/main/r/ml/ml.R” in the Spark repo.</small></div></p> <h1 id="r-function-name-conflicts">R Function Name Conflicts</h1>
--------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org