Repository: spark Updated Branches: refs/heads/master 39e4e7e4d -> 293225e0c
[SPARK-8124] [SPARKR] Created more examples on SparkR DataFrames Here are more examples on SparkR DataFrames including creating a Spark Contect and a SQL context, loading data and simple data manipulation. Author: Daniel Emaasit (PhD Student) <daniel.emaa...@gmail.com> Closes #6668 from Emaasit/dan-dev and squashes the following commits: 3a97867 [Daniel Emaasit (PhD Student)] Used fewer rows for createDataFrame f7227f9 [Daniel Emaasit (PhD Student)] Using command line arguments a550f70 [Daniel Emaasit (PhD Student)] Used base R functions 33f9882 [Daniel Emaasit (PhD Student)] Renamed file b6603e3 [Daniel Emaasit (PhD Student)] changed "Describe" function to "describe" 90565dd [Daniel Emaasit (PhD Student)] Deleted the getting-started file b95a103 [Daniel Emaasit (PhD Student)] Deleted this file cc55cd8 [Daniel Emaasit (PhD Student)] combined all the code into one .R file c6933af [Daniel Emaasit (PhD Student)] changed variable name to SQLContext 8e0fe14 [Daniel Emaasit (PhD Student)] provided two options for creating DataFrames 2653573 [Daniel Emaasit (PhD Student)] Updates to a comment and variable name 275b787 [Daniel Emaasit (PhD Student)] Added the Apache License at the top of the file 2e8f724 [Daniel Emaasit (PhD Student)] Added the Apache License at the top of the file 486f44e [Daniel Emaasit (PhD Student)] Added the Apache License at the file d705112 [Daniel Emaasit (PhD Student)] Created more examples on SparkR DataFrames Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/293225e0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/293225e0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/293225e0 Branch: refs/heads/master Commit: 293225e0cd9318ad368dde30ac6a17725d33ebb6 Parents: 39e4e7e Author: Daniel Emaasit (PhD Student) <daniel.emaa...@gmail.com> Authored: Mon Jul 6 10:36:02 2015 -0700 Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu> Committed: Mon Jul 6 11:08:36 2015 -0700 ---------------------------------------------------------------------- examples/src/main/r/data-manipulation.R | 107 +++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/293225e0/examples/src/main/r/data-manipulation.R ---------------------------------------------------------------------- diff --git a/examples/src/main/r/data-manipulation.R b/examples/src/main/r/data-manipulation.R new file mode 100644 index 0000000..aa2336e --- /dev/null +++ b/examples/src/main/r/data-manipulation.R @@ -0,0 +1,107 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# For this example, we shall use the "flights" dataset +# The dataset consists of every flight departing Houston in 2011. +# The data set is made up of 227,496 rows x 14 columns. + +# To run this example use +# ./bin/sparkR --packages com.databricks:spark-csv_2.10:1.0.3 +# examples/src/main/r/data-manipulation.R <path_to_csv> + +# Load SparkR library into your R session +library(SparkR) + +args <- commandArgs(trailing = TRUE) + +if (length(args) != 1) { + print("Usage: data-manipulation.R <path-to-flights.csv") + print("The data can be downloaded from: http://s3-us-west-2.amazonaws.com/sparkr-data/flights.csv ") + q("no") +} + +## Initialize SparkContext +sc <- sparkR.init(appName = "SparkR-data-manipulation-example") + +## Initialize SQLContext +sqlContext <- sparkRSQL.init(sc) + +flightsCsvPath <- args[[1]] + +# Create a local R dataframe +flights_df <- read.csv(flightsCsvPath, header = TRUE) +flights_df$date <- as.Date(flights_df$date) + +## Filter flights whose destination is San Francisco and write to a local data frame +SFO_df <- flights_df[flights_df$dest == "SFO", ] + +# Convert the local data frame into a SparkR DataFrame +SFO_DF <- createDataFrame(sqlContext, SFO_df) + +# Directly create a SparkR DataFrame from the source data +flightsDF <- read.df(sqlContext, flightsCsvPath, source = "com.databricks.spark.csv", header = "true") + +# Print the schema of this Spark DataFrame +printSchema(flightsDF) + +# Cache the DataFrame +cache(flightsDF) + +# Print the first 6 rows of the DataFrame +showDF(flightsDF, numRows = 6) ## Or +head(flightsDF) + +# Show the column names in the DataFrame +columns(flightsDF) + +# Show the number of rows in the DataFrame +count(flightsDF) + +# Select specific columns +destDF <- select(flightsDF, "dest", "cancelled") + +# Using SQL to select columns of data +# First, register the flights DataFrame as a table +registerTempTable(flightsDF, "flightsTable") +destDF <- sql(sqlContext, "SELECT dest, cancelled FROM flightsTable") + +# Use collect to create a local R data frame +local_df <- collect(destDF) + +# Print the newly created local data frame +head(local_df) + +# Filter flights whose destination is JFK +jfkDF <- filter(flightsDF, "dest = \"JFK\"") ##OR +jfkDF <- filter(flightsDF, flightsDF$dest == "JFK") + +# If the magrittr library is available, we can use it to +# chain data frame operations +if("magrittr" %in% rownames(installed.packages())) { + library(magrittr) + + # Group the flights by date and then find the average daily delay + # Write the result into a DataFrame + groupBy(flightsDF, flightsDF$date) %>% + summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF + + # Print the computed data frame + head(dailyDelayDF) +} + +# Stop the SparkContext now +sparkR.stop() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org