spark git commit: [SPARK-22063][R] Fixes lint check failures in R by latest commit sha1 ID of lint-r

gurwls223 Sun, 01 Oct 2017 02:43:25 -0700

Repository: spark
Updated Branches:
  refs/heads/master c6610a997 -> 02c91e03f



[SPARK-22063][R] Fixes lint check failures in R by latest commit sha1 ID of 
lint-r

## What changes were proposed in this pull request?

Currently, we set lintr to jimhester/lintra769c0b (see 
[this](https://github.com/apache/spark/commit/7d1175011c976756efcd4e4e4f70a8fd6f287026)
 and [SPARK-14074](https://issues.apache.org/jira/browse/SPARK-14074)).

I first tested and checked lintr-1.0.1 but it looks many important fixes are 
missing (for example, checking 100 length). So, I instead tried the latest 
commit, 
https://github.com/jimhester/lintr/commit/5431140ffea65071f1327625d4a8de9688fa7e72,
 in my local and fixed the check failures.

It looks it has fixed many bugs and now finds many instances that I have 
observed and thought should be caught time to time, here I filed [the 
results](https://gist.github.com/HyukjinKwon/4f59ddcc7b6487a02da81800baca533c).

The downside looks it now takes about 7ish mins, (it was 2ish mins before) in 
my local.

## How was this patch tested?

Manually, `./dev/lint-r` after manually updating the lintr package.

Author: hyukjinkwon <gurwls...@gmail.com>
Author: zuotingbing <zuo.tingbi...@zte.com.cn>

Closes #19290 from HyukjinKwon/upgrade-r-lint.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/02c91e03
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/02c91e03
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/02c91e03

Branch: refs/heads/master
Commit: 02c91e03f975c2a6a05a9d5327057bb6b3c4a66f
Parents: c6610a9
Author: hyukjinkwon <gurwls...@gmail.com>
Authored: Sun Oct 1 18:42:45 2017 +0900
Committer: hyukjinkwon <gurwls...@gmail.com>
Committed: Sun Oct 1 18:42:45 2017 +0900

----------------------------------------------------------------------
 R/pkg/.lintr                                 |   2 +-
 R/pkg/R/DataFrame.R                          |  30 ++---
 R/pkg/R/RDD.R                                |   6 +-
 R/pkg/R/WindowSpec.R                         |   2 +-
 R/pkg/R/column.R                             |   2 +
 R/pkg/R/context.R                            |   2 +-
 R/pkg/R/deserialize.R                        |   2 +-
 R/pkg/R/functions.R                          |  79 +++++++------
 R/pkg/R/generics.R                           |   4 +-
 R/pkg/R/group.R                              |   4 +-
 R/pkg/R/mllib_classification.R               | 137 +++++++++++++---------
 R/pkg/R/mllib_clustering.R                   |  15 +--
 R/pkg/R/mllib_regression.R                   |  62 +++++-----
 R/pkg/R/mllib_tree.R                         |  36 ++++--
 R/pkg/R/pairRDD.R                            |   4 +-
 R/pkg/R/schema.R                             |   2 +-
 R/pkg/R/stats.R                              |  14 +--
 R/pkg/R/utils.R                              |   4 +-
 R/pkg/inst/worker/worker.R                   |   2 +-
 R/pkg/tests/fulltests/test_binary_function.R |   2 +-
 R/pkg/tests/fulltests/test_rdd.R             |   6 +-
 R/pkg/tests/fulltests/test_sparkSQL.R        |  14 +--
 dev/lint-r.R                                 |   4 +-
 23 files changed, 242 insertions(+), 193 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/.lintr
----------------------------------------------------------------------
diff --git a/R/pkg/.lintr b/R/pkg/.lintr
index ae50b28..c83ad2a 100644
--- a/R/pkg/.lintr
+++ b/R/pkg/.lintr
@@ -1,2 +1,2 @@
-linters: with_defaults(line_length_linter(100), multiple_dots_linter = NULL, 
camel_case_linter = NULL, open_curly_linter(allow_single_line = TRUE), 
closed_curly_linter(allow_single_line = TRUE))
+linters: with_defaults(line_length_linter(100), multiple_dots_linter = NULL, 
object_name_linter = NULL, camel_case_linter = NULL, 
open_curly_linter(allow_single_line = TRUE), 
closed_curly_linter(allow_single_line = TRUE))
 exclusions: list("inst/profile/general.R" = 1, "inst/profile/shell.R")

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 0728141..176bb3b 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1923,13 +1923,15 @@ setMethod("[", signature(x = "SparkDataFrame"),
 #' @param i,subset (Optional) a logical expression to filter on rows.
 #'                 For extract operator [[ and replacement operator [[<-, the 
indexing parameter for
 #'                 a single Column.
-#' @param j,select expression for the single Column or a list of columns to 
select from the SparkDataFrame.
+#' @param j,select expression for the single Column or a list of columns to 
select from the
+#'                 SparkDataFrame.
 #' @param drop if TRUE, a Column will be returned if the resulting dataset has 
only one column.
 #'             Otherwise, a SparkDataFrame will always be returned.
 #' @param value a Column or an atomic vector in the length of 1 as literal 
value, or \code{NULL}.
 #'              If \code{NULL}, the specified Column is dropped.
 #' @param ... currently not used.
-#' @return A new SparkDataFrame containing only the rows that meet the 
condition with selected columns.
+#' @return A new SparkDataFrame containing only the rows that meet the 
condition with selected
+#'         columns.
 #' @export
 #' @family SparkDataFrame functions
 #' @aliases subset,SparkDataFrame-method
@@ -2608,12 +2610,12 @@ setMethod("merge",
             } else {
               # if by or both by.x and by.y have length 0, use Cartesian 
Product
               joinRes <- crossJoin(x, y)
-              return (joinRes)
+              return(joinRes)
             }
 
             # sets alias for making colnames unique in dataframes 'x' and 'y'
-            colsX <- generateAliasesForIntersectedCols(x, by, suffixes[1])
-            colsY <- generateAliasesForIntersectedCols(y, by, suffixes[2])
+            colsX <- genAliasesForIntersectedCols(x, by, suffixes[1])
+            colsY <- genAliasesForIntersectedCols(y, by, suffixes[2])
 
             # selects columns with their aliases from dataframes
             # in case same column names are present in both data frames
@@ -2661,9 +2663,8 @@ setMethod("merge",
 #' @param intersectedColNames a list of intersected column names of the 
SparkDataFrame
 #' @param suffix a suffix for the column name
 #' @return list of columns
-#'
-#' @note generateAliasesForIntersectedCols since 1.6.0
-generateAliasesForIntersectedCols <- function (x, intersectedColNames, suffix) 
{
+#' @noRd
+genAliasesForIntersectedCols <- function(x, intersectedColNames, suffix) {
   allColNames <- names(x)
   # sets alias for making colnames unique in dataframe 'x'
   cols <- lapply(allColNames, function(colName) {
@@ -2671,7 +2672,7 @@ generateAliasesForIntersectedCols <- function (x, 
intersectedColNames, suffix) {
     if (colName %in% intersectedColNames) {
       newJoin <- paste(colName, suffix, sep = "")
       if (newJoin %in% allColNames){
-        stop ("The following column name: ", newJoin, " occurs more than once 
in the 'DataFrame'.",
+        stop("The following column name: ", newJoin, " occurs more than once 
in the 'DataFrame'.",
           "Please use different suffixes for the intersected columns.")
       }
       col <- alias(col, newJoin)
@@ -3058,7 +3059,8 @@ setMethod("describe",
 #' summary(select(df, "age", "height"))
 #' }
 #' @note summary(SparkDataFrame) since 1.5.0
-#' @note The statistics provided by \code{summary} were change in 2.3.0 use 
\link{describe} for previous defaults.
+#' @note The statistics provided by \code{summary} were change in 2.3.0 use 
\link{describe} for
+#'       previous defaults.
 #' @seealso \link{describe}
 setMethod("summary",
           signature(object = "SparkDataFrame"),
@@ -3765,8 +3767,8 @@ setMethod("checkpoint",
 #'
 #' Create a multi-dimensional cube for the SparkDataFrame using the specified 
columns.
 #'
-#' If grouping expression is missing \code{cube} creates a single global 
aggregate and is equivalent to
-#' direct application of \link{agg}.
+#' If grouping expression is missing \code{cube} creates a single global 
aggregate and is
+#' equivalent to direct application of \link{agg}.
 #'
 #' @param x a SparkDataFrame.
 #' @param ... character name(s) or Column(s) to group on.
@@ -3800,8 +3802,8 @@ setMethod("cube",
 #'
 #' Create a multi-dimensional rollup for the SparkDataFrame using the 
specified columns.
 #'
-#' If grouping expression is missing \code{rollup} creates a single global 
aggregate and is equivalent to
-#' direct application of \link{agg}.
+#' If grouping expression is missing \code{rollup} creates a single global 
aggregate and is
+#' equivalent to direct application of \link{agg}.
 #'
 #' @param x a SparkDataFrame.
 #' @param ... character name(s) or Column(s) to group on.

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/RDD.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/RDD.R b/R/pkg/R/RDD.R
index 15ca212..6e89b4b 100644
--- a/R/pkg/R/RDD.R
+++ b/R/pkg/R/RDD.R
@@ -131,7 +131,7 @@ PipelinedRDD <- function(prev, func) {
 # Return the serialization mode for an RDD.
 setGeneric("getSerializedMode", function(rdd, ...) { 
standardGeneric("getSerializedMode") })
 # For normal RDDs we can directly read the serializedMode
-setMethod("getSerializedMode", signature(rdd = "RDD"), function(rdd) 
rdd@env$serializedMode )
+setMethod("getSerializedMode", signature(rdd = "RDD"), function(rdd) 
rdd@env$serializedMode)
 # For pipelined RDDs if jrdd_val is set then serializedMode should exist
 # if not we return the defaultSerialization mode of "byte" as we don't know 
the serialization
 # mode at this point in time.
@@ -145,7 +145,7 @@ setMethod("getSerializedMode", signature(rdd = 
"PipelinedRDD"),
           })
 
 # The jrdd accessor function.
-setMethod("getJRDD", signature(rdd = "RDD"), function(rdd) rdd@jrdd )
+setMethod("getJRDD", signature(rdd = "RDD"), function(rdd) rdd@jrdd)
 setMethod("getJRDD", signature(rdd = "PipelinedRDD"),
           function(rdd, serializedMode = "byte") {
             if (!is.null(rdd@env$jrdd_val)) {
@@ -893,7 +893,7 @@ setMethod("sampleRDD",
                 if (withReplacement) {
                   count <- stats::rpois(1, fraction)
                   if (count > 0) {
-                    res[ (len + 1) : (len + count) ] <- rep(list(elem), count)
+                    res[(len + 1) : (len + count)] <- rep(list(elem), count)
                     len <- len + count
                   }
                 } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/WindowSpec.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R
index 81beac9..debc7cb 100644
--- a/R/pkg/R/WindowSpec.R
+++ b/R/pkg/R/WindowSpec.R
@@ -73,7 +73,7 @@ setMethod("show", "WindowSpec",
 setMethod("partitionBy",
           signature(x = "WindowSpec"),
           function(x, col, ...) {
-            stopifnot (class(col) %in% c("character", "Column"))
+            stopifnot(class(col) %in% c("character", "Column"))
 
             if (class(col) == "character") {
               windowSpec(callJMethod(x@sws, "partitionBy", col, list(...)))

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/column.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R
index a5c2ea8..3095adb 100644
--- a/R/pkg/R/column.R
+++ b/R/pkg/R/column.R
@@ -238,8 +238,10 @@ setMethod("between", signature(x = "Column"),
 #' @param x a Column.
 #' @param dataType a character object describing the target data type.
 #'        See
+# nolint start
 #'        
\href{https://spark.apache.org/docs/latest/sparkr.html#data-type-mapping-between-r-and-spark}{
 #'        Spark Data Types} for available data types.
+# nolint end
 #' @rdname cast
 #' @name cast
 #' @family colum_func

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/context.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 8349b57..443c2ff 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -329,7 +329,7 @@ spark.addFile <- function(path, recursive = FALSE) {
 #' spark.getSparkFilesRootDirectory()
 #'}
 #' @note spark.getSparkFilesRootDirectory since 2.1.0
-spark.getSparkFilesRootDirectory <- function() {
+spark.getSparkFilesRootDirectory <- function() { # nolint
   if (Sys.getenv("SPARKR_IS_RUNNING_ON_WORKER") == "") {
     # Running on driver.
     callJStatic("org.apache.spark.SparkFiles", "getRootDirectory")

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/deserialize.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/deserialize.R b/R/pkg/R/deserialize.R
index 0e99b17..a90f7d3 100644
--- a/R/pkg/R/deserialize.R
+++ b/R/pkg/R/deserialize.R
@@ -43,7 +43,7 @@ readObject <- function(con) {
 }
 
 readTypedObject <- function(con, type) {
-  switch (type,
+  switch(type,
     "i" = readInt(con),
     "c" = readString(con),
     "b" = readBoolean(con),

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/functions.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
index 9f28626..0143a3e 100644
--- a/R/pkg/R/functions.R
+++ b/R/pkg/R/functions.R
@@ -38,7 +38,8 @@ NULL
 #'
 #' Date time functions defined for \code{Column}.
 #'
-#' @param x Column to compute on. In \code{window}, it must be a time Column 
of \code{TimestampType}.
+#' @param x Column to compute on. In \code{window}, it must be a time Column of
+#'          \code{TimestampType}.
 #' @param format For \code{to_date} and \code{to_timestamp}, it is the string 
to use to parse
 #'               Column \code{x} to DateType or TimestampType. For 
\code{trunc}, it is the string
 #'               to use to specify the truncation method. For example, "year", 
"yyyy", "yy" for
@@ -90,8 +91,8 @@ NULL
 #'
 #' Math functions defined for \code{Column}.
 #'
-#' @param x Column to compute on. In \code{shiftLeft}, \code{shiftRight} and 
\code{shiftRightUnsigned},
-#'          this is the number of bits to shift.
+#' @param x Column to compute on. In \code{shiftLeft}, \code{shiftRight} and
+#'          \code{shiftRightUnsigned}, this is the number of bits to shift.
 #' @param y Column to compute on.
 #' @param ... additional argument(s).
 #' @name column_math_functions
@@ -480,7 +481,7 @@ setMethod("ceiling",
 setMethod("coalesce",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -676,7 +677,7 @@ setMethod("crc32",
 setMethod("hash",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -1310,9 +1311,9 @@ setMethod("round",
 #' Also known as Gaussian rounding or bankers' rounding that rounds to the 
nearest even number.
 #' bround(2.5, 0) = 2, bround(3.5, 0) = 4.
 #'
-#' @param scale round to \code{scale} digits to the right of the decimal point 
when \code{scale} > 0,
-#'        the nearest even number when \code{scale} = 0, and \code{scale} 
digits to the left
-#'        of the decimal point when \code{scale} < 0.
+#' @param scale round to \code{scale} digits to the right of the decimal point 
when
+#'        \code{scale} > 0, the nearest even number when \code{scale} = 0, and 
\code{scale} digits
+#'        to the left of the decimal point when \code{scale} < 0.
 #' @rdname column_math_functions
 #' @aliases bround bround,Column-method
 #' @export
@@ -2005,8 +2006,9 @@ setMethod("months_between", signature(y = "Column"),
           })
 
 #' @details
-#' \code{nanvl}: Returns the first column (\code{y}) if it is not NaN, or the 
second column (\code{x}) if
-#' the first column is NaN. Both inputs should be floating point columns 
(DoubleType or FloatType).
+#' \code{nanvl}: Returns the first column (\code{y}) if it is not NaN, or the 
second column
+#' (\code{x}) if the first column is NaN. Both inputs should be floating point 
columns
+#' (DoubleType or FloatType).
 #'
 #' @rdname column_nonaggregate_functions
 #' @aliases nanvl nanvl,Column-method
@@ -2061,7 +2063,7 @@ setMethod("approxCountDistinct",
 setMethod("countDistinct",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(...), function (x) {
+            jcols <- lapply(list(...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -2090,7 +2092,7 @@ setMethod("countDistinct",
 setMethod("concat",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -2110,7 +2112,7 @@ setMethod("greatest",
           signature(x = "Column"),
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -2130,7 +2132,7 @@ setMethod("least",
           signature(x = "Column"),
           function(x, ...) {
             stopifnot(length(list(...)) > 0)
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -2406,8 +2408,8 @@ setMethod("shiftLeft", signature(y = "Column", x = 
"numeric"),
           })
 
 #' @details
-#' \code{shiftRight}: (Signed) shifts the given value numBits right. If the 
given value is a long value,
-#' it will return a long value else it will return an integer value.
+#' \code{shiftRight}: (Signed) shifts the given value numBits right. If the 
given value is a long
+#' value, it will return a long value else it will return an integer value.
 #'
 #' @rdname column_math_functions
 #' @aliases shiftRight shiftRight,Column,numeric-method
@@ -2505,9 +2507,10 @@ setMethod("format_string", signature(format = 
"character", x = "Column"),
           })
 
 #' @details
-#' \code{from_unixtime}: Converts the number of seconds from unix epoch 
(1970-01-01 00:00:00 UTC) to a
-#' string representing the timestamp of that moment in the current system time 
zone in the JVM in the
-#' given format. See 
\href{http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html}{
+#' \code{from_unixtime}: Converts the number of seconds from unix epoch 
(1970-01-01 00:00:00 UTC)
+#' to a string representing the timestamp of that moment in the current system 
time zone in the JVM
+#' in the given format.
+#' See 
\href{http://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html}{
 #' Customizing Formats} for available options.
 #'
 #' @rdname column_datetime_functions
@@ -2634,8 +2637,8 @@ setMethod("lpad", signature(x = "Column", len = 
"numeric", pad = "character"),
           })
 
 #' @details
-#' \code{rand}: Generates a random column with independent and identically 
distributed (i.i.d.) samples
-#' from U[0.0, 1.0].
+#' \code{rand}: Generates a random column with independent and identically 
distributed (i.i.d.)
+#' samples from U[0.0, 1.0].
 #'
 #' @rdname column_nonaggregate_functions
 #' @param seed a random seed. Can be missing.
@@ -2664,8 +2667,8 @@ setMethod("rand", signature(seed = "numeric"),
           })
 
 #' @details
-#' \code{randn}: Generates a column with independent and identically 
distributed (i.i.d.) samples from
-#' the standard normal distribution.
+#' \code{randn}: Generates a column with independent and identically 
distributed (i.i.d.) samples
+#' from the standard normal distribution.
 #'
 #' @rdname column_nonaggregate_functions
 #' @aliases randn randn,missing-method
@@ -2831,8 +2834,8 @@ setMethod("unix_timestamp", signature(x = "Column", 
format = "character"),
           })
 
 #' @details
-#' \code{when}: Evaluates a list of conditions and returns one of multiple 
possible result expressions.
-#' For unmatched expressions null is returned.
+#' \code{when}: Evaluates a list of conditions and returns one of multiple 
possible result
+#' expressions. For unmatched expressions null is returned.
 #'
 #' @rdname column_nonaggregate_functions
 #' @param condition the condition to test on. Must be a Column expression.
@@ -2859,8 +2862,8 @@ setMethod("when", signature(condition = "Column", value = 
"ANY"),
           })
 
 #' @details
-#' \code{ifelse}: Evaluates a list of conditions and returns \code{yes} if the 
conditions are satisfied.
-#' Otherwise \code{no} is returned for unmatched conditions.
+#' \code{ifelse}: Evaluates a list of conditions and returns \code{yes} if the 
conditions are
+#' satisfied. Otherwise \code{no} is returned for unmatched conditions.
 #'
 #' @rdname column_nonaggregate_functions
 #' @param test a Column expression that describes the condition.
@@ -2990,7 +2993,8 @@ setMethod("ntile",
           })
 
 #' @details
-#' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows 
within a window partition.
+#' \code{percent_rank}: Returns the relative rank (i.e. percentile) of rows 
within a window
+#' partition.
 #' This is computed by: (rank of row in its partition - 1) / (number of rows 
in the partition - 1).
 #' This is equivalent to the \code{PERCENT_RANK} function in SQL.
 #' The method should be used with no argument.
@@ -3160,7 +3164,8 @@ setMethod("posexplode",
           })
 
 #' @details
-#' \code{create_array}: Creates a new array column. The input columns must all 
have the same data type.
+#' \code{create_array}: Creates a new array column. The input columns must all 
have the same data
+#' type.
 #'
 #' @rdname column_nonaggregate_functions
 #' @aliases create_array create_array,Column-method
@@ -3169,7 +3174,7 @@ setMethod("posexplode",
 setMethod("create_array",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -3178,8 +3183,8 @@ setMethod("create_array",
           })
 
 #' @details
-#' \code{create_map}: Creates a new map column. The input columns must be 
grouped as key-value pairs,
-#' e.g. (key1, value1, key2, value2, ...).
+#' \code{create_map}: Creates a new map column. The input columns must be 
grouped as key-value
+#' pairs, e.g. (key1, value1, key2, value2, ...).
 #' The key columns must all have the same data type, and can't be null.
 #' The value columns must all have the same data type.
 #'
@@ -3190,7 +3195,7 @@ setMethod("create_array",
 setMethod("create_map",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })
@@ -3352,9 +3357,9 @@ setMethod("not",
           })
 
 #' @details
-#' \code{grouping_bit}: Indicates whether a specified column in a GROUP BY 
list is aggregated or not,
-#' returns 1 for aggregated or 0 for not aggregated in the result set. Same as 
\code{GROUPING} in SQL
-#' and \code{grouping} function in Scala.
+#' \code{grouping_bit}: Indicates whether a specified column in a GROUP BY 
list is aggregated or
+#' not, returns 1 for aggregated or 0 for not aggregated in the result set. 
Same as \code{GROUPING}
+#' in SQL and \code{grouping} function in Scala.
 #'
 #' @rdname column_aggregate_functions
 #' @aliases grouping_bit grouping_bit,Column-method
@@ -3412,7 +3417,7 @@ setMethod("grouping_bit",
 setMethod("grouping_id",
           signature(x = "Column"),
           function(x, ...) {
-            jcols <- lapply(list(x, ...), function (x) {
+            jcols <- lapply(list(x, ...), function(x) {
               stopifnot(class(x) == "Column")
               x@jc
             })

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index 0fe8f04..4e42748 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -385,7 +385,7 @@ setGeneric("value", function(bcast) { 
standardGeneric("value") })
 #' @return A SparkDataFrame.
 #' @rdname summarize
 #' @export
-setGeneric("agg", function (x, ...) { standardGeneric("agg") })
+setGeneric("agg", function(x, ...) { standardGeneric("agg") })
 
 #' alias
 #'
@@ -731,7 +731,7 @@ setGeneric("schema", function(x) { 
standardGeneric("schema") })
 
 #' @rdname select
 #' @export
-setGeneric("select", function(x, col, ...) { standardGeneric("select") } )
+setGeneric("select", function(x, col, ...) { standardGeneric("select") })
 
 #' @rdname selectExpr
 #' @export

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/group.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/group.R b/R/pkg/R/group.R
index 0a7be0e..54ef9f0 100644
--- a/R/pkg/R/group.R
+++ b/R/pkg/R/group.R
@@ -133,8 +133,8 @@ setMethod("summarize",
 # Aggregate Functions by name
 methods <- c("avg", "max", "mean", "min", "sum")
 
-# These are not exposed on GroupedData: "kurtosis", "skewness", "stddev", 
"stddev_samp", "stddev_pop",
-# "variance", "var_samp", "var_pop"
+# These are not exposed on GroupedData: "kurtosis", "skewness", "stddev", 
"stddev_samp",
+# "stddev_pop", "variance", "var_samp", "var_pop"
 
 #' Pivot a column of the GroupedData and perform the specified aggregation.
 #'

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/mllib_classification.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index 15af829..7cd072a 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -58,22 +58,25 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
 #' @param regParam The regularization parameter. Only supports L2 
regularization currently.
 #' @param maxIter Maximum iteration number.
 #' @param tol Convergence tolerance of iterations.
-#' @param standardization Whether to standardize the training features before 
fitting the model. The coefficients
-#'                        of models will be always returned on the original 
scale, so it will be transparent for
-#'                        users. Note that with/without standardization, the 
models should be always converged
-#'                        to the same solution when no regularization is 
applied.
+#' @param standardization Whether to standardize the training features before 
fitting the model.
+#'                        The coefficients of models will be always returned 
on the original scale,
+#'                        so it will be transparent for users. Note that 
with/without
+#'                        standardization, the models should be always 
converged to the same
+#'                        solution when no regularization is applied.
 #' @param threshold The threshold in binary classification applied to the 
linear model prediction.
 #'                  This threshold can be any real number, where Inf will make 
all predictions 0.0
 #'                  and -Inf will make all predictions 1.0.
 #' @param weightCol The weight column name.
-#' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the dimensions of features
-#'                         or the number of partitions are large, this param 
could be adjusted to a larger size.
+#' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the
+#'                         dimensions of features or the number of partitions 
are large, this param
+#'                         could be adjusted to a larger size.
 #'                         This is an expert parameter. Default value should 
be good for most cases.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and label
-#'                      column of string type.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and
+#'                      label column of string type.
 #'                      Supported options: "skip" (filter out rows with 
invalid data),
-#'                                         "error" (throw an error), "keep" 
(put invalid data in a special additional
-#'                                         bucket, at index numLabels). 
Default is "error".
+#'                                         "error" (throw an error), "keep" 
(put invalid data in
+#'                                         a special additional bucket, at 
index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.svmLinear} returns a fitted linear SVM model.
 #' @rdname spark.svmLinear
@@ -175,62 +178,80 @@ function(object, path, overwrite = FALSE) {
 
 #' Logistic Regression Model
 #'
-#' Fits an logistic regression model against a SparkDataFrame. It supports 
"binomial": Binary logistic regression
-#' with pivoting; "multinomial": Multinomial logistic (softmax) regression 
without pivoting, similar to glmnet.
-#' Users can print, make predictions on the produced model and save the model 
to the input path.
+#' Fits an logistic regression model against a SparkDataFrame. It supports 
"binomial": Binary
+#' logistic regression with pivoting; "multinomial": Multinomial logistic 
(softmax) regression
+#' without pivoting, similar to glmnet. Users can print, make predictions on 
the produced model
+#' and save the model to the input path.
 #'
 #' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
 #'                operators are supported, including '~', '.', ':', '+', and 
'-'.
 #' @param regParam the regularization parameter.
-#' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, 
the penalty is an L2 penalty.
-#'                        For alpha = 1.0, it is an L1 penalty. For 0.0 < 
alpha < 1.0, the penalty is a combination
-#'                        of L1 and L2. Default is 0.0 which is an L2 penalty.
+#' @param elasticNetParam the ElasticNet mixing parameter. For alpha = 0.0, 
the penalty is an L2
+#'                        penalty. For alpha = 1.0, it is an L1 penalty. For 
0.0 < alpha < 1.0,
+#'                        the penalty is a combination of L1 and L2. Default 
is 0.0 which is an
+#'                        L2 penalty.
 #' @param maxIter maximum iteration number.
 #' @param tol convergence tolerance of iterations.
-#' @param family the name of family which is a description of the label 
distribution to be used in the model.
+#' @param family the name of family which is a description of the label 
distribution to be used
+#'               in the model.
 #'               Supported options:
 #'                 \itemize{
 #'                   \item{"auto": Automatically select the family based on 
the number of classes:
 #'                           If number of classes == 1 || number of classes == 
2, set to "binomial".
 #'                           Else, set to "multinomial".}
 #'                   \item{"binomial": Binary logistic regression with 
pivoting.}
-#'                   \item{"multinomial": Multinomial logistic (softmax) 
regression without pivoting.}
+#'                   \item{"multinomial": Multinomial logistic (softmax) 
regression without
+#'                           pivoting.}
 #'                 }
-#' @param standardization whether to standardize the training features before 
fitting the model. The coefficients
-#'                        of models will be always returned on the original 
scale, so it will be transparent for
-#'                        users. Note that with/without standardization, the 
models should be always converged
-#'                        to the same solution when no regularization is 
applied. Default is TRUE, same as glmnet.
-#' @param thresholds in binary classification, in range [0, 1]. If the 
estimated probability of class label 1
-#'                  is > threshold, then predict 1, else 0. A high threshold 
encourages the model to predict 0
-#'                  more often; a low threshold encourages the model to 
predict 1 more often. Note: Setting this with
-#'                  threshold p is equivalent to setting thresholds c(1-p, p). 
In multiclass (or binary) classification to adjust the probability of
-#'                  predicting each class. Array must have length equal to the 
number of classes, with values > 0,
-#'                  excepting that at most one value may be 0. The class with 
largest value p/t is predicted, where p
-#'                  is the original probability of that class and t is the 
class's threshold.
+#' @param standardization whether to standardize the training features before 
fitting the model.
+#'                        The coefficients of models will be always returned 
on the original scale,
+#'                        so it will be transparent for users. Note that 
with/without
+#'                        standardization, the models should be always 
converged to the same
+#'                        solution when no regularization is applied. Default 
is TRUE, same as
+#'                        glmnet.
+#' @param thresholds in binary classification, in range [0, 1]. If the 
estimated probability of
+#'                   class label 1 is > threshold, then predict 1, else 0. A 
high threshold
+#'                   encourages the model to predict 0 more often; a low 
threshold encourages the
+#'                   model to predict 1 more often. Note: Setting this with 
threshold p is
+#'                   equivalent to setting thresholds c(1-p, p). In multiclass 
(or binary)
+#'                   classification to adjust the probability of predicting 
each class. Array must
+#'                   have length equal to the number of classes, with values > 
0, excepting that
+#'                   at most one value may be 0. The class with largest value 
p/t is predicted,
+#'                   where p is the original probability of that class and t 
is the class's
+#'                   threshold.
 #' @param weightCol The weight column name.
-#' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the dimensions of features
-#'                         or the number of partitions are large, this param 
could be adjusted to a larger size.
-#'                         This is an expert parameter. Default value should 
be good for most cases.
-#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if 
fitting under bound constrained optimization.
-#'                                  The bound matrix must be compatible with 
the shape (1, number of features) for binomial
-#'                                  regression, or (number of classes, number 
of features) for multinomial regression.
+#' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the
+#'                         dimensions of features or the number of partitions 
are large, this param
+#'                         could be adjusted to a larger size. This is an 
expert parameter. Default
+#'                         value should be good for most cases.
+#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if 
fitting under bound
+#'                                  constrained optimization.
+#'                                  The bound matrix must be compatible with 
the shape (1, number
+#'                                  of features) for binomial regression, or 
(number of classes,
+#'                                  number of features) for multinomial 
regression.
 #'                                  It is a R matrix.
-#' @param upperBoundsOnCoefficients The upper bounds on coefficients if 
fitting under bound constrained optimization.
-#'                                  The bound matrix must be compatible with 
the shape (1, number of features) for binomial
-#'                                  regression, or (number of classes, number 
of features) for multinomial regression.
+#' @param upperBoundsOnCoefficients The upper bounds on coefficients if 
fitting under bound
+#'                                  constrained optimization.
+#'                                  The bound matrix must be compatible with 
the shape (1, number
+#'                                  of features) for binomial regression, or 
(number of classes,
+#'                                  number of features) for multinomial 
regression.
 #'                                  It is a R matrix.
-#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting 
under bound constrained optimization.
-#'                                The bounds vector size must be equal to 1 
for binomial regression, or the number
-#'                                of classes for multinomial regression.
-#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting 
under bound constrained optimization.
-#'                                The bound vector size must be equal to 1 for 
binomial regression, or the number
+#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting 
under bound constrained
+#'                                optimization.
+#'                                The bounds vector size must be equal to 1 
for binomial regression,
+#'                                or the number
 #'                                of classes for multinomial regression.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and label
-#'                      column of string type.
+#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting 
under bound constrained
+#'                                optimization.
+#'                                The bound vector size must be equal to 1 for 
binomial regression,
+#'                                or the number of classes for multinomial 
regression.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and
+#'                      label column of string type.
 #'                      Supported options: "skip" (filter out rows with 
invalid data),
-#'                                         "error" (throw an error), "keep" 
(put invalid data in a special additional
-#'                                         bucket, at index numLabels). 
Default is "error".
+#'                                         "error" (throw an error), "keep" 
(put invalid data in
+#'                                         a special additional bucket, at 
index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model.
 #' @rdname spark.logit
@@ -412,11 +433,12 @@ setMethod("write.ml", signature(object = 
"LogisticRegressionModel", path = "char
 #' @param seed seed parameter for weights initialization.
 #' @param initialWeights initialWeights parameter for weights initialization, 
it should be a
 #'        numeric vector.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and label
-#'                      column of string type.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and
+#'                      label column of string type.
 #'                      Supported options: "skip" (filter out rows with 
invalid data),
-#'                                         "error" (throw an error), "keep" 
(put invalid data in a special additional
-#'                                         bucket, at index numLabels). 
Default is "error".
+#'                                         "error" (throw an error), "keep" 
(put invalid data in
+#'                                         a special additional bucket, at 
index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.mlp} returns a fitted Multilayer Perceptron 
Classification Model.
 #' @rdname spark.mlp
@@ -452,11 +474,11 @@ setMethod("spark.mlp", signature(data = "SparkDataFrame", 
formula = "formula"),
                    handleInvalid = c("error", "keep", "skip")) {
             formula <- paste(deparse(formula), collapse = "")
             if (is.null(layers)) {
-              stop ("layers must be a integer vector with length > 1.")
+              stop("layers must be a integer vector with length > 1.")
             }
             layers <- as.integer(na.omit(layers))
             if (length(layers) <= 1) {
-              stop ("layers must be a integer vector with length > 1.")
+              stop("layers must be a integer vector with length > 1.")
             }
             if (!is.null(seed)) {
               seed <- as.character(as.integer(seed))
@@ -538,11 +560,12 @@ setMethod("write.ml", signature(object = 
"MultilayerPerceptronClassificationMode
 #' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
 #'               operators are supported, including '~', '.', ':', '+', and 
'-'.
 #' @param smoothing smoothing parameter.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and label
-#'                      column of string type.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and
+#'                      label column of string type.
 #'                      Supported options: "skip" (filter out rows with 
invalid data),
-#'                                         "error" (throw an error), "keep" 
(put invalid data in a special additional
-#'                                         bucket, at index numLabels). 
Default is "error".
+#'                                         "error" (throw an error), "keep" 
(put invalid data in
+#'                                         a special additional bucket, at 
index numLabels). Default
+#'                                         is "error".
 #' @param ... additional argument(s) passed to the method. Currently only 
\code{smoothing}.
 #' @return \code{spark.naiveBayes} returns a fitted naive Bayes model.
 #' @rdname spark.naiveBayes

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/mllib_clustering.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R
index 97c9fa1..a25bf81 100644
--- a/R/pkg/R/mllib_clustering.R
+++ b/R/pkg/R/mllib_clustering.R
@@ -60,9 +60,9 @@ setClass("LDAModel", representation(jobj = "jobj"))
 #' @param maxIter maximum iteration number.
 #' @param seed the random seed.
 #' @param minDivisibleClusterSize The minimum number of points (if greater 
than or equal to 1.0)
-#'                                or the minimum proportion of points (if less 
than 1.0) of a divisible cluster.
-#'                                Note that it is an expert parameter. The 
default value should be good enough
-#'                                for most cases.
+#'                                or the minimum proportion of points (if less 
than 1.0) of a
+#'                                divisible cluster. Note that it is an expert 
parameter. The
+#'                                default value should be good enough for most 
cases.
 #' @param ... additional argument(s) passed to the method.
 #' @return \code{spark.bisectingKmeans} returns a fitted bisecting k-means 
model.
 #' @rdname spark.bisectingKmeans
@@ -325,10 +325,11 @@ setMethod("write.ml", signature(object = 
"GaussianMixtureModel", path = "charact
 #'                Note that the response variable of formula is empty in 
spark.kmeans.
 #' @param k number of centers.
 #' @param maxIter maximum iteration number.
-#' @param initMode the initialization algorithm choosen to fit the model.
+#' @param initMode the initialization algorithm chosen to fit the model.
 #' @param seed the random seed for cluster initialization.
 #' @param initSteps the number of steps for the k-means|| initialization mode.
-#'                  This is an advanced setting, the default of 2 is almost 
always enough. Must be > 0.
+#'                  This is an advanced setting, the default of 2 is almost 
always enough.
+#'                  Must be > 0.
 #' @param tol convergence tolerance of iterations.
 #' @param ... additional argument(s) passed to the method.
 #' @return \code{spark.kmeans} returns a fitted k-means model.
@@ -548,8 +549,8 @@ setMethod("spark.lda", signature(data = "SparkDataFrame"),
 #'         \item{\code{topics}}{top 10 terms and their weights of all topics}
 #'         \item{\code{vocabulary}}{whole terms of the training corpus, NULL 
if libsvm format file
 #'               used as training set}
-#'         \item{\code{trainingLogLikelihood}}{Log likelihood of the observed 
tokens in the training set,
-#'               given the current parameter estimates:
+#'         \item{\code{trainingLogLikelihood}}{Log likelihood of the observed 
tokens in the
+#'               training set, given the current parameter estimates:
 #'               log P(docs | topics, topic distributions for docs, Dirichlet 
hyperparameters)
 #'               It is only for distributed LDA model (i.e., optimizer = "em")}
 #'         \item{\code{logPrior}}{Log probability of the current parameter 
estimate:

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/mllib_regression.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_regression.R b/R/pkg/R/mllib_regression.R
index ebaeae9..f734a08 100644
--- a/R/pkg/R/mllib_regression.R
+++ b/R/pkg/R/mllib_regression.R
@@ -58,8 +58,8 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #'               Note that there are two ways to specify the tweedie family.
 #'               \itemize{
 #'                \item Set \code{family = "tweedie"} and specify the 
var.power and link.power;
-#'                \item When package \code{statmod} is loaded, the tweedie 
family is specified using the
-#'                family definition therein, i.e., \code{tweedie(var.power, 
link.power)}.
+#'                \item When package \code{statmod} is loaded, the tweedie 
family is specified
+#'                using the family definition therein, i.e., 
\code{tweedie(var.power, link.power)}.
 #'               }
 #' @param tol positive convergence tolerance of iterations.
 #' @param maxIter integer giving the maximal number of IRLS iterations.
@@ -71,13 +71,15 @@ setClass("IsotonicRegressionModel", representation(jobj = 
"jobj"))
 #'                      applicable to the Tweedie family.
 #' @param link.power the index in the power link function. Only applicable to 
the Tweedie family.
 #' @param stringIndexerOrderType how to order categories of a string feature 
column. This is used to
-#'                               decide the base level of a string feature as 
the last category after
-#'                               ordering is dropped when encoding strings. 
Supported options are
-#'                               "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and "alphabetAsc".
-#'                               The default value is "frequencyDesc". When 
the ordering is set to
-#'                               "alphabetDesc", this drops the same category 
as R when encoding strings.
-#' @param offsetCol the offset column name. If this is not set or empty, we 
treat all instance offsets
-#'                  as 0.0. The feature specified as offset has a constant 
coefficient of 1.0.
+#'                               decide the base level of a string feature as 
the last category
+#'                               after ordering is dropped when encoding 
strings. Supported options
+#'                               are "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and
+#'                               "alphabetAsc". The default value is 
"frequencyDesc". When the
+#'                               ordering is set to "alphabetDesc", this drops 
the same category
+#'                               as R when encoding strings.
+#' @param offsetCol the offset column name. If this is not set or empty, we 
treat all instance
+#'                  offsets as 0.0. The feature specified as offset has a 
constant coefficient of
+#'                  1.0.
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.glm,SparkDataFrame,formula-method
 #' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -197,13 +199,15 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 #' @param var.power the index of the power variance function in the Tweedie 
family.
 #' @param link.power the index of the power link function in the Tweedie 
family.
 #' @param stringIndexerOrderType how to order categories of a string feature 
column. This is used to
-#'                               decide the base level of a string feature as 
the last category after
-#'                               ordering is dropped when encoding strings. 
Supported options are
-#'                               "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and "alphabetAsc".
-#'                               The default value is "frequencyDesc". When 
the ordering is set to
-#'                               "alphabetDesc", this drops the same category 
as R when encoding strings.
-#' @param offsetCol the offset column name. If this is not set or empty, we 
treat all instance offsets
-#'                  as 0.0. The feature specified as offset has a constant 
coefficient of 1.0.
+#'                               decide the base level of a string feature as 
the last category
+#'                               after ordering is dropped when encoding 
strings. Supported options
+#'                               are "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and
+#'                               "alphabetAsc". The default value is 
"frequencyDesc". When the
+#'                               ordering is set to "alphabetDesc", this drops 
the same category
+#'                               as R when encoding strings.
+#' @param offsetCol the offset column name. If this is not set or empty, we 
treat all instance
+#'                  offsets as 0.0. The feature specified as offset has a 
constant coefficient of
+#'                  1.0.
 #' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
@@ -233,11 +237,11 @@ setMethod("glm", signature(formula = "formula", family = 
"ANY", data = "SparkDat
 
 #' @param object a fitted generalized linear model.
 #' @return \code{summary} returns summary information of the fitted model, 
which is a list.
-#'         The list of components includes at least the \code{coefficients} 
(coefficients matrix, which includes
-#'         coefficients, standard error of coefficients, t value and p value),
+#'         The list of components includes at least the \code{coefficients} 
(coefficients matrix,
+#'         which includes coefficients, standard error of coefficients, t 
value and p value),
 #'         \code{null.deviance} (null/residual degrees of freedom), \code{aic} 
(AIC)
-#'         and \code{iter} (number of iterations IRLS takes). If there are 
collinear columns in the data,
-#'         the coefficients matrix only provides coefficients.
+#'         and \code{iter} (number of iterations IRLS takes). If there are 
collinear columns in
+#'         the data, the coefficients matrix only provides coefficients.
 #' @rdname spark.glm
 #' @export
 #' @note summary(GeneralizedLinearRegressionModel) since 2.0.0
@@ -457,15 +461,17 @@ setMethod("write.ml", signature(object = 
"IsotonicRegressionModel", path = "char
 #' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
 #'                operators are supported, including '~', ':', '+', and '-'.
 #'                Note that operator '.' is not supported currently.
-#' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the dimensions of features
-#'                         or the number of partitions are large, this param 
could be adjusted to a larger size.
-#'                         This is an expert parameter. Default value should 
be good for most cases.
+#' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the
+#'                         dimensions of features or the number of partitions 
are large, this
+#'                         param could be adjusted to a larger size. This is 
an expert parameter.
+#'                         Default value should be good for most cases.
 #' @param stringIndexerOrderType how to order categories of a string feature 
column. This is used to
-#'                               decide the base level of a string feature as 
the last category after
-#'                               ordering is dropped when encoding strings. 
Supported options are
-#'                               "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and "alphabetAsc".
-#'                               The default value is "frequencyDesc". When 
the ordering is set to
-#'                               "alphabetDesc", this drops the same category 
as R when encoding strings.
+#'                               decide the base level of a string feature as 
the last category
+#'                               after ordering is dropped when encoding 
strings. Supported options
+#'                               are "frequencyDesc", "frequencyAsc", 
"alphabetDesc", and
+#'                               "alphabetAsc". The default value is 
"frequencyDesc". When the
+#'                               ordering is set to "alphabetDesc", this drops 
the same category
+#'                               as R when encoding strings.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.survreg} returns a fitted AFT survival regression model.
 #' @rdname spark.survreg

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/mllib_tree.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/mllib_tree.R b/R/pkg/R/mllib_tree.R
index 33c4653..89a58bf 100644
--- a/R/pkg/R/mllib_tree.R
+++ b/R/pkg/R/mllib_tree.R
@@ -132,10 +132,12 @@ print.summary.decisionTree <- function(x) {
 #' Gradient Boosted Tree model, \code{predict} to make predictions on new 
data, and
 #' \code{write.ml}/\code{read.ml} to save/load fitted models.
 #' For more details, see
+# nolint start
 #' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-regression}{
 #' GBT Regression} and
 #' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html#gradient-boosted-tree-classifier}{
 #' GBT Classification}
+# nolint end
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
@@ -164,11 +166,12 @@ print.summary.decisionTree <- function(x) {
 #'                     nodes. If TRUE, the algorithm will cache node IDs for 
each instance. Caching
 #'                     can speed up training of deeper trees. Users can set 
how often should the
 #'                     cache be checkpointed or disable it by setting 
checkpointInterval.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and label
-#'                      column of string type in classification model.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and
+#'                      label column of string type in classification model.
 #'                      Supported options: "skip" (filter out rows with 
invalid data),
-#'                                         "error" (throw an error), "keep" 
(put invalid data in a special additional
-#'                                         bucket, at index numLabels). 
Default is "error".
+#'                                         "error" (throw an error), "keep" 
(put invalid data in
+#'                                         a special additional bucket, at 
index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.gbt,SparkDataFrame,formula-method
 #' @return \code{spark.gbt} returns a fitted Gradient Boosted Tree model.
@@ -352,10 +355,12 @@ setMethod("write.ml", signature(object = 
"GBTClassificationModel", path = "chara
 #' model, \code{predict} to make predictions on new data, and 
\code{write.ml}/\code{read.ml} to
 #' save/load fitted models.
 #' For more details, see
+# nolint start
 #' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-regression}{
 #' Random Forest Regression} and
 #' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html#random-forest-classifier}{
 #' Random Forest Classification}
+# nolint end
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
@@ -382,11 +387,12 @@ setMethod("write.ml", signature(object = 
"GBTClassificationModel", path = "chara
 #'                     nodes. If TRUE, the algorithm will cache node IDs for 
each instance. Caching
 #'                     can speed up training of deeper trees. Users can set 
how often should the
 #'                     cache be checkpointed or disable it by setting 
checkpointInterval.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and label
-#'                      column of string type in classification model.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and
+#'                      label column of string type in classification model.
 #'                      Supported options: "skip" (filter out rows with 
invalid data),
-#'                                         "error" (throw an error), "keep" 
(put invalid data in a special additional
-#'                                         bucket, at index numLabels). 
Default is "error".
+#'                                         "error" (throw an error), "keep" 
(put invalid data in
+#'                                         a special additional bucket, at 
index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.randomForest,SparkDataFrame,formula-method
 #' @return \code{spark.randomForest} returns a fitted Random Forest model.
@@ -567,10 +573,12 @@ setMethod("write.ml", signature(object = 
"RandomForestClassificationModel", path
 #' model, \code{predict} to make predictions on new data, and 
\code{write.ml}/\code{read.ml} to
 #' save/load fitted models.
 #' For more details, see
+# nolint start
 #' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-regression}{
 #' Decision Tree Regression} and
 #' 
\href{http://spark.apache.org/docs/latest/ml-classification-regression.html#decision-tree-classifier}{
 #' Decision Tree Classification}
+# nolint end
 #'
 #' @param data a SparkDataFrame for training.
 #' @param formula a symbolic description of the model to be fitted. Currently 
only a few formula
@@ -592,11 +600,12 @@ setMethod("write.ml", signature(object = 
"RandomForestClassificationModel", path
 #'                     nodes. If TRUE, the algorithm will cache node IDs for 
each instance. Caching
 #'                     can speed up training of deeper trees. Users can set 
how often should the
 #'                     cache be checkpointed or disable it by setting 
checkpointInterval.
-#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and label
-#'                      column of string type in classification model.
+#' @param handleInvalid How to handle invalid data (unseen labels or NULL 
values) in features and
+#'                      label column of string type in classification model.
 #'                      Supported options: "skip" (filter out rows with 
invalid data),
-#'                                         "error" (throw an error), "keep" 
(put invalid data in a special additional
-#'                                         bucket, at index numLabels). 
Default is "error".
+#'                                         "error" (throw an error), "keep" 
(put invalid data in
+#'                                         a special additional bucket, at 
index numLabels). Default
+#'                                         is "error".
 #' @param ... additional arguments passed to the method.
 #' @aliases spark.decisionTree,SparkDataFrame,formula-method
 #' @return \code{spark.decisionTree} returns a fitted Decision Tree model.
@@ -671,7 +680,8 @@ setMethod("spark.decisionTree", signature(data = 
"SparkDataFrame", formula = "fo
 #' @return \code{summary} returns summary information of the fitted model, 
which is a list.
 #'         The list of components includes \code{formula} (formula),
 #'         \code{numFeatures} (number of features), \code{features} (list of 
features),
-#'         \code{featureImportances} (feature importances), and 
\code{maxDepth} (max depth of trees).
+#'         \code{featureImportances} (feature importances), and 
\code{maxDepth} (max depth of
+#'         trees).
 #' @rdname spark.decisionTree
 #' @aliases summary,DecisionTreeRegressionModel-method
 #' @export

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/pairRDD.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/pairRDD.R b/R/pkg/R/pairRDD.R
index 8fa21be..9c2e57d 100644
--- a/R/pkg/R/pairRDD.R
+++ b/R/pkg/R/pairRDD.R
@@ -860,7 +860,7 @@ setMethod("subtractByKey",
                                             other,
                                             numPartitions = numPartitions),
                                     filterFunction),
-                          function (v) { v[[1]] })
+                          function(v) { v[[1]] })
           })
 
 #' Return a subset of this RDD sampled by key.
@@ -925,7 +925,7 @@ setMethod("sampleByKey",
                   if (withReplacement) {
                     count <- stats::rpois(1, frac)
                     if (count > 0) {
-                      res[ (len + 1) : (len + count) ] <- rep(list(elem), 
count)
+                      res[(len + 1) : (len + count)] <- rep(list(elem), count)
                       len <- len + count
                     }
                   } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/schema.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/schema.R b/R/pkg/R/schema.R
index d1ed683..65f4187 100644
--- a/R/pkg/R/schema.R
+++ b/R/pkg/R/schema.R
@@ -155,7 +155,7 @@ checkType <- function(type) {
   } else {
     # Check complex types
     firstChar <- substr(type, 1, 1)
-    switch (firstChar,
+    switch(firstChar,
             a = {
               # Array type
               m <- regexec("^array<(.+)>$", type)

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/stats.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/stats.R b/R/pkg/R/stats.R
index 9a9fa84..c8af798 100644
--- a/R/pkg/R/stats.R
+++ b/R/pkg/R/stats.R
@@ -29,9 +29,9 @@ setOldClass("jobj")
 #' @param col1 name of the first column. Distinct items will make the first 
item of each row.
 #' @param col2 name of the second column. Distinct items will make the column 
names of the output.
 #' @return a local R data.frame representing the contingency table. The first 
column of each row
-#'         will be the distinct values of \code{col1} and the column names 
will be the distinct values
-#'         of \code{col2}. The name of the first column will be 
"\code{col1}_\code{col2}". Pairs
-#'         that have no occurrences will have zero as their counts.
+#'         will be the distinct values of \code{col1} and the column names 
will be the distinct
+#'         values of \code{col2}. The name of the first column will be 
"\code{col1}_\code{col2}".
+#'         Pairs that have no occurrences will have zero as their counts.
 #'
 #' @rdname crosstab
 #' @name crosstab
@@ -53,8 +53,8 @@ setMethod("crosstab",
           })
 
 #' @details
-#' \code{cov}: When applied to SparkDataFrame, this calculates the sample 
covariance of two numerical
-#' columns of \emph{one} SparkDataFrame.
+#' \code{cov}: When applied to SparkDataFrame, this calculates the sample 
covariance of two
+#' numerical columns of \emph{one} SparkDataFrame.
 #'
 #' @param colName1 the name of the first column
 #' @param colName2 the name of the second column
@@ -159,8 +159,8 @@ setMethod("freqItems", signature(x = "SparkDataFrame", cols 
= "character"),
 #' @param relativeError The relative target precision to achieve (>= 0). If 
set to zero,
 #'                      the exact quantiles are computed, which could be very 
expensive.
 #'                      Note that values greater than 1 are accepted but give 
the same result as 1.
-#' @return The approximate quantiles at the given probabilities. If the input 
is a single column name,
-#'         the output is a list of approximate quantiles in that column; If 
the input is
+#' @return The approximate quantiles at the given probabilities. If the input 
is a single column
+#'         name, the output is a list of approximate quantiles in that column; 
If the input is
 #'         multiple column names, the output should be a list, and each 
element in it is a list of
 #'         numeric values which represents the approximate quantiles in 
corresponding column.
 #'

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/R/utils.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/utils.R b/R/pkg/R/utils.R
index 91483a4..4b71699 100644
--- a/R/pkg/R/utils.R
+++ b/R/pkg/R/utils.R
@@ -625,7 +625,7 @@ appendPartitionLengths <- function(x, other) {
     x <- lapplyPartition(x, appendLength)
     other <- lapplyPartition(other, appendLength)
   }
-  list (x, other)
+  list(x, other)
 }
 
 # Perform zip or cartesian between elements from two RDDs in each partition
@@ -657,7 +657,7 @@ mergePartitions <- function(rdd, zip) {
           keys <- list()
         }
         if (lengthOfValues > 1) {
-          values <- part[ (lengthOfKeys + 1) : (len - 1) ]
+          values <- part[(lengthOfKeys + 1) : (len - 1)]
         } else {
           values <- list()
         }

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/inst/worker/worker.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/worker/worker.R b/R/pkg/inst/worker/worker.R
index 03e7450..00789d8 100644
--- a/R/pkg/inst/worker/worker.R
+++ b/R/pkg/inst/worker/worker.R
@@ -68,7 +68,7 @@ compute <- function(mode, partition, serializer, 
deserializer, key,
   } else {
     output <- computeFunc(partition, inputData)
   }
-  return (output)
+  return(output)
 }
 
 outputResult <- function(serializer, output, outputCon) {

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/tests/fulltests/test_binary_function.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_binary_function.R 
b/R/pkg/tests/fulltests/test_binary_function.R
index 442bed5..c5d240f 100644
--- a/R/pkg/tests/fulltests/test_binary_function.R
+++ b/R/pkg/tests/fulltests/test_binary_function.R
@@ -73,7 +73,7 @@ test_that("zipPartitions() on RDDs", {
   rdd2 <- parallelize(sc, 1:4, 2L)  # 1:2, 3:4
   rdd3 <- parallelize(sc, 1:6, 2L)  # 1:3, 4:6
   actual <- collectRDD(zipPartitions(rdd1, rdd2, rdd3,
-                                  func = function(x, y, z) { list(list(x, y, 
z))} ))
+                                  func = function(x, y, z) { list(list(x, y, 
z))}))
   expect_equal(actual,
                list(list(1, c(1, 2), c(1, 2, 3)), list(2, c(3, 4), c(4, 5, 
6))))
 

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/tests/fulltests/test_rdd.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_rdd.R b/R/pkg/tests/fulltests/test_rdd.R
index 6ee1fce..0c702ea 100644
--- a/R/pkg/tests/fulltests/test_rdd.R
+++ b/R/pkg/tests/fulltests/test_rdd.R
@@ -698,14 +698,14 @@ test_that("fullOuterJoin() on pairwise RDDs", {
 })
 
 test_that("sortByKey() on pairwise RDDs", {
-  numPairsRdd <- map(rdd, function(x) { list (x, x) })
+  numPairsRdd <- map(rdd, function(x) { list(x, x) })
   sortedRdd <- sortByKey(numPairsRdd, ascending = FALSE)
   actual <- collectRDD(sortedRdd)
-  numPairs <- lapply(nums, function(x) { list (x, x) })
+  numPairs <- lapply(nums, function(x) { list(x, x) })
   expect_equal(actual, sortKeyValueList(numPairs, decreasing = TRUE))
 
   rdd2 <- parallelize(sc, sort(nums, decreasing = TRUE), 2L)
-  numPairsRdd2 <- map(rdd2, function(x) { list (x, x) })
+  numPairsRdd2 <- map(rdd2, function(x) { list(x, x) })
   sortedRdd2 <- sortByKey(numPairsRdd2)
   actual <- collectRDD(sortedRdd2)
   expect_equal(actual, numPairs)

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/R/pkg/tests/fulltests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R 
b/R/pkg/tests/fulltests/test_sparkSQL.R
index 4e62be9..7f781f2 100644
--- a/R/pkg/tests/fulltests/test_sparkSQL.R
+++ b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -560,9 +560,9 @@ test_that("Collect DataFrame with complex types", {
   expect_equal(nrow(ldf), 3)
   expect_equal(ncol(ldf), 3)
   expect_equal(names(ldf), c("c1", "c2", "c3"))
-  expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
-  expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list 
("g", "h", "i")))
-  expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list 
(7.0, 8.0, 9.0)))
+  expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9)))
+  expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), 
list("g", "h", "i")))
+  expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), 
list(7.0, 8.0, 9.0)))
 
   # MapType
   schema <- structType(structField("name", "string"),
@@ -1524,7 +1524,7 @@ test_that("column functions", {
     expect_equal(ncol(s), 1)
     expect_equal(nrow(s), 3)
     expect_is(s[[1]][[1]], "struct")
-    expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))
+    expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 })))
   }
 
   # passing option
@@ -2710,7 +2710,7 @@ test_that("freqItems() on a DataFrame", {
   input <- 1:1000
   rdf <- data.frame(numbers = input, letters = as.character(input),
                     negDoubles = input * -1.0, stringsAsFactors = F)
-  rdf[ input %% 3 == 0, ] <- c(1, "1", -1)
+  rdf[input %% 3 == 0, ] <- c(1, "1", -1)
   df <- createDataFrame(rdf)
   multiColResults <- freqItems(df, c("numbers", "letters"), support = 0.1)
   expect_true(1 %in% multiColResults$numbers[[1]])
@@ -3064,7 +3064,7 @@ test_that("coalesce, repartition, numPartitions", {
 })
 
 test_that("gapply() and gapplyCollect() on a DataFrame", {
-  df <- createDataFrame (
+  df <- createDataFrame(
     list(list(1L, 1, "1", 0.1), list(1L, 2, "1", 0.2), list(3L, 3, "3", 0.3)),
     c("a", "b", "c", "d"))
   expected <- collect(df)
@@ -3135,7 +3135,7 @@ test_that("gapply() and gapplyCollect() on a DataFrame", {
   actual <- df3Collect[order(df3Collect$a), ]
   expect_identical(actual$avg, expected$avg)
 
-  irisDF <- suppressWarnings(createDataFrame (iris))
+  irisDF <- suppressWarnings(createDataFrame(iris))
   schema <-  structType(structField("Sepal_Length", "double"), 
structField("Avg", "double"))
   # Groups by `Sepal_Length` and computes the average for `Sepal_Width`
   df4 <- gapply(

http://git-wip-us.apache.org/repos/asf/spark/blob/02c91e03/dev/lint-r.R
----------------------------------------------------------------------
diff --git a/dev/lint-r.R b/dev/lint-r.R
index 87ee36d..a4261d2 100644
--- a/dev/lint-r.R
+++ b/dev/lint-r.R
@@ -26,8 +26,8 @@ if (! library(SparkR, lib.loc = LOCAL_LIB_LOC, logical.return 
= TRUE)) {
 
 # Installs lintr from Github in a local directory.
 # NOTE: The CRAN's version is too old to adapt to our rules.
-if ("lintr" %in% row.names(installed.packages())  == FALSE) {
-  devtools::install_github("jimhester/lintr@a769c0b")
+if ("lintr" %in% row.names(installed.packages()) == FALSE) {
+  devtools::install_github("jimhester/lintr@5431140")
 }
 
 library(lintr)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-22063][R] Fixes lint check failures in R by latest commit sha1 ID of lint-r

Reply via email to