Repository: spark
Updated Branches:
  refs/heads/master 3e7dc8296 -> 69993217f


[SPARK-24807][CORE] Adding files/jars twice: output a warning and add a note

## What changes were proposed in this pull request?

In the PR, I propose to output an warning if the `addFile()` or `addJar()` 
methods are callled more than once for the same path. Currently, overwriting of 
already added files is not supported. New comments and warning are reflected 
the existing behaviour.

Author: Maxim Gekk <maxim.g...@databricks.com>

Closes #21771 from MaxGekk/warning-on-adding-file.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69993217
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69993217
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69993217

Branch: refs/heads/master
Commit: 69993217fc4f5e5e41a297702389e86fe534dc2f
Parents: 3e7dc82
Author: Maxim Gekk <maxim.g...@databricks.com>
Authored: Sat Jul 14 22:07:49 2018 -0700
Committer: Xiao Li <gatorsm...@gmail.com>
Committed: Sat Jul 14 22:07:49 2018 -0700

----------------------------------------------------------------------
 R/pkg/R/context.R                                       |  2 ++
 core/src/main/scala/org/apache/spark/SparkContext.scala | 12 ++++++++++++
 .../org/apache/spark/api/java/JavaSparkContext.scala    |  6 ++++++
 python/pyspark/context.py                               |  4 ++++
 4 files changed, 24 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/69993217/R/pkg/R/context.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 8ec727d..3e996a5 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -305,6 +305,8 @@ setCheckpointDirSC <- function(sc, dirName) {
 #' Currently directories are only supported for Hadoop-supported filesystems.
 #' Refer Hadoop-supported filesystems at 
\url{https://wiki.apache.org/hadoop/HCFS}.
 #'
+#' Note: A path can be added only once. Subsequent additions of the same path 
are ignored.
+#'
 #' @rdname spark.addFile
 #' @param path The path of the file to be added
 #' @param recursive Whether to add files recursively from the path. Default is 
FALSE.

http://git-wip-us.apache.org/repos/asf/spark/blob/69993217/core/src/main/scala/org/apache/spark/SparkContext.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala 
b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 74bfb5d..531384a 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1496,6 +1496,8 @@ class SparkContext(config: SparkConf) extends Logging {
    * @param path can be either a local file, a file in HDFS (or other 
Hadoop-supported
    * filesystems), or an HTTP, HTTPS or FTP URI. To access the file in Spark 
jobs,
    * use `SparkFiles.get(fileName)` to find its download location.
+   *
+   * @note A path can be added only once. Subsequent additions of the same 
path are ignored.
    */
   def addFile(path: String): Unit = {
     addFile(path, false)
@@ -1516,6 +1518,8 @@ class SparkContext(config: SparkConf) extends Logging {
    * use `SparkFiles.get(fileName)` to find its download location.
    * @param recursive if true, a directory can be given in `path`. Currently 
directories are
    * only supported for Hadoop-supported filesystems.
+   *
+   * @note A path can be added only once. Subsequent additions of the same 
path are ignored.
    */
   def addFile(path: String, recursive: Boolean): Unit = {
     val uri = new Path(path).toUri
@@ -1555,6 +1559,9 @@ class SparkContext(config: SparkConf) extends Logging {
       Utils.fetchFile(uri.toString, new File(SparkFiles.getRootDirectory()), 
conf,
         env.securityManager, hadoopConfiguration, timestamp, useCache = false)
       postEnvironmentUpdate()
+    } else {
+      logWarning(s"The path $path has been added already. Overwriting of added 
paths " +
+       "is not supported in the current version.")
     }
   }
 
@@ -1803,6 +1810,8 @@ class SparkContext(config: SparkConf) extends Logging {
    *
    * @param path can be either a local file, a file in HDFS (or other 
Hadoop-supported filesystems),
    * an HTTP, HTTPS or FTP URI, or local:/path for a file on every worker node.
+   *
+   * @note A path can be added only once. Subsequent additions of the same 
path are ignored.
    */
   def addJar(path: String) {
     def addJarFile(file: File): String = {
@@ -1849,6 +1858,9 @@ class SparkContext(config: SparkConf) extends Logging {
         if (addedJars.putIfAbsent(key, timestamp).isEmpty) {
           logInfo(s"Added JAR $path at $key with timestamp $timestamp")
           postEnvironmentUpdate()
+        } else {
+          logWarning(s"The jar $path has been added already. Overwriting of 
added jars " +
+            "is not supported in the current version.")
         }
       }
     }

http://git-wip-us.apache.org/repos/asf/spark/blob/69993217/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
----------------------------------------------------------------------
diff --git 
a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala 
b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index f1936bf..09c8384 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -668,6 +668,8 @@ class JavaSparkContext(val sc: SparkContext)
    * The `path` passed can be either a local file, a file in HDFS (or other 
Hadoop-supported
    * filesystems), or an HTTP, HTTPS or FTP URI.  To access the file in Spark 
jobs,
    * use `SparkFiles.get(fileName)` to find its download location.
+   *
+   * @note A path can be added only once. Subsequent additions of the same 
path are ignored.
    */
   def addFile(path: String) {
     sc.addFile(path)
@@ -681,6 +683,8 @@ class JavaSparkContext(val sc: SparkContext)
    *
    * A directory can be given if the recursive option is set to true. 
Currently directories are only
    * supported for Hadoop-supported filesystems.
+   *
+   * @note A path can be added only once. Subsequent additions of the same 
path are ignored.
    */
   def addFile(path: String, recursive: Boolean): Unit = {
     sc.addFile(path, recursive)
@@ -690,6 +694,8 @@ class JavaSparkContext(val sc: SparkContext)
    * Adds a JAR dependency for all tasks to be executed on this SparkContext 
in the future.
    * The `path` passed can be either a local file, a file in HDFS (or other 
Hadoop-supported
    * filesystems), or an HTTP, HTTPS or FTP URI.
+   *
+   * @note A path can be added only once. Subsequent additions of the same 
path are ignored.
    */
   def addJar(path: String) {
     sc.addJar(path)

http://git-wip-us.apache.org/repos/asf/spark/blob/69993217/python/pyspark/context.py
----------------------------------------------------------------------
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index ede3b6a..2cb3117 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -847,6 +847,8 @@ class SparkContext(object):
         A directory can be given if the recursive option is set to True.
         Currently directories are only supported for Hadoop-supported 
filesystems.
 
+        .. note:: A path can be added only once. Subsequent additions of the 
same path are ignored.
+
         >>> from pyspark import SparkFiles
         >>> path = os.path.join(tempdir, "test.txt")
         >>> with open(path, "w") as testFile:
@@ -867,6 +869,8 @@ class SparkContext(object):
         SparkContext in the future.  The C{path} passed can be either a local
         file, a file in HDFS (or other Hadoop-supported filesystems), or an
         HTTP, HTTPS or FTP URI.
+
+        .. note:: A path can be added only once. Subsequent additions of the 
same path are ignored.
         """
         self.addFile(path)
         (dirname, filename) = os.path.split(path)  # dirname may be directory 
or HDFS/S3 prefix


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to