Repository: spark
Updated Branches:
  refs/heads/branch-2.3 4c49b12da -> 414e4e3d7


[SPARK-10878][CORE] Fix race condition when multiple clients resolves artifacts 
at the same time

## What changes were proposed in this pull request?

When multiple clients attempt to resolve artifacts via the `--packages` 
parameter, they could run into race condition when they each attempt to modify 
the dummy `org.apache.spark-spark-submit-parent-default.xml` file created in 
the default ivy cache dir.
This PR changes the behavior to encode UUID in the dummy module descriptor so 
each client will operate on a different resolution file in the ivy cache dir. 
In addition, this patch changes the behavior of when and which resolution files 
are cleaned to prevent accumulation of resolution files in the default ivy 
cache dir.

Since this PR is a successor of #18801, close #18801. Many codes were ported 
from #18801. **Many efforts were put here. I think this PR should credit to 
Victsm .**

## How was this patch tested?

added UT into `SparkSubmitUtilsSuite`

Author: Kazuaki Ishizaki <ishiz...@jp.ibm.com>

Closes #21251 from kiszk/SPARK-10878.

(cherry picked from commit d3c426a5b02abdec49ff45df12a8f11f9e473a88)
Signed-off-by: Marcelo Vanzin <van...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/414e4e3d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/414e4e3d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/414e4e3d

Branch: refs/heads/branch-2.3
Commit: 414e4e3d70caa950a63fab1c8cac3314fb961b0c
Parents: 4c49b12
Author: Kazuaki Ishizaki <ishiz...@jp.ibm.com>
Authored: Thu May 10 14:41:55 2018 -0700
Committer: Marcelo Vanzin <van...@cloudera.com>
Committed: Thu May 10 14:42:06 2018 -0700

----------------------------------------------------------------------
 .../org/apache/spark/deploy/SparkSubmit.scala   | 42 +++++++++++++++-----
 .../spark/deploy/SparkSubmitUtilsSuite.scala    | 15 +++++++
 2 files changed, 47 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/414e4e3d/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
----------------------------------------------------------------------
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala 
b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index deb52a4..d1347f1 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -22,6 +22,7 @@ import java.lang.reflect.{InvocationTargetException, 
Modifier, UndeclaredThrowab
 import java.net.URL
 import java.security.PrivilegedExceptionAction
 import java.text.ParseException
+import java.util.UUID
 
 import scala.annotation.tailrec
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
@@ -1209,7 +1210,33 @@ private[spark] object SparkSubmitUtils {
 
   /** A nice function to use in tests as well. Values are dummy strings. */
   def getModuleDescriptor: DefaultModuleDescriptor = 
DefaultModuleDescriptor.newDefaultInstance(
-    ModuleRevisionId.newInstance("org.apache.spark", "spark-submit-parent", 
"1.0"))
+    // Include UUID in module name, so multiple clients resolving maven 
coordinate at the same time
+    // do not modify the same resolution file concurrently.
+    ModuleRevisionId.newInstance("org.apache.spark",
+      s"spark-submit-parent-${UUID.randomUUID.toString}",
+      "1.0"))
+
+  /**
+   * Clear ivy resolution from current launch. The resolution file is usually 
at
+   * ~/.ivy2/org.apache.spark-spark-submit-parent-$UUID-default.xml,
+   * ~/.ivy2/resolved-org.apache.spark-spark-submit-parent-$UUID-1.0.xml, and
+   * 
~/.ivy2/resolved-org.apache.spark-spark-submit-parent-$UUID-1.0.properties.
+   * Since each launch will have its own resolution files created, delete them 
after
+   * each resolution to prevent accumulation of these files in the ivy cache 
dir.
+   */
+  private def clearIvyResolutionFiles(
+      mdId: ModuleRevisionId,
+      ivySettings: IvySettings,
+      ivyConfName: String): Unit = {
+    val currentResolutionFiles = Seq(
+      s"${mdId.getOrganisation}-${mdId.getName}-$ivyConfName.xml",
+      
s"resolved-${mdId.getOrganisation}-${mdId.getName}-${mdId.getRevision}.xml",
+      
s"resolved-${mdId.getOrganisation}-${mdId.getName}-${mdId.getRevision}.properties"
+    )
+    currentResolutionFiles.foreach { filename =>
+      new File(ivySettings.getDefaultCache, filename).delete()
+    }
+  }
 
   /**
    * Resolves any dependencies that were supplied through maven coordinates
@@ -1260,14 +1287,6 @@ private[spark] object SparkSubmitUtils {
 
         // A Module descriptor must be specified. Entries are dummy strings
         val md = getModuleDescriptor
-        // clear ivy resolution from previous launches. The resolution file is 
usually at
-        // ~/.ivy2/org.apache.spark-spark-submit-parent-default.xml. In 
between runs, this file
-        // leads to confusion with Ivy when the files can no longer be found 
at the repository
-        // declared in that file/
-        val mdId = md.getModuleRevisionId
-        val previousResolution = new File(ivySettings.getDefaultCache,
-          s"${mdId.getOrganisation}-${mdId.getName}-$ivyConfName.xml")
-        if (previousResolution.exists) previousResolution.delete
 
         md.setDefaultConf(ivyConfName)
 
@@ -1288,7 +1307,10 @@ private[spark] object SparkSubmitUtils {
           packagesDirectory.getAbsolutePath + File.separator +
             "[organization]_[artifact]-[revision](-[classifier]).[ext]",
           retrieveOptions.setConfs(Array(ivyConfName)))
-        resolveDependencyPaths(rr.getArtifacts.toArray, packagesDirectory)
+        val paths = resolveDependencyPaths(rr.getArtifacts.toArray, 
packagesDirectory)
+        val mdId = md.getModuleRevisionId
+        clearIvyResolutionFiles(mdId, ivySettings, ivyConfName)
+        paths
       } finally {
         System.setOut(sysOut)
       }

http://git-wip-us.apache.org/repos/asf/spark/blob/414e4e3d/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
----------------------------------------------------------------------
diff --git 
a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala 
b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
index eb8c203..a0f0989 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitUtilsSuite.scala
@@ -256,4 +256,19 @@ class SparkSubmitUtilsSuite extends SparkFunSuite with 
BeforeAndAfterAll {
       assert(jarPath.indexOf("mydep") >= 0, "should find dependency")
     }
   }
+
+  test("SPARK-10878: test resolution files cleaned after resolving artifact") {
+    val main = new MavenCoordinate("my.great.lib", "mylib", "0.1")
+
+    IvyTestUtils.withRepository(main, None, None) { repo =>
+      val ivySettings = SparkSubmitUtils.buildIvySettings(Some(repo), 
Some(tempIvyPath))
+      val jarPath = SparkSubmitUtils.resolveMavenCoordinates(
+        main.toString,
+        ivySettings,
+        isTest = true)
+      val r = """.*org.apache.spark-spark-submit-parent-.*""".r
+      assert(!ivySettings.getDefaultCache.listFiles.map(_.getName)
+        .exists(r.findFirstIn(_).isDefined), "resolution files should be 
cleaned")
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to