[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-12-14 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r542527770



##
File path: core/src/main/scala/org/apache/spark/SparkContext.scala
##
@@ -1890,47 +1890,58 @@ class SparkContext(config: SparkConf) extends Logging {
 throw new IllegalArgumentException(
   s"Directory ${path} is not allowed for addJar")
   }
-  path
+  Seq(path)
 } catch {
   case NonFatal(e) =>
 logError(s"Failed to add $path to Spark environment", e)
-null
+Nil
 }
   } else {
-path
+Seq(path)
   }
 }
 
 if (path == null || path.isEmpty) {
   logWarning("null or empty path specified as parameter to addJar")
 } else {
-  val key = if (path.contains("\\") && Utils.isWindows) {
+  val (keys, schema) = if (path.contains("\\") && Utils.isWindows) {
 // For local paths with backslashes on Windows, URI throws an exception
-addLocalJarFile(new File(path))
+(addLocalJarFile(new File(path)), "local")
   } else {
 val uri = new Path(path).toUri
 // SPARK-17650: Make sure this is a valid URL before adding it to the 
list of dependencies
 Utils.validateURL(uri)
-uri.getScheme match {
+val uriSchema = uri.getScheme
+val jarPaths = uriSchema match {
   // A JAR file which exists only on the driver node
   case null =>
 // SPARK-22585 path without schema is not url encoded
 addLocalJarFile(new File(uri.getPath))
   // A JAR file which exists only on the driver node
   case "file" => addLocalJarFile(new File(uri.getPath))
   // A JAR file which exists locally on every worker node
-  case "local" => "file:" + uri.getPath
+  case "local" => Seq("file:" + uri.getPath)
+  case "ivy" =>
+// Since `new Path(path).toUri` will lose query information,
+// so here we use `URI.create(path)`
+DependencyUtils.resolveMavenDependencies(URI.create(path))
+  .map(jar => env.rpcEnv.fileServer.addJar(new File(jar)))

Review comment:
   Looks good to me, thanks!





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-12-07 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r537648126



##
File path: core/src/main/scala/org/apache/spark/SparkContext.scala
##
@@ -1890,47 +1890,58 @@ class SparkContext(config: SparkConf) extends Logging {
 throw new IllegalArgumentException(
   s"Directory ${path} is not allowed for addJar")
   }
-  path
+  Seq(path)
 } catch {
   case NonFatal(e) =>
 logError(s"Failed to add $path to Spark environment", e)
-null
+Nil
 }
   } else {
-path
+Seq(path)
   }
 }
 
 if (path == null || path.isEmpty) {
   logWarning("null or empty path specified as parameter to addJar")
 } else {
-  val key = if (path.contains("\\") && Utils.isWindows) {
+  val (keys, schema) = if (path.contains("\\") && Utils.isWindows) {
 // For local paths with backslashes on Windows, URI throws an exception
-addLocalJarFile(new File(path))
+(addLocalJarFile(new File(path)), "local")
   } else {
 val uri = new Path(path).toUri
 // SPARK-17650: Make sure this is a valid URL before adding it to the 
list of dependencies
 Utils.validateURL(uri)
-uri.getScheme match {
+val uriSchema = uri.getScheme
+val jarPaths = uriSchema match {
   // A JAR file which exists only on the driver node
   case null =>
 // SPARK-22585 path without schema is not url encoded
 addLocalJarFile(new File(uri.getPath))
   // A JAR file which exists only on the driver node
   case "file" => addLocalJarFile(new File(uri.getPath))
   // A JAR file which exists locally on every worker node
-  case "local" => "file:" + uri.getPath
+  case "local" => Seq("file:" + uri.getPath)
+  case "ivy" =>
+// Since `new Path(path).toUri` will lose query information,
+// so here we use `URI.create(path)`
+DependencyUtils.resolveMavenDependencies(URI.create(path))
+  .map(jar => env.rpcEnv.fileServer.addJar(new File(jar)))

Review comment:
   Should we use the existing method for the sake of consistency:
   ```
   .map(jar => addLocalJarFile(new File(jar))
   ```
   Core logic will be the same, but there are some additional checks (verifies 
that `DependencyUtils` did its job correctly) and consolidates the logic.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-12-03 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r535381382



##
File path: core/src/main/scala/org/apache/spark/util/DependencyUtils.scala
##
@@ -15,22 +15,112 @@
  * limitations under the License.
  */
 
-package org.apache.spark.deploy
+package org.apache.spark.util
 
 import java.io.File
-import java.net.URI
+import java.net.{URI, URISyntaxException}
 
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.SparkSubmitUtils
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
-private[deploy] object DependencyUtils extends Logging {
+private[spark] object DependencyUtils extends Logging {
+
+  def getIvyProperties(): Seq[String] = {
+Seq(
+  "spark.jars.excludes",
+  "spark.jars.packages",
+  "spark.jars.repositories",
+  "spark.jars.ivy",
+  "spark.jars.ivySettings"
+).map(sys.props.get(_).orNull)
+  }
+
+  private def parseQueryParams(uriQuery: String): (Boolean, String) = {
+if (uriQuery == null) {
+  (false, "")
+} else {
+  val mapTokens = uriQuery.split("&").map(_.split("="))
+  if (mapTokens.exists(_.length != 2)) {
+throw new URISyntaxException(uriQuery, s"Invalid query string: 
$uriQuery")
+  }
+  val groupedParams = mapTokens.map(kv => (kv(0), kv(1))).groupBy(_._1)
+  // Parse transitive parameters (e.g., transitive=true) in an ivy URL, 
default value is false
+  var transitive = false

Review comment:
   Whoops, forgot about having to unpack the value. To be a bit more 
concise you can also do:
   ```
   val transitive = transitiveParams.flatMap(_.takeRight(1).map(_._2 == 
"true").headOption).getOrElse(false)
   ```
   But I think either way looks good!





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-12-02 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r534312342



##
File path: core/src/main/scala/org/apache/spark/util/DependencyUtils.scala
##
@@ -15,22 +15,112 @@
  * limitations under the License.
  */
 
-package org.apache.spark.deploy
+package org.apache.spark.util
 
 import java.io.File
-import java.net.URI
+import java.net.{URI, URISyntaxException}
 
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.SparkSubmitUtils
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
-private[deploy] object DependencyUtils extends Logging {
+private[spark] object DependencyUtils extends Logging {
+
+  def getIvyProperties(): Seq[String] = {
+Seq(
+  "spark.jars.excludes",
+  "spark.jars.packages",
+  "spark.jars.repositories",
+  "spark.jars.ivy",
+  "spark.jars.ivySettings"
+).map(sys.props.get(_).orNull)
+  }
+
+  private def parseQueryParams(uriQuery: String): (Boolean, String) = {
+if (uriQuery == null) {
+  (false, "")
+} else {
+  val mapTokens = uriQuery.split("&").map(_.split("="))
+  if (mapTokens.exists(_.length != 2)) {
+throw new URISyntaxException(uriQuery, s"Invalid query string: 
$uriQuery")
+  }
+  val groupedParams = mapTokens.map(kv => (kv(0), kv(1))).groupBy(_._1)
+  // Parse transitive parameters (e.g., transitive=true) in an ivy URL, 
default value is false
+  var transitive = false

Review comment:
   Maybe:
   ```
   val transitiveParams = groupedParams.get("transitive")
   if (transitiveParams.map(_.size).getOrElse(0) > 1) {
 // log warning
   }
   val transitive = 
transitiveParams.flatMap(_.takeRight(1).headOption).getOrElse(false)
   ```





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-12-02 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r534310229



##
File path: core/src/main/scala/org/apache/spark/util/DependencyUtils.scala
##
@@ -15,22 +15,112 @@
  * limitations under the License.
  */
 
-package org.apache.spark.deploy
+package org.apache.spark.util
 
 import java.io.File
-import java.net.URI
+import java.net.{URI, URISyntaxException}
 
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.SparkSubmitUtils
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
-private[deploy] object DependencyUtils extends Logging {
+private[spark] object DependencyUtils extends Logging {
+
+  def getIvyProperties(): Seq[String] = {

Review comment:
   Sounds good.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-12-01 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r533573706



##
File path: core/src/main/scala/org/apache/spark/util/DependencyUtils.scala
##
@@ -15,22 +15,112 @@
  * limitations under the License.
  */
 
-package org.apache.spark.deploy
+package org.apache.spark.util
 
 import java.io.File
-import java.net.URI
+import java.net.{URI, URISyntaxException}
 
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.SparkSubmitUtils
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
-private[deploy] object DependencyUtils extends Logging {
+private[spark] object DependencyUtils extends Logging {
+
+  def getIvyProperties(): Seq[String] = {
+Seq(
+  "spark.jars.excludes",
+  "spark.jars.packages",
+  "spark.jars.repositories",
+  "spark.jars.ivy",
+  "spark.jars.ivySettings"
+).map(sys.props.get(_).orNull)
+  }
+
+  private def parseQueryParams(uriQuery: String): (Boolean, String) = {

Review comment:
   can we document the fields of the return tuple

##
File path: core/src/main/scala/org/apache/spark/util/DependencyUtils.scala
##
@@ -15,22 +15,112 @@
  * limitations under the License.
  */
 
-package org.apache.spark.deploy
+package org.apache.spark.util
 
 import java.io.File
-import java.net.URI
+import java.net.{URI, URISyntaxException}
 
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.SparkSubmitUtils
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
-private[deploy] object DependencyUtils extends Logging {
+private[spark] object DependencyUtils extends Logging {
+
+  def getIvyProperties(): Seq[String] = {
+Seq(
+  "spark.jars.excludes",
+  "spark.jars.packages",
+  "spark.jars.repositories",
+  "spark.jars.ivy",
+  "spark.jars.ivySettings"
+).map(sys.props.get(_).orNull)
+  }
+
+  private def parseQueryParams(uriQuery: String): (Boolean, String) = {
+if (uriQuery == null) {
+  (false, "")
+} else {
+  val mapTokens = uriQuery.split("&").map(_.split("="))
+  if (mapTokens.exists(_.length != 2)) {
+throw new URISyntaxException(uriQuery, s"Invalid query string: 
$uriQuery")
+  }
+  val groupedParams = mapTokens.map(kv => (kv(0), kv(1))).groupBy(_._1)
+  // Parse transitive parameters (e.g., transitive=true) in an ivy URL, 
default value is false
+  var transitive = false

Review comment:
   To be a bit more functional / Scala-idiomatic, I think we can do 
something like 
`groupedParams.get("transitive").takeRight(1).headOption.getOrElse(false)` 
instead of the `foreach` call

##
File path: core/src/main/scala/org/apache/spark/util/DependencyUtils.scala
##
@@ -15,22 +15,112 @@
  * limitations under the License.
  */
 
-package org.apache.spark.deploy
+package org.apache.spark.util
 
 import java.io.File
-import java.net.URI
+import java.net.{URI, URISyntaxException}
 
 import org.apache.commons.lang3.StringUtils
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 import org.apache.spark.{SecurityManager, SparkConf, SparkException}
+import org.apache.spark.deploy.SparkSubmitUtils
 import org.apache.spark.internal.Logging
-import org.apache.spark.util.{MutableURLClassLoader, Utils}
 
-private[deploy] object DependencyUtils extends Logging {
+private[spark] object DependencyUtils extends Logging {
+
+  def getIvyProperties(): Seq[String] = {
+Seq(
+  "spark.jars.excludes",
+  "spark.jars.packages",
+  "spark.jars.repositories",
+  "spark.jars.ivy",
+  "spark.jars.ivySettings"
+).map(sys.props.get(_).orNull)
+  }
+
+  private def parseQueryParams(uriQuery: String): (Boolean, String) = {
+if (uriQuery == null) {
+  (false, "")
+} else {
+  val mapTokens = uriQuery.split("&").map(_.split("="))
+  if (mapTokens.exists(_.length != 2)) {
+throw new URISyntaxException(uriQuery, s"Invalid query string: 
$uriQuery")
+  }
+  val groupedParams = mapTokens.map(kv => (kv(0), kv(1))).groupBy(_._1)
+  // Parse transitive parameters (e.g., transitive=true) in an ivy URL, 
default value is false
+  var transitive = false
+  groupedParams.get("transitive").foreach { params =>
+if (params.length > 1) {
+  logWarning("It's best to specify `transitive` parameter in ivy URL 
query only once." +
+" If there are multiple `transitive` parameter, we will select the 
last one")
+}
+ 

[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-11-30 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r532888035



##
File path: core/src/main/scala/org/apache/spark/util/Utils.scala
##
@@ -2980,6 +2980,77 @@ private[spark] object Utils extends Logging {
 metadata.toString
   }
 
+  /**
+   * Download Ivy URIs dependent jars.
+   *
+   * @param uri Ivy uri need to be downloaded.
+   * @return Comma separated string list of URIs of downloaded jars

Review comment:
   Sure, makes sense.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-11-24 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r529847248



##
File path: 
sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
##
@@ -159,6 +161,13 @@ class SessionResourceLoader(session: SparkSession) extends 
FunctionResourceLoade
 }
   }
 
+  protected def resolveJars(path: String): List[String] = {
+new Path(path).toUri.getScheme match {

Review comment:
   Similar comment for `addJar` below.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-11-24 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r529840209



##
File path: core/src/main/scala/org/apache/spark/util/Utils.scala
##
@@ -2980,6 +2980,77 @@ private[spark] object Utils extends Logging {
 metadata.toString
   }
 
+  /**
+   * Download Ivy URIs dependent jars.
+   *
+   * @param uri Ivy uri need to be downloaded.
+   * @return Comma separated string list of URIs of downloaded jars
+   */
+  def resolveMavenDependencies(uri: URI): String = {
+val Seq(repositories, ivyRepoPath, ivySettingsPath) =
+  Seq(
+"spark.jars.repositories",
+"spark.jars.ivy",
+"spark.jars.ivySettings"
+  ).map(sys.props.get(_).orNull)
+// Create the IvySettings, either load from file or build defaults
+val ivySettings = Option(ivySettingsPath) match {
+  case Some(path) =>
+SparkSubmitUtils.loadIvySettings(path, Option(repositories), 
Option(ivyRepoPath))
+
+  case None =>
+SparkSubmitUtils.buildIvySettings(Option(repositories), 
Option(ivyRepoPath))
+}
+SparkSubmitUtils.resolveMavenCoordinates(uri.getAuthority, ivySettings,
+  parseExcludeList(uri.getQuery), parseTransitive(uri.getQuery))
+  }
+
+  private def parseURLQueryParameter(queryString: String, queryTag: String): 
Array[String] = {
+if (queryString == null || queryString.isEmpty) {
+  Array.empty[String]
+} else {
+  val mapTokens = queryString.split("&")
+  assert(mapTokens.forall(_.split("=").length == 2), "Invalid query 
string: " + queryString)
+  mapTokens.map(_.split("=")).map(kv => (kv(0), kv(1))).filter(_._1 == 
queryTag).map(_._2)
+}
+  }
+
+  /**
+   * Parse excluded list in ivy URL. When download ivy URL jar, Spark won't 
download transitive jar
+   * in excluded list.
+   *
+   * @param queryString Ivy URI query part string.
+   * @return Exclude list which contains grape parameters of exclude.
+   * Example: Input:  
exclude=org.mortbay.jetty:jetty,org.eclipse.jetty:jetty-http
+   * Output:  [org.mortbay.jetty:jetty, org.eclipse.jetty:jetty-http]
+   */
+  private def parseExcludeList(queryString: String): Array[String] = {
+parseURLQueryParameter(queryString, "exclude")
+  .flatMap { excludeString =>
+val excludes: Array[String] = excludeString.split(",")
+assert(excludes.forall(_.split(":").length == 2),
+  "Invalid exclude string: expected 'org:module,org:module,..', found 
" + excludeString)
+excludes
+  }
+  }
+
+  /**
+   * Parse transitive parameter in ivy URL, default value is false.
+   *
+   * @param queryString Ivy URI query part string.
+   * @return Exclude list which contains grape parameters of transitive.
+   * Example: Input:  exclude=org.mortbay.jetty:jetty=true
+   * Output:  true
+   */
+  private def parseTransitive(queryString: String): Boolean = {
+val transitive = parseURLQueryParameter(queryString, "transitive")
+if (transitive.isEmpty) {
+  false
+} else {
+  transitive.last.toBoolean

Review comment:
   Maybe print a warning if it is specified multiple times? This is not 
expected.

##
File path: core/src/main/scala/org/apache/spark/util/Utils.scala
##
@@ -2980,6 +2980,77 @@ private[spark] object Utils extends Logging {
 metadata.toString
   }
 
+  /**
+   * Download Ivy URIs dependent jars.
+   *
+   * @param uri Ivy uri need to be downloaded.
+   * @return Comma separated string list of URIs of downloaded jars
+   */
+  def resolveMavenDependencies(uri: URI): String = {
+val Seq(repositories, ivyRepoPath, ivySettingsPath) =
+  Seq(
+"spark.jars.repositories",
+"spark.jars.ivy",
+"spark.jars.ivySettings"
+  ).map(sys.props.get(_).orNull)
+// Create the IvySettings, either load from file or build defaults
+val ivySettings = Option(ivySettingsPath) match {
+  case Some(path) =>
+SparkSubmitUtils.loadIvySettings(path, Option(repositories), 
Option(ivyRepoPath))

Review comment:
   Some of this logic is duplicated from 
`DependencyUtils.resolveMavenDependencies`, seems it will be better if we can 
unify? `DependencyUtils` seems like a more suitable place for this logic anyway.
   
   With the addition of `Utils.resolveMavenDependencies` we have three 
identically-named methods in 3 different utility classes... I worry this will 
become very confusing. Better to consolidate.

##
File path: 
sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
##
@@ -159,6 +161,13 @@ class SessionResourceLoader(session: SparkSession) extends 
FunctionResourceLoade
 }
   }
 
+  protected def resolveJars(path: String): List[String] = {
+new Path(path).toUri.getScheme match {

Review comment:
   Similar comment for `addJar` below





This is an automated message 

[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-11-24 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r529838764



##
File path: core/src/main/scala/org/apache/spark/util/Utils.scala
##
@@ -2980,6 +2980,75 @@ private[spark] object Utils extends Logging {
 metadata.toString
   }
 
+  /**
+   * Download Ivy URIs dependent jars.
+   *
+   * @param uri Ivy uri need to be downloaded.
+   * @return Comma separated string list of URIs of downloaded jars
+   */
+  def resolveMavenDependencies(uri: URI): String = {
+val Seq(repositories, ivyRepoPath, ivySettingsPath) =
+  Seq(
+"spark.jars.repositories",
+"spark.jars.ivy",
+"spark.jars.ivySettings"
+  ).map(sys.props.get(_).orNull)
+// Create the IvySettings, either load from file or build defaults
+val ivySettings = Option(ivySettingsPath) match {
+  case Some(path) =>
+SparkSubmitUtils.loadIvySettings(path, Option(repositories), 
Option(ivyRepoPath))

Review comment:
   Let's not use default Ivy settings... In my experience with some custom 
logic we have, it's very valuable to ensure that all of the Ivy resolution 
obeys the settings.
   
   Maybe we can pull this out into a common utility that can be leveraged here 
and `DriverWrapper`? Then there is no need for testing twice. But I don't think 
saying that the logic is tested elsewhere then copy-pasted is sufficient -- if 
there is drift in the two code paths, we will lose the validation.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] [spark] xkrogen commented on a change in pull request #29966: [SPARK-33084][CORE][SQL] Add jar support ivy path

2020-11-24 Thread GitBox


xkrogen commented on a change in pull request #29966:
URL: https://github.com/apache/spark/pull/29966#discussion_r529837613



##
File path: core/src/main/scala/org/apache/spark/util/Utils.scala
##
@@ -2980,6 +2980,75 @@ private[spark] object Utils extends Logging {
 metadata.toString
   }
 
+  /**
+   * Download Ivy URIs dependent jars.
+   *
+   * @param uri Ivy uri need to be downloaded.
+   * @return Comma separated string list of URIs of downloaded jars
+   */
+  def resolveMavenDependencies(uri: URI): String = {
+val Seq(repositories, ivyRepoPath, ivySettingsPath) =
+  Seq(
+"spark.jars.repositories",
+"spark.jars.ivy",
+"spark.jars.ivySettings"
+  ).map(sys.props.get(_).orNull)
+// Create the IvySettings, either load from file or build defaults
+val ivySettings = Option(ivySettingsPath) match {
+  case Some(path) =>
+SparkSubmitUtils.loadIvySettings(path, Option(repositories), 
Option(ivyRepoPath))

Review comment:
   Let's not use default Ivy settings... In my experience with some custom 
logic we have, it's very valuable to ensure that all of the Ivy resolution 
obeys the settings.
   
   Maybe we can pull this out into a common utility that can be leveraged here 
and `DriverWrapper`? Then there is no need for testing twice. But I don't think 
saying that the logic is tested elsewhere then copy-pasted is sufficient -- if 
there is drift in the two code paths, we will lose the validation.

##
File path: core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
##
@@ -1348,6 +1348,7 @@ private[spark] object SparkSubmitUtils {
   coordinates: String,
   ivySettings: IvySettings,
   exclusions: Seq[String] = Nil,
+  transitive: Boolean = true,

Review comment:
   Nit: Add Scaladoc for this new parameter

##
File path: 
sql/core/src/main/scala/org/apache/spark/sql/internal/SessionState.scala
##
@@ -159,6 +161,13 @@ class SessionResourceLoader(session: SparkSession) extends 
FunctionResourceLoade
 }
   }
 
+  protected def resolveJars(path: String): List[String] = {
+new Path(path).toUri.getScheme match {

Review comment:
   Shouldn't we use `URI.create(path)` a single time, then re-use the URI 
in this line and the one below? I also think I remember you mentioning 
elsewhere that `new Path(path).toUri` can lose information.

##
File path: core/src/main/scala/org/apache/spark/util/Utils.scala
##
@@ -2980,6 +2980,77 @@ private[spark] object Utils extends Logging {
 metadata.toString
   }
 
+  /**
+   * Download Ivy URIs dependent jars.
+   *
+   * @param uri Ivy uri need to be downloaded.
+   * @return Comma separated string list of URIs of downloaded jars
+   */
+  def resolveMavenDependencies(uri: URI): String = {
+val Seq(repositories, ivyRepoPath, ivySettingsPath) =
+  Seq(
+"spark.jars.repositories",
+"spark.jars.ivy",
+"spark.jars.ivySettings"
+  ).map(sys.props.get(_).orNull)
+// Create the IvySettings, either load from file or build defaults
+val ivySettings = Option(ivySettingsPath) match {
+  case Some(path) =>
+SparkSubmitUtils.loadIvySettings(path, Option(repositories), 
Option(ivyRepoPath))
+
+  case None =>
+SparkSubmitUtils.buildIvySettings(Option(repositories), 
Option(ivyRepoPath))
+}
+SparkSubmitUtils.resolveMavenCoordinates(uri.getAuthority, ivySettings,
+  parseExcludeList(uri.getQuery), parseTransitive(uri.getQuery))
+  }
+
+  private def parseURLQueryParameter(queryString: String, queryTag: String): 
Array[String] = {
+if (queryString == null || queryString.isEmpty) {
+  Array.empty[String]
+} else {
+  val mapTokens = queryString.split("&")
+  assert(mapTokens.forall(_.split("=").length == 2), "Invalid query 
string: " + queryString)

Review comment:
   IIRC this will accept URLs that looks like `?=foo`, `?foo=`, or 
`?bar==foo`. It would be good to add tests for this to confirm, and adjust 
as necessary.
   
   Same comment for `parseExcludeList` below.

##
File path: core/src/main/scala/org/apache/spark/util/Utils.scala
##
@@ -2980,6 +2980,77 @@ private[spark] object Utils extends Logging {
 metadata.toString
   }
 
+  /**
+   * Download Ivy URIs dependent jars.
+   *
+   * @param uri Ivy uri need to be downloaded.
+   * @return Comma separated string list of URIs of downloaded jars

Review comment:
   Should we be returning a `List[String]` instead of `String` (here and in 
`SparkSubmitUtils`)? It seems odd to have `SparkSubmitUtils` do a `mkString` to 
convert a list to string, then re-convert back to a list later.





This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please