[spark] branch master updated: [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check

dongjoon Tue, 26 Jan 2021 15:35:27 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new abf7e81  [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate 
dependency check and add more strict Hadoop version check
abf7e81 is described below

commit abf7e81712c9fb5a8c2e747a405a58bb8e1154e5
Author: Chao Sun <sunc...@apple.com>
AuthorDate: Tue Jan 26 15:34:55 2021 -0800

    [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and 
add more strict Hadoop version check
    
    ### What changes were proposed in this pull request?
    
    1. Add back Maven enforcer for duplicate dependencies check
    2. More strict check on Hadoop versions which support shaded client in 
`IsolatedClientLoader`. To do proper version check, this adds a util function 
`majorMinorPatchVersion` to extract major/minor/patch version from a string.
    3. Cleanup unnecessary code
    
    ### Why are the changes needed?
    
    The Maven enforcer was removed as part of #30556. This proposes to add it 
back.
    
    Also, Hadoop shaded client doesn't work in certain cases (see [these 
comments](https://github.com/apache/spark/pull/30701#discussion_r558522227) for 
details). This strictly checks that the current Hadoop version (i.e., 3.2.2 at 
the moment) has good support of shaded client or otherwise fallback to old 
unshaded ones.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Existing tests.
    
    Closes #31203 from sunchao/SPARK-33212-followup.
    
    Lead-authored-by: Chao Sun <sunc...@apple.com>
    Co-authored-by: Chao Sun <sunc...@apache.org>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../scala/org/apache/spark/util/VersionUtils.scala | 33 ++++++++++++++++++++++
 .../org/apache/spark/util/VersionUtilsSuite.scala  | 14 +++++++++
 pom.xml                                            | 11 ++++++++
 .../sql/hive/client/IsolatedClientLoader.scala     | 21 +++++++-------
 .../sql/hive/client/HadoopVersionInfoSuite.scala   | 16 +++++++++++
 5 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/VersionUtils.scala 
b/core/src/main/scala/org/apache/spark/util/VersionUtils.scala
index c0f8866..e97d1c9 100644
--- a/core/src/main/scala/org/apache/spark/util/VersionUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/VersionUtils.scala
@@ -24,6 +24,7 @@ private[spark] object VersionUtils {
 
   private val majorMinorRegex = """^(\d+)\.(\d+)(\..*)?$""".r
   private val shortVersionRegex = """^(\d+\.\d+\.\d+)(.*)?$""".r
+  private val majorMinorPatchRegex = 
"""^(\d+)(?:\.(\d+)(?:\.(\d+)(?:[.-].*)?)?)?$""".r
 
   /**
    * Given a Spark version string, return the major version number.
@@ -63,4 +64,36 @@ private[spark] object VersionUtils {
           s" version string, but it could not find the major and minor version 
numbers.")
     }
   }
+
+  /**
+   * Extracts the major, minor and patch parts from the input `version`. Note 
that if minor or patch
+   * version is missing from the input, this will return 0 for these parts. 
Returns `None` if the
+   * input is not of a valid format.
+   *
+   * Examples of valid version:
+   *  - 1   (extracts to (1, 0, 0))
+   *  - 2.4   (extracts to (2, 4, 0))
+   *  - 3.2.2   (extracts to (3, 2, 2))
+   *  - 3.2.2.4   (extracts to 3, 2, 2))
+   *  - 3.3.1-SNAPSHOT   (extracts to (3, 3, 1))
+   *  - 3.2.2.4SNAPSHOT   (extracts to (3, 2, 2), only the first 3 components)
+   *
+   * Examples of invalid version:
+   *  - ABC
+   *  - 1X
+   *  - 2.4XYZ
+   *  - 2.4-SNAPSHOT
+   *  - 3.4.5ABC
+   *
+   *  @return A non-empty option containing a 3-value tuple (major, minor, 
patch) iff the
+   *          input is a valid version. `None` otherwise.
+   */
+  def majorMinorPatchVersion(version: String): Option[(Int, Int, Int)] = {
+    majorMinorPatchRegex.findFirstMatchIn(version).map { m =>
+      val major = m.group(1).toInt
+      val minor = Option(m.group(2)).map(_.toInt).getOrElse(0)
+      val patch = Option(m.group(3)).map(_.toInt).getOrElse(0)
+      (major, minor, patch)
+    }
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala 
b/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala
index 56623eb..ff68dd1 100644
--- a/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala
@@ -98,4 +98,18 @@ class VersionUtilsSuite extends SparkFunSuite {
       }
     }
   }
+
+  test("SPARK-33212: retrieve major/minor/patch version parts") {
+    assert(VersionUtils.majorMinorPatchVersion("3.2.2").contains((3, 2, 2)))
+    assert(VersionUtils.majorMinorPatchVersion("3.2.2.4").contains((3, 2, 2)))
+    assert(VersionUtils.majorMinorPatchVersion("3.2.2-SNAPSHOT").contains((3, 
2, 2)))
+    assert(VersionUtils.majorMinorPatchVersion("3.2.2.4XXX").contains((3, 2, 
2)))
+    assert(VersionUtils.majorMinorPatchVersion("3.2").contains((3, 2, 0)))
+    assert(VersionUtils.majorMinorPatchVersion("3").contains((3, 0, 0)))
+
+    // illegal cases
+    Seq("ABC", "3X", "3.2-SNAPSHOT", "3.2ABC", "3-ABC", "3.2.4XYZ").foreach { 
version =>
+      assert(VersionUtils.majorMinorPatchVersion(version).isEmpty, s"version 
$version")
+    }
+  }
 }
diff --git a/pom.xml b/pom.xml
index 94f3230..653d192 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2448,6 +2448,17 @@
                 </rules>
               </configuration>
             </execution>
+            <execution>
+              <id>enforce-no-duplicate-dependencies</id>
+              <goals>
+                <goal>enforce</goal>
+              </goals>
+              <configuration>
+                <rules>
+                  <banDuplicatePomDependencyVersions/>
+                </rules>
+              </configuration>
+            </execution>
           </executions>
         </plugin>
        <plugin>
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
index 58ca476..e520a0a 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala
@@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.util.quietly
 import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.internal.NonClosableMutableURLClassLoader
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.util.{MutableURLClassLoader, Utils}
+import org.apache.spark.util.{MutableURLClassLoader, Utils, VersionUtils}
 
 /** Factory for `IsolatedClientLoader` with specific versions of hive. */
 private[hive] object IsolatedClientLoader extends Logging {
@@ -107,12 +107,19 @@ private[hive] object IsolatedClientLoader extends Logging 
{
         s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid 
version.")
   }
 
+  def supportsHadoopShadedClient(hadoopVersion: String): Boolean = {
+    VersionUtils.majorMinorPatchVersion(hadoopVersion).exists {
+      case (3, 2, v) if v >= 2 => true
+      case _ => false
+    }
+  }
+
   private def downloadVersion(
       version: HiveVersion,
       hadoopVersion: String,
       ivyPath: Option[String],
       remoteRepos: String): Seq[URL] = {
-    val hadoopJarNames = if (hadoopVersion.startsWith("3")) {
+    val hadoopJarNames = if (supportsHadoopShadedClient(hadoopVersion)) {
       Seq(s"org.apache.hadoop:hadoop-client-api:$hadoopVersion",
         s"org.apache.hadoop:hadoop-client-runtime:$hadoopVersion")
     } else {
@@ -123,14 +130,6 @@ private[hive] object IsolatedClientLoader extends Logging {
         .map(a => s"org.apache.hive:$a:${version.fullVersion}") ++
       Seq("com.google.guava:guava:14.0.1") ++ hadoopJarNames
 
-    val extraExclusions = if (hadoopVersion.startsWith("3")) {
-      // this introduced from lower version of Hive could conflict with jars 
in Hadoop 3.2+, so
-      // exclude here in favor of the ones in Hadoop 3.2+
-      Seq("org.apache.hadoop:hadoop-auth")
-    } else {
-      Seq.empty
-    }
-
     val classpaths = quietly {
       SparkSubmitUtils.resolveMavenCoordinates(
         hiveArtifacts.mkString(","),
@@ -138,7 +137,7 @@ private[hive] object IsolatedClientLoader extends Logging {
           Some(remoteRepos),
           ivyPath),
         transitive = true,
-        exclusions = version.exclusions ++ extraExclusions)
+        exclusions = version.exclusions)
     }
     val allFiles = classpaths.map(new File(_)).toSet
 
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala
index 8d55356..5701a2a 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala
@@ -21,6 +21,7 @@ import java.io.File
 import java.net.URLClassLoader
 
 import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.util.VersionInfo
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
 import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
@@ -68,4 +69,19 @@ class HadoopVersionInfoSuite extends SparkFunSuite {
       Utils.deleteRecursively(ivyPath)
     }
   }
+
+  test("SPARK-32212: test supportHadoopShadedClient()") {
+    Seq("3.2.2", "3.2.3", "3.2.2.1", "3.2.2-XYZ", "3.2.2.4-SNAPSHOT").foreach 
{ version =>
+      assert(IsolatedClientLoader.supportsHadoopShadedClient(version), 
s"version $version")
+    }
+
+    // negative cases
+    Seq("3.1.3", "3.2", "3.2.1", "4").foreach { version =>
+      assert(!IsolatedClientLoader.supportsHadoopShadedClient(version), 
s"version $version")
+    }
+  }
+
+  test("SPARK-32212: built-in Hadoop version should support shaded client") {
+    
assert(IsolatedClientLoader.supportsHadoopShadedClient(VersionInfo.getVersion))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check

Reply via email to