This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new abf7e81 [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check abf7e81 is described below commit abf7e81712c9fb5a8c2e747a405a58bb8e1154e5 Author: Chao Sun <sunc...@apple.com> AuthorDate: Tue Jan 26 15:34:55 2021 -0800 [SPARK-33212][FOLLOW-UP][BUILD] Bring back duplicate dependency check and add more strict Hadoop version check ### What changes were proposed in this pull request? 1. Add back Maven enforcer for duplicate dependencies check 2. More strict check on Hadoop versions which support shaded client in `IsolatedClientLoader`. To do proper version check, this adds a util function `majorMinorPatchVersion` to extract major/minor/patch version from a string. 3. Cleanup unnecessary code ### Why are the changes needed? The Maven enforcer was removed as part of #30556. This proposes to add it back. Also, Hadoop shaded client doesn't work in certain cases (see [these comments](https://github.com/apache/spark/pull/30701#discussion_r558522227) for details). This strictly checks that the current Hadoop version (i.e., 3.2.2 at the moment) has good support of shaded client or otherwise fallback to old unshaded ones. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. Closes #31203 from sunchao/SPARK-33212-followup. Lead-authored-by: Chao Sun <sunc...@apple.com> Co-authored-by: Chao Sun <sunc...@apache.org> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../scala/org/apache/spark/util/VersionUtils.scala | 33 ++++++++++++++++++++++ .../org/apache/spark/util/VersionUtilsSuite.scala | 14 +++++++++ pom.xml | 11 ++++++++ .../sql/hive/client/IsolatedClientLoader.scala | 21 +++++++------- .../sql/hive/client/HadoopVersionInfoSuite.scala | 16 +++++++++++ 5 files changed, 84 insertions(+), 11 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/VersionUtils.scala b/core/src/main/scala/org/apache/spark/util/VersionUtils.scala index c0f8866..e97d1c9 100644 --- a/core/src/main/scala/org/apache/spark/util/VersionUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/VersionUtils.scala @@ -24,6 +24,7 @@ private[spark] object VersionUtils { private val majorMinorRegex = """^(\d+)\.(\d+)(\..*)?$""".r private val shortVersionRegex = """^(\d+\.\d+\.\d+)(.*)?$""".r + private val majorMinorPatchRegex = """^(\d+)(?:\.(\d+)(?:\.(\d+)(?:[.-].*)?)?)?$""".r /** * Given a Spark version string, return the major version number. @@ -63,4 +64,36 @@ private[spark] object VersionUtils { s" version string, but it could not find the major and minor version numbers.") } } + + /** + * Extracts the major, minor and patch parts from the input `version`. Note that if minor or patch + * version is missing from the input, this will return 0 for these parts. Returns `None` if the + * input is not of a valid format. + * + * Examples of valid version: + * - 1 (extracts to (1, 0, 0)) + * - 2.4 (extracts to (2, 4, 0)) + * - 3.2.2 (extracts to (3, 2, 2)) + * - 3.2.2.4 (extracts to 3, 2, 2)) + * - 3.3.1-SNAPSHOT (extracts to (3, 3, 1)) + * - 3.2.2.4SNAPSHOT (extracts to (3, 2, 2), only the first 3 components) + * + * Examples of invalid version: + * - ABC + * - 1X + * - 2.4XYZ + * - 2.4-SNAPSHOT + * - 3.4.5ABC + * + * @return A non-empty option containing a 3-value tuple (major, minor, patch) iff the + * input is a valid version. `None` otherwise. + */ + def majorMinorPatchVersion(version: String): Option[(Int, Int, Int)] = { + majorMinorPatchRegex.findFirstMatchIn(version).map { m => + val major = m.group(1).toInt + val minor = Option(m.group(2)).map(_.toInt).getOrElse(0) + val patch = Option(m.group(3)).map(_.toInt).getOrElse(0) + (major, minor, patch) + } + } } diff --git a/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala index 56623eb..ff68dd1 100644 --- a/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/VersionUtilsSuite.scala @@ -98,4 +98,18 @@ class VersionUtilsSuite extends SparkFunSuite { } } } + + test("SPARK-33212: retrieve major/minor/patch version parts") { + assert(VersionUtils.majorMinorPatchVersion("3.2.2").contains((3, 2, 2))) + assert(VersionUtils.majorMinorPatchVersion("3.2.2.4").contains((3, 2, 2))) + assert(VersionUtils.majorMinorPatchVersion("3.2.2-SNAPSHOT").contains((3, 2, 2))) + assert(VersionUtils.majorMinorPatchVersion("3.2.2.4XXX").contains((3, 2, 2))) + assert(VersionUtils.majorMinorPatchVersion("3.2").contains((3, 2, 0))) + assert(VersionUtils.majorMinorPatchVersion("3").contains((3, 0, 0))) + + // illegal cases + Seq("ABC", "3X", "3.2-SNAPSHOT", "3.2ABC", "3-ABC", "3.2.4XYZ").foreach { version => + assert(VersionUtils.majorMinorPatchVersion(version).isEmpty, s"version $version") + } + } } diff --git a/pom.xml b/pom.xml index 94f3230..653d192 100644 --- a/pom.xml +++ b/pom.xml @@ -2448,6 +2448,17 @@ </rules> </configuration> </execution> + <execution> + <id>enforce-no-duplicate-dependencies</id> + <goals> + <goal>enforce</goal> + </goals> + <configuration> + <rules> + <banDuplicatePomDependencyVersions/> + </rules> + </configuration> + </execution> </executions> </plugin> <plugin> diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala index 58ca476..e520a0a 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/IsolatedClientLoader.scala @@ -37,7 +37,7 @@ import org.apache.spark.sql.catalyst.util.quietly import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.internal.NonClosableMutableURLClassLoader import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.util.{MutableURLClassLoader, Utils} +import org.apache.spark.util.{MutableURLClassLoader, Utils, VersionUtils} /** Factory for `IsolatedClientLoader` with specific versions of hive. */ private[hive] object IsolatedClientLoader extends Logging { @@ -107,12 +107,19 @@ private[hive] object IsolatedClientLoader extends Logging { s"Please set ${HiveUtils.HIVE_METASTORE_VERSION.key} with a valid version.") } + def supportsHadoopShadedClient(hadoopVersion: String): Boolean = { + VersionUtils.majorMinorPatchVersion(hadoopVersion).exists { + case (3, 2, v) if v >= 2 => true + case _ => false + } + } + private def downloadVersion( version: HiveVersion, hadoopVersion: String, ivyPath: Option[String], remoteRepos: String): Seq[URL] = { - val hadoopJarNames = if (hadoopVersion.startsWith("3")) { + val hadoopJarNames = if (supportsHadoopShadedClient(hadoopVersion)) { Seq(s"org.apache.hadoop:hadoop-client-api:$hadoopVersion", s"org.apache.hadoop:hadoop-client-runtime:$hadoopVersion") } else { @@ -123,14 +130,6 @@ private[hive] object IsolatedClientLoader extends Logging { .map(a => s"org.apache.hive:$a:${version.fullVersion}") ++ Seq("com.google.guava:guava:14.0.1") ++ hadoopJarNames - val extraExclusions = if (hadoopVersion.startsWith("3")) { - // this introduced from lower version of Hive could conflict with jars in Hadoop 3.2+, so - // exclude here in favor of the ones in Hadoop 3.2+ - Seq("org.apache.hadoop:hadoop-auth") - } else { - Seq.empty - } - val classpaths = quietly { SparkSubmitUtils.resolveMavenCoordinates( hiveArtifacts.mkString(","), @@ -138,7 +137,7 @@ private[hive] object IsolatedClientLoader extends Logging { Some(remoteRepos), ivyPath), transitive = true, - exclusions = version.exclusions ++ extraExclusions) + exclusions = version.exclusions) } val allFiles = classpaths.map(new File(_)).toSet diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala index 8d55356..5701a2a 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/HadoopVersionInfoSuite.scala @@ -21,6 +21,7 @@ import java.io.File import java.net.URLClassLoader import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.util.VersionInfo import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} @@ -68,4 +69,19 @@ class HadoopVersionInfoSuite extends SparkFunSuite { Utils.deleteRecursively(ivyPath) } } + + test("SPARK-32212: test supportHadoopShadedClient()") { + Seq("3.2.2", "3.2.3", "3.2.2.1", "3.2.2-XYZ", "3.2.2.4-SNAPSHOT").foreach { version => + assert(IsolatedClientLoader.supportsHadoopShadedClient(version), s"version $version") + } + + // negative cases + Seq("3.1.3", "3.2", "3.2.1", "4").foreach { version => + assert(!IsolatedClientLoader.supportsHadoopShadedClient(version), s"version $version") + } + } + + test("SPARK-32212: built-in Hadoop version should support shaded client") { + assert(IsolatedClientLoader.supportsHadoopShadedClient(VersionInfo.getVersion)) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org