This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 011b843687d8 Revert "[SPARK-35878][CORE] Revert S3A endpoint fixup logic of SPARK-35878" 011b843687d8 is described below commit 011b843687d8ae36b03e8d3d177b0bf43e7d29b6 Author: Dongjoon Hyun <dh...@apple.com> AuthorDate: Tue Feb 20 22:26:56 2024 -0800 Revert "[SPARK-35878][CORE] Revert S3A endpoint fixup logic of SPARK-35878" This reverts commit 36f199d1e41276c78036355eac1dac092e65aabe. --- .../org/apache/spark/deploy/SparkHadoopUtil.scala | 10 +++++++ .../apache/spark/deploy/SparkHadoopUtilSuite.scala | 33 ++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 2edd80db2637..628b688dedba 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -529,6 +529,16 @@ private[spark] object SparkHadoopUtil extends Logging { if (conf.getOption("spark.hadoop.fs.s3a.downgrade.syncable.exceptions").isEmpty) { hadoopConf.set("fs.s3a.downgrade.syncable.exceptions", "true", setBySpark) } + // In Hadoop 3.3.1, AWS region handling with the default "" endpoint only works + // in EC2 deployments or when the AWS CLI is installed. + // The workaround is to set the name of the S3 endpoint explicitly, + // if not already set. See HADOOP-17771. + if (hadoopConf.get("fs.s3a.endpoint", "").isEmpty && + hadoopConf.get("fs.s3a.endpoint.region") == null) { + // set to US central endpoint which can also connect to buckets + // in other regions at the expense of a HEAD request during fs creation + hadoopConf.set("fs.s3a.endpoint", "s3.amazonaws.com", setBySpark) + } } private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala index 9a81cb947257..2326d10d4164 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala @@ -39,6 +39,19 @@ class SparkHadoopUtilSuite extends SparkFunSuite { assertConfigMatches(hadoopConf, "orc.filterPushdown", "true", SOURCE_SPARK_HADOOP) assertConfigMatches(hadoopConf, "fs.s3a.downgrade.syncable.exceptions", "true", SET_TO_DEFAULT_VALUES) + assertConfigMatches(hadoopConf, "fs.s3a.endpoint", "s3.amazonaws.com", SET_TO_DEFAULT_VALUES) + } + + /** + * An empty S3A endpoint will be overridden just as a null value + * would. + */ + test("appendSparkHadoopConfigs with S3A endpoint set to empty string") { + val sc = new SparkConf() + val hadoopConf = new Configuration(false) + sc.set("spark.hadoop.fs.s3a.endpoint", "") + new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf) + assertConfigMatches(hadoopConf, "fs.s3a.endpoint", "s3.amazonaws.com", SET_TO_DEFAULT_VALUES) } /** @@ -48,8 +61,28 @@ class SparkHadoopUtilSuite extends SparkFunSuite { val sc = new SparkConf() val hadoopConf = new Configuration(false) sc.set("spark.hadoop.fs.s3a.downgrade.syncable.exceptions", "false") + sc.set("spark.hadoop.fs.s3a.endpoint", "s3-eu-west-1.amazonaws.com") new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf) assertConfigValue(hadoopConf, "fs.s3a.downgrade.syncable.exceptions", "false") + assertConfigValue(hadoopConf, "fs.s3a.endpoint", + "s3-eu-west-1.amazonaws.com") + } + + /** + * If the endpoint region is set (even to a blank string) in + * "spark.hadoop.fs.s3a.endpoint.region" then the endpoint is not set, + * even when the s3a endpoint is "". + * This supports a feature in hadoop 3.3.1 where this configuration + * pair triggers a revert to the "SDK to work out the region" algorithm, + * which works on EC2 deployments. + */ + test("appendSparkHadoopConfigs with S3A endpoint region set to an empty string") { + val sc = new SparkConf() + val hadoopConf = new Configuration(false) + sc.set("spark.hadoop.fs.s3a.endpoint.region", "") + new SparkHadoopUtil().appendSparkHadoopConfigs(sc, hadoopConf) + // the endpoint value will not have been set + assertConfigValue(hadoopConf, "fs.s3a.endpoint", null) } /** --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org