This is an automated email from the ASF dual-hosted git repository.
chengchengjin pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new a70837d11b [GLUTEN-11238][VL] Deprecate parquet encrypted check config
(#11240)
a70837d11b is described below
commit a70837d11b7747037dc88146f0fc21a04c739e74
Author: Jin Chengcheng <[email protected]>
AuthorDate: Fri Dec 5 06:52:08 2025 +0000
[GLUTEN-11238][VL] Deprecate parquet encrypted check config (#11240)
---
.../gluten/backendsapi/velox/VeloxBackend.scala | 1 -
.../apache/gluten/utils/ParquetMetadataUtils.scala | 36 +++++++++-------------
.../utils/ParquetEncryptionDetectionSuite.scala | 1 -
docs/Configuration.md | 2 --
.../org/apache/gluten/config/GlutenConfig.scala | 28 -----------------
5 files changed, 15 insertions(+), 53 deletions(-)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
index 743e6bd9e5..91f20dbc77 100644
---
a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
+++
b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala
@@ -200,7 +200,6 @@ object VeloxBackendSettings extends BackendSettingsApi {
return None
}
val fileLimit = GlutenConfig.get.parquetMetadataFallbackFileLimit
- .max(GlutenConfig.get.parquetEncryptionValidationFileLimit)
val parquetOptions = new ParquetOptions(CaseInsensitiveMap(properties),
SQLConf.get)
val parquetMetadataValidationResult =
ParquetMetadataUtils.validateMetadata(rootPaths, hadoopConf,
parquetOptions, fileLimit)
diff --git
a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
index 1085630dd5..4ea4ebad65 100644
---
a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
+++
b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala
@@ -48,20 +48,22 @@ object ParquetMetadataUtils {
parquetOptions: ParquetOptions,
fileLimit: Int
): Option[String] = {
+ var remaining = fileLimit
rootPaths.foreach {
rootPath =>
val fs = new Path(rootPath).getFileSystem(hadoopConf)
try {
- val reason =
+ val (maybeReason, filesScanned) =
checkForUnexpectedMetadataWithLimit(
fs,
new Path(rootPath),
hadoopConf,
parquetOptions,
fileLimit = fileLimit)
- if (reason.nonEmpty) {
- return reason
+ if (maybeReason.isDefined) {
+ return maybeReason
}
+ remaining -= filesScanned
} catch {
case e: Exception =>
}
@@ -95,7 +97,8 @@ object ParquetMetadataUtils {
* @param fileLimit
* Maximum number of files to inspect
* @return
- * True if an encrypted file is detected, false otherwise
+ * (String, Int) if an unsupported metadata is detected,empty otherwise
and the number of
+ * checked files
*/
private def checkForUnexpectedMetadataWithLimit(
fs: FileSystem,
@@ -103,7 +106,7 @@ object ParquetMetadataUtils {
conf: Configuration,
parquetOptions: ParquetOptions,
fileLimit: Int
- ): Option[String] = {
+ ): (Option[String], Int) = {
val filesIterator = fs.listFiles(path, true)
var checkedFileCount = 0
while (filesIterator.hasNext && checkedFileCount < fileLimit) {
@@ -111,10 +114,10 @@ object ParquetMetadataUtils {
checkedFileCount += 1
val metadataUnsupported = isUnsupportedMetadata(fileStatus, conf,
parquetOptions)
if (metadataUnsupported.isDefined) {
- return metadataUnsupported
+ return (metadataUnsupported, checkedFileCount)
}
}
- None
+ (None, checkedFileCount)
}
/**
@@ -126,9 +129,7 @@ object ParquetMetadataUtils {
fileStatus: LocatedFileStatus,
conf: Configuration,
parquetOptions: ParquetOptions): Option[String] = {
- val isEncryptionValidationEnabled =
GlutenConfig.get.parquetEncryptionValidationEnabled
- val isMetadataValidationEnabled =
GlutenConfig.get.parquetMetadataValidationEnabled
- if (!isMetadataValidationEnabled && !isEncryptionValidationEnabled) {
+ if (!GlutenConfig.get.parquetMetadataValidationEnabled) {
return None
}
val footer =
@@ -136,9 +137,6 @@ object ParquetMetadataUtils {
ParquetFooterReader.readFooter(conf, fileStatus,
ParquetMetadataConverter.NO_FILTER)
} catch {
case e: Exception if ExceptionUtils.hasCause(e,
classOf[ParquetCryptoRuntimeException]) =>
- if (!isEncryptionValidationEnabled) {
- return None
- }
return Some("Encrypted Parquet footer detected.")
case _: RuntimeException =>
// Ignored as it's could be a "Not a Parquet file" exception.
@@ -149,19 +147,15 @@ object ParquetMetadataUtils {
isTimezoneFoundInMetadata(footer, parquetOptions)
)
- if (isMetadataValidationEnabled) {
- for (check <- validationChecks) {
- if (check.isDefined) {
- return check
- }
+ for (check <- validationChecks) {
+ if (check.isDefined) {
+ return check
}
}
// Previous Spark3.4 version uses toString to check if the data is
encrypted,
// so place the check to the end
- if (
- isEncryptionValidationEnabled &&
SparkShimLoader.getSparkShims.isParquetFileEncrypted(footer)
- ) {
+ if (SparkShimLoader.getSparkShims.isParquetFileEncrypted(footer)) {
return Some("Encrypted Parquet file detected.")
}
None
diff --git
a/backends-velox/src/test/scala/org/apache/gluten/utils/ParquetEncryptionDetectionSuite.scala
b/backends-velox/src/test/scala/org/apache/gluten/utils/ParquetEncryptionDetectionSuite.scala
index 6003caaa5a..834fa8907f 100644
---
a/backends-velox/src/test/scala/org/apache/gluten/utils/ParquetEncryptionDetectionSuite.scala
+++
b/backends-velox/src/test/scala/org/apache/gluten/utils/ParquetEncryptionDetectionSuite.scala
@@ -73,7 +73,6 @@ class ParquetEncryptionDetectionSuite extends
SharedSparkSession {
override def sparkConf: SparkConf = {
super.sparkConf
- .set(GlutenConfig.ENCRYPTED_PARQUET_FALLBACK_ENABLED.key, "true")
.set(GlutenConfig.PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED.key,
"true")
}
diff --git a/docs/Configuration.md b/docs/Configuration.md
index 9521f47478..8ef5060351 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -115,8 +115,6 @@ nav_order: 15
| spark.gluten.sql.columnarToRowMemoryThreshold | 64MB
|
| spark.gluten.sql.countDistinctWithoutExpand | false
| Convert Count Distinct to a UDAF called count_distinct to prevent
SparkPlanner converting it to Expand+Count. WARNING: When enabled, count
distinct queries will fail to fallback!!!
|
| spark.gluten.sql.extendedColumnPruning.enabled | true
| Do extended nested column pruning for cases ignored by vanilla
Spark.
|
-| spark.gluten.sql.fallbackEncryptedParquet |
<undefined> | If enabled, Gluten will not offload scan when encrypted
parquet files are detected. Defaulted to
spark.gluten.sql.fallbackUnexpectedMetadataParquet.
|
-| spark.gluten.sql.fallbackEncryptedParquet.limit |
<undefined> | If supplied, `limit` number of files will be checked to
determine encryption and falling back to java scan. Defaulted to
spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit.
|
| spark.gluten.sql.fallbackRegexpExpressions | false
| If true, fall back all regexp expressions. There are a few
incompatible cases between RE2 (used by native engine) and java.util.regex
(used by Spark). User should enable this property if their incompatibility is
intolerable.
|
| spark.gluten.sql.fallbackUnexpectedMetadataParquet | false
| If enabled, Gluten will not offload scan when unexpected metadata
is detected.
|
| spark.gluten.sql.fallbackUnexpectedMetadataParquet.limit | 10
| If supplied, metadata of `limit` number of Parquet files will be
checked to determine whether to fall back to java scan.
|
diff --git
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
index 9896bd7cd1..340c490993 100644
---
a/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
+++
b/gluten-substrait/src/main/scala/org/apache/gluten/config/GlutenConfig.scala
@@ -375,16 +375,6 @@ class GlutenConfig(conf: SQLConf) extends
GlutenCoreConfig(conf) {
getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT)
}
- def parquetEncryptionValidationEnabled: Boolean = {
- getConf(ENCRYPTED_PARQUET_FALLBACK_ENABLED)
- .getOrElse(getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED))
- }
-
- def parquetEncryptionValidationFileLimit: Int = {
- getConf(PARQUET_ENCRYPTED_FALLBACK_FILE_LIMIT).getOrElse(
- getConf(PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT))
- }
-
def enableColumnarRange: Boolean = getConf(COLUMNAR_RANGE_ENABLED)
def enableColumnarCollectLimit: Boolean =
getConf(COLUMNAR_COLLECT_LIMIT_ENABLED)
def enableColumnarCollectTail: Boolean =
getConf(COLUMNAR_COLLECT_TAIL_ENABLED)
@@ -1576,24 +1566,6 @@ object GlutenConfig extends ConfigRegistry {
.checkValue(_ > 0, s"must be positive.")
.createWithDefault(10)
- val ENCRYPTED_PARQUET_FALLBACK_ENABLED =
- buildConf("spark.gluten.sql.fallbackEncryptedParquet")
- .doc(
- "If enabled, Gluten will not offload scan when encrypted parquet files
are" +
- " detected. Defaulted to " +
s"${PARQUET_UNEXPECTED_METADATA_FALLBACK_ENABLED.key}.")
- .booleanConf
- .createOptional
-
- val PARQUET_ENCRYPTED_FALLBACK_FILE_LIMIT =
- buildConf("spark.gluten.sql.fallbackEncryptedParquet.limit")
- .doc(
- "If supplied, `limit` number of files will be checked to determine
encryption " +
- s"and falling back to java scan. Defaulted to " +
- s"${PARQUET_UNEXPECTED_METADATA_FALLBACK_FILE_LIMIT.key}.")
- .intConf
- .checkValue(_ > 0, s"must be positive.")
- .createOptional
-
val COLUMNAR_RANGE_ENABLED =
buildConf("spark.gluten.sql.columnar.range")
.doc("Enable or disable columnar range.")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]