(spark) branch master updated: [SPARK-48506][CORE] Compression codec short names are case insensitive except for event logging

yangjie01 Tue, 04 Jun 2024 05:34:09 -0700

This is an automated email from the ASF dual-hosted git repository.

yangjie01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new f4afa2215a1a [SPARK-48506][CORE] Compression codec short names are 
case insensitive except for event logging
f4afa2215a1a is described below

commit f4afa2215a1a390d9f099a26155fbefc5beefbe9
Author: Kent Yao <y...@apache.org>
AuthorDate: Tue Jun 4 20:33:51 2024 +0800

    [SPARK-48506][CORE] Compression codec short names are case insensitive 
except for event logging
    
    ### What changes were proposed in this pull request?
    
    Compression codec short names, e.g. map statuses, broadcasts, shuffle, 
parquet/orc/avro outputs, are case insensitive except for event logging. 
Calling `org.apache.spark.io.CompressionCodec.getShortName` causes this issue.
    
    In this PR, we make `CompressionCodec.getShortName` handle case sensitivity 
correctly.
    
    ### Why are the changes needed?
    
    Feature parity
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, spark.eventLog.compression.codec now accepts not only the lowercased 
form of lz4, lzf, snappy, and zstd, but also forms with any of the characters 
to be upcased。
    
    ### How was this patch tested?
    
    new tests
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #46847 from yaooqinn/SPARK-48506.
    
    Authored-by: Kent Yao <y...@apache.org>
    Signed-off-by: yangjie01 <yangji...@baidu.com>
---
 .../main/scala/org/apache/spark/io/CompressionCodec.scala |  5 +++--
 .../scala/org/apache/spark/io/CompressionCodecSuite.scala | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala 
b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
index 7d5a86d1a81d..233228a9c6d4 100644
--- a/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
+++ b/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala
@@ -101,8 +101,9 @@ private[spark] object CompressionCodec {
    * If it is already a short name, just return it.
    */
   def getShortName(codecName: String): String = {
-    if (shortCompressionCodecNames.contains(codecName)) {
-      codecName
+    val lowercasedCodec = codecName.toLowerCase(Locale.ROOT)
+    if (shortCompressionCodecNames.contains(lowercasedCodec)) {
+      lowercasedCodec
     } else {
       shortCompressionCodecNames
         .collectFirst { case (k, v) if v == codecName => k }
diff --git 
a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala 
b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
index 729fcecff120..5c09a1f965b9 100644
--- a/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
+++ b/core/src/test/scala/org/apache/spark/io/CompressionCodecSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.io
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
+import java.util.Locale
 
 import com.google.common.io.ByteStreams
 
@@ -160,4 +161,18 @@ class CompressionCodecSuite extends SparkFunSuite {
     ByteStreams.readFully(concatenatedBytes, decompressed)
     assert(decompressed.toSeq === (0 to 127))
   }
+
+  test("SPARK-48506: CompressionCodec getShortName is case insensitive for 
short names") {
+    CompressionCodec.shortCompressionCodecNames.foreach { case (shortName, 
codecClass) =>
+      assert(CompressionCodec.getShortName(shortName) === shortName)
+      assert(CompressionCodec.getShortName(shortName.toUpperCase(Locale.ROOT)) 
=== shortName)
+      assert(CompressionCodec.getShortName(codecClass) === shortName)
+      checkError(
+        exception = intercept[SparkIllegalArgumentException] {
+          CompressionCodec.getShortName(codecClass.toUpperCase(Locale.ROOT))
+        },
+        errorClass = "CODEC_SHORT_NAME_NOT_FOUND",
+        parameters = Map("codecName" -> codecClass.toUpperCase(Locale.ROOT)))
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-48506][CORE] Compression codec short names are case insensitive except for event logging

Reply via email to