[spark] branch branch-2.4 updated: [SPARK-30201][SQL][2.4] HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT

dongjoon Mon, 05 Oct 2020 20:16:16 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-2.4 by this push:
     new 1a77846  [SPARK-30201][SQL][2.4] HiveOutputWriter standardOI should 
use ObjectInspectorCopyOption.DEFAULT
1a77846 is described below

commit 1a77846775a62f48661c39c0f00914524d2e7014
Author: ulysses <youxi...@weidian.com>
AuthorDate: Mon Oct 5 20:13:04 2020 -0700

    [SPARK-30201][SQL][2.4] HiveOutputWriter standardOI should use 
ObjectInspectorCopyOption.DEFAULT
    
    ### What changes were proposed in this pull request?
    
    This is a backport of #26831.
    
    Now spark use `ObjectInspectorCopyOption.JAVA` as oi option which will 
convert any string to UTF-8 string. When write non UTF-8 code data, then 
`EFBFBD` will appear.
    We should use `ObjectInspectorCopyOption.DEFAULT` to support pass the bytes.
    
    ### Why are the changes needed?
    
    Here is the way to reproduce:
    1. make a file contains 16 radix 'AABBCC' which is not the UTF-8 code.
    2. create table test1 (c string) location '$file_path';
    3. select hex(c) from test1; // AABBCC
    4. craete table test2 (c string) as select c from test1;
    5. select hex(c) from test2; // EFBFBDEFBFBDEFBFBD
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Pass the CI.
    
    Closes #29948 from anuragmantri/SPARK-30201-2.4.
    
    Authored-by: ulysses <youxi...@weidian.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../org/apache/spark/sql/hive/HiveInspectors.scala |  9 ++++++--
 .../spark/sql/hive/execution/HiveFileFormat.scala  |  7 ++++++-
 .../org/apache/spark/sql/hive/InsertSuite.scala    | 24 ++++++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 4dec2f7..65c6fcc 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -304,12 +304,17 @@ private[hive] trait HiveInspectors {
         withNullSafe(o => getByteWritable(o))
       case _: ByteObjectInspector =>
         withNullSafe(o => o.asInstanceOf[java.lang.Byte])
-      case _: JavaHiveVarcharObjectInspector =>
+        // To spark HiveVarchar and HiveChar are same as string
+      case _: HiveVarcharObjectInspector if x.preferWritable() =>
+        withNullSafe(o => getStringWritable(o))
+      case _: HiveVarcharObjectInspector =>
         withNullSafe { o =>
             val s = o.asInstanceOf[UTF8String].toString
             new HiveVarchar(s, s.length)
         }
-      case _: JavaHiveCharObjectInspector =>
+      case _: HiveCharObjectInspector if x.preferWritable() =>
+        withNullSafe(o => getStringWritable(o))
+      case _: HiveCharObjectInspector =>
         withNullSafe { o =>
             val s = o.asInstanceOf[UTF8String].toString
             new HiveChar(s, s.length)
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
index 4a7cd69..293b693 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala
@@ -128,10 +128,15 @@ class HiveOutputWriter(
     new Path(path),
     Reporter.NULL)
 
+  /**
+   * Since SPARK-30201 ObjectInspectorCopyOption.JAVA change to 
ObjectInspectorCopyOption.DEFAULT.
+   * The reason is DEFAULT option can convert `UTF8String` to `Text` with 
bytes and
+   * we can compatible with non UTF-8 code bytes during write.
+   */
   private val standardOI = ObjectInspectorUtils
     .getStandardObjectInspector(
       tableDesc.getDeserializer(jobConf).getObjectInspector,
-      ObjectInspectorCopyOption.JAVA)
+      ObjectInspectorCopyOption.DEFAULT)
     .asInstanceOf[StructObjectInspector]
 
   private val fieldOIs =
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
index 510de3a..224a219 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import com.google.common.io.Files
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.SparkException
@@ -785,4 +786,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton 
with BeforeAndAfter
       }
     }
   }
+
+  test("SPARK-30201 HiveOutputWriter standardOI should use 
ObjectInspectorCopyOption.DEFAULT") {
+    withTable("t1", "t2") {
+      withTempDir { dir =>
+        val file = new File(dir, "test.hex")
+        val hex = "AABBCC"
+        val bs = org.apache.commons.codec.binary.Hex.decodeHex(hex.toCharArray)
+        Files.write(bs, file)
+        val path = file.getParent
+        sql(s"create table t1 (c string) STORED AS TEXTFILE location '$path'")
+        checkAnswer(
+          sql("select hex(c) from t1"),
+          Row(hex)
+        )
+
+        sql("create table t2 as select c from t1")
+        checkAnswer(
+          sql("select hex(c) from t2"),
+          Row(hex)
+        )
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-2.4 updated: [SPARK-30201][SQL][2.4] HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT

Reply via email to