This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push: new 1a77846 [SPARK-30201][SQL][2.4] HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT 1a77846 is described below commit 1a77846775a62f48661c39c0f00914524d2e7014 Author: ulysses <youxi...@weidian.com> AuthorDate: Mon Oct 5 20:13:04 2020 -0700 [SPARK-30201][SQL][2.4] HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT ### What changes were proposed in this pull request? This is a backport of #26831. Now spark use `ObjectInspectorCopyOption.JAVA` as oi option which will convert any string to UTF-8 string. When write non UTF-8 code data, then `EFBFBD` will appear. We should use `ObjectInspectorCopyOption.DEFAULT` to support pass the bytes. ### Why are the changes needed? Here is the way to reproduce: 1. make a file contains 16 radix 'AABBCC' which is not the UTF-8 code. 2. create table test1 (c string) location '$file_path'; 3. select hex(c) from test1; // AABBCC 4. craete table test2 (c string) as select c from test1; 5. select hex(c) from test2; // EFBFBDEFBFBDEFBFBD ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CI. Closes #29948 from anuragmantri/SPARK-30201-2.4. Authored-by: ulysses <youxi...@weidian.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../org/apache/spark/sql/hive/HiveInspectors.scala | 9 ++++++-- .../spark/sql/hive/execution/HiveFileFormat.scala | 7 ++++++- .../org/apache/spark/sql/hive/InsertSuite.scala | 24 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala index 4dec2f7..65c6fcc 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala @@ -304,12 +304,17 @@ private[hive] trait HiveInspectors { withNullSafe(o => getByteWritable(o)) case _: ByteObjectInspector => withNullSafe(o => o.asInstanceOf[java.lang.Byte]) - case _: JavaHiveVarcharObjectInspector => + // To spark HiveVarchar and HiveChar are same as string + case _: HiveVarcharObjectInspector if x.preferWritable() => + withNullSafe(o => getStringWritable(o)) + case _: HiveVarcharObjectInspector => withNullSafe { o => val s = o.asInstanceOf[UTF8String].toString new HiveVarchar(s, s.length) } - case _: JavaHiveCharObjectInspector => + case _: HiveCharObjectInspector if x.preferWritable() => + withNullSafe(o => getStringWritable(o)) + case _: HiveCharObjectInspector => withNullSafe { o => val s = o.asInstanceOf[UTF8String].toString new HiveChar(s, s.length) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala index 4a7cd69..293b693 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveFileFormat.scala @@ -128,10 +128,15 @@ class HiveOutputWriter( new Path(path), Reporter.NULL) + /** + * Since SPARK-30201 ObjectInspectorCopyOption.JAVA change to ObjectInspectorCopyOption.DEFAULT. + * The reason is DEFAULT option can convert `UTF8String` to `Text` with bytes and + * we can compatible with non UTF-8 code bytes during write. + */ private val standardOI = ObjectInspectorUtils .getStandardObjectInspector( tableDesc.getDeserializer(jobConf).getObjectInspector, - ObjectInspectorCopyOption.JAVA) + ObjectInspectorCopyOption.DEFAULT) .asInstanceOf[StructObjectInspector] private val fieldOIs = diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala index 510de3a..224a219 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive import java.io.File +import com.google.common.io.Files import org.scalatest.BeforeAndAfter import org.apache.spark.SparkException @@ -785,4 +786,27 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter } } } + + test("SPARK-30201 HiveOutputWriter standardOI should use ObjectInspectorCopyOption.DEFAULT") { + withTable("t1", "t2") { + withTempDir { dir => + val file = new File(dir, "test.hex") + val hex = "AABBCC" + val bs = org.apache.commons.codec.binary.Hex.decodeHex(hex.toCharArray) + Files.write(bs, file) + val path = file.getParent + sql(s"create table t1 (c string) STORED AS TEXTFILE location '$path'") + checkAnswer( + sql("select hex(c) from t1"), + Row(hex) + ) + + sql("create table t2 as select c from t1") + checkAnswer( + sql("select hex(c) from t2"), + Row(hex) + ) + } + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org