This is an automated email from the ASF dual-hosted git repository. yumwang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new d5fa41efe2b [SPARK-41741][SQL] Encode the string using the UTF_8 charset in ParquetFilters d5fa41efe2b is described below commit d5fa41efe2b1aa0aa41f558c1bef048b4632cf5c Author: Yuming Wang <yumw...@ebay.com> AuthorDate: Mon Feb 20 19:15:30 2023 +0800 [SPARK-41741][SQL] Encode the string using the UTF_8 charset in ParquetFilters ### What changes were proposed in this pull request? This PR makes it encode the string using the `UTF_8` charset in `ParquetFilters`. ### Why are the changes needed? Fix data issue where the default charset is not `UTF_8`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual test. Closes #40090 from wangyum/SPARK-41741. Authored-by: Yuming Wang <yumw...@ebay.com> Signed-off-by: Yuming Wang <yumw...@ebay.com> --- .../spark/sql/execution/datasources/parquet/ParquetFilters.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala index c34f2827659..6994e1ba39d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.datasources.parquet import java.lang.{Boolean => JBoolean, Double => JDouble, Float => JFloat, Long => JLong} import java.math.{BigDecimal => JBigDecimal} +import java.nio.charset.StandardCharsets.UTF_8 import java.sql.{Date, Timestamp} import java.time.{Duration, Instant, LocalDate, Period} import java.util.HashSet @@ -776,7 +777,7 @@ class ParquetFilters( Option(prefix).map { v => FilterApi.userDefined(binaryColumn(nameToParquetField(name).fieldNames), new UserDefinedPredicate[Binary] with Serializable { - private val strToBinary = Binary.fromReusedByteArray(v.getBytes) + private val strToBinary = Binary.fromReusedByteArray(v.getBytes(UTF_8)) private val size = strToBinary.length override def canDrop(statistics: Statistics[Binary]): Boolean = { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org