This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new eb3b380 [SPARK-34909][SQL] Fix conversion of negative to unsigned in conv() eb3b380 is described below commit eb3b3801153f2cd355ab2a4c21177ea8772d43e0 Author: Tim Armstrong <tim.armstr...@databricks.com> AuthorDate: Wed Mar 31 12:58:29 2021 +0800 [SPARK-34909][SQL] Fix conversion of negative to unsigned in conv() ### What changes were proposed in this pull request? Use `java.lang.Long.divideUnsigned()` to do integer division in `NumberConverter` to avoid a bug in `unsignedLongDiv` that produced invalid results. ### Why are the changes needed? The previous results are incorrect, the result of the below query should be 45012021522523134134555 ``` scala> spark.sql("select conv('-10', 11, 7)").show(20, 150) +-----------------------+ | conv(-10, 11, 7)| +-----------------------+ |4501202152252313413456| +-----------------------+ scala> spark.sql("select hex(conv('-10', 11, 7))").show(20, 150) +----------------------------------------------+ | hex(conv(-10, 11, 7))| +----------------------------------------------+ |3435303132303231353232353233313334313334353600| +----------------------------------------------+ ``` ### Does this PR introduce _any_ user-facing change? `conv()` will produce different results because the bug is fixed. ### How was this patch tested? Added a simple unit test. Closes #32006 from timarmstrong/conv-unsigned. Authored-by: Tim Armstrong <tim.armstr...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit 13b255fefd881beb68fd8bb6741c7f88318baf9b) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/catalyst/util/NumberConverter.scala | 26 +++------------------- .../sql/catalyst/util/NumberConverterSuite.scala | 4 ++++ 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala index 7dbdd1e..dca75e5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala @@ -22,26 +22,6 @@ import org.apache.spark.unsafe.types.UTF8String object NumberConverter { /** - * Divide x by m as if x is an unsigned 64-bit integer. Examples: - * unsignedLongDiv(-1, 2) == Long.MAX_VALUE unsignedLongDiv(6, 3) == 2 - * unsignedLongDiv(0, 5) == 0 - * - * @param x is treated as unsigned - * @param m is treated as signed - */ - private def unsignedLongDiv(x: Long, m: Int): Long = { - if (x >= 0) { - x / m - } else { - // Let uval be the value of the unsigned long with the same bits as x - // Two's complement => x = uval - 2*MAX - 2 - // => uval = x + 2*MAX + 2 - // Now, use the fact: (a+b)/c = a/c + b/c + (a%c+b%c)/c - x / m + 2 * (Long.MaxValue / m) + 2 / m + (x % m + 2 * (Long.MaxValue % m) + 2 % m) / m - } - } - - /** * Decode v into value[]. * * @param v is treated as an unsigned 64-bit integer @@ -52,7 +32,7 @@ object NumberConverter { java.util.Arrays.fill(value, 0.asInstanceOf[Byte]) var i = value.length - 1 while (tmpV != 0) { - val q = unsignedLongDiv(tmpV, radix) + val q = java.lang.Long.divideUnsigned(tmpV, radix) value(i) = (tmpV - q * radix).asInstanceOf[Byte] tmpV = q i -= 1 @@ -69,12 +49,12 @@ object NumberConverter { */ private def encode(radix: Int, fromPos: Int, value: Array[Byte]): Long = { var v: Long = 0L - val bound = unsignedLongDiv(-1 - radix, radix) // Possible overflow once + val bound = java.lang.Long.divideUnsigned(-1 - radix, radix) // Possible overflow once var i = fromPos while (i < value.length && value(i) >= 0) { if (v >= bound) { // Check for overflow - if (unsignedLongDiv(-1 - value(i), radix) < v) { + if (java.lang.Long.divideUnsigned(-1 - value(i), radix) < v) { return -1 } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala index ec73f45..eb257b7 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala @@ -40,6 +40,10 @@ class NumberConverterSuite extends SparkFunSuite { checkConv("11abc", 10, 16, "B") } + test("SPARK-34909: convert negative to unsigned") { + checkConv("-10", 11, 7, "45012021522523134134555") + } + test("byte to binary") { checkToBinary(0.toByte) checkToBinary(1.toByte) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org