This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 30d45f8  Fix truncateStringMax in UnicodeUtil (#334)
30d45f8 is described below

commit 30d45f88d1f10afdef0884b89860e6fd1f25365d
Author: Vinitha Gankidi <[email protected]>
AuthorDate: Tue Jul 30 19:18:59 2019 -0700

    Fix truncateStringMax in UnicodeUtil (#334)
    
    Fixes #328, fixes #329.
    
    Index to codePointAt should be the offset calculated by code points
---
 api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java    |  6 +++---
 .../test/java/org/apache/iceberg/TestMetricsTruncation.java   | 11 +++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java 
b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
index 1eaed21..f76ec73 100644
--- a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
+++ b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
@@ -79,11 +79,11 @@ public class UnicodeUtil {
 
     // Try incrementing the code points from the end
     for (int i = length - 1; i >= 0; i--) {
-      int nextCodePoint = truncatedStringBuffer.codePointAt(i) + 1;
+      // Get the offset in the truncated string buffer where the number of 
unicode characters = i
+      int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
+      int nextCodePoint = truncatedStringBuffer.codePointAt(offsetByCodePoint) 
+ 1;
       // No overflow
       if (nextCodePoint != 0 && Character.isValidCodePoint(nextCodePoint)) {
-        // Get the offset in the truncated string buffer where the number of 
unicode characters = i
-        int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
         truncatedStringBuffer.setLength(offsetByCodePoint);
         // Append next code point to the truncated substring
         truncatedStringBuffer.appendCodePoint(nextCodePoint);
diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java 
b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
index 7a99904..af304da 100644
--- a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
+++ b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
@@ -139,6 +139,9 @@ public class TestMetricsTruncation {
     String test6 = "\uD800\uDFFF\uD800\uDFFF";
     // Increment the previous character
     String test6_2_expected = "\uD801\uDC00";
+    String test7 = "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02";
+    String test7_2_expected = "\uD83D\uDE02\uD83D\uDE03";
+    String test7_1_expected = "\uD83D\uDE03";
 
     Comparator<CharSequence> cmp = Literal.of(test1).comparator();
     Assert.assertTrue("Truncated upper bound should be greater than or equal 
to the actual upper bound",
@@ -176,5 +179,13 @@ public class TestMetricsTruncation {
     Assert.assertTrue("Test 4 byte UTF-8 character increment. Output must have 
one character with " +
         "the first character incremented", cmp.compare(
         truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected) == 
0);
+    Assert.assertTrue("Truncated upper bound should be greater than or equal 
to the actual upper bound",
+        cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7) >= 
0);
+    Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where 
the second unicode " +
+        "character should be incremented", cmp.compare(
+            truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected) 
== 0);
+    Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where 
the first unicode " +
+        "character should be incremented", cmp.compare(
+            truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected) 
== 0);
   }
 }

Reply via email to