This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 30d45f8 Fix truncateStringMax in UnicodeUtil (#334)
30d45f8 is described below
commit 30d45f88d1f10afdef0884b89860e6fd1f25365d
Author: Vinitha Gankidi <[email protected]>
AuthorDate: Tue Jul 30 19:18:59 2019 -0700
Fix truncateStringMax in UnicodeUtil (#334)
Fixes #328, fixes #329.
Index to codePointAt should be the offset calculated by code points
---
api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java | 6 +++---
.../test/java/org/apache/iceberg/TestMetricsTruncation.java | 11 +++++++++++
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
index 1eaed21..f76ec73 100644
--- a/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
+++ b/api/src/main/java/org/apache/iceberg/util/UnicodeUtil.java
@@ -79,11 +79,11 @@ public class UnicodeUtil {
// Try incrementing the code points from the end
for (int i = length - 1; i >= 0; i--) {
- int nextCodePoint = truncatedStringBuffer.codePointAt(i) + 1;
+ // Get the offset in the truncated string buffer where the number of
unicode characters = i
+ int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
+ int nextCodePoint = truncatedStringBuffer.codePointAt(offsetByCodePoint)
+ 1;
// No overflow
if (nextCodePoint != 0 && Character.isValidCodePoint(nextCodePoint)) {
- // Get the offset in the truncated string buffer where the number of
unicode characters = i
- int offsetByCodePoint = truncatedStringBuffer.offsetByCodePoints(0, i);
truncatedStringBuffer.setLength(offsetByCodePoint);
// Append next code point to the truncated substring
truncatedStringBuffer.appendCodePoint(nextCodePoint);
diff --git a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
index 7a99904..af304da 100644
--- a/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
+++ b/core/src/test/java/org/apache/iceberg/TestMetricsTruncation.java
@@ -139,6 +139,9 @@ public class TestMetricsTruncation {
String test6 = "\uD800\uDFFF\uD800\uDFFF";
// Increment the previous character
String test6_2_expected = "\uD801\uDC00";
+ String test7 = "\uD83D\uDE02\uD83D\uDE02\uD83D\uDE02";
+ String test7_2_expected = "\uD83D\uDE02\uD83D\uDE03";
+ String test7_1_expected = "\uD83D\uDE03";
Comparator<CharSequence> cmp = Literal.of(test1).comparator();
Assert.assertTrue("Truncated upper bound should be greater than or equal
to the actual upper bound",
@@ -176,5 +179,13 @@ public class TestMetricsTruncation {
Assert.assertTrue("Test 4 byte UTF-8 character increment. Output must have
one character with " +
"the first character incremented", cmp.compare(
truncateStringMax(Literal.of(test6), 1).value(), test6_2_expected) ==
0);
+ Assert.assertTrue("Truncated upper bound should be greater than or equal
to the actual upper bound",
+ cmp.compare(truncateStringMax(Literal.of(test7), 2).value(), test7) >=
0);
+ Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where
the second unicode " +
+ "character should be incremented", cmp.compare(
+ truncateStringMax(Literal.of(test7), 2).value(), test7_2_expected)
== 0);
+ Assert.assertTrue("Test input with multiple 4 byte UTF-8 character where
the first unicode " +
+ "character should be incremented", cmp.compare(
+ truncateStringMax(Literal.of(test7), 1).value(), test7_1_expected)
== 0);
}
}