This is an automated email from the ASF dual-hosted git repository.
garydgregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-lang.git
The following commit(s) were added to refs/heads/master by this push:
new 37a2be804 Handle supplementary code points in splitByCharacterType
(#1734)
37a2be804 is described below
commit 37a2be80434f5c393e5842c52a95afb607be2eee
Author: alhuda <[email protected]>
AuthorDate: Sun Jun 28 17:17:58 2026 +0530
Handle supplementary code points in splitByCharacterType (#1734)
---
.../java/org/apache/commons/lang3/StringUtils.java | 12 ++++++++----
.../org/apache/commons/lang3/StringUtilsTest.java | 22 ++++++++++++++++++++++
2 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java
b/src/main/java/org/apache/commons/lang3/StringUtils.java
index 4d7150fca..c8ef44dbd 100644
--- a/src/main/java/org/apache/commons/lang3/StringUtils.java
+++ b/src/main/java/org/apache/commons/lang3/StringUtils.java
@@ -7259,14 +7259,17 @@ private static String[] splitByCharacterType(final
String str, final boolean cam
final char[] c = str.toCharArray();
final List<String> list = new ArrayList<>();
int tokenStart = 0;
- int currentType = Character.getType(c[tokenStart]);
- for (int pos = tokenStart + 1; pos < c.length; pos++) {
- final int type = Character.getType(c[pos]);
+ int currentType = Character.getType(Character.codePointAt(c,
tokenStart));
+ for (int pos = tokenStart +
Character.charCount(Character.codePointAt(c, tokenStart)); pos < c.length;) {
+ final int codePoint = Character.codePointAt(c, pos);
+ final int type = Character.getType(codePoint);
+ final int count = Character.charCount(codePoint);
if (type == currentType) {
+ pos += count;
continue;
}
if (camelCase && type == Character.LOWERCASE_LETTER && currentType
== Character.UPPERCASE_LETTER) {
- final int newTokenStart = pos - 1;
+ final int newTokenStart = pos -
Character.charCount(Character.codePointBefore(c, pos));
if (newTokenStart != tokenStart) {
list.add(new String(c, tokenStart, newTokenStart -
tokenStart));
tokenStart = newTokenStart;
@@ -7276,6 +7279,7 @@ private static String[] splitByCharacterType(final String
str, final boolean cam
tokenStart = pos;
}
currentType = type;
+ pos += count;
}
list.add(new String(c, tokenStart, c.length - tokenStart));
return list.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
index fa34c839d..e90b276fb 100644
--- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
+++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
@@ -2355,6 +2355,18 @@ void testSplitByCharacterType() {
assertTrue(Objects.deepEquals(new String[]{"ASFR", "ules"},
StringUtils.splitByCharacterType("ASFRules")));
+
+ // Supplementary code points are classified by their own type, not
split apart as surrogates.
+ // U+1D400 MATHEMATICAL BOLD CAPITAL A is an upper-case letter, like
ASCII 'A'.
+ final String boldA = new String(Character.toChars(0x1D400));
+ // U+1D7D3 MATHEMATICAL BOLD DIGIT FIVE is a decimal digit, like ASCII
'5'.
+ final String boldFive = new String(Character.toChars(0x1D7D3));
+ assertTrue(Objects.deepEquals(new String[]{"A" + boldA},
+ StringUtils.splitByCharacterType("A" + boldA)));
+ assertTrue(Objects.deepEquals(new String[]{"5" + boldFive},
+ StringUtils.splitByCharacterType("5" + boldFive)));
+ assertTrue(Objects.deepEquals(new String[]{boldA, "5" + boldFive, "z"},
+ StringUtils.splitByCharacterType(boldA + "5" + boldFive +
"z")));
}
@Test
@@ -2382,6 +2394,16 @@ void testSplitByCharacterTypeCamelCase() {
assertTrue(Objects.deepEquals(new String[]{"ASF", "Rules"},
StringUtils.splitByCharacterTypeCamelCase("ASFRules")));
+
+ // A supplementary upper-case letter immediately before a lower-case
run joins the following token,
+ // exactly as a BMP upper-case letter does. U+1D400 MATHEMATICAL BOLD
CAPITAL A is an upper-case letter.
+ final String boldA = new String(Character.toChars(0x1D400));
+ assertTrue(Objects.deepEquals(new String[]{boldA + "bc"},
+ StringUtils.splitByCharacterTypeCamelCase(boldA + "bc")));
+ assertTrue(Objects.deepEquals(new String[]{"AB", boldA + "cd"},
+ StringUtils.splitByCharacterTypeCamelCase("AB" + boldA +
"cd")));
+ assertTrue(Objects.deepEquals(new String[]{"foo", boldA + "bar"},
+ StringUtils.splitByCharacterTypeCamelCase("foo" + boldA +
"bar")));
}
@Test