This is an automated email from the ASF dual-hosted git repository.

garydgregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-lang.git


The following commit(s) were added to refs/heads/master by this push:
     new 37a2be804 Handle supplementary code points in splitByCharacterType 
(#1734)
37a2be804 is described below

commit 37a2be80434f5c393e5842c52a95afb607be2eee
Author: alhuda <[email protected]>
AuthorDate: Sun Jun 28 17:17:58 2026 +0530

    Handle supplementary code points in splitByCharacterType (#1734)
---
 .../java/org/apache/commons/lang3/StringUtils.java | 12 ++++++++----
 .../org/apache/commons/lang3/StringUtilsTest.java  | 22 ++++++++++++++++++++++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java 
b/src/main/java/org/apache/commons/lang3/StringUtils.java
index 4d7150fca..c8ef44dbd 100644
--- a/src/main/java/org/apache/commons/lang3/StringUtils.java
+++ b/src/main/java/org/apache/commons/lang3/StringUtils.java
@@ -7259,14 +7259,17 @@ private static String[] splitByCharacterType(final 
String str, final boolean cam
         final char[] c = str.toCharArray();
         final List<String> list = new ArrayList<>();
         int tokenStart = 0;
-        int currentType = Character.getType(c[tokenStart]);
-        for (int pos = tokenStart + 1; pos < c.length; pos++) {
-            final int type = Character.getType(c[pos]);
+        int currentType = Character.getType(Character.codePointAt(c, 
tokenStart));
+        for (int pos = tokenStart + 
Character.charCount(Character.codePointAt(c, tokenStart)); pos < c.length;) {
+            final int codePoint = Character.codePointAt(c, pos);
+            final int type = Character.getType(codePoint);
+            final int count = Character.charCount(codePoint);
             if (type == currentType) {
+                pos += count;
                 continue;
             }
             if (camelCase && type == Character.LOWERCASE_LETTER && currentType 
== Character.UPPERCASE_LETTER) {
-                final int newTokenStart = pos - 1;
+                final int newTokenStart = pos - 
Character.charCount(Character.codePointBefore(c, pos));
                 if (newTokenStart != tokenStart) {
                     list.add(new String(c, tokenStart, newTokenStart - 
tokenStart));
                     tokenStart = newTokenStart;
@@ -7276,6 +7279,7 @@ private static String[] splitByCharacterType(final String 
str, final boolean cam
                 tokenStart = pos;
             }
             currentType = type;
+            pos += count;
         }
         list.add(new String(c, tokenStart, c.length - tokenStart));
         return list.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java 
b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
index fa34c839d..e90b276fb 100644
--- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
+++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java
@@ -2355,6 +2355,18 @@ void testSplitByCharacterType() {
 
         assertTrue(Objects.deepEquals(new String[]{"ASFR", "ules"},
                 StringUtils.splitByCharacterType("ASFRules")));
+
+        // Supplementary code points are classified by their own type, not 
split apart as surrogates.
+        // U+1D400 MATHEMATICAL BOLD CAPITAL A is an upper-case letter, like 
ASCII 'A'.
+        final String boldA = new String(Character.toChars(0x1D400));
+        // U+1D7D3 MATHEMATICAL BOLD DIGIT FIVE is a decimal digit, like ASCII 
'5'.
+        final String boldFive = new String(Character.toChars(0x1D7D3));
+        assertTrue(Objects.deepEquals(new String[]{"A" + boldA},
+                StringUtils.splitByCharacterType("A" + boldA)));
+        assertTrue(Objects.deepEquals(new String[]{"5" + boldFive},
+                StringUtils.splitByCharacterType("5" + boldFive)));
+        assertTrue(Objects.deepEquals(new String[]{boldA, "5" + boldFive, "z"},
+                StringUtils.splitByCharacterType(boldA + "5" + boldFive + 
"z")));
     }
 
     @Test
@@ -2382,6 +2394,16 @@ void testSplitByCharacterTypeCamelCase() {
 
         assertTrue(Objects.deepEquals(new String[]{"ASF", "Rules"},
                 StringUtils.splitByCharacterTypeCamelCase("ASFRules")));
+
+        // A supplementary upper-case letter immediately before a lower-case 
run joins the following token,
+        // exactly as a BMP upper-case letter does. U+1D400 MATHEMATICAL BOLD 
CAPITAL A is an upper-case letter.
+        final String boldA = new String(Character.toChars(0x1D400));
+        assertTrue(Objects.deepEquals(new String[]{boldA + "bc"},
+                StringUtils.splitByCharacterTypeCamelCase(boldA + "bc")));
+        assertTrue(Objects.deepEquals(new String[]{"AB", boldA + "cd"},
+                StringUtils.splitByCharacterTypeCamelCase("AB" + boldA + 
"cd")));
+        assertTrue(Objects.deepEquals(new String[]{"foo", boldA + "bar"},
+                StringUtils.splitByCharacterTypeCamelCase("foo" + boldA + 
"bar")));
     }
 
     @Test

Reply via email to