Hello Gary,
I updated the unit test, and removed the guessing part, I think.
This page shows nicely how the Family Grapheme is composed
https://utf-8-visualizer.ardis.lu/?q=%F0%9F%91%A8%F0%9F%8F%BB%E2%80%8D%F0%9F%91%A9%F0%9F%8F%BB%E2%80%8D%F0%9F%91%A6%F0%9F%8F%BB%E2%80%8D%F0%9F%91%A6%F0%9F%8F%BB
Kind regards
Carsten
import org.apache.commons.lang3.StringUtils;
import org.junit.Test;
import static org.junit.Assert.*;
public class AbbreviateTest {
String[] expectedResultsFox = {
"🦊...", // 4
"🦊🦊...",
"🦊🦊🦊...",
"🦊🦊🦊🦊...",
"🦊🦊🦊🦊🦊...",
"🦊🦊🦊🦊🦊🦊...",
"🦊🦊🦊🦊🦊🦊🦊...", // 10
};
String[] expectedResultsFamilyWithCodepoints = {
"👩...", // 4
"👩🏻...",
"👩🏻...", // zero width joiner
"👩🏻👨...",
"👩🏻👨🏻...",
"👩🏻👨🏻...",
"👩🏻👨🏻👦..."
};
String[] expectedResultsFamilyWithGrapheme = {
"👩🏻👨🏻👦🏻👦🏻...", // 4
"👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼...",
"👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽...",
"👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾...",
"👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿...",
"👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻...",
"👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼..."
// 10
};
@Test
public void
abberviate4ByteCharsShouldNotContainBrokenSurrogatePairs() {
String abbreviateResult;
for(var i = 4; i <= 10; i++) {
abbreviateResult =
StringUtils.abbreviate("🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊", i);
String expectedFox =
expectedResultsFox[i - 4];
assertEquals("The abbreviated
String contains a broken surrogate pair", expectedFox, abbreviateResult);
assertEquals("There are not
enough codepoints in the result (2 for each fox, 3 for the dots)",
expectedFox.codePointCount(0, expectedFox.length()),
abbreviateResult.codePointCount(0,abbreviateResult.length()));
}
}
@Test
public void
abbreviateGraphemeClusterShouldNotContainBrokenSurrogatePairs() {
String abbreviateResult;
for(var i = 4; i <= 10; i++) {
abbreviateResult =
StringUtils.abbreviate("👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿",
i);
String expectedFamily =
expectedResultsFamilyWithCodepoints[i - 4];
assertEquals("There are not
enough codepoints in the result (2 for each person + 2 for each skin color + 1
for the zero width joiner, 3 for the dots)", expectedFamily.codePointCount(0,
expectedFamily.length()),
abbreviateResult.codePointCount(0,abbreviateResult.length()));
}
}
@Test
public void
abbreviateGraphemeClusterMayHonorTheGraphemeCluster() {
// if the abbreviate function honors the
grapheme cluster it would cut after each one, not in the middle of them
// but that could bring unwanted behavior.
String abbreviateResult;
for(var i = 4; i <= 10; i++) {
abbreviateResult =
StringUtils.abbreviate("👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿",
i);
String expectedFamily =
expectedResultsFamilyWithGrapheme[i - 4];
assertEquals("There are not
enough codepoints in the result (11 for each family, 3 for the dots)",
expectedFamily.codePointCount(0, expectedFamily.length()),
abbreviateResult.codePointCount(0,abbreviateResult.length()));
}
}
}
On 2025/04/14 12:21:53 Gary Gregory wrote:
> Hi Carsten,
>
> Could you provide a unit test with the expected behavior? The example
> you gave has console output and assertions commented out, both of
> which are undesirable. Instead of me guessing, I'd rather you manage
> expectations and provide a failing/passing set of assertions.
>
> TY!
> Gary
>
>
> On Sun, Apr 13, 2025 at 7:45 PM Gary Gregory
> <[email protected]<mailto:[email protected]>> wrote:
> >
> > I created https://issues.apache.org/jira/browse/LANG-1770 to track this
> > report.
> >
> > Gary
> >
> > On Fri, Apr 11, 2025 at 10:15 AM Carsten Kirschner
> > <[email protected]<mailto:[email protected]>lid> wrote:
> > >
> > > Hello,
> > >
> > > The current commons lang3 StringUtils.abbreviate (3.17.0) implementation
> > > will destroy 4 byte emoji characters and larger grapheme clusters. I know
> > > that handling grapheme correctly before java 20 is not possible, but at
> > > least a codepoint aware solution with String.offsetByCodPoints could be
> > > build. I wrote a small test to show the problem.
> > > The zero width joiners in the family emoji are questionable for the
> > > abbreviate, but there should never be a question mark for an invalid char
> > > in the result as there is now.
> > >
> > > The problem is not so much the „doesn’t look nice“ aspect of the broken
> > > emoji, but if that abbreviated string is passed to an XML Writer
> > > (com.ctc.wstx.io.UTF8Writer in my case) it throws an exception on this
> > > broken byte sequence. Like this: Caused by: java.io.IOException: Broken
> > > surrogate pair: first char 0xd83c, second 0x2e; illegal combination
> > > at
> > > com.ctc.wstx.io.UTF8Writer._convertSurrogate(UTF8Writer.java:402)
> > > ~[woodstox-core-7.0.0.jar:7.0.0]
> > >
> > > Thanks,
> > > Carsten
> > >
> > >
> > >
> > > import org.apache.commons.lang3.StringUtils;
> > > import org.junit.Test;
> > > import static org.junit.Assert.*;
> > >
> > > public class AbbreviateTest {
> > >
> > > String[] expectedResultsFox = {
> > > "🦊...", // 4
> > > "🦊🦊...",
> > > "🦊🦊🦊...",
> > > "🦊🦊🦊🦊...",
> > > "🦊🦊🦊🦊🦊...",
> > > "🦊🦊🦊🦊🦊🦊...",
> > > "🦊🦊🦊🦊🦊🦊🦊...", // 10
> > > };
> > >
> > > String[] expectedResultsFamilyWithCodepoints = {
> > > "👩...",
> > > "👩🏻...",
> > > "👩🏻...", // zero width
> > > joiner
> > > "👩🏻👨...",
> > > "👩🏻👨🏻...",
> > > "👩🏻👨🏻...",
> > > "👩🏻👨🏻👦..."
> > > };
> > >
> > > String[] expectedResultsFamilyWithGrapheme = {
> > > "👩🏻👨🏻👦🏻👦🏻...", // 4
> > >
> > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼...",
> > >
> > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽...",
> > >
> > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾...",
> > >
> > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿...",
> > >
> > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻...",
> > >
> > > "👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼..."
> > > // 10
> > > };
> > >
> > > @Test
> > > public void abberviateTest() {
> > > String abbreviateResult;
> > > for(var i = 4; i <= 10; i++) {
> > > abbreviateResult =
> > > StringUtils.abbreviate("🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊🦊", i);
> > >
> > > System.out.println(abbreviateResult);
> > >
> > > //assertEquals(expectedResultsFox[i - 4], abbreviateResult);
> > > }
> > > for(var i = 4; i <= 10; i++) {
> > > abbreviateResult =
> > > StringUtils.abbreviate("👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿👩🏻👨🏻👦🏻👦🏻👩🏼👨🏼👦🏼👦🏼👩🏽👨🏽👦🏽👦🏽👩🏾👨🏾👦🏾👦🏾👩🏿👨🏿👦🏿👦🏿",
> > > i);
> > >
> > > System.out.println(abbreviateResult);
> > >
> > > //assertEquals(expectedResultsFamilyWithCodepoints[i - 4],
> > > abbreviateResult);
> > > }
> > > }
> > > }
> > >
> > >
>