Author: brane
Date: Sun May 24 16:32:57 2026
New Revision: 1934559
Log:
Add grapheme-aware UTF-8 string trimming functions, with tests.
* subversion/include/private/svn_utf_private.h
(svn_utf__cstring_width): New; get display width and string length.
(svn_utf__cstring_trim_right,
svn_utf__cstring_trim_left): New; strip graphemes from a string to trim
it to the given display width.
* subversion/libsvn_subr/utf.c: Include limits.h.
(svn_utf_cstring_utf8_width): Reimplement here, it doesn't need utf8proc.
* subversion/libsvn_subr/utf8proc.c: Remove include of limits.h.
(svn_utf_cstring_utf8_width): Remove.
(svn_utf__cstring_width, skip_graphemes,
svn_utf__cstring_trim_right, svn_utf__cstring_trim_left): Implement.
* subversion/tests/libsvn_subr/utf-test.c
(test_utf8_width): Also test svn_utf__cstring_width.
(test_utf8_trim_right, test_utf8_trim_left): New test functions.
(test_funcs): Register the new test functions.
Modified:
subversion/trunk/subversion/include/private/svn_utf_private.h
subversion/trunk/subversion/libsvn_subr/utf.c
subversion/trunk/subversion/libsvn_subr/utf8proc.c
subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
Modified: subversion/trunk/subversion/include/private/svn_utf_private.h
==============================================================================
--- subversion/trunk/subversion/include/private/svn_utf_private.h Sun May
24 16:03:57 2026 (r1934558)
+++ subversion/trunk/subversion/include/private/svn_utf_private.h Sun May
24 16:32:57 2026 (r1934559)
@@ -320,6 +320,41 @@ svn_utf__cstring_utf8_grapheme_breaks(ap
const char *cstr,
apr_pool_t *pool);
+/* Return the display width of the UTF-8 string CSTR, or -1 if the string is
+ * not valid. If LENGTH is not NULL, set *LENGTH to the byte-wise length
+ * of CSTR; this the same as the value returned by strlen(CSTR).
+ */
+apr_ssize_t
+svn_utf__cstring_width(apr_size_t *length, const char *cstr);
+
+/* Trims the UTF-8 string CSTR to at most MAX_WIDTH visible Unicode glyphs,
+ * removing excess graphemes from the trailing (right) end of the string.
+ * Returns the display width of the trimmed substring, which can be less than
+ * MAX_WIDTH, and sets *STARTP and *ENDP to the start and one-past-the-end
+ * of the trimmed substring of CSTR.
+ *
+ * If CSTR is not a valid UTF-8 string, the returned value will be -1.
+ */
+apr_ssize_t
+svn_utf__cstring_trim_right(const char **startp,
+ const char **endp,
+ const char *cstr,
+ apr_size_t max_width);
+
+/* Trims the UTF-8 string CSTR to at most MAX_WIDTH visible Unicode glyphs,
+ * removing excess graphemes from the leading (left) end of the string.
+ * Returns the display width of the trimmed substring, which can be less than
+ * MAX_WIDTH, and sets *STARTP and *ENDP to the start and one-past-the-end
+ * of the trimmed substring of CSTR.
+ *
+ * If CSTR is not a valid UTF-8 string, the returned value will be -1.
+ */
+apr_ssize_t
+svn_utf__cstring_trim_left(const char **startp,
+ const char **endp,
+ const char *cstr,
+ apr_size_t max_width);
+
/* Return a new string with a copy of @a cstr allocated in @a pool aligned to
* the right side with spaces. This function takes UTF-8 multibyte encoding and
* wcwidth into an account. The new string will be have exacly as much
Modified: subversion/trunk/subversion/libsvn_subr/utf.c
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf.c Sun May 24 16:03:57
2026 (r1934558)
+++ subversion/trunk/subversion/libsvn_subr/utf.c Sun May 24 16:32:57
2026 (r1934559)
@@ -26,6 +26,7 @@
#include <stdlib.h>
#include <string.h>
#include <assert.h>
+#include <limits.h>
#include <apr_strings.h>
#include <apr_lib.h>
@@ -1040,6 +1041,18 @@ svn_utf_cstring_from_utf8_string(const c
return err;
}
+int
+svn_utf_cstring_utf8_width(const char *cstr)
+{
+ const apr_ssize_t width = svn_utf__cstring_width(NULL, cstr);
+
+ /* Check for return value overflow. It's unfortunate that we chose
+ to use 'int' for what is essentially a string length value. */
+ if (width > INT_MAX)
+ return -1;
+
+ return (int)width;
+}
/* Insert the given UCS-4 VALUE into BUF at the given OFFSET. */
static void
Modified: subversion/trunk/subversion/libsvn_subr/utf8proc.c
==============================================================================
--- subversion/trunk/subversion/libsvn_subr/utf8proc.c Sun May 24 16:03:57
2026 (r1934558)
+++ subversion/trunk/subversion/libsvn_subr/utf8proc.c Sun May 24 16:32:57
2026 (r1934559)
@@ -23,7 +23,6 @@
-#include <limits.h>
#include <apr_fnmatch.h>
#include "svn_utf.h"
@@ -683,13 +682,18 @@ svn_utf__cstring_utf8_grapheme_breaks(ap
return total_width;
}
-int
-svn_utf_cstring_utf8_width(const char *cstr)
+apr_ssize_t
+svn_utf__cstring_width(apr_size_t *length, const char *cstr)
{
+ const char *const start = cstr;
apr_ssize_t width = 0;
if (*cstr == '\0')
- return 0;
+ {
+ if (length)
+ *length = 0;
+ return 0;
+ }
/* Convert the UTF-8 string to UTF-32 (UCS4) which is the format
* utf8proc_charwidth() expects, and get the width of each character.
@@ -709,12 +713,144 @@ svn_utf_cstring_utf8_width(const char *c
width += utf8proc_charwidth(ucs);
}
- /* Check for return value overflow. It's unfortunate that we chose
- to use 'int' for what is essentially a string length value. */
- if (width > INT_MAX)
+ if (length)
+ *length = cstr - start;
+ return width;
+}
+
+/*
+ * Skip graphemes from the beginning of CSTR until their total width
+ * is MAX_WIDTH or less if CSTR ends earlier. If the sum of the skipped
+ * grapheme width is not exactly MAX_WIDTH, then:
+ * if TRIM_RIGHT is TRUE, stop just _before_ MAX_WIDTH;
+ * otherwise, stop just _after_ MAX_WIDTH.
+ * Return the total width of the skipped graphemes and set *ENDP to the
+ * start of the first grapheme in CSTR that was not skipped.
+ *
+ * CSTR may not be empty and MAX_WIDTH may not be 0.
+ * Return -1 if the examined part of CSTR is not valid UTF-8.
+ */
+static apr_ssize_t
+skip_graphemes(const char **endp,
+ const char *cstr,
+ apr_size_t max_width,
+ svn_boolean_t trim_right)
+{
+ apr_ssize_t current_width = 0;
+ apr_ssize_t next_width = 0;
+ utf8proc_int32_t state = 0;
+ utf8proc_int32_t codepoint1;
+ utf8proc_int32_t codepoint2;
+
+ const char *grapheme_end = cstr;
+ int grapheme_width = 0;
+
+ const utf8proc_uint8_t *utf8 = (const utf8proc_uint8_t *)grapheme_end;
+ utf8proc_ssize_t nbytes = utf8proc_iterate(utf8, -1, &codepoint1);
+
+ if (nbytes < 0)
return -1;
- return (int)width;
+ grapheme_width += utf8proc_charwidth(codepoint1);
+ utf8 += nbytes;
+
+ while(*utf8 && current_width < max_width)
+ {
+ nbytes = utf8proc_iterate(utf8, -1, &codepoint2);
+ if (nbytes < 0)
+ return -1;
+
+ if (utf8proc_grapheme_break_stateful(codepoint1, codepoint2, &state))
+ {
+ next_width = current_width + grapheme_width;
+ if (next_width > max_width)
+ /* Note: current_width < next_width */
+ break;
+
+ current_width = next_width;
+ grapheme_end = (const char *)utf8;
+ grapheme_width = 0;
+ }
+
+ codepoint1 = codepoint2;
+ grapheme_width += utf8proc_charwidth(codepoint1);
+ utf8 += nbytes;
+ }
+
+ /* Account for the width of the trailing part of the string. */
+ if (next_width == current_width)
+ next_width = current_width + grapheme_width;
+
+ if (current_width == max_width)
+ {
+ *endp = grapheme_end;
+ return current_width;
+ }
+ else
+ {
+ if (next_width <= max_width)
+ {
+ *endp = (const char *)utf8;
+ return next_width;
+ }
+ else
+ {
+ if (trim_right)
+ {
+ *endp = grapheme_end;
+ return current_width;
+ }
+ else
+ {
+ *endp = (const char *)utf8;
+ return next_width;
+ }
+ }
+ }
+}
+
+apr_ssize_t
+svn_utf__cstring_trim_right(const char **startp,
+ const char **endp,
+ const char *cstr,
+ apr_size_t max_width)
+{
+ *startp = cstr;
+ if (!*cstr || max_width == 0)
+ {
+ *endp = cstr;
+ return 0;
+ }
+ return skip_graphemes(endp, cstr, max_width, TRUE);
+}
+
+apr_ssize_t
+svn_utf__cstring_trim_left(const char **startp,
+ const char **endp,
+ const char *cstr,
+ apr_size_t max_width)
+{
+ apr_ssize_t width;
+ apr_size_t length;
+ apr_ssize_t skipped;
+
+ if (!*cstr || max_width == 0)
+ {
+ *startp = *endp = cstr;
+ return 0;
+ }
+
+ width = svn_utf__cstring_width(&length, cstr);
+ *endp = cstr + length;
+ if (width <= max_width)
+ {
+ *startp = cstr;
+ return width;
+ }
+ skipped = skip_graphemes(startp, cstr, width - max_width, FALSE);
+ if (skipped < 0)
+ return -1;
+ return width - skipped;
}
/* Advances CSTR by N printable UTF-8 characters */
Modified: subversion/trunk/subversion/tests/libsvn_subr/utf-test.c
==============================================================================
--- subversion/trunk/subversion/tests/libsvn_subr/utf-test.c Sun May 24
16:03:57 2026 (r1934558)
+++ subversion/trunk/subversion/tests/libsvn_subr/utf-test.c Sun May 24
16:32:57 2026 (r1934559)
@@ -1000,7 +1000,7 @@ test_utf_xfrm(apr_pool_t *pool)
return SVN_NO_ERROR;
}
-/* Test data for test_utf8_width and test_utf8_grapheme_breaks */
+/* Test data for width and trimming tests. */
static const char *fat_emojis =
"\xf0\x9f\xa5\xba" /* three emojis, each two columns wide */
"\xf0\x9f\x91\x89"
@@ -1022,12 +1022,237 @@ static const char *bom = "\xEF\xBB\xBF"
static svn_error_t *
test_utf8_width(apr_pool_t *pool)
{
+ apr_size_t length = -147; /* Magic number used to check... */
+
+ SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(invalid), -1);
+ SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, invalid), -1);
+ SVN_TEST_INT_ASSERT(length, -147); /* ...that 'length' was not changed. */
+
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(""), 0);
+ SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, ""), 0);
+ SVN_TEST_INT_ASSERT(length, 0);
+
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width("abc123"), 6);
+ SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, "abc123"), 6);
+ SVN_TEST_INT_ASSERT(length, 6);
+
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(fat_emojis), 6);
+ SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, fat_emojis), 6);
+ SVN_TEST_INT_ASSERT(length, strlen(fat_emojis));
+
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(mixup), 10);
- SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(invalid), -1);
+ SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, mixup), 10);
+ SVN_TEST_INT_ASSERT(length, strlen(mixup));
+
SVN_TEST_INT_ASSERT(svn_utf_cstring_utf8_width(bom), 3);
+ SVN_TEST_INT_ASSERT(svn_utf__cstring_width(&length, bom), 3);
+ SVN_TEST_INT_ASSERT(length, strlen(bom));
+
+ return SVN_NO_ERROR;
+}
+
+static svn_error_t *
+test_utf8_trim_right(apr_pool_t *pool)
+{
+ apr_ssize_t width;
+ const char *start, *end;
+
+ /* Invalid and empty */
+ width = svn_utf__cstring_trim_right(&start, &end, invalid, 1);
+ SVN_TEST_INT_ASSERT(width, -1);
+
+ width = svn_utf__cstring_trim_right(&start, &end, invalid, 0);
+ SVN_TEST_INT_ASSERT(width, 0);
+ SVN_TEST_ASSERT(start == end);
+
+ width = svn_utf__cstring_trim_right(&start, &end, "", 1);
+ SVN_TEST_INT_ASSERT(width, 0);
+ SVN_TEST_ASSERT(start == end);
+
+ /* ASCII */
+ width = svn_utf__cstring_trim_right(&start, &end, "abc123", 10);
+ SVN_TEST_INT_ASSERT(width, 6);
+ SVN_TEST_INT_ASSERT(*start, 'a');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, 6);
+
+ width = svn_utf__cstring_trim_right(&start, &end, "abc123", 6);
+ SVN_TEST_INT_ASSERT(width, 6);
+ SVN_TEST_INT_ASSERT(*start, 'a');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, 6);
+
+ width = svn_utf__cstring_trim_right(&start, &end, "abc123", 3);
+ SVN_TEST_INT_ASSERT(width, 3);
+ SVN_TEST_INT_ASSERT(*start, 'a');
+ SVN_TEST_INT_ASSERT(*end, '1');
+ SVN_TEST_INT_ASSERT(end - start, 3);
+
+ /* Accented Latin */
+ width = svn_utf__cstring_trim_right(&start, &end, mixup, 15);
+ SVN_TEST_INT_ASSERT(width, 10);
+ SVN_TEST_INT_ASSERT(*start, 'S');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(mixup));
+
+ width = svn_utf__cstring_trim_right(&start, &end, mixup, 10);
+ SVN_TEST_INT_ASSERT(width, 10);
+ SVN_TEST_INT_ASSERT(*start, 'S');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(mixup));
+
+ width = svn_utf__cstring_trim_right(&start, &end, mixup, 7);
+ SVN_TEST_INT_ASSERT(width, 7);
+ SVN_TEST_INT_ASSERT(*start, 'S');
+ SVN_TEST_INT_ASSERT(*end, '\xe1');
+ SVN_TEST_INT_ASSERT(end - start, 23);
+
+ /* Emoji (two colmns wide glyphs) */
+ width = svn_utf__cstring_trim_right(&start, &end, fat_emojis, 10);
+ SVN_TEST_INT_ASSERT(width, 6);
+ SVN_TEST_INT_ASSERT(*start, '\xf0');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(fat_emojis));
+
+ width = svn_utf__cstring_trim_right(&start, &end, fat_emojis, 6);
+ SVN_TEST_INT_ASSERT(width, 6);
+ SVN_TEST_INT_ASSERT(*start, '\xf0');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(fat_emojis));
+
+ width = svn_utf__cstring_trim_right(&start, &end, fat_emojis, 4);
+ SVN_TEST_INT_ASSERT(width, 4);
+ SVN_TEST_INT_ASSERT(*start, '\xf0');
+ SVN_TEST_INT_ASSERT(*end, '\xf0');
+ SVN_TEST_INT_ASSERT(end - start, 8);
+
+ width = svn_utf__cstring_trim_right(&start, &end, fat_emojis, 3);
+ SVN_TEST_INT_ASSERT(width, 2);
+ SVN_TEST_INT_ASSERT(*start, '\xf0');
+ SVN_TEST_INT_ASSERT(*end, '\xf0');
+ SVN_TEST_INT_ASSERT(end - start, 4);
+
+ /* Byte order mark */
+ width = svn_utf__cstring_trim_right(&start, &end, bom, 5);
+ SVN_TEST_INT_ASSERT(width, 3);
+ SVN_TEST_INT_ASSERT(*start, '\xef');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(bom));
+
+ width = svn_utf__cstring_trim_right(&start, &end, bom, 3);
+ SVN_TEST_INT_ASSERT(width, 3);
+ SVN_TEST_INT_ASSERT(*start, '\xef');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(bom));
+
+ width = svn_utf__cstring_trim_right(&start, &end, bom, 2);
+ SVN_TEST_INT_ASSERT(width, 2);
+ SVN_TEST_INT_ASSERT(*start, '\xef');
+ SVN_TEST_INT_ASSERT(*end, 'c');
+ SVN_TEST_INT_ASSERT(end - start, 5);
+
+ return SVN_NO_ERROR;
+}
+
+static svn_error_t *
+test_utf8_trim_left(apr_pool_t *pool)
+{
+ apr_ssize_t width;
+ const char *start, *end;
+
+ /* Invalid and empty */
+ width = svn_utf__cstring_trim_left(&start, &end, invalid, 1);
+ SVN_TEST_INT_ASSERT(width, -1);
+
+ width = svn_utf__cstring_trim_left(&start, &end, invalid, 0);
+ SVN_TEST_INT_ASSERT(width, 0);
+ SVN_TEST_ASSERT(start == end);
+
+ width = svn_utf__cstring_trim_left(&start, &end, "", 1);
+ SVN_TEST_INT_ASSERT(width, 0);
+ SVN_TEST_ASSERT(start == end);
+
+ /* ASCII */
+ width = svn_utf__cstring_trim_left(&start, &end, "abc123", 10);
+ SVN_TEST_INT_ASSERT(width, 6);
+ SVN_TEST_INT_ASSERT(*start, 'a');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, 6);
+
+ width = svn_utf__cstring_trim_left(&start, &end, "abc123", 6);
+ SVN_TEST_INT_ASSERT(width, 6);
+ SVN_TEST_INT_ASSERT(*start, 'a');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, 6);
+
+ width = svn_utf__cstring_trim_left(&start, &end, "abc123", 3);
+ SVN_TEST_INT_ASSERT(width, 3);
+ SVN_TEST_INT_ASSERT(*start, '1');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, 3);
+
+ /* Accented Latin */
+ width = svn_utf__cstring_trim_left(&start, &end, mixup, 15);
+ SVN_TEST_INT_ASSERT(width, 10);
+ SVN_TEST_INT_ASSERT(*start, 'S');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(mixup));
+
+ width = svn_utf__cstring_trim_left(&start, &end, mixup, 10);
+ SVN_TEST_INT_ASSERT(width, 10);
+ SVN_TEST_INT_ASSERT(*start, 'S');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(mixup));
+
+ width = svn_utf__cstring_trim_left(&start, &end, mixup, 6);
+ SVN_TEST_INT_ASSERT(width, 6);
+ SVN_TEST_INT_ASSERT(*start, 'e');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, 21);
+
+ /* Emoji (two colmns wide glyphs) */
+ width = svn_utf__cstring_trim_left(&start, &end, fat_emojis, 10);
+ SVN_TEST_INT_ASSERT(width, 6);
+ SVN_TEST_INT_ASSERT(*start, '\xf0');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(fat_emojis));
+
+ width = svn_utf__cstring_trim_left(&start, &end, fat_emojis, 6);
+ SVN_TEST_INT_ASSERT(width, 6);
+ SVN_TEST_INT_ASSERT(*start, '\xf0');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(fat_emojis));
+
+ width = svn_utf__cstring_trim_left(&start, &end, fat_emojis, 4);
+ SVN_TEST_INT_ASSERT(width, 4);
+ SVN_TEST_INT_ASSERT(*start, '\xf0');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, 8);
+
+ width = svn_utf__cstring_trim_left(&start, &end, fat_emojis, 3);
+ SVN_TEST_INT_ASSERT(width, 2);
+ SVN_TEST_INT_ASSERT(*start, '\xf0');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, 4);
+
+ /* Byte order mark */
+ width = svn_utf__cstring_trim_left(&start, &end, bom, 5);
+ SVN_TEST_INT_ASSERT(width, 3);
+ SVN_TEST_INT_ASSERT(*start, '\xef');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(bom));
+
+ width = svn_utf__cstring_trim_left(&start, &end, bom, 3);
+ SVN_TEST_INT_ASSERT(width, 3);
+ SVN_TEST_INT_ASSERT(*start, '\xef');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, strlen(bom));
+
+ width = svn_utf__cstring_trim_left(&start, &end, bom, 2);
+ SVN_TEST_INT_ASSERT(width, 2);
+ SVN_TEST_INT_ASSERT(*start, 'b');
+ SVN_TEST_INT_ASSERT(*end, '\0');
+ SVN_TEST_INT_ASSERT(end - start, 2);
return SVN_NO_ERROR;
}
@@ -1151,6 +1376,10 @@ static struct svn_test_descriptor_t test
"test svn_utf__xfrm"),
SVN_TEST_PASS2(test_utf8_width,
"test svn_utf_cstring_utf8_width"),
+ SVN_TEST_PASS2(test_utf8_trim_right,
+ "test grapheme-aware right trim"),
+ SVN_TEST_PASS2(test_utf8_trim_left,
+ "test grapheme-aware left trim"),
SVN_TEST_PASS2(test_utf8_grapheme_breaks,
"test utf8 grapheme breaks"),
SVN_TEST_PASS2(test_utf8_align,