The attached is from Lukáš Zaoral,
who updated the expand/unexpand implementation in Fedora
to use Collin's mbbuf module
(to fix crash bugs in the original i18n implementation
(https://bugzilla.redhat.com/2443041))
I've confirmed tests and syntax checks pass with this.
I plan to push this later, and we can iterate
on tests etc. after that.
cheers,
Padraig
From 8baccfd2dd4e1005505aeeb74a34feb6bf48be4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Zaoral?= <[email protected]>
Date: Fri, 6 Mar 2026 14:13:17 +0000
Subject: [PATCH] expand,unexpand: support multi-byte input
* src/expand.c: Use mbbuf to support multi-byte input.
* src/unexpand.c: Likewise.
* tests/expand/mb.sh: New multi-byte test.
* tests/unexpand/mb.sh: Likewise.
* tests/local.mk: Reference new tests.
* NEWS: Mention the improvement.
---
src/expand.c | 38 +++++++---
src/unexpand.c | 55 +++++++++-----
tests/expand/mb.sh | 171 +++++++++++++++++++++++++++++++++++++++++++
tests/local.mk | 2 +
tests/unexpand/mb.sh | 163 +++++++++++++++++++++++++++++++++++++++++
5 files changed, 399 insertions(+), 30 deletions(-)
create mode 100755 tests/expand/mb.sh
create mode 100755 tests/unexpand/mb.sh
diff --git a/src/expand.c b/src/expand.c
index cbf659c17..6d4223c9b 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -37,7 +37,11 @@
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
+
#include "system.h"
+#include "ioblksize.h"
+#include "mcel.h"
+#include "mbbuf.h"
#include "expand-common.h"
/* The official name of this program (e.g., no 'g' prefix). */
@@ -103,10 +107,14 @@ expand (void)
if (!fp)
return;
+ static char line_in[IO_BUFSIZE];
+ mbbuf_t mbbuf;
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
+
while (true)
{
/* Input character, or EOF. */
- int c;
+ mcel_t g;
/* If true, perform translations. */
bool convert = true;
@@ -126,12 +134,16 @@ expand (void)
do
{
- while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
- continue;
+ while ((g = mbbuf_get_char (&mbbuf)).ch == MBBUF_EOF
+ && (fp = next_file (fp)))
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
if (convert)
{
- if (c == '\t')
+ convert &= convert_entire_line
+ || !! (c32isblank (g.ch) && ! c32isnbspace (g.ch));
+
+ if (g.ch == '\t')
{
/* Column the next input tab stop is on. */
bool last_tab;
@@ -142,9 +154,12 @@ expand (void)
if (putchar (' ') < 0)
write_error ();
- c = ' ';
+ if (putchar (' ') < 0)
+ write_error ();
+
+ continue;
}
- else if (c == '\b')
+ else if (g.ch == '\b')
{
/* Go back one column, and force recalculation of the
next tab stop. */
@@ -153,20 +168,21 @@ expand (void)
}
else
{
- if (ckd_add (&column, column, 1))
+ int width = c32width (g.ch);
+ if (ckd_add (&column, column, width < 0 ? 1 : width))
error (EXIT_FAILURE, 0, _("input line is too long"));
}
- convert &= convert_entire_line || !! isblank (c);
}
- if (c < 0)
+ if (g.ch == MBBUF_EOF)
return;
- if (putchar (c) < 0)
+ fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, stdout);
+ if (ferror (stdout))
write_error ();
}
- while (c != '\n');
+ while (g.ch != '\n');
}
}
diff --git a/src/unexpand.c b/src/unexpand.c
index 54b3ae2fe..16d0f0031 100644
--- a/src/unexpand.c
+++ b/src/unexpand.c
@@ -38,7 +38,11 @@
#include <stdio.h>
#include <getopt.h>
#include <sys/types.h>
+
#include "system.h"
+#include "ioblksize.h"
+#include "mbbuf.h"
+#include "mcel.h"
#include "expand-common.h"
/* The official name of this program (e.g., no 'g' prefix). */
@@ -120,15 +124,19 @@ unexpand (void)
if (!fp)
return;
+ static char line_in[IO_BUFSIZE];
+ mbbuf_t mbbuf;
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
+
/* The worst case is a non-blank character, then one blank, then a
tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so
allocate MAX_COLUMN_WIDTH bytes to store the blanks. */
- pending_blank = ximalloc (max_column_width);
+ pending_blank = ximalloc (max_column_width * sizeof (char) * MB_LEN_MAX);
while (true)
{
/* Input character, or EOF. */
- int c;
+ mcel_t g;
/* If true, perform translations. */
bool convert = true;
@@ -140,6 +148,9 @@ unexpand (void)
/* Column of next input character. */
colno column = 0;
+ /* Column the next input tab stop is on. */
+ colno next_tab_column = 0;
+
/* Index in TAB_LIST of next tab stop to examine. */
idx_t tab_index = 0;
@@ -159,28 +170,27 @@ unexpand (void)
do
{
- while ((c = getc (fp)) < 0 && (fp = next_file (fp)))
- continue;
+ while ((g = mbbuf_get_char (&mbbuf)).ch == MBBUF_EOF
+ && (fp = next_file (fp)))
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, fp);
if (convert)
{
- bool blank = !! isblank (c);
+ bool blank = !! (c32isblank (g.ch) && ! c32isnbspace (g.ch));
if (blank)
{
bool last_tab;
- /* Column the next input tab stop is on. */
- colno next_tab_column = get_next_tab_column (column,
- &tab_index,
- &last_tab);
+ next_tab_column = get_next_tab_column (column, &tab_index,
+ &last_tab);
if (last_tab)
convert = false;
if (convert)
{
- if (c == '\t')
+ if (g.ch == '\t')
{
column = next_tab_column;
@@ -189,7 +199,7 @@ unexpand (void)
}
else
{
- column++;
+ column += c32width (g.ch);
if (! (prev_blank && column == next_tab_column))
{
@@ -197,13 +207,18 @@ unexpand (void)
will be replaced by tabs. */
if (column == next_tab_column)
one_blank_before_tab_stop = true;
- pending_blank[pending++] = c;
+ memcpy (pending_blank + pending,
+ mbbuf_char_offset (&mbbuf, g), g.len);
+ pending += g.len;
prev_blank = true;
continue;
}
/* Replace the pending blanks by a tab or two. */
- pending_blank[0] = c = '\t';
+ g.len = 0;
+ if (putc ('\t', stdout) < 0)
+ write_error ();
+ pending_blank[0] = '\t';
}
/* Discard pending blanks, unless it was a single
@@ -211,17 +226,18 @@ unexpand (void)
pending = one_blank_before_tab_stop;
}
}
- else if (c == '\b')
+ else if (g.ch == '\b')
{
/* Go back one column, and force recalculation of the
next tab stop. */
column -= !!column;
+ next_tab_column = column;
tab_index -= !!tab_index;
}
else
{
- column++;
- if (!column)
+ int width = c32width (g.ch);
+ if (ckd_add (&column, column, width < 0 ? 1 : width))
error (EXIT_FAILURE, 0, _("input line is too long"));
}
@@ -239,16 +255,17 @@ unexpand (void)
convert &= convert_entire_line || blank;
}
- if (c < 0)
+ if (g.ch == MBBUF_EOF)
{
free (pending_blank);
return;
}
- if (putchar (c) < 0)
+ fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, stdout);
+ if (ferror (stdout))
write_error ();
}
- while (c != '\n');
+ while (g.ch != '\n');
}
}
diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh
new file mode 100755
index 000000000..10ea160f4
--- /dev/null
+++ b/tests/expand/mb.sh
@@ -0,0 +1,171 @@
+#!/bin/sh
+
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ expand printf
+
+export LC_ALL=en_US.UTF-8
+
+#input containing multibyte characters
+cat <<\EOF > in || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
+
+cat <<\EOF > exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#multiple files as an input
+cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test characters with display widths != 1
+env printf '12345678
+e\t|ascii(1)
+\u00E9\t|composed(1)
+e\u0301\t|decomposed(1)
+\u3000\t|ideo-space(2)
+\uFF0D\t|full-hypen(2)
+' > in || framework_failure_
+
+env printf '12345678
+e |ascii(1)
+\u00E9 |composed(1)
+e\u0301 |decomposed(1)
+\u3000 |ideo-space(2)
+\uFF0D |full-hypen(2)
+' > exp || framework_failure_
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#shouldn't fail with "input line too long"
+#when a line starts with a control character
+env printf '\n' > in || framework_failure_
+
+expand < in > out || fail=1
+compare in out > /dev/null 2>&1 || fail=1
+
+#non-Unicode characters interspersed between Unicode ones
+env printf '12345678
+\t\xFF|
+\xFF\t|
+\t\xFFä|
+ä\xFF\t|
+\tä\xFF|
+\xFF\tä|
+äbcdef\xFF\t|
+' > in || framework_failure_
+
+env printf '12345678
+ \xFF|
+\xFF |
+ \xFFä|
+ä\xFF |
+ ä\xFF|
+\xFF ä|
+äbcdef\xFF |
+' > exp || framework_failure_
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+
+#BOM header test 1
+env printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_
+
+env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+env printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+EOF
+env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_
+
+
+env printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+env printf '\xEF\xBB\xBF' >> exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+expand in1 in1 > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+Exit $fail
diff --git a/tests/local.mk b/tests/local.mk
index 2d82a0de9..ef263e992 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -342,6 +342,7 @@ all_tests = \
tests/env/env-S-script.sh \
tests/expand/expand.pl \
tests/expand/bounded-memory.sh \
+ tests/expand/mb.sh \
tests/expr/expr.pl \
tests/expr/expr-multibyte.pl \
tests/factor/factor.pl \
@@ -504,6 +505,7 @@ all_tests = \
tests/misc/usage_vs_refs.sh \
tests/unexpand/unexpand.pl \
tests/unexpand/bounded-memory.sh \
+ tests/unexpand/mb.sh \
tests/uniq/uniq.pl \
tests/uniq/uniq-perf.sh \
tests/uniq/uniq-collate.sh \
diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh
new file mode 100755
index 000000000..dde30b594
--- /dev/null
+++ b/tests/unexpand/mb.sh
@@ -0,0 +1,163 @@
+#!/bin/sh
+
+# Copyright (C) 2012-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ unexpand printf
+
+export LC_ALL=en_US.UTF-8
+
+#input containing multibyte characters
+cat > in <<\EOF
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+cat > exp <<\EOF
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+
+#multiple files as an input
+cat >> exp <<\EOF
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+
+unexpand -a ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test characters with a display width larger than 1
+
+env printf '12345678
+e |ascii(1)
+\u00E9 |composed(1)
+e\u0301 |decomposed(1)
+\u3000 |ideo-space(2)
+\u3000\u3000\u3000\u3000|ideo-space(2) * 4
+\uFF0D |full-hypen(2)
+' > in || framework_failure_
+
+env printf '12345678
+e\t|ascii(1)
+\u00E9\t|composed(1)
+e\u0301\t|decomposed(1)
+\t|ideo-space(2)
+\t|ideo-space(2) * 4
+\uFF0D\t|full-hypen(2)
+' > exp || framework_failure_
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#test input where a blank of width > 1 is not being substituted
+in="$(LC_ALL=en_US.UTF-8 env printf ' \u3000 ö ü ß')"
+exp=' ö ü ß'
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#non-Unicode characters interspersed between Unicode ones
+env printf '12345678
+ \xFF|
+\xFF |
+ \xFFä|
+ä\xFF |
+ ä\xFF|
+\xFF ä|
+äbcde\xFF |
+' > in || framework_failure_
+
+env printf '12345678
+\t\xFF|
+\xFF\t|
+\t\xFFä|
+ä\xFF\t|
+\tä\xFF|
+\xFF\tä|
+äbcde\xFF\t|
+' > exp || framework_failure_
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+#BOM header test 1
+env printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+unexpand -a < in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+env printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+env printf "\xEF\xBB\xBF" >> exp; cat <<\EOF >> exp || framework_failure_
+1234567812345678123456781
+. . . .
+a b c d
+. . . .
+ä ö ü ß
+. . . .
+ äöü . öüä. ä xx
+EOF
+
+unexpand -a ./in ./in > out || fail=1
+compare exp out > /dev/null 2>&1 || fail=1
+
+Exit $fail
--
2.53.0