We discussed this patch off list and are going to leave it for a future
release. But I figured I would post it here for others to try and so I
do not lose it.
The patch handles multi-byte characters when invoking
'uniq --ignore-case' while perserving performance in the case of
LC_ALL=C and the case without --ignore-case.
$ yes abcdefghijklmnopqrstuvwxyz | head -n 10000000 > test.txt
$ export LC_ALL=en_US.UTF-8
$ time ./src/uniq-new test.txt
real 0m0.420s
$ time ./src/uniq-new --ignore-case test.txt
real 0m0.761s
$ export LC_ALL=C
$ time ./src/uniq-new test.txt
real 0m0.425s
$ time ./src/uniq-new --ignore-case test.txt
real 0m0.485s
$ export LC_ALL=en_US.UTF-8
$ time ./src/uniq-old test.txt
real 0m0.420s
$ time ./src/uniq-old --ignore-case test.txt
real 0m0.437s
$ export LC_ALL=C
$ time ./src/uniq-old test.txt
real 0m0.416s
$ time ./src/uniq-old --ignore-case test.txt
real 0m0.626s
Collin
>From d93fda0413336267e1987683ce4f4778265e1b5f Mon Sep 17 00:00:00 2001
Message-ID: <d93fda0413336267e1987683ce4f4778265e1b5f.1757188059.git.collin.fu...@gmail.com>
From: Collin Funk <[email protected]>
Date: Sat, 6 Sep 2025 12:30:20 -0700
Subject: [PATCH] uniq: support multi-byte characters with --ignore-case
* bootstrap.conf (gnulib_modules): Add c32tolower.
* src/uniq.c (different): Use mcel functions to scan the characters and
compare using c32tolower only if MB_CUR_MAX is greater than 1.
* tests/local.mk: Add it.
* tests/uniq/uniq-ignorecase.sh (all_tests): Add it.
---
bootstrap.conf | 1 +
src/uniq.c | 23 +++++++--
tests/local.mk | 1 +
tests/uniq/uniq-ignorecase.sh | 89 +++++++++++++++++++++++++++++++++++
4 files changed, 111 insertions(+), 3 deletions(-)
create mode 100755 tests/uniq/uniq-ignorecase.sh
diff --git a/bootstrap.conf b/bootstrap.conf
index 03848e9ea..4453607ef 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -52,6 +52,7 @@ gnulib_modules="
c-strcase
c32iscntrl
c32isspace
+ c32tolower
c32width
canon-host
canonicalize
diff --git a/src/uniq.c b/src/uniq.c
index 9aa780574..6f0a02969 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -284,10 +284,27 @@ find_field (struct linebuffer const *line, idx_t *plen)
static bool
different (char *old, char *new, idx_t oldlen, idx_t newlen)
{
- if (ignore_case)
- return oldlen != newlen || memcasecmp (old, new, oldlen);
+ if (1 < MB_CUR_MAX && ignore_case)
+ {
+ char *old_lim = old + oldlen;
+ char *new_lim = new + newlen;
+ for (mcel_t g1, g2; old < old_lim && new < new_lim;
+ old += g1.len, new += g2.len)
+ {
+ g1 = mcel_scan (old, old_lim);
+ g2 = mcel_scan (new, new_lim);
+ if (mcel_tocmp (c32tolower, g1, g2) != 0)
+ return true;
+ }
+ return (old < old_lim) != (new < new_lim);
+ }
else
- return oldlen != newlen || memcmp (old, new, oldlen);
+ {
+ if (ignore_case)
+ return oldlen != newlen || memcasecmp (old, new, oldlen);
+ else
+ return oldlen != newlen || memcmp (old, new, oldlen);
+ }
}
/* Output the line in linebuffer LINE to standard output
diff --git a/tests/local.mk b/tests/local.mk
index a42a20fbe..ab7a40623 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -473,6 +473,7 @@ all_tests = \
tests/uniq/uniq.pl \
tests/uniq/uniq-perf.sh \
tests/uniq/uniq-collate.sh \
+ tests/uniq/uniq-ignorecase.sh \
tests/misc/xattr.sh \
tests/misc/yes.sh \
tests/tail/wait.sh \
diff --git a/tests/uniq/uniq-ignorecase.sh b/tests/uniq/uniq-ignorecase.sh
new file mode 100755
index 000000000..525334076
--- /dev/null
+++ b/tests/uniq/uniq-ignorecase.sh
@@ -0,0 +1,89 @@
+#!/bin/sh
+# Test uniq --ignore-case
+
+# Copyright (C) 2025 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ uniq printf
+
+# abc
+# ABC
+env printf 'abc\nABC\n' > inp || framework_failure_
+env printf 'abc\n' > exp || framework_failure_
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+# ABC
+# abc
+env printf 'ABC\nabc\n' > inp || framework_failure_
+env printf 'ABC\n' > exp || framework_failure_
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+test "$LOCALE_FR_UTF8" != none || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+# президент
+# ПРЕЗИДЕНТ
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\u0442\n' > inp \
+ || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' >> inp \
+ || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\u0442\n' > exp \
+ || framework_failure
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+# ПРЕЗИДЕНТ
+# президент
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > inp \
+ || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\u0442\n' >> inp \
+ || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > exp \
+ || framework_failure
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+# президен
+# ПРЕЗИДЕНТ
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' > inp \
+ || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' >> inp \
+ || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' > exp \
+ || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' >> exp \
+ || framework_failure
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+# ПРЕЗИДЕНТ
+# президен
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > inp \
+ || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' >> inp \
+ || framework_failure
+env printf '\u041f\u0420\u0415\u0417\u0418\u0414\u0415\u041d\u0422\n' > exp \
+ || framework_failure
+env printf '\u043f\u0440\u0435\u0437\u0438\u0434\u0435\u043d\n' >> exp \
+ || framework_failure
+uniq --ignore-case inp > out || fail=1
+compare exp out || fail=1
+
+Exit $fail
--
2.51.0