On 10/03/2026 00:34, Bruno Haible wrote:
Pádraig Brady wrote:
\u3000 is ideographic space, i.e. a space generally used in east asian text
so that alignment is maintained. Since it's a space, and not non breaking space
it should be treated as a blank character IMHO.
It should be treated like a space character. Implementations essentially
agree what this means. See gnulib/tests/test-c32isspace.c.
The "blank" character category has, unfortunately, so much variation among
implementations that it is not really useful. See
gnulib/tests/test-c32isblank.c:
OK I'll go with the attached, which avoids direct use of c32isblank(),
instead defining:
ATTRIBUTE_PURE
static inline bool
c32issep (char32_t wc)
{
#if defined __GLIBC__
return !! c32isblank (wc);
#endif
return !! (c32isspace (wc) && ! c32isvertspace (wc) && ! c32isnbspace (wc));
}
Tests pass on GLIBC, musl, Solaris 11.
thanks,
PadraigFrom 46e083e0f40f1931d7fef74ad8512514b7adad35 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Mon, 9 Mar 2026 22:23:12 +0000
Subject: [PATCH] all: use more consistent blank character determination
* src/system.h (c32issep): A new function that is essentially
iswblank() on GLIBC platforms, and iswspace() with exceptions elsewhere.
* src/expand.c: Use it instead of c32isblank().
* src/fold.c: Likewise.
* src/join.c: Likewise.
* src/numfmt.c: Likewise.
* src/unexpand.c: Likewise.
* src/uniq.c: Likewise.
* NEWS: Mention the improvement.
---
NEWS | 4 ++++
src/expand.c | 3 +--
src/fold.c | 2 +-
src/join.c | 4 ++--
src/numfmt.c | 5 ++---
src/system.h | 23 +++++++++++++++++++++++
src/unexpand.c | 2 +-
src/uniq.c | 2 +-
8 files changed, 35 insertions(+), 10 deletions(-)
diff --git a/NEWS b/NEWS
index cec03a581..666968c86 100644
--- a/NEWS
+++ b/NEWS
@@ -42,6 +42,10 @@ GNU coreutils NEWS -*- outline -*-
'install' now allows the combination of the --compare and
--preserve-timestamps options.
+ 'fold', 'join', 'numfmt', 'uniq' now use more consistent blank character
+ determination on non GLIBC platforms. For example \u3000 (ideographic space)
+ will be considered a blank character on all platforms.
+
'nl' now supports multi-byte --section-delimiter characters.
'shuf -i' now operates up to two times faster on systems with unlocked stdio
diff --git a/src/expand.c b/src/expand.c
index 6d4223c9b..1d0759079 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -140,8 +140,7 @@ expand (void)
if (convert)
{
- convert &= convert_entire_line
- || !! (c32isblank (g.ch) && ! c32isnbspace (g.ch));
+ convert &= convert_entire_line || c32issep (g.ch);
if (g.ch == '\t')
{
diff --git a/src/fold.c b/src/fold.c
index 666490f95..f49078f01 100644
--- a/src/fold.c
+++ b/src/fold.c
@@ -210,7 +210,7 @@ fold_file (char const *filename, size_t width)
for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
{
g2 = mcel_scan (logical_p, logical_lim);
- if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch))
+ if (c32issep (g2.ch))
{
space_length = g2.len;
logical_end = logical_p - line_out;
diff --git a/src/join.c b/src/join.c
index 883a42005..4346758a6 100644
--- a/src/join.c
+++ b/src/join.c
@@ -308,7 +308,7 @@ eq_tab (mcel_t g)
static bool
newline_or_blank (mcel_t g)
{
- return g.ch == '\n' || c32isblank (g.ch);
+ return g.ch == '\n' || c32issep (g.ch);
}
/* Fill in the 'fields' structure in LINE. */
@@ -918,7 +918,7 @@ decode_field_spec (char const *s, int *file_index, idx_t *field_index)
static bool
comma_or_blank (mcel_t g)
{
- return g.ch == ',' || c32isblank (g.ch);
+ return g.ch == ',' || c32issep (g.ch);
}
/* Add the comma or blank separated field spec(s) in STR to 'outlist'. */
diff --git a/src/numfmt.c b/src/numfmt.c
index fb6cb3396..2436c5487 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -215,8 +215,7 @@ static bool dev_debug = false;
static bool
newline_or_blank (mcel_t g)
{
- return g.ch == '\n'
- || (c32isblank (g.ch) && ! c32isnbspace (g.ch));
+ return g.ch == '\n' || c32issep (g.ch);
}
static inline int
@@ -673,7 +672,7 @@ simple_strtod_human (char const *input_str,
if (!matched_unit_sep)
{
mcel_t g = mcel_scanz (*endptr);
- if (c32isblank (g.ch) || c32isnbspace (g.ch))
+ if (c32issep (g.ch) || c32isnbspace (g.ch))
(*endptr) += g.len;
}
diff --git a/src/system.h b/src/system.h
index 988c7cd9e..79b3e6069 100644
--- a/src/system.h
+++ b/src/system.h
@@ -160,6 +160,29 @@ c32isnbspace (char32_t wc)
return wc == 0x00A0 || wc == 0x2007 || wc == 0x202F || wc == 0x2060;
}
+ATTRIBUTE_PURE
+static inline int
+c32isvertspace (char32_t wc)
+{
+ return wc == 0x000A || wc == 0x000B || wc == 0x000C || wc == 0x000D
+ || wc == 0x2028 || wc == 0x2029;
+}
+
+
+/* c32isblank() is too variable on non GLIBC platforms.
+ E.g., does not include \u3000 ideographic space on musl.
+ E.g., does include non-breaking space on Solaris and NetBSD.
+ This equivalent is more consistent across systems. */
+ATTRIBUTE_PURE
+static inline bool
+c32issep (char32_t wc)
+{
+#if defined __GLIBC__
+ return !! c32isblank (wc);
+#endif
+ return !! (c32isspace (wc) && ! c32isvertspace (wc) && ! c32isnbspace (wc));
+}
+
#include <locale.h>
/* Take care of NLS matters. */
diff --git a/src/unexpand.c b/src/unexpand.c
index 16d0f0031..4fbf9d3f8 100644
--- a/src/unexpand.c
+++ b/src/unexpand.c
@@ -176,7 +176,7 @@ unexpand (void)
if (convert)
{
- bool blank = !! (c32isblank (g.ch) && ! c32isnbspace (g.ch));
+ bool blank = c32issep (g.ch);
if (blank)
{
diff --git a/src/uniq.c b/src/uniq.c
index eebff4b7b..30463598a 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -254,7 +254,7 @@ size_opt (char const *opt, char const *msgid)
static bool
newline_or_blank (mcel_t g)
{
- return g.ch == '\n' || c32isblank (g.ch);
+ return g.ch == '\n' || c32issep (g.ch);
}
/* Given a linebuffer LINE,
--
2.53.0