I noticed that Fedora and OpenSUSE (likely others) add a patch to
Coreutils for 'fold --characters'. Not sure if that option has been
discussed here.

Anyways, I wrote the following patch which is much simpler than the one
they use, in my opinion. The mbfile module from Gnulib provides a
similar interface to stdio's getc. When operating on ascii characters it
does not call mbrtoc32, etc.

Here is an example of the behavior using 뉐 which has a width of two
columns:

    $ for i in $(seq 10); do printf '\uB250' >> test.txt; done
    $ printf '\n' >> test.txt
    $ cat test.txt
    뉐뉐뉐뉐뉐뉐뉐뉐뉐뉐
    $ ./src/fold -w 5 test.txt 
    뉐뉐
    뉐뉐
    뉐뉐
    뉐뉐
    뉐뉐
    $ ./src/fold --characters -w 5 test.txt 
    뉐뉐뉐뉐뉐
    뉐뉐뉐뉐뉐

What do you think?

It would be nice to improve unicode support and this was an easy
start. Something like 'tr' is much more difficult.

Collin

>From 162d7152bd657cffa21841fbec9c66cb57671999 Mon Sep 17 00:00:00 2001
Message-ID: <162d7152bd657cffa21841fbec9c66cb57671999.1755751867.git.collin.fu...@gmail.com>
From: Collin Funk <[email protected]>
Date: Wed, 20 Aug 2025 21:13:52 -0700
Subject: [PATCH] fold: add the --characters option

* bootstrap.conf (gnulib_modules): Add mbfile.
* src/fold.c: Include mbfile.h.
(count_bytes): Remove variable.
(counting_mode, last_character_width): New variables.
(shortopts, long_options): Add the option.
(adjust_column): If --characters is in used account for number of
characters instead of their width.
(fold_file): Use the mbfile equivalents of stdio and comparisons.
(main): Check for the option.
* src/local.mk (src_fold_LDADD): Add $(LIBC32CONV), $(LIBUNISTRING), and
$(MBRTOWC_LIB).
* tests/fold/fold-characters.sh: New file.
* tests/local.mk (all_tests): Add the test.
* doc/coreutils.texi (fold invocation): Mention the option.
---
 bootstrap.conf                |  1 +
 doc/coreutils.texi            |  7 +++
 src/fold.c                    | 95 ++++++++++++++++++++++++-----------
 src/local.mk                  |  3 ++
 tests/fold/fold-characters.sh | 64 +++++++++++++++++++++++
 tests/local.mk                |  1 +
 6 files changed, 142 insertions(+), 29 deletions(-)
 create mode 100755 tests/fold/fold-characters.sh

diff --git a/bootstrap.conf b/bootstrap.conf
index 49fcf30f3..fb9b8b7a6 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -166,6 +166,7 @@ gnulib_modules="
   maintainer-makefile
   malloc-gnu
   manywarnings
+  mbfile
   mbrlen
   mbrtoc32
   mbrtowc
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index c874ffc61..3f0931e1a 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -2964,6 +2964,13 @@ @node fold invocation
 returns are each counted as taking up one column, just like other
 characters.
 
+@item -c
+@itemx --characters
+@opindex -c
+@opindex --characters
+Count characters rather than columns, meaning that lines containing
+characters wider than one column will be visually longer.
+
 @item -s
 @itemx --spaces
 @opindex -s
diff --git a/src/fold.c b/src/fold.c
index b64aad491..69e771611 100644
--- a/src/fold.c
+++ b/src/fold.c
@@ -25,6 +25,7 @@
 
 #include "system.h"
 #include "fadvise.h"
+#include "mbfile.h"
 #include "xdectoint.h"
 
 #define TAB_WIDTH 8
@@ -37,17 +38,26 @@
 /* If nonzero, try to break on whitespace. */
 static bool break_spaces;
 
-/* If nonzero, count bytes, not column positions. */
-static bool count_bytes;
+/* Mode to operate in.  */
+static enum
+  {
+    COUNT_COLUMNS,
+    COUNT_BYTES,
+    COUNT_CHARACTERS
+  } counting_mode = COUNT_COLUMNS;
 
 /* If nonzero, at least one of the files we read was standard input. */
 static bool have_read_stdin;
 
-static char const shortopts[] = "bsw:0::1::2::3::4::5::6::7::8::9::";
+/* Width of last read character.  */
+static int last_character_width = 0;
+
+static char const shortopts[] = "bcsw:0::1::2::3::4::5::6::7::8::9::";
 
 static struct option const longopts[] =
 {
   {"bytes", no_argument, nullptr, 'b'},
+  {"characters", no_argument, nullptr, 'c'},
   {"spaces", no_argument, nullptr, 's'},
   {"width", required_argument, nullptr, 'w'},
   {GETOPT_HELP_OPTION_DECL},
@@ -75,6 +85,7 @@ Wrap input lines in each FILE, writing to standard output.\n\
 
       fputs (_("\
   -b, --bytes         count bytes rather than columns\n\
+  -c, --characters    count characters rather than columns\n\
   -s, --spaces        break at spaces\n\
   -w, --width=WIDTH   use WIDTH columns instead of 80\n\
 "), stdout);
@@ -90,24 +101,28 @@ Wrap input lines in each FILE, writing to standard output.\n\
    The first column is 0. */
 
 static size_t
-adjust_column (size_t column, char c)
+adjust_column (size_t column, mbf_char_t c)
 {
-  if (!count_bytes)
+  if (counting_mode != COUNT_BYTES)
     {
-      if (c == '\b')
+      if (mb_iseq (c, '\b'))
         {
           if (column > 0)
-            column--;
+            column -= last_character_width;
         }
-      else if (c == '\r')
+      else if (mb_iseq (c, '\r'))
         column = 0;
-      else if (c == '\t')
+      else if (mb_iseq (c, '\t'))
         column += TAB_WIDTH - column % TAB_WIDTH;
-      else /* if (isprint (c)) */
-        column++;
+      else /* if (mb_isprint (c)) */
+        {
+          last_character_width = (counting_mode == COUNT_CHARACTERS
+                                  ? 1 : mb_width (c));
+          column += last_character_width;
+        }
     }
   else
-    column++;
+    column += c.bytes;
   return column;
 }
 
@@ -119,10 +134,11 @@ static bool
 fold_file (char const *filename, size_t width)
 {
   FILE *istream;
-  int c;
+  mb_file_t mb_istream;
+  mbf_char_t c;
   size_t column = 0;		/* Screen column where next char will go. */
   idx_t offset_out = 0;		/* Index in 'line_out' for next char. */
-  static char *line_out = nullptr;
+  static mbf_char_t *line_out = nullptr;
   static idx_t allocated_out = 0;
   int saved_errno;
 
@@ -141,16 +157,22 @@ fold_file (char const *filename, size_t width)
     }
 
   fadvise (istream, FADVISE_SEQUENTIAL);
+  mbf_init (mb_istream, istream);
 
-  while ((c = getc (istream)) != EOF)
+  while (true)
     {
+      mbf_getc (c, mb_istream);
+      if (mb_iseof (c))
+        break;
+
       if (allocated_out - offset_out <= 1)
         line_out = xpalloc (line_out, &allocated_out, 1, -1, sizeof *line_out);
 
-      if (c == '\n')
+      if (mb_iseq (c, '\n'))
         {
-          line_out[offset_out++] = c;
-          fwrite (line_out, sizeof (char), offset_out, stdout);
+          mb_copy (&line_out[offset_out++], &c);
+          for (idx_t i = 0; i < offset_out; ++i)
+            mb_putc (line_out[i], stdout);
           column = offset_out = 0;
           continue;
         }
@@ -172,7 +194,7 @@ fold_file (char const *filename, size_t width)
               while (logical_end)
                 {
                   --logical_end;
-                  if (isblank (to_uchar (line_out[logical_end])))
+                  if (mb_isblank (line_out[logical_end]))
                     {
                       found_blank = true;
                       break;
@@ -183,12 +205,20 @@ fold_file (char const *filename, size_t width)
                 {
                   /* Found a blank.  Don't output the part after it. */
                   logical_end++;
-                  fwrite (line_out, sizeof (char), logical_end, stdout);
+                  for (idx_t i = 0; i < logical_end; ++i)
+                    mb_putc (line_out[i], stdout);
                   putchar ('\n');
                   /* Move the remainder to the beginning of the next line.
                      The areas being copied here might overlap. */
-                  memmove (line_out, line_out + logical_end,
-                           offset_out - logical_end);
+                  for (idx_t i = 0; i < offset_out - logical_end; ++i)
+                    {
+                      mbf_char_t c1;
+                      mbf_char_t c2;
+                      mb_copy (&c1, &line_out[i]);
+                      mb_copy (&c2, &line_out[logical_end + i]);
+                      mb_copy (&line_out[i], &c2);
+                      mb_copy (&line_out[logical_end + i], &c1);
+                    }
                   offset_out -= logical_end;
                   column = 0;
                   for (idx_t i = 0; i < offset_out; i++)
@@ -199,17 +229,19 @@ fold_file (char const *filename, size_t width)
 
           if (offset_out == 0)
             {
-              line_out[offset_out++] = c;
+              mb_copy (&line_out[offset_out++], &c);
               continue;
             }
 
-          line_out[offset_out++] = '\n';
-          fwrite (line_out, sizeof (char), offset_out, stdout);
+          mb_setascii (&line_out[offset_out], '\n');
+          offset_out++;
+          for (idx_t i = 0; i < offset_out; ++i)
+            mb_putc (line_out[i], stdout);
           column = offset_out = 0;
           goto rescan;
         }
 
-      line_out[offset_out++] = c;
+      mb_copy (&line_out[offset_out++], &c);
     }
 
   saved_errno = errno;
@@ -217,7 +249,8 @@ fold_file (char const *filename, size_t width)
     saved_errno = 0;
 
   if (offset_out)
-    fwrite (line_out, sizeof (char), offset_out, stdout);
+    for (idx_t i = 0; i < offset_out; ++i)
+      mb_putc (line_out[i], stdout);
 
   if (STREQ (filename, "-"))
     clearerr (istream);
@@ -249,7 +282,7 @@ main (int argc, char **argv)
 
   atexit (close_stdout);
 
-  break_spaces = count_bytes = have_read_stdin = false;
+  break_spaces = have_read_stdin = false;
 
   while ((optc = getopt_long (argc, argv, shortopts, longopts, nullptr)) != -1)
     {
@@ -258,7 +291,11 @@ main (int argc, char **argv)
       switch (optc)
         {
         case 'b':		/* Count bytes rather than columns. */
-          count_bytes = true;
+          counting_mode = COUNT_BYTES;
+          break;
+
+        case 'c':               /* Count characters rather than columns. */
+          counting_mode = COUNT_CHARACTERS;
           break;
 
         case 's':		/* Break at word boundaries. */
diff --git a/src/local.mk b/src/local.mk
index c7c77a7c9..3f93a7507 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -336,6 +336,9 @@ src_sort_LDADD += $(LIBPMULTITHREAD)
 # for pthread_sigmask
 src_sort_LDADD += $(PTHREAD_SIGMASK_LIB)
 
+# for mbrtowc, mbfile
+src_fold_LDADD += $(LIBC32CONV) $(LIBUNISTRING) $(MBRTOWC_LIB)
+
 # Get the release year from lib/version-etc.c.
 RELEASE_YEAR = \
   `sed -n '/.*COPYRIGHT_YEAR = \([0-9][0-9][0-9][0-9]\) };/s//\1/p' \
diff --git a/tests/fold/fold-characters.sh b/tests/fold/fold-characters.sh
new file mode 100755
index 000000000..ed528e7ff
--- /dev/null
+++ b/tests/fold/fold-characters.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+# Test fold --characters.
+
+# Copyright (C) 2025 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ fold printf
+
+export LC_ALL=en_US.UTF-8
+
+if test "$(locale charmap 2>/dev/null)" != UTF-8; then
+  skip_ "English UTF-8 locale not available"
+fi
+
+# The string "뉐뉐뉐" is 3 characters, but occupies 6 columns.
+env printf '\uB250\uB250\uB250\n' > input1 || framework_failure_
+env printf '\uB250\uB250\n\uB250\n' > column-exp1 || framework_failure_
+
+fold -w 5 input1 > column-out1 || fail=1
+compare column-exp1 column-out1 || fail=1
+
+# Should be the same as the input.
+fold --characters -w 5 input1 > characters-out1 || fail=1
+compare input1 characters-out1 || fail=1
+
+# Test with 50 2 column wide characters.
+for i in $(seq 50); do
+  env printf '\uFF1A' >> input2 || framework_failure_
+  env printf '\uFF1A' >> column-exp2 || framework_failure_
+  env printf '\uFF1A' >> character-exp2 || framework_failure_
+  if test $i -ne 1; then
+    if test $(($i % 5)) -eq 0 || test $i -eq 50; then
+      env printf '\n' >> column-exp2 || framework_failure_
+    fi
+    if test $(($i % 10)) -eq 0; then
+      env printf '\n' >> character-exp2 || framework_failure_
+    fi
+  fi
+done
+
+env printf '\n' >> input2 || framework_failure_
+
+# 5 characters per line.
+fold -w 10 input2 > column-out2 || fail=1
+compare column-exp2 column-out2 || fail=1
+
+# 10 characters per line.
+fold --characters -w 10 input2 > character-out2 || fail=1
+compare column-exp2 character-out2 || fail=1
+
+Exit $fail
diff --git a/tests/local.mk b/tests/local.mk
index 3fbf442ee..4d5868a88 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -326,6 +326,7 @@ all_tests =					\
   tests/factor/factor.pl			\
   tests/factor/factor-parallel.sh		\
   tests/misc/false-status.sh			\
+  tests/fold/fold-characters.sh			\
   tests/misc/fold.pl				\
   tests/groups/groups-dash.sh			\
   tests/groups/groups-process-all.sh		\
-- 
2.50.1

Reply via email to