On 08/01/16 22:07, Pádraig Brady wrote:
> On 08/01/16 19:04, Assaf Gordon wrote:

>> an open question:
>> With -z, do embedded newlines count as whitespace/field delimiters ?
>> (not sure if this applies to other programs).
>>
>> For example:
>>
>>     $ printf "A B\tC\nD 1000\x00"
>>
>> Should the newline count as whitespace/field delimiter (since numfmt 
>> defaults to whitespace delimiters) ?
>> If so, the "1000" should be the fifth field.
>> If not, the "1000" should be in the fourth field (and "C\nD" cound as one 
>> field).
>>
>> Currently, because the numfmt code uses "isblank()", newlines DO NOT count 
>> as whitespace:
>>
>>      $ printf "A B\tC\nD 1000\x00" | ./src/numfmt -z --to=si --field=4 | od 
>> -a
>>      0000000   A  sp   B  sp   C  nl   D  sp   1   .   0   K nul
>>      0000015
> 
> A very good point.
> This is not an issue for the utils in my current patch set I think,
> but is for field processing utils like numfmt, sort, join, uniq
> (cut delimits fields with a char rather than a class).
> I.E. should these utils use isspace() rather than isblank()
> when -z is specified? More conservatively they probably
> should use isblank(c) || c=='\n'.

Attached patch implements this for join, sort, uniq.

thanks,
Pádraig.

>From 827952057e697e3d72b9549ae1a6344df3e70dcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <[email protected]>
Date: Tue, 12 Jan 2016 16:29:32 +0000
Subject: [PATCH] join,sort,uniq: with -z, treat '\n' as a field separator

* NEWS: Mention the change in behavior.
* doc/coreutils.texi (newlineFieldSeparator): A new description,
referenced from ({join,sort,uniq} invocation).
* src/system.h (field_sep): A new inline function to determine
if a character is a field separator.
* src/join.c (usage): s/whitespace/blank/ to be more accurate
wrt which characters are field separators.
(xfields): s/isblank/field_sep/.
* src/sort.c (inittables): Likewise.
* src/uniq.c (find_field): Likewise.
* tests/misc/join.pl: Adjust -z test, and add a test/example
for processing the whole record with field processing.
* tests/misc/sort.pl: Add -z test cases, including case with '\n'.
* tests/misc/uniq.pl: Add -z -f test case with \n.
---
 NEWS               | 2 ++
 doc/coreutils.texi | 5 +++++
 src/join.c         | 8 ++++----
 src/sort.c         | 4 ++--
 src/system.h       | 7 +++++++
 src/uniq.c         | 4 ++--
 tests/misc/join.pl | 9 ++++++---
 tests/misc/sort.pl | 5 +++++
 tests/misc/uniq.pl | 1 +
 9 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/NEWS b/NEWS
index 6e48a53..192d8fa 100644
--- a/NEWS
+++ b/NEWS
@@ -64,6 +64,8 @@ GNU coreutils NEWS                                    -*- outline -*-
   ls now quotes file names unambiguously and appropriate for use in a shell,
   when outputting to a terminal.
 
+  join, sort, uniq with --zero-terminated, now treat '\n' as a field delimiter.
+
 ** Improvements
 
   All utilities now quote user supplied arguments in error strings,
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 2538062..80e9a03 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -4493,6 +4493,9 @@ numeric string when checking for uniqueness, whereas @code{sort -n |
 uniq} inspects the entire line.  @xref{uniq invocation}.
 
 @optZeroTerminated
+@macro newlineFieldSeparator
+Note with @option{-z} the newline character is treated as a field separator.
+@end macro
 
 @end table
 
@@ -5034,6 +5037,7 @@ fields and characters).  By default the entire rest of the lines are
 compared.
 
 @optZeroTerminated
+@newlineFieldSeparator
 
 @end table
 
@@ -6157,6 +6161,7 @@ Print a line for each unpairable line in file @var{file-number}
 (either @samp{1} or @samp{2}), instead of the normal output.
 
 @optZeroTerminated
+@newlineFieldSeparator
 
 @end table
 
diff --git a/src/join.c b/src/join.c
index 8686428..9b25da6 100644
--- a/src/join.c
+++ b/src/join.c
@@ -194,7 +194,7 @@ Usage: %s [OPTION]... FILE1 FILE2\n\
               program_name);
       fputs (_("\
 For each pair of input lines with identical join fields, write a line to\n\
-standard output.  The default join field is the first, delimited by whitespace.\
+standard output.  The default join field is the first, delimited by blanks.\
 \n\
 "), stdout);
       fputs (_("\
@@ -284,19 +284,19 @@ xfields (struct line *line)
   else if (tab < 0)
     {
       /* Skip leading blanks before the first field.  */
-      while (isblank (to_uchar (*ptr)))
+      while (field_sep (*ptr))
         if (++ptr == lim)
           return;
 
       do
         {
           char *sep;
-          for (sep = ptr + 1; sep != lim && ! isblank (to_uchar (*sep)); sep++)
+          for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
             continue;
           extract_field (line, ptr, sep - ptr);
           if (sep == lim)
             return;
-          for (ptr = sep + 1; ptr != lim && isblank (to_uchar (*ptr)); ptr++)
+          for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
             continue;
         }
       while (ptr != lim);
diff --git a/src/sort.c b/src/sort.c
index a3a9b15..575877d 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -1275,9 +1275,9 @@ inittables (void)
 
   for (i = 0; i < UCHAR_LIM; ++i)
     {
-      blanks[i] = isblank (i) || i == '\n';
+      blanks[i] = field_sep (i);
       nonprinting[i] = ! isprint (i);
-      nondictionary[i] = ! isalnum (i) && ! isblank (i) && i != '\n';
+      nondictionary[i] = ! isalnum (i) && ! field_sep (i);
       fold_toupper[i] = toupper (i);
     }
 
diff --git a/src/system.h b/src/system.h
index c1c4a18..9898bc7 100644
--- a/src/system.h
+++ b/src/system.h
@@ -155,6 +155,13 @@ enum
    errors that the cast doesn't.  */
 static inline unsigned char to_uchar (char ch) { return ch; }
 
+/* '\n' is considered a field separator with  --zero-terminated.  */
+static inline bool
+field_sep (unsigned char ch)
+{
+  return isblank (ch) || ch == '\n';
+}
+
 #include <locale.h>
 
 /* Take care of NLS matters.  */
diff --git a/src/uniq.c b/src/uniq.c
index 6f8cd4a..0e118da 100644
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -261,9 +261,9 @@ find_field (struct linebuffer const *line)
 
   for (count = 0; count < skip_fields && i < size; count++)
     {
-      while (i < size && isblank (to_uchar (lp[i])))
+      while (i < size && field_sep (lp[i]))
         i++;
-      while (i < size && !isblank (to_uchar (lp[i])))
+      while (i < size && !field_sep (lp[i]))
         i++;
     }
 
diff --git a/tests/misc/join.pl b/tests/misc/join.pl
index 2a40f00..1540e82 100755
--- a/tests/misc/join.pl
+++ b/tests/misc/join.pl
@@ -290,10 +290,13 @@ my @tv = (
 # missing last NUL at the end of the last line (=end of file)
 ['z4', '-z',
  ["a\0c\0e", "a\0b\0c"], "a\0c\0", 0],
-# edge-case: the embedded newlines should treated as
-# part of the nul-terminated line
+# With -z, embedded newlines are treated as field separators.
+# Note '\n' are converted to ' ' in this case.
 ['z5', '-z -a1 -a2',
- ["a\n1\0c 3\0","b\n8\0c 9\0"], "a\n1\0b\n8\0c 3 9\0"],
+ ["a\n\n1\0c 3\0", "a 2\0b\n8\0c 9\0"], "a 1 2\0b 8\0c 3 9\0"],
+# Once can avoid field processing like:
+['z6', '-z -t ""',
+ ["a\n1\n\0", "a\n1\n\0"], "a\n1\n\0"],
 
 );
 
diff --git a/tests/misc/sort.pl b/tests/misc/sort.pl
index c9bcce1..c3e7f8e 100755
--- a/tests/misc/sort.pl
+++ b/tests/misc/sort.pl
@@ -406,6 +406,11 @@ my @Tests =
 ["output-is-input-3", '-m -o f', {OUT=>''},
  {IN=> {g=> "a\n"}}, {IN=> {h=> "b\n"}}, {IN=> {f=> "c\n"}},
  {CMP=> ["a\nb\nc\n", {'f'=> undef}]} ],
+
+# --zero-terminated
+['zero-1', '-z', {IN=>"2\0001\000"}, {OUT=>"1\0002\000"}],
+['zero-2', '-z -k2,2', {IN=>"1\n2\0002\n1\000"}, {OUT=>"2\n1\0001\n2\000"}],
+['zero-3', '-zb -k2,2', {IN=>"1\n\n2\0002\n1\0"}, {OUT=>"2\n1\0001\n\n2\0"}],
 );
 
 # Add _POSIX2_VERSION=199209 to the environment of each test
diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl
index 2bc06b9..f028036 100755
--- a/tests/misc/uniq.pl
+++ b/tests/misc/uniq.pl
@@ -95,6 +95,7 @@ my @Tests =
  ['3z', '-z', {IN=>"a\na"}, {OUT=>"a\na\0"}],
  ['4z', '-z', {IN=>"a\nb"}, {OUT=>"a\nb\0"}],
  ['5z', '-z', {IN=>"a\na\nb"}, {OUT=>"a\na\nb\0"}],
+ ['10z', '-z -f1', {IN=>"a\nb\n\0c\nb\n\0"}, {OUT=>"a\nb\n\0"}],
  ['20z', '-dz', {IN=>"a\na\n"}, {OUT=>""}],
 
  # Make sure that eight bit characters work
-- 
2.5.0

Reply via email to