readlinebuffer_delim is used internally by many major coreutils
programs, like uniq, comm, join, nl, and shuf. The previous
implementation read input one byte at a time via getc(). Although getc
is remapped to getc_unlocked (so we avoid stdio locking), the per-byte
function call overhead is still significant. For example, with a 200 MB
file with 5M lines, there are 200 million getc_unlocked calls, and those
clearly dominate perf profiles with 91% of uniq runtime and 78% of comm
runtime.

I think we can do this more efficiently. First, we can use freadptr() to
peek at the stdio internal read buffer without copying, which resolves
to a direct struct member access on glibc (_IO_read_ptr/_IO_read_end),
BSD (_p/_r), and most other platforms.

Second, we can use memchr() to scan the available buffered data for the
delimiter. On modern glibc memchr() is SIMD optimised (even without
-march=native since it's called as IFUNC), so we do a significantly
fewer number of vectorised scans.

Third, once the delimiter position is known, we can use fread() (which
is remapped to fread_unlocked() on glibc) to consume exactly those bytes
from the stream into our linebuffer. Since the data is already in the
stdio buffer, this is effectively a memcpy.

On platforms where freadptr() is not supported, we can fall back to the
original per-byte getc loop. Performance on those platforms is therefore
essentially unchanged from before this patch.

Using the following input data:

  LC_ALL=C tr -dc "A-Za-z0-9 \t\n" < /dev/urandom | head -c 80000000 \
    | fold -w 80 > random_lines
  LC_ALL=C sort random_lines > sorted_lines
  LC_ALL=C sort -u random_lines > uniq_sorted_lines
  awk "{for(i=1;i<=5;i++) print}" sorted_lines \
    | head -5000000 > repeated_sorted
  awk 'NR<=800000 {printf "%07d %s\n", NR,          $0}' \
    sorted_lines > join_file1
  awk 'NR<=800000 {printf "%07d %s\n", int(NR*1.5), $0}' \
    sorted_lines > join_file2
  awk 'NR<=800000 {printf "%07d %s\n", NR*2,        $0}' \
    sorted_lines > join_nooverlap1
  awk 'NR<=800000 {printf "%07d %s\n", NR*2-1,      $0}' \
    sorted_lines > join_nooverlap2

And the following benchmarks:

  hyperfine --warmup 5 --runs 30 \
    "uniq_before repeated_sorted > /dev/null" \
    "uniq_after repeated_sorted > /dev/null"

  hyperfine --warmup 5 --runs 30 \
    "uniq_before uniq_sorted_lines > /dev/null" \
    "uniq_after uniq_sorted_lines > /dev/null"

  hyperfine --warmup 5 --runs 30 \
    "LC_ALL=C comm_before sorted_lines sorted_lines > /dev/null" \
    "LC_ALL=C comm_after sorted_lines sorted_lines > /dev/null"

  hyperfine --warmup 5 --runs 30 \
    "LC_ALL=C comm_before sorted_lines uniq_sorted_lines > /dev/null" \
    "LC_ALL=C comm_after sorted_lines uniq_sorted_lines > /dev/null"

  hyperfine --warmup 5 --runs 30 \
    "LC_ALL=C join_before join_file1 join_file2 > /dev/null" \
    "LC_ALL=C join_after join_file1 join_file2 > /dev/null"

  hyperfine --warmup 5 --runs 30 \
    "LC_ALL=C join_before join_nooverlap1 join_nooverlap2 > /dev/null" \
    "LC_ALL=C join_after join_nooverlap1 join_nooverlap2 > /dev/null"

  hyperfine --warmup 5 --runs 30 \
    "nl_before random_lines > /dev/null" \
    "nl_after random_lines > /dev/null"

  hyperfine --warmup 5 --runs 30 \
    "shuf_before random_lines > /dev/null" \
    "shuf_after random_lines > /dev/null"

The results on i9-14900HX x86_64 with -O2:

  uniq (5M repeated sorted lines, high repetition):
    Before: 328.2 ms    After: 148.8 ms    (-54.7%)

  uniq (1.7M unique sorted lines, typical):
    Before: 131.3 ms    After:  71.1 ms    (-45.8%)

  comm (1.8M sorted lines, self-compare):
    Before: 284.9 ms    After: 143.0 ms    (-49.8%)

  comm (1.8M lines, different sorted files, typical):
    Before: 329.9 ms    After: 171.1 ms    (-48.2%)

  join (800K lines, ~2/3 key overlap):
    Before: 342.6 ms    After: 264.8 ms    (-22.7%)

  join (800K lines, no key overlap):
    Before: 341.4 ms    After: 264.7 ms    (-22.5%)

  nl (1.8M lines):
    Before: 237.6 ms    After: 174.9 ms    (-26.4%)

  shuf (1.8M lines):
    Before: 207.1 ms    After: 207.9 ms    (no change)

And on M1 Pro aarch64 with -O2:

  uniq (5M repeated sorted lines, high repetition):
    Before: 536.8 ms    After: 219.5 ms    (-59.1%)

  uniq (1.7M unique sorted lines, typical):
    Before: 202.5 ms    After:  85.8 ms    (-57.6%)

  comm (1.8M sorted lines, self-compare):
    Before: 438.9 ms    After: 179.1 ms    (-59.2%)

  comm (1.8M lines, different sorted files, typical):
    Before: 418.0 ms    After: 214.1 ms    (-48.8%)

  join (800K lines, ~2/3 key overlap):
    Before: 759.5 ms    After: 622.1 ms    (-18.1%)

  join (800K lines, no key overlap):
    Before: 706.1 ms    After: 577.6 ms    (-18.2%)

  nl (1.8M lines):
    Before: 332.8 ms    After: 215.4 ms    (-35.3%)

  shuf (1.8M lines):
    Before: 282.9 ms    After: 282.9 ms    (no change)

You might wonder why there's no change in shuf. Profiling shows shuf
spends its time almost entirely in randperm_new() and randint_genmax(),
so I/O is not the bottleneck.
---
 lib/linebuffer.c   | 80 ++++++++++++++++++++++++++++++++++------------
 modules/linebuffer |  1 +
 2 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/lib/linebuffer.c b/lib/linebuffer.c
index 36642a1db1..814bd0a84d 100644
--- a/lib/linebuffer.c
+++ b/lib/linebuffer.c
@@ -25,6 +25,7 @@
 #include <string.h>
 #include <sys/types.h>
 #include "linebuffer.h"
+#include "freadptr.h"
 #include "xalloc.h"
 
 #if USE_UNLOCKED_IO
@@ -62,35 +63,74 @@ readlinebuffer_delim (struct linebuffer *linebuffer, FILE 
*stream,
   if (feof (stream))
     return NULL;
 
+  idx_t pos = 0;
   char *buffer = linebuffer->buffer;
-  char *p = linebuffer->buffer;
-  char *end = buffer + linebuffer->size; /* Sentinel. */
-  int c;
+  idx_t alloc = linebuffer->size;
 
-  do
+  for (;;)
     {
-      c = getc (stream);
-      if (c == EOF)
+      size_t avail;
+      const char *bufp = freadptr (stream, &avail);
+      if (bufp)
         {
-          if (p == buffer || ferror (stream))
-            return NULL;
-          if (p[-1] == delimiter)
-            break;
-          c = delimiter;
+          const char *found = memchr (bufp, delimiter, avail);
+          size_t nbytes = found ? (size_t) (found - bufp + 1) : avail;
+
+          if (alloc - pos < (idx_t) nbytes)
+            {
+              buffer = xpalloc (buffer, &alloc,
+                                nbytes - (alloc - pos), -1, 1);
+              linebuffer->buffer = buffer;
+              linebuffer->size = alloc;
+            }
+
+          size_t nread = fread (buffer + pos, 1, nbytes, stream);
+          pos += nread;
+
+          if (nread < nbytes)
+            {
+              /* Short read despite freadptr indicating data was available.
+                 Return immediately on error.  Otherwise fall through to the
+                 getc path, which will handle EOF and synthesise a trailing
+                 delimiter if needed.  */
+              if (ferror (stream))
+                return NULL;
+              continue;
+            }
+
+          if (found)
+            {
+              linebuffer->length = pos;
+              return linebuffer;
+            }
         }
-      if (p == end)
+      else
         {
-          idx_t oldsize = linebuffer->size;
-          buffer = xpalloc (buffer, &linebuffer->size, 1, -1, 1);
-          p = buffer + oldsize;
-          linebuffer->buffer = buffer;
-          end = buffer + linebuffer->size;
+          /* Buffer empty or not accessible.  Consume one byte via getc to
+             trigger a refill, then loop back to use freadptr on the newly
+             populated buffer.  */
+          int c = getc (stream);
+          if (c == EOF)
+            {
+              if (pos == 0 || ferror (stream))
+                return NULL;
+              if (buffer[pos - 1] == delimiter)
+                break;
+              c = delimiter;
+            }
+          if (pos >= alloc)
+            {
+              buffer = xpalloc (buffer, &alloc, 1, -1, 1);
+              linebuffer->buffer = buffer;
+              linebuffer->size = alloc;
+            }
+          buffer[pos++] = c;
+          if (c == delimiter)
+            break;
         }
-      *p++ = c;
     }
-  while (c != delimiter);
 
-  linebuffer->length = p - buffer;
+  linebuffer->length = pos;
   return linebuffer;
 }
 
diff --git a/modules/linebuffer b/modules/linebuffer
index b21af67ce5..91cde9cd6f 100644
--- a/modules/linebuffer
+++ b/modules/linebuffer
@@ -6,6 +6,7 @@ lib/linebuffer.h
 lib/linebuffer.c
 
 Depends-on:
+freadptr
 idx
 xalloc
 

base-commit: 0cca364da074782231685f955da2f0ce67d8c155
-- 
2.53.0


Reply via email to