Hi,

I was looking around the coreutils wc codebase and noticed there was an AVX2 
line counting implementation, but no AVX512 version. I took a stab at adding 
one.

On my machine running a Ryzen 9 7900, this results in a 10% speedup compared to 
AVX2 when counting lines in a file with a billion lines, see attached terminal 
output.

Kind regards,
Mathieu


From c5b4603de3bbdbfa2006238902383fc18266e96f Mon Sep 17 00:00:00 2001
From: Mathieu Bordere <[email protected]>
Date: Wed, 24 Sep 2025 12:41:06 +0200
Subject: [PATCH] wc: Add avx512 function for line counting

* configure.ac: Add detection of AVX512 intrinsics for wc.
* src/local.mk: Build AVX512 wc libraries.
* src/wc.c: Add runtime detection of AVX512 intrinsics and call
appropriate function when detected.
* src/wc.h: Declare wc_lines_avx512 function.
* tests/wc/wc-cpu.sh: Add a test that disables AVX512 intrinsics.
* src/wc_avx512.c: New file containing the wc -l implementation using
AVX512. The logic and code is reused from the AVX2 implementation with
slight adaptations. Replaced __builtin_popcount by __builtin_popcountll
and the combination of _mm256_cmpeq_epi8 and _mm256_movemask_epi8 by a
single call to _mm512_cmpeq_epi8_mask.
---
 configure.ac       | 31 +++++++++++++++++++++++++
 src/local.mk       |  7 ++++++
 src/wc.c           | 30 ++++++++++++++++++++----
 src/wc.h           |  1 +
 src/wc_avx512.c    | 58 ++++++++++++++++++++++++++++++++++++++++++++++
 tests/wc/wc-cpu.sh | 12 ++++++++--
 6 files changed, 133 insertions(+), 6 deletions(-)
 create mode 100644 src/wc_avx512.c

diff --git a/configure.ac b/configure.ac
index 274eff42f..09485306c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -742,6 +742,37 @@ AM_CONDITIONAL([USE_AVX2_WC_LINECOUNT],
 
 CFLAGS=$ac_save_CFLAGS
 
+CFLAGS="-mavx512bw -mavx512f $CFLAGS"
+AC_MSG_CHECKING([for avx512 intrinsics])
+AC_CACHE_VAL([utils_cv_avx512_intrinsic_exists],[
+AC_LINK_IFELSE(
+  [AC_LANG_SOURCE([[
+    #include <x86intrin.h>
+
+    int
+    main (void)
+    {
+      __m512i matches = _mm512_setzero_si512 ();
+      long long mask = _mm512_movepi8_mask (matches);
+      int lines = __builtin_popcount (mask);
+      return (__builtin_cpu_supports ("avx512bw")
+              && __builtin_cpu_supports ("avx512f"));
+    }
+  ]])
+  ],[
+    utils_cv_avx512_intrinsic_exists=yes
+  ],[
+    utils_cv_avx512_intrinsic_exists=no
+  ])])
+AC_MSG_RESULT([$utils_cv_avx512_intrinsic_exists])
+if test $utils_cv_avx512_intrinsic_exists = yes; then
+  AC_DEFINE([USE_AVX512_WC_LINECOUNT], [1],
+            [Counting lines with AVX512 enabled])
+fi
+AM_CONDITIONAL([USE_AVX512_WC_LINECOUNT],
+               [test $utils_cv_avx512_intrinsic_exists = yes])
+
+CFLAGS=$ac_save_CFLAGS
 ############################################################################
 
 dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script.
diff --git a/src/local.mk b/src/local.mk
index 8f6d9a5d7..7fb1c071a 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -483,6 +483,13 @@ src_expand_SOURCES = src/expand.c src/expand-common.c
 src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
 
 src_wc_SOURCES = src/wc.c
+if USE_AVX512_WC_LINECOUNT
+noinst_LIBRARIES += src/libwc_avx512.a
+src_libwc_avx512_a_SOURCES = src/wc_avx512.c
+wc_avx512_ldadd = src/libwc_avx512.a
+src_wc_LDADD += $(wc_avx512_ldadd)
+src_libwc_avx512_a_CFLAGS = -mavx512bw -mavx512f  $(AM_CFLAGS)
+endif
 if USE_AVX2_WC_LINECOUNT
 noinst_LIBRARIES += src/libwc_avx2.a
 src_libwc_avx2_a_SOURCES = src/wc_avx2.c
diff --git a/src/wc.c b/src/wc.c
index 777277f23..243399393 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -134,14 +134,29 @@ static enum total_type total_mode = total_auto;
 static bool
 avx2_supported (void)
 {
-  bool avx_enabled = cpu_supports ("avx2");
-
+  bool avx2_enabled = cpu_supports ("avx2");
   if (debug)
-    error (0, 0, (avx_enabled
+    error (0, 0, (avx2_enabled
                   ? _("using avx2 hardware support")
                   : _("avx2 support not detected")));
 
-  return avx_enabled;
+  return avx2_enabled;
+}
+#endif
+
+#ifdef USE_AVX512_WC_LINECOUNT
+static bool
+avx512_supported (void)
+{
+  bool avx512_enabled = (cpu_supports ("avx512f")
+                         && cpu_supports ("avx512bw"));
+
+  if (debug)
+    error (0, 0, (avx512_enabled
+                  ? _("using avx512 hardware support")
+                  : _("avx512 support not detected")));
+
+  return avx512_enabled;
 }
 #endif
 
@@ -246,6 +261,13 @@ write_counts (uintmax_t lines,
 static struct wc_lines
 wc_lines (int fd)
 {
+#ifdef USE_AVX512_WC_LINECOUNT
+  static signed char use_avx512;
+  if (!use_avx512)
+    use_avx512 = avx512_supported () ? 1 : -1;
+  if (0 < use_avx512)
+    return wc_lines_avx512 (fd);
+#endif
 #ifdef USE_AVX2_WC_LINECOUNT
   static signed char use_avx2;
   if (!use_avx2)
diff --git a/src/wc.h b/src/wc.h
index a6b4c9e84..f151e92f2 100644
--- a/src/wc.h
+++ b/src/wc.h
@@ -1,3 +1,4 @@
 #include <stdint.h>
 struct wc_lines { int err; intmax_t lines; intmax_t bytes; };
 struct wc_lines wc_lines_avx2 (int);
+struct wc_lines wc_lines_avx512 (int);
diff --git a/src/wc_avx512.c b/src/wc_avx512.c
new file mode 100644
index 000000000..41faea646
--- /dev/null
+++ b/src/wc_avx512.c
@@ -0,0 +1,58 @@
+/* wc_avx512 - Count the number of newlines with avx512 instructions.
+   Copyright (C) 2021-2025 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include "wc.h"
+#include "system.h"
+#include "ioblksize.h"
+
+#include <x86intrin.h>
+
+/* Read FD and return a summary.  */
+extern struct wc_lines
+wc_lines_avx512 (int fd)
+{
+  intmax_t lines = 0;
+  intmax_t bytes = 0;
+
+  __m512i endlines = _mm512_set1_epi8 ('\n');
+
+  while (true)
+    {
+       __m512i avx_buf[IO_BUFSIZE / sizeof (__m512i)];
+      ssize_t bytes_read = read (fd, avx_buf, sizeof avx_buf);
+      if (bytes_read <= 0)
+        return (struct wc_lines) { bytes_read == 0 ? 0 : errno, lines, bytes };
+
+      bytes += bytes_read;
+      __m512i *datap = avx_buf;
+
+      while (bytes_read >= 64)
+        {
+           __m512i to_match = _mm512_load_si512 (datap);
+           long long matches = _mm512_cmpeq_epi8_mask (to_match, endlines);
+           lines += __builtin_popcountll (matches);
+           datap += 1;
+           bytes_read -= 64;
+        }
+
+      /* Finish up any left over bytes */
+      char *end = (char *) datap + bytes_read;
+      for (char *p = (char *) datap; p < end; p++)
+        lines += *p == '\n';
+    }
+}
diff --git a/tests/wc/wc-cpu.sh b/tests/wc/wc-cpu.sh
index 725817a7c..1a341c527 100755
--- a/tests/wc/wc-cpu.sh
+++ b/tests/wc/wc-cpu.sh
@@ -19,7 +19,7 @@
 . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
 print_ver_ wc
 
-GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' \
+GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2,-AVX512BW,-AVX512F' \
  wc --debug /dev/null 2>debug || fail=1
 grep 'using.*hardware support' debug && fail=1
 
@@ -27,8 +27,16 @@ lines=$(shuf -i 0-1000 | head -n1)  || framework_failure_
 seq 1000 | head -n "$lines" > lines || framework_failure_
 
 wc_accelerated=$(wc -l < lines) || fail=1
-wc_base=$(GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' wc -l < lines) || fail=1
+wc_accelerated_no_avx512=$(
+          GLIBC_TUNABLES='glibc.cpiu.hwcaps=-AVX512BW,-AVX512F' \
+          wc -l < lines
+         ) || fail=1
+wc_base=$(
+          GLIBC_TUNABLES='glibc.cpiu.hwcaps=-AVX2,-AVX512BW,-AVX512F' \
+          wc -l < lines
+         ) || fail=1
 
 test "$wc_accelerated" = "$wc_base" || fail=1
+test "$wc_accelerated_no_avx512" = "$wc_base" || fail=1
 
 Exit $fail
-- 
2.51.0

mathieu@sophia:coreutils $ time ./src/wc -l --debug 
../1brc_data/measurements_1B.txt
wc: using avx512 hardware support
1000000000 ../1brc_data/measurements_1B.txt
./src/wc -l --debug ../1brc_data/measurements_1B.txt  0,10s user 0,65s system 
98% cpu 0,757 total
mathieu@sophia:coreutils $ time ./src/wc -l --debug 
../1brc_data/measurements_1B.txt
mathieu@sophia:coreutils $ time GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' 
./src/wc --debug -l ../1brc_data/measurements_1B.txt
wc: using avx512 hardware support
1000000000 ../1brc_data/measurements_1B.txt
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' ./src/wc --debug -l   0,09s user 0,65s 
system 98% cpu 0,746 total
mathieu@sophia:coreutils $ time GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' 
./src/wc --debug -l ../1brc_data/measurements_1B.txt
wc: using avx512 hardware support
1000000000 ../1brc_data/measurements_1B.txt
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX2' ./src/wc --debug -l   0,10s user 0,64s 
system 99% cpu 0,744 total
mathieu@sophia:coreutils $ time 
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F' ./src/wc --debug -l 
../1brc_data/measurements_1B.txt
wc: avx512 support not detected
wc: using avx2 hardware support
1000000000 ../1brc_data/measurements_1B.txt
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F' ./src/wc --debug -l   
0,18s user 0,63s system 99% cpu 0,810 total
mathieu@sophia:coreutils $ time 
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F' ./src/wc --debug -l 
../1brc_data/measurements_1B.txt
wc: avx512 support not detected
wc: using avx2 hardware support
1000000000 ../1brc_data/measurements_1B.txt
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F' ./src/wc --debug -l   
0,16s user 0,65s system 98% cpu 0,815 total
mathieu@sophia:coreutils $ time 
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F' ./src/wc --debug -l 
../1brc_data/measurements_1B.txt
wc: avx512 support not detected
wc: using avx2 hardware support
1000000000 ../1brc_data/measurements_1B.txt
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F' ./src/wc --debug -l   
0,18s user 0,63s system 99% cpu 0,815 total
mathieu@sophia:coreutils $ time 
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F' ./src/wc --debug -l 
../1brc_data/measurements_1B.txt
wc: avx512 support not detected
wc: using avx2 hardware support
1000000000 ../1brc_data/measurements_1B.txt
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F' ./src/wc --debug -l   
0,16s user 0,64s system 99% cpu 0,813 total
mathieu@sophia:coreutils $ time 
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F,-AVX2' ./src/wc --debug -l 
../1brc_data/measurements_1B.txt
wc: avx512 support not detected
wc: avx2 support not detected
1000000000 ../1brc_data/measurements_1B.txt
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F,-AVX2' ./src/wc --debug -  
1,30s user 0,58s system 98% cpu 1,907 total
mathieu@sophia:coreutils $ time 
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F,-AVX2' ./src/wc --debug -l 
../1brc_data/measurements_1B.txt
wc: avx512 support not detected
wc: avx2 support not detected
1000000000 ../1brc_data/measurements_1B.txt
GLIBC_TUNABLES='glibc.cpu.hwcaps=-AVX512BW,-AVX512F,-AVX2' ./src/wc --debug -  
1,30s user 0,61s system 99% cpu 1,931 total

Reply via email to