Hello,

On 2021-03-29 7:21 a.m., Pádraig Brady wrote:

On 28/03/2021 18:29, Kristoffer Brånemyr via GNU coreutils General
I wanted to practice some more using vector intrinsics, so I made a small AVX2 optimization for wc -l. Depending on line length it is about 2-5x faster than previous version. (Well, only looking at user time it is much faster than that even.)

Excellent results.
I'll review this very soon.


I'm attaching the patch (copied from the Github's pull-request),
hopefully we can continue the discussion here on the mailing list.

-assaf
>From 462386ea5aad1b1673f7c1bc51983374aad325a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristoffer=20Br=C3=A5nemyr?= <zti...@yahoo.se>
Date: Sat, 20 Feb 2021 12:27:17 +0100
Subject: [PATCH] wc: Add AVX2 optimization when counting only lines

---
 configure.ac   |  46 ++++++++++++++
 po/POTFILES.in |   1 +
 src/local.mk   |   9 +++
 src/wc.c       | 162 ++++++++++++++++++++++++++++++++++++-------------
 src/wc_avx2.c  | 115 +++++++++++++++++++++++++++++++++++
 5 files changed, 290 insertions(+), 43 deletions(-)
 create mode 100644 src/wc_avx2.c

diff --git a/configure.ac b/configure.ac
index 7fbecbf8d..8186b88f1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -575,6 +575,52 @@ AM_CONDITIONAL([USE_PCLMUL_CRC32],
                 test "x$pclmul_intrinsic_exists" = "xyes"])
 CFLAGS=$ac_save_CFLAGS
 
+AC_MSG_CHECKING([if __get_cpuid_count exists])
+AC_COMPILE_IFELSE(
+  [AC_LANG_SOURCE([[
+    #include <cpuid.h>
+
+    int main(void)
+    {
+      unsigned int eax = 0, ebx = 0, ecx = 0, edx = 0;
+      __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+      return 1;
+    }
+  ]])
+  ],[
+    AC_MSG_RESULT([yes])
+    get_cpuid_count_exists=yes
+  ],[
+    AC_MSG_RESULT([no])
+  ])
+
+CFLAGS="-mavx2 $CFLAGS"
+AC_MSG_CHECKING([if avx2 intrinstics exists])
+AC_COMPILE_IFELSE(
+  [AC_LANG_SOURCE([[
+    #include <x86intrin.h>
+
+    int main(void)
+    {
+      __m256i a, b;
+      a = _mm256_sad_epu8(a, b);
+      return 1;
+    }
+  ]])
+  ],[
+    AC_MSG_RESULT([yes])
+    AC_DEFINE([HAVE_AVX2_INTRINSIC], [1], [avx2 intrinsics exists])
+    avx2_intrinsic_exists=yes
+  ],[
+    AC_MSG_RESULT([no])
+  ])
+if test "x$get_cpuid_count_exists" = "xyes" && test "x$avx2_intrinsic_exists" = "xyes"; then
+  AC_DEFINE([USE_AVX2_WC_LINECOUNT], [1], [Counting lines with AVX2 enabled])
+fi
+AM_CONDITIONAL([USE_AVX2_WC_LINECOUNT], [test "x$get_cpuid_count_exists" = "xyes" && test "x$avx2_intrinsic_exists" = "xyes"])
+
+CFLAGS=$ac_save_CFLAGS
+
 ############################################################################
 
 dnl Autogenerated by the 'gen-lists-of-programs.sh' auxiliary script.
diff --git a/po/POTFILES.in b/po/POTFILES.in
index b5f5bbff1..dc80762db 100644
--- a/po/POTFILES.in
+++ b/po/POTFILES.in
@@ -142,6 +142,7 @@ src/unlink.c
 src/uptime.c
 src/users.c
 src/wc.c
+src/wc_avx2.c
 src/who.c
 src/whoami.c
 src/yes.c
diff --git a/src/local.mk b/src/local.mk
index 8c8479a53..c6555dafb 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -427,6 +427,15 @@ src_basenc_CPPFLAGS = -DBASE_TYPE=42 $(AM_CPPFLAGS)
 src_expand_SOURCES = src/expand.c src/expand-common.c
 src_unexpand_SOURCES = src/unexpand.c src/expand-common.c
 
+src_wc_SOURCES = src/wc.c
+if USE_AVX2_WC_LINECOUNT
+noinst_LIBRARIES += src/libwc_avx2.a
+src_libwc_avx2_a_SOURCES = src/wc_avx2.c
+wc_avx2_ldadd = src/libwc_avx2.a
+src_wc_LDADD += $(wc_avx2_ldadd)
+src_libwc_avx2_a_CFLAGS = -mavx2 $(AM_CFLAGS)
+endif
+
 # Ensure we don't link against libcoreutils.a as that lib is
 # not compiled with -fPIC which causes issues on 64 bit at least
 src_libstdbuf_so_LDADD = $(LIBINTL)
diff --git a/src/wc.c b/src/wc.c
index 5216db189..1ecec0d83 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -37,6 +37,9 @@
 #include "safe-read.h"
 #include "stat-size.h"
 #include "xbinary-io.h"
+#ifdef USE_AVX2_WC_LINECOUNT
+#include <cpuid.h>
+#endif
 
 #if !defined iswspace && !HAVE_ISWSPACE
 # define iswspace(wc) \
@@ -53,6 +56,15 @@
 /* Size of atomic reads. */
 #define BUFFER_SIZE (16 * 1024)
 
+static
+bool wc_lines(const char *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out);
+#ifdef USE_AVX2_WC_LINECOUNT
+/* From wc_avx2.c */
+bool wc_lines_avx2(const char *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out);
+#endif
+bool (*wc_lines_p)(const char *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out) = wc_lines;
+
+
 /* Cumulative number of lines, words, chars and bytes in all files so far.
    max_line_length is the maximum over all files processed so far.  */
 static uintmax_t total_lines;
@@ -108,6 +120,41 @@ static struct option const longopts[] =
   {NULL, 0, NULL, 0}
 };
 
+#ifdef USE_AVX2_WC_LINECOUNT
+static bool
+avx2_supported(void)
+{
+  unsigned int eax = 0;
+  unsigned int ebx = 0;
+  unsigned int ecx = 0;
+  unsigned int edx = 0;
+
+  if (! __get_cpuid(1, &eax, &ebx, &ecx, &edx))
+    {
+      return false;
+    }
+
+  if (! (ecx & bit_OSXSAVE))
+    {
+      return false;
+    }
+
+  eax = ebx = ecx = edx = 0;
+
+  if (! __get_cpuid_count(7, 0, &eax, &ebx, &ecx, &edx))
+    {
+      return false;
+    }
+
+  if (! (ebx & bit_AVX2))
+    {
+      return false;
+    }
+
+  return true;
+}
+#endif
+
 void
 usage (int status)
 {
@@ -208,6 +255,70 @@ write_counts (uintmax_t lines,
   putchar ('\n');
 }
 
+static
+bool wc_lines(const char *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
+{
+  size_t bytes_read;
+  uintmax_t lines, bytes;
+  char buf[BUFFER_SIZE + 1];
+  bool long_lines = false;
+
+  if (!lines_out || !bytes_out)
+    {
+      return false;
+    }
+
+  lines = bytes = 0;
+
+  while ((bytes_read = safe_read (fd, buf, BUFSIZ)) > 0)
+    {
+
+      if (bytes_read == SAFE_READ_ERROR)
+        {
+          error (0, errno, "%s", quotef (file));
+          return false;
+        }
+
+      bytes += bytes_read;
+
+      char *p = buf;
+      char *end = buf + bytes_read;
+      uintmax_t plines = lines;
+
+      if (! long_lines)
+        {
+          /* Avoid function call overhead for shorter lines.  */
+          while (p != end)
+            lines += *p++ == '\n';
+        }
+      else
+        {
+          /* memchr is more efficient with longer lines.  */
+          while ((p = memchr (p, '\n', end - p)))
+            {
+              ++p;
+              ++lines;
+            }
+        }
+
+      /* If the average line length in the block is >= 15, then use
+          memchr for the next block, where system specific optimizations
+          may outweigh function call overhead.
+          FIXME: This line length was determined in 2015, on both
+          x86_64 and ppc64, but it's worth re-evaluating in future with
+          newer compilers, CPUs, or memchr() implementations etc.  */
+      if (lines - plines <= bytes_read / 15)
+        long_lines = true;
+      else
+        long_lines = false;
+    }
+
+  *bytes_out = bytes;
+  *lines_out = lines;
+
+  return true;
+}
+
 /* Count words.  FILE_X is the name of the file (or NULL for standard
    input) that is open on descriptor FD.  *FSTATUS is its status.
    CURRENT_POS is the current file offset if known, negative if unknown.
@@ -312,49 +423,7 @@ wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos)
     {
       /* Use a separate loop when counting only lines or lines and bytes --
          but not chars or words.  */
-      bool long_lines = false;
-      while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0)
-        {
-          if (bytes_read == SAFE_READ_ERROR)
-            {
-              error (0, errno, "%s", quotef (file));
-              ok = false;
-              break;
-            }
-
-          bytes += bytes_read;
-
-          char *p = buf;
-          char *end = p + bytes_read;
-          uintmax_t plines = lines;
-
-          if (! long_lines)
-            {
-              /* Avoid function call overhead for shorter lines.  */
-              while (p != end)
-                lines += *p++ == '\n';
-            }
-          else
-            {
-              /* memchr is more efficient with longer lines.  */
-              while ((p = memchr (p, '\n', end - p)))
-                {
-                  ++p;
-                  ++lines;
-                }
-            }
-
-          /* If the average line length in the block is >= 15, then use
-             memchr for the next block, where system specific optimizations
-             may outweigh function call overhead.
-             FIXME: This line length was determined in 2015, on both
-             x86_64 and ppc64, but it's worth re-evaluating in future with
-             newer compilers, CPUs, or memchr() implementations etc.  */
-          if (lines - plines <= bytes_read / 15)
-            long_lines = true;
-          else
-            long_lines = false;
-        }
+      ok = wc_lines_p(file, fd, &lines, &bytes);
     }
 #if MB_LEN_MAX > 1
 # define SUPPORT_OLD_MBRTOWC 1
@@ -706,6 +775,13 @@ main (int argc, char **argv)
   print_linelength = false;
   total_lines = total_words = total_chars = total_bytes = max_line_length = 0;
 
+#ifdef USE_AVX2_WC_LINECOUNT
+  if (avx2_supported())
+    {
+      wc_lines_p = wc_lines_avx2;
+    }
+#endif
+
   while ((optc = getopt_long (argc, argv, "clLmw", longopts, NULL)) != -1)
     switch (optc)
       {
diff --git a/src/wc_avx2.c b/src/wc_avx2.c
new file mode 100644
index 000000000..d49ad17b4
--- /dev/null
+++ b/src/wc_avx2.c
@@ -0,0 +1,115 @@
+/* wc - print the number of lines, words, and bytes in files
+   Copyright (C) 1985-2021 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include "system.h"
+#include "die.h"
+#include "safe-read.h"
+
+#include <x86intrin.h>
+
+/* This must be below 16 KB (16384) or else the accumulators can theoretically overflow,
+   producing wrong result. This is 2*32 bytes below, so there is no single bytes in the
+   optimal case. */
+#define BUFSIZE (16320)
+
+bool wc_lines_avx2(const char *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out);
+
+bool wc_lines_avx2(const char *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out)
+{
+  __m256i accumulator;
+  __m256i accumulator2;
+  __m256i zeroes;
+  __m256i endlines;
+  __m256i avx_buf[BUFSIZE / sizeof(__m256i)];
+  __m256i *datap;
+  uintmax_t lines = 0;
+  uintmax_t bytes = 0;
+  size_t bytes_read = 0;
+
+
+  if (!lines_out || !bytes_out)
+    {
+      return false;
+    }
+
+  /* Using two parallel accumulators gave a good performance increase.
+     Adding a third gave no additional benefit, at least on an Intel Xeon E3-1231v3.
+     Maybe on a newer CPU with additional vector execution engines it would be a win. */
+  accumulator = _mm256_setzero_si256();
+  accumulator2 = _mm256_setzero_si256();
+  zeroes = _mm256_setzero_si256();
+  endlines = _mm256_set1_epi8('\n');
+
+  while ((bytes_read = safe_read (fd, avx_buf, sizeof(avx_buf))) > 0)
+    {
+      __m256i to_match;
+      __m256i to_match2;
+      __m256i matches;
+      __m256i matches2;
+
+      if (bytes_read == SAFE_READ_ERROR)
+        {
+          error (0, errno, "%s", quotef (file));
+
+          return false;
+        }
+
+      bytes += bytes_read;
+
+      datap = avx_buf;
+      char *end = ((char *)avx_buf) + bytes_read;
+
+      while (bytes_read >= 64)
+        {
+          to_match = _mm256_load_si256(datap);
+          to_match2 = _mm256_load_si256(datap + 1);
+
+          matches = _mm256_cmpeq_epi8(to_match, endlines);
+          matches2 = _mm256_cmpeq_epi8(to_match2, endlines);
+          /* Compare will set each 8 bit integer in the register to 0xFF on match.
+             When we subtract it the 8 bit accumulators will underflow, so this is equal to adding 1. */
+          accumulator = _mm256_sub_epi8(accumulator, matches);
+          accumulator2 = _mm256_sub_epi8(accumulator2, matches2);
+
+          datap += 2;
+          bytes_read -= 64;
+        }
+
+      /* Horizontally add all 8 bit integers in the register, and then reset it */
+      accumulator = _mm256_sad_epu8(accumulator, zeroes);
+      lines += _mm256_extract_epi16(accumulator, 0) + _mm256_extract_epi16(accumulator, 4) +
+                _mm256_extract_epi16(accumulator, 8) + _mm256_extract_epi16(accumulator, 12);
+      accumulator = _mm256_setzero_si256();
+
+      accumulator2 = _mm256_sad_epu8(accumulator2, zeroes);
+      lines += _mm256_extract_epi16(accumulator2, 0) + _mm256_extract_epi16(accumulator2, 4) +
+                _mm256_extract_epi16(accumulator2, 8) + _mm256_extract_epi16(accumulator2, 12);
+      accumulator2 = _mm256_setzero_si256();
+
+      /* Finish up any left over bytes */
+      char *p = (char *)datap;
+      while (p != end)
+        lines += *p++ == '\n';
+
+    }
+
+  *lines_out = lines;
+  *bytes_out = bytes;
+
+  return true;
+}

Reply via email to