From 7856df9ecd26102b525d8140425f14cf9fbcea77 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@fb.com>
Date: Mon, 21 Sep 2020 13:06:15 -0700
Subject: [PATCH 2/2] tests: test for many-regexp N^2 RSS regression

* tests/many-regex-performance: New test for this performance
regression.
* tests/Makefile.am: Add it.
* NEWS (Bug fixes): Describe it.
---
 NEWS                         |  8 ++++
 tests/Makefile.am            |  1 +
 tests/many-regex-performance | 79 ++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+)
 create mode 100755 tests/many-regex-performance

diff --git a/NEWS b/NEWS
index 79b9db0..442d4d2 100644
--- a/NEWS
+++ b/NEWS
@@ -39,6 +39,14 @@ GNU grep NEWS                                    -*- outline -*-
   A performance regression with many duplicate patterns has been fixed.
   [Bug#43040 introduced in grep 3.4]

+  An N^2 RSS performance regression with many patterns has been fixed
+  in common cases (no backref, and no use of -o or --color).
+  With only 80,000 lines of /usr/share/dict/linux.words, the following
+  would use 100GB of RSS and take 3 minutes. With the fix, it used less
+  than 400MB and took less than one second:
+    head -80000 /usr/share/dict/linux.words > w; grep -vf w w
+  [Bug#43527 introduced in grep 3.4]
+
 ** Build-related

   "make dist" builds .tar.gz files again, as they are still used in
diff --git a/tests/Makefile.am b/tests/Makefile.am
index 9d49e40..83e7087 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -120,6 +120,7 @@ TESTS =						\
   kwset-abuse					\
   long-line-vs-2GiB-read			\
   long-pattern-perf				\
+  many-regex-performance			\
   match-lines					\
   max-count-overread				\
   max-count-vs-context				\
diff --git a/tests/many-regex-performance b/tests/many-regex-performance
new file mode 100755
index 0000000..ee68a04
--- /dev/null
+++ b/tests/many-regex-performance
@@ -0,0 +1,79 @@
+#!/bin/sh
+# Test for this performance regression:
+# grep-3.4 would require O(N^2) RSS for N regexps
+# grep-3.5 requires O(N) in the most common cases.
+
+# Copyright 2020 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/init.sh"; path_prepend_ ../src
+
+fail=0
+
+# This test is susceptible to failure due to differences in
+# system load during the two test runs, so we'll mark it as
+# "expensive", making it less likely to be run by regular users.
+expensive_
+
+# Make the quick/small input large enough so that even on high-end
+# systems this first invocation takes at least 10ms of user time.
+word_list=/usr/share/dict/linux.words
+
+# If $word_list does not exist, generate an input that exibhits
+# similar performance characteristics.
+if ! test -f $word_list; then
+  # Generate data comprable to that word list.
+  # Note how all "words" start with "a", and that there is
+  # a small percentage of lines with at least one "." metachar.
+  # This requires /dev/urandom, so if it's not present, skip
+  # this test. If desperate, we could fall back to using
+  # tar+compressed lib/*.c as the data source.
+  test -r /dev/urandom \
+    || skip_ 'this system has neither word list nor working /dev/urandom'
+  word_list=word_list
+  ( echo a; cat /dev/urandom		\
+    | LC_ALL=C tr -dc 'a-zA-Z0-9_'	\
+    | head -c500000			\
+    | sed 's/\(........\)/\1\n/g'	\
+    | sed s/rs/./			\
+    | sed s/./a/			\
+    | sort				\
+  ) > $word_list
+fi
+
+n_lines=2000
+while :; do
+  sed ${n_lines}q < $word_list > in || framework_failure_
+  small_ms=$(LC_ALL=C user_time_ 1 grep --file=in -v in) || fail=1
+  test $small_ms -ge 10 && break
+  n_lines=$(expr $n_lines + 2000)
+done
+
+# Now, run it again, but with 20 times as many lines.
+n_lines=$(expr $n_lines \* 20)
+sed ${n_lines}q < $word_list > in || framework_failure_
+large_ms=$(LC_ALL=C user_time_ 1 grep --file=in -v in) || fail=1
+
+# Deliberately recording in an unused variable so it
+# shows up in set -x output, in case this test fails.
+ratio=$(expr "$large_ms" / "$small_ms")
+
+# The duration of the larger run must be no more than 60 times
+# that of the small one.  Using recent versions prior to this fix,
+# this test would fail due to ratios larger than 300.  Using the
+# fixed version, it's common to see a ratio of 20-30.
+returns_ 1 expr $small_ms '<' $large_ms / 60 || fail=1
+
+Exit $fail
-- 
2.28.0.546.g385c171a01

