This patch tunes class best_match's cutoff for rejecting meaningless
spelling suggestions.

Previously, we allowed an edit distance of up to half of the length of the
longer of the goal string and closest candidate strings, rounded down.

With this patch, we now allow only up to a third - with some tuning of
rounding (and for very short strings), to ensure that:
(a) everything that worked before still works (with the removal of a
couple of cases that shouldn't), and that
(b) the new threshold is always at least as conservative as the old
threshold and thus shouldn't offer new nonsensical suggestions (with
the possible exception of cases where transposition has helped; see
r261521 aka Damerau-Levenshtein; PR other/69968).

In particular, all of the bogus suggestions from PR c/82967 are now
no longer offered.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu;
adds 4 PASS results to gcc.sum.

OK for trunk?

OK for backporting to gcc 8? (with the caveat that gcc 8's
edit distance doesn't do transpositions)

gcc/ChangeLog:
        PR c/82967
        * spellcheck.c (get_edit_distance_cutoff): New function.
        (selftest::test_edit_distance_unit_test_oneway): Rename to...
        (selftest::test_get_edit_distance_one_way): ...this.
        (selftest::test_get_edit_distance_unit): Rename to...
        (selftest::test_get_edit_distance_both_ways): ...this.
        (selftest::test_edit_distances): Move tests to this new function,
        and test some more pairs of strings.  Update for above renaming.
        (selftest::get_old_cutoff): New function.
        (selftest::test_get_edit_distance_cutoff): New function.
        (selftest::assert_suggested_for): New function.
        (ASSERT_SUGGESTED_FOR): New macro.
        (selftest::assert_not_suggested_for): New function.
        (ASSERT_NOT_SUGGESTED_FOR): New macro.
        (selftest::test_suggestions): New function.
        (selftest::spellcheck_c_tests): Move test_get_edit_distance_unit
        tests to selftest::test_edit_distances and call it.  Add calls to
        selftest::test_get_edit_distance_cutoff and
        selftest::test_suggestions.
        * spellcheck.h (get_edit_distance_cutoff): New function declaration.
        (best_match::consider): Replace hard-coded cutoff calculation with
        a call to...
        (best_match::get_cutoff): New declaration.
        (best_match::get_best_meaningful_candidate): Likewise.

gcc/testsuite/ChangeLog:
        PR c/82967
        * c-c++-common/attributes-1.c: Remove bogus suggestion from
        dg-prune-output.
        * gcc.dg/diagnostic-token-ranges.c (undeclared_identifier): Remove
        bogus suggestion.
        * gcc.dg/spellcheck-identifiers-4.c: New test.
---
 gcc/spellcheck.c                                | 231 ++++++++++++++++++++----
 gcc/spellcheck.h                                |  19 +-
 gcc/testsuite/c-c++-common/attributes-1.c       |   2 +-
 gcc/testsuite/gcc.dg/diagnostic-token-ranges.c  |   3 +-
 gcc/testsuite/gcc.dg/spellcheck-identifiers-4.c |  10 +
 5 files changed, 224 insertions(+), 41 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/spellcheck-identifiers-4.c

diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
index e2a8b92..690e6fa 100644
--- a/gcc/spellcheck.c
+++ b/gcc/spellcheck.c
@@ -162,6 +162,36 @@ find_closest_string (const char *target,
   return bm.get_best_meaningful_candidate ();
 }
 
+/* Generate the maximum edit distance for which we consider a suggestion
+   to be meaningful, given a goal of length GOAL_LEN and a candidate of
+   length CANDIDATE_LEN.
+
+   This is a third of the the length of the candidate or of the goal,
+   whichever is bigger.  */
+
+edit_distance_t
+get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
+{
+  size_t max_length = MAX (goal_len, candidate_len);
+  size_t min_length = MIN (goal_len, candidate_len);
+
+  gcc_assert (max_length >= min_length);
+
+  /* Special case: don't offer suggestions for a pair of
+     length == 1 strings (or empty strings).  */
+  if (max_length <= 1)
+    return 0;
+
+  /* If the lengths are close, then round down.  */
+  if (max_length - min_length <= 1)
+    /* ...but allow an edit distance of at least 1.  */
+    return MAX (max_length / 3, 1);
+
+  /* Otherwise, round up (thus giving a little extra leeway to some cases
+     involving insertions/deletions).  */
+  return (max_length + 2) / 3;
+}
+
 #if CHECKING_P
 
 namespace selftest {
@@ -171,8 +201,8 @@ namespace selftest {
 /* Verify that get_edit_distance (A, B) equals the expected value.  */
 
 static void
-test_edit_distance_unit_test_oneway (const char *a, const char *b,
-                                   edit_distance_t expected)
+test_get_edit_distance_one_way (const char *a, const char *b,
+                               edit_distance_t expected)
 {
   edit_distance_t actual = get_edit_distance (a, b);
   ASSERT_EQ (actual, expected);
@@ -185,11 +215,169 @@ test_edit_distance_unit_test_oneway (const char *a, 
const char *b,
    equal the expected value, to ensure that the function is symmetric.  */
 
 static void
-test_get_edit_distance_unit (const char *a, const char *b,
+test_get_edit_distance_both_ways (const char *a, const char *b,
                             edit_distance_t expected)
 {
-  test_edit_distance_unit_test_oneway (a, b, expected);
-  test_edit_distance_unit_test_oneway (b, a, expected);
+  test_get_edit_distance_one_way (a, b, expected);
+  test_get_edit_distance_one_way (b, a, expected);
+}
+
+/* Verify get_edit_distance for a variety of pairs of pre-canned
+   inputs, comparing against known-good values.  */
+
+static void
+test_edit_distances ()
+{
+  test_get_edit_distance_both_ways ("", "nonempty", strlen ("nonempty"));
+  test_get_edit_distance_both_ways ("saturday", "sunday", 3);
+  test_get_edit_distance_both_ways ("foo", "m_foo", 2);
+  test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 3);
+  test_get_edit_distance_both_ways
+    ("the quick brown fox jumps over the lazy dog", "dog", 40);
+  test_get_edit_distance_both_ways
+    ("the quick brown fox jumps over the lazy dog",
+     "the quick brown dog jumps over the lazy fox",
+     4);
+  test_get_edit_distance_both_ways
+    ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
+     "All your base are belong to us",
+     44);
+  test_get_edit_distance_both_ways ("foo", "FOO", 3);
+  test_get_edit_distance_both_ways ("fee", "deed", 2);
+  test_get_edit_distance_both_ways ("coorzd1", "coordx1", 2);
+  test_get_edit_distance_both_ways ("assert", "sqrt", 3);
+  test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", 3);
+  test_get_edit_distance_both_ways ("time", "nice", 2);
+  test_get_edit_distance_both_ways ("bar", "carg", 2);
+  test_get_edit_distance_both_ways ("gtk_widget_show_all",
+                                   "GtkWidgetShowAll", 7);
+  test_get_edit_distance_both_ways ("m_bar", "bar", 2);
+  test_get_edit_distance_both_ways ("MACRO", "MACRAME", 3);
+  test_get_edit_distance_both_ways ("ab", "ac", 1);
+  test_get_edit_distance_both_ways ("ab", "a", 1);
+  test_get_edit_distance_both_ways ("a", "b", 1);
+  test_get_edit_distance_both_ways ("nanl", "name", 2);
+  test_get_edit_distance_both_ways ("char", "bar", 2);
+  test_get_edit_distance_both_ways ("-optimize", "fsanitize", 5);
+  test_get_edit_distance_both_ways ("__DATE__", "__i386__", 4);
+
+  /* Examples where transposition helps.  */
+  test_get_edit_distance_both_ways ("ab", "ba", 1);
+  test_get_edit_distance_both_ways ("ba", "abc", 2);
+  test_get_edit_distance_both_ways ("coorzd1", "coordz1", 1);
+  test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
+                                   "bacdefghijklmnopqrstuvwxzy", 2);
+  test_get_edit_distance_both_ways ("saturday", "sundya", 4);
+  test_get_edit_distance_both_ways ("signed", "singed", 1);
+}
+
+/* Subroutine of test_get_edit_distance_cutoff, for emulating the
+   spellchecking cutoff in up to GCC 8.  */
+
+static edit_distance_t
+get_old_cutoff (size_t goal_len, size_t candidate_len)
+{
+  return MAX (goal_len, candidate_len) / 2;
+}
+
+/* Verify that the cutoff for "meaningfulness" of suggestions is at least as
+   conservative as in older GCC releases.
+
+   This should ensure that we don't offer additional meaningless
+   suggestions (apart from those for which transposition has helped).  */
+
+static void
+test_get_edit_distance_cutoff ()
+{
+  for (size_t goal_len = 0; goal_len < 30; goal_len++)
+    for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
+      ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
+                  <= get_old_cutoff (goal_len, candidate_len));
+}
+
+/* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
+
+static void
+assert_suggested_for (const location &loc, const char *candidate,
+                     const char *target)
+{
+  auto_vec<const char *> candidates;
+  candidates.safe_push (candidate);
+  ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
+}
+
+/* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
+
+#define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET)                        \
+  SELFTEST_BEGIN_STMT                                                  \
+    assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);       \
+  SELFTEST_END_STMT
+
+/* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
+
+static void
+assert_not_suggested_for (const location &loc, const char *candidate,
+                         const char *target)
+{
+  auto_vec<const char *> candidates;
+  candidates.safe_push (candidate);
+  ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
+}
+
+/* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
+
+#define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET)                    \
+  SELFTEST_BEGIN_STMT                                                  \
+    assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);   \
+  SELFTEST_END_STMT
+
+/* Verify that we offer varous suggestions that are meaningful,
+   and that we don't offer various other ones that aren't (PR c/82967).  */
+
+static void
+test_suggestions ()
+{
+  /* Good suggestions.  */
+
+  ASSERT_SUGGESTED_FOR ("m_bar", "bar");
+  // dist == 2, max_length == 5, min_length == 3
+
+  ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
+  // dist == 3, max_length == 7, min_length == 5
+
+  ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
+  // dist == 7, max_length == 16, min_length = 19
+
+  ASSERT_SUGGESTED_FOR ("ab", "ac");
+  // dist == 1, max_length == min_length = 2
+
+  ASSERT_SUGGESTED_FOR ("ab", "a");
+  // dist == 1, max_length == 2, min_length = 1
+
+  /* Bad suggestions.  */
+
+  ASSERT_NOT_SUGGESTED_FOR ("a", "b");
+  // dist == 1, max_length == min_length = 1
+
+  ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
+  // dist == 3, max_length 6, min_length == 4
+
+  ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
+  // dist == 3, max_length == min_length == 8
+
+  ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
+  ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
+  // dist == 2, max_length == min_length == 4
+
+  ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
+  ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
+  // dist == 2, max_length == 4, min_length == 3
+
+  ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
+  // dist == 5, max_length == min_length == 9
+
+  ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
+  // dist == 4, max_length == min_length == 8
 }
 
 /* Verify that find_closest_string is sane.  */
@@ -291,39 +479,14 @@ test_metric_conditions ()
     }
 }
 
-/* Verify get_edit_distance for a variety of pairs of pre-canned
-   inputs, comparing against known-good values.  */
+/* Run all of the selftests within this file.  */
 
 void
 spellcheck_c_tests ()
 {
-  test_get_edit_distance_unit ("", "nonempty", strlen ("nonempty"));
-  test_get_edit_distance_unit ("saturday", "sunday", 3);
-  test_get_edit_distance_unit ("foo", "m_foo", 2);
-  test_get_edit_distance_unit ("hello_world", "HelloWorld", 3);
-  test_get_edit_distance_unit
-    ("the quick brown fox jumps over the lazy dog", "dog", 40);
-  test_get_edit_distance_unit
-    ("the quick brown fox jumps over the lazy dog",
-     "the quick brown dog jumps over the lazy fox",
-     4);
-  test_get_edit_distance_unit
-    ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
-     "All your base are belong to us",
-     44);
-  test_get_edit_distance_unit ("foo", "FOO", 3);
-  test_get_edit_distance_unit ("fee", "deed", 2);
-  test_get_edit_distance_unit ("coorzd1", "coordx1", 2);
-
-  /* Examples where transposition helps.  */
-  test_get_edit_distance_unit ("ab", "ba", 1);
-  test_get_edit_distance_unit ("ba", "abc", 2);
-  test_get_edit_distance_unit ("coorzd1", "coordz1", 1);
-  test_get_edit_distance_unit ("abcdefghijklmnopqrstuvwxyz",
-                              "bacdefghijklmnopqrstuvwxzy", 2);
-  test_get_edit_distance_unit ("saturday", "sundya", 4);
-  test_get_edit_distance_unit ("signed", "singed", 1);
-
+  test_edit_distances ();
+  test_get_edit_distance_cutoff ();
+  test_suggestions ();
   test_find_closest_string ();
   test_metric_conditions ();
 }
diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h
index 41ac16f..e8fa77c 100644
--- a/gcc/spellcheck.h
+++ b/gcc/spellcheck.h
@@ -66,6 +66,9 @@ struct edit_distance_traits<const char *>
   }
 };
 
+extern edit_distance_t get_edit_distance_cutoff (size_t goal_len,
+                                                size_t candidate_len);
+
 /* A type for use when determining the best match against a string,
    expressed as a template so that we can match against various
    string-like types (const char *, frontend identifiers, and preprocessor
@@ -119,7 +122,7 @@ class best_match
     /* If the candidate will be unable to beat the criterion in
        get_best_meaningful_candidate, reject it without computing
        the exact distance.  */
-    unsigned int cutoff = MAX (m_goal_len, candidate_len) / 2;
+    edit_distance_t cutoff = get_cutoff (candidate_len);
     if (min_candidate_distance > cutoff)
       return;
 
@@ -151,17 +154,25 @@ class best_match
     m_best_candidate_len = best_candidate_len;
   }
 
+  /* Generate the maximum edit distance for which we consider a suggestion
+     to be meaningful, given a candidate of length CANDIDATE_LEN.  */
+
+  edit_distance_t get_cutoff (size_t candidate_len) const
+  {
+    return ::get_edit_distance_cutoff (m_goal_len, candidate_len);
+  }
+
   /* Get the best candidate so far, but applying a filter to ensure
      that we return NULL if none of the candidates are close to the goal,
      to avoid offering nonsensical suggestions to the user.  */
 
   candidate_t get_best_meaningful_candidate () const
   {
-    /* If more than half of the letters were misspelled, the suggestion is
-       likely to be meaningless.  */
+    /* If the edit distance is too high, the suggestion is likely to be
+       meaningless.  */
     if (m_best_candidate)
       {
-       unsigned int cutoff = MAX (m_goal_len, m_best_candidate_len) / 2;
+       edit_distance_t cutoff = get_cutoff (m_best_candidate_len);
        if (m_best_distance > cutoff)
          return NULL;
     }
diff --git a/gcc/testsuite/c-c++-common/attributes-1.c 
b/gcc/testsuite/c-c++-common/attributes-1.c
index c76339f..1657da1 100644
--- a/gcc/testsuite/c-c++-common/attributes-1.c
+++ b/gcc/testsuite/c-c++-common/attributes-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-prune-output "undeclared here \\(not in a function\\); did you mean 
.char..|\[^\n\r\]* was not declared in this scope" } */
+/* { dg-prune-output "undeclared here \\(not in a function\\)|\[^\n\r\]* was 
not declared in this scope" } */
 
 void* my_calloc(unsigned, unsigned) __attribute__((alloc_size(1,bar))); /* { 
dg-warning "outside range" } */
 void* my_realloc(void*, unsigned) __attribute__((alloc_size(bar))); /* { 
dg-warning "outside range" } */
diff --git a/gcc/testsuite/gcc.dg/diagnostic-token-ranges.c 
b/gcc/testsuite/gcc.dg/diagnostic-token-ranges.c
index 2ef7a01..406bd32 100644
--- a/gcc/testsuite/gcc.dg/diagnostic-token-ranges.c
+++ b/gcc/testsuite/gcc.dg/diagnostic-token-ranges.c
@@ -8,12 +8,11 @@ long double nanl (const char *);
 
 void undeclared_identifier (void)
 {
-  name; /* { dg-error "'name' undeclared .first use in this function.; did you 
mean .nanl." } */
+  name; /* { dg-error "'name' undeclared" } */
 /*
 { dg-begin-multiline-output "" }
    name;
    ^~~~
-   nanl
 { dg-end-multiline-output "" }
 */
 }
diff --git a/gcc/testsuite/gcc.dg/spellcheck-identifiers-4.c 
b/gcc/testsuite/gcc.dg/spellcheck-identifiers-4.c
new file mode 100644
index 0000000..f9b7d8d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/spellcheck-identifiers-4.c
@@ -0,0 +1,10 @@
+/* { dg-options "-Wimplicit-function-declaration" } */
+
+extern double sqrt (double);
+
+void test (float pf, float inff)
+{
+  assert (pf == inff); /* { dg-bogus "sqrt" } */
+  /* { dg-warning "implicit declaration of function 'assert'" "" { target 
*-*-* } .-1 } */
+  /* { dg-message "header '<assert.h>'" "" { target *-*-* } .-2 } */
+}
-- 
1.8.5.3

Reply via email to