diff gcc/spellcheck.c @ 131:84e7813d76e9

gcc-8.2
author mir3636
date Thu, 25 Oct 2018 07:37:49 +0900
parents 04ced10e8804
children 1830386684a0
line wrap: on
line diff
--- a/gcc/spellcheck.c	Fri Oct 27 22:46:09 2017 +0900
+++ b/gcc/spellcheck.c	Thu Oct 25 07:37:49 2018 +0900
@@ -1,5 +1,5 @@
 /* Find near-matches for strings.
-   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   Copyright (C) 2015-2018 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -25,15 +25,18 @@
 #include "spellcheck.h"
 #include "selftest.h"
 
-/* The Levenshtein distance is an "edit-distance": the minimal
-   number of one-character insertions, removals or substitutions
-   that are needed to change one string into another.
+/* Get the edit distance between the two strings: the minimal
+   number of edits that are needed to change one string into another,
+   where edits can be one-character insertions, removals, or substitutions,
+   or transpositions of two adjacent characters (counting as one "edit").
 
-   This implementation uses the Wagner-Fischer algorithm.  */
+   This implementation uses the Wagner-Fischer algorithm for the
+   Damerau-Levenshtein distance; specifically, the "optimal string alignment
+   distance" or "restricted edit distance" variant.  */
 
 edit_distance_t
-levenshtein_distance (const char *s, int len_s,
-		      const char *t, int len_t)
+get_edit_distance (const char *s, int len_s,
+		   const char *t, int len_t)
 {
   const bool debug = false;
 
@@ -49,76 +52,86 @@
     return len_s;
 
   /* We effectively build a matrix where each (i, j) contains the
-     Levenshtein distance between the prefix strings s[0:j]
-     and t[0:i].
+     distance between the prefix strings s[0:j] and t[0:i].
      Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
-     we simply keep track of the last row, v0 and a new row, v1,
-     which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
-     in favor of two (len_s + 1) allocations.  These could potentially be
+     we simply keep track of the last two rows, v_one_ago and v_two_ago,
+     and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
+     allocation and memory accesses in favor of three (len_s + 1)
+     allocations.  These could potentially be
      statically-allocated if we impose a maximum length on the
      strings of interest.  */
-  edit_distance_t *v0 = new edit_distance_t[len_s + 1];
-  edit_distance_t *v1 = new edit_distance_t[len_s + 1];
+  edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
+  edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
+  edit_distance_t *v_next = new edit_distance_t[len_s + 1];
 
   /* The first row is for the case of an empty target string, which
      we can reach by deleting every character in the source string.  */
   for (int i = 0; i < len_s + 1; i++)
-    v0[i] = i;
+    v_one_ago[i] = i;
 
   /* Build successive rows.  */
   for (int i = 0; i < len_t; i++)
     {
       if (debug)
 	{
-	  printf ("i:%i v0 = ", i);
+	  printf ("i:%i v_one_ago = ", i);
 	  for (int j = 0; j < len_s + 1; j++)
-	    printf ("%i ", v0[j]);
+	    printf ("%i ", v_one_ago[j]);
 	  printf ("\n");
 	}
 
       /* The initial column is for the case of an empty source string; we
 	 can reach prefixes of the target string of length i
 	 by inserting i characters.  */
-      v1[0] = i + 1;
+      v_next[0] = i + 1;
 
       /* Build the rest of the row by considering neighbors to
 	 the north, west and northwest.  */
       for (int j = 0; j < len_s; j++)
 	{
 	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
-	  edit_distance_t deletion     = v1[j] + 1;
-	  edit_distance_t insertion    = v0[j + 1] + 1;
-	  edit_distance_t substitution = v0[j] + cost;
+	  edit_distance_t deletion     = v_next[j] + 1;
+	  edit_distance_t insertion    = v_one_ago[j + 1] + 1;
+	  edit_distance_t substitution = v_one_ago[j] + cost;
 	  edit_distance_t cheapest = MIN (deletion, insertion);
 	  cheapest = MIN (cheapest, substitution);
-	  v1[j + 1] = cheapest;
+	  if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
+	    {
+	      edit_distance_t transposition = v_two_ago[j - 1] + 1;
+	      cheapest = MIN (cheapest, transposition);
+	    }
+	  v_next[j + 1] = cheapest;
 	}
 
       /* Prepare to move on to next row.  */
       for (int j = 0; j < len_s + 1; j++)
-	v0[j] = v1[j];
+	{
+	  v_two_ago[j] = v_one_ago[j];
+	  v_one_ago[j] = v_next[j];
+	}
     }
 
   if (debug)
     {
-      printf ("final v1 = ");
+      printf ("final v_next = ");
       for (int j = 0; j < len_s + 1; j++)
-	printf ("%i ", v1[j]);
+	printf ("%i ", v_next[j]);
       printf ("\n");
     }
 
-  edit_distance_t result = v1[len_s];
-  delete[] v0;
-  delete[] v1;
+  edit_distance_t result = v_next[len_s];
+  delete[] v_two_ago;
+  delete[] v_one_ago;
+  delete[] v_next;
   return result;
 }
 
-/* Calculate Levenshtein distance between two nil-terminated strings.  */
+/* Get the edit distance between two nil-terminated strings.  */
 
 edit_distance_t
-levenshtein_distance (const char *s, const char *t)
+get_edit_distance (const char *s, const char *t)
 {
-  return levenshtein_distance (s, strlen (s), t, strlen (t));
+  return get_edit_distance (s, strlen (s), t, strlen (t));
 }
 
 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
@@ -149,35 +162,222 @@
   return bm.get_best_meaningful_candidate ();
 }
 
+/* Generate the maximum edit distance for which we consider a suggestion
+   to be meaningful, given a goal of length GOAL_LEN and a candidate of
+   length CANDIDATE_LEN.
+
+   This is a third of the the length of the candidate or of the goal,
+   whichever is bigger.  */
+
+edit_distance_t
+get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
+{
+  size_t max_length = MAX (goal_len, candidate_len);
+  size_t min_length = MIN (goal_len, candidate_len);
+
+  gcc_assert (max_length >= min_length);
+
+  /* Special case: don't offer suggestions for a pair of
+     length == 1 strings (or empty strings).  */
+  if (max_length <= 1)
+    return 0;
+
+  /* If the lengths are close, then round down.  */
+  if (max_length - min_length <= 1)
+    /* ...but allow an edit distance of at least 1.  */
+    return MAX (max_length / 3, 1);
+
+  /* Otherwise, round up (thus giving a little extra leeway to some cases
+     involving insertions/deletions).  */
+  return (max_length + 2) / 3;
+}
+
 #if CHECKING_P
 
 namespace selftest {
 
 /* Selftests.  */
 
-/* Verify that the levenshtein_distance (A, B) equals the expected
-   value.  */
+/* Verify that get_edit_distance (A, B) equals the expected value.  */
 
 static void
-levenshtein_distance_unit_test_oneway (const char *a, const char *b,
-				       edit_distance_t expected)
+test_get_edit_distance_one_way (const char *a, const char *b,
+				edit_distance_t expected)
 {
-  edit_distance_t actual = levenshtein_distance (a, b);
+  edit_distance_t actual = get_edit_distance (a, b);
   ASSERT_EQ (actual, expected);
 }
 
 /* Verify that both
-     levenshtein_distance (A, B)
+     get_edit_distance (A, B)
    and
-     levenshtein_distance (B, A)
+     get_edit_distance (B, A)
    equal the expected value, to ensure that the function is symmetric.  */
 
 static void
-levenshtein_distance_unit_test (const char *a, const char *b,
-				edit_distance_t expected)
+test_get_edit_distance_both_ways (const char *a, const char *b,
+			     edit_distance_t expected)
+{
+  test_get_edit_distance_one_way (a, b, expected);
+  test_get_edit_distance_one_way (b, a, expected);
+}
+
+/* Verify get_edit_distance for a variety of pairs of pre-canned
+   inputs, comparing against known-good values.  */
+
+static void
+test_edit_distances ()
+{
+  test_get_edit_distance_both_ways ("", "nonempty", strlen ("nonempty"));
+  test_get_edit_distance_both_ways ("saturday", "sunday", 3);
+  test_get_edit_distance_both_ways ("foo", "m_foo", 2);
+  test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 3);
+  test_get_edit_distance_both_ways
+    ("the quick brown fox jumps over the lazy dog", "dog", 40);
+  test_get_edit_distance_both_ways
+    ("the quick brown fox jumps over the lazy dog",
+     "the quick brown dog jumps over the lazy fox",
+     4);
+  test_get_edit_distance_both_ways
+    ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
+     "All your base are belong to us",
+     44);
+  test_get_edit_distance_both_ways ("foo", "FOO", 3);
+  test_get_edit_distance_both_ways ("fee", "deed", 2);
+  test_get_edit_distance_both_ways ("coorzd1", "coordx1", 2);
+  test_get_edit_distance_both_ways ("assert", "sqrt", 3);
+  test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", 3);
+  test_get_edit_distance_both_ways ("time", "nice", 2);
+  test_get_edit_distance_both_ways ("bar", "carg", 2);
+  test_get_edit_distance_both_ways ("gtk_widget_show_all",
+				    "GtkWidgetShowAll", 7);
+  test_get_edit_distance_both_ways ("m_bar", "bar", 2);
+  test_get_edit_distance_both_ways ("MACRO", "MACRAME", 3);
+  test_get_edit_distance_both_ways ("ab", "ac", 1);
+  test_get_edit_distance_both_ways ("ab", "a", 1);
+  test_get_edit_distance_both_ways ("a", "b", 1);
+  test_get_edit_distance_both_ways ("nanl", "name", 2);
+  test_get_edit_distance_both_ways ("char", "bar", 2);
+  test_get_edit_distance_both_ways ("-optimize", "fsanitize", 5);
+  test_get_edit_distance_both_ways ("__DATE__", "__i386__", 4);
+
+  /* Examples where transposition helps.  */
+  test_get_edit_distance_both_ways ("ab", "ba", 1);
+  test_get_edit_distance_both_ways ("ba", "abc", 2);
+  test_get_edit_distance_both_ways ("coorzd1", "coordz1", 1);
+  test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
+				    "bacdefghijklmnopqrstuvwxzy", 2);
+  test_get_edit_distance_both_ways ("saturday", "sundya", 4);
+  test_get_edit_distance_both_ways ("signed", "singed", 1);
+}
+
+/* Subroutine of test_get_edit_distance_cutoff, for emulating the
+   spellchecking cutoff in up to GCC 8.  */
+
+static edit_distance_t
+get_old_cutoff (size_t goal_len, size_t candidate_len)
+{
+  return MAX (goal_len, candidate_len) / 2;
+}
+
+/* Verify that the cutoff for "meaningfulness" of suggestions is at least as
+   conservative as in older GCC releases.
+
+   This should ensure that we don't offer additional meaningless
+   suggestions (apart from those for which transposition has helped).  */
+
+static void
+test_get_edit_distance_cutoff ()
 {
-  levenshtein_distance_unit_test_oneway (a, b, expected);
-  levenshtein_distance_unit_test_oneway (b, a, expected);
+  for (size_t goal_len = 0; goal_len < 30; goal_len++)
+    for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
+      ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
+		   <= get_old_cutoff (goal_len, candidate_len));
+}
+
+/* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
+
+static void
+assert_suggested_for (const location &loc, const char *candidate,
+		      const char *target)
+{
+  auto_vec<const char *> candidates;
+  candidates.safe_push (candidate);
+  ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
+}
+
+/* Assert that CANDIDATE is offered as a suggestion for TARGET.  */
+
+#define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET)			\
+  SELFTEST_BEGIN_STMT							\
+    assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);	\
+  SELFTEST_END_STMT
+
+/* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
+
+static void
+assert_not_suggested_for (const location &loc, const char *candidate,
+			  const char *target)
+{
+  auto_vec<const char *> candidates;
+  candidates.safe_push (candidate);
+  ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
+}
+
+/* Assert that CANDIDATE is not offered as a suggestion for TARGET.  */
+
+#define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET)			\
+  SELFTEST_BEGIN_STMT							\
+    assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET);	\
+  SELFTEST_END_STMT
+
+/* Verify that we offer varous suggestions that are meaningful,
+   and that we don't offer various other ones that aren't (PR c/82967).  */
+
+static void
+test_suggestions ()
+{
+  /* Good suggestions.  */
+
+  ASSERT_SUGGESTED_FOR ("m_bar", "bar");
+  // dist == 2, max_length == 5, min_length == 3
+
+  ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
+  // dist == 3, max_length == 7, min_length == 5
+
+  ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
+  // dist == 7, max_length == 16, min_length = 19
+
+  ASSERT_SUGGESTED_FOR ("ab", "ac");
+  // dist == 1, max_length == min_length = 2
+
+  ASSERT_SUGGESTED_FOR ("ab", "a");
+  // dist == 1, max_length == 2, min_length = 1
+
+  /* Bad suggestions.  */
+
+  ASSERT_NOT_SUGGESTED_FOR ("a", "b");
+  // dist == 1, max_length == min_length = 1
+
+  ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
+  // dist == 3, max_length 6, min_length == 4
+
+  ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
+  // dist == 3, max_length == min_length == 8
+
+  ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
+  ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
+  // dist == 2, max_length == min_length == 4
+
+  ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
+  ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
+  // dist == 2, max_length == 4, min_length == 3
+
+  ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
+  // dist == 5, max_length == min_length == 9
+
+  ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
+  // dist == 4, max_length == min_length == 8
 }
 
 /* Verify that find_closest_string is sane.  */
@@ -215,6 +415,16 @@
      it as a suggestion will be nonsensical.  Verify that we don't offer such
      suggestions.  */
   ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
+
+  /* Example from PR 69968 where transposition helps.  */
+  candidates.truncate (0);
+  candidates.safe_push("coordx");
+  candidates.safe_push("coordy");
+  candidates.safe_push("coordz");
+  candidates.safe_push("coordx1");
+  candidates.safe_push("coordy1");
+  candidates.safe_push("coordz1");
+  ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
 }
 
 /* Test data for test_metric_conditions.  */
@@ -227,7 +437,7 @@
   "1234567890123456789012345678901234567890123456789012345678901234567890"
 };
 
-/* Verify that levenshtein_distance appears to be a sane distance function,
+/* Verify that get_edit_distance appears to be a sane distance function,
    i.e. the conditions for being a metric.  This is done directly for a
    small set of examples, using test_data above.  This is O(N^3) in the size
    of the array, due to the test for the triangle inequality, so we keep the
@@ -243,7 +453,7 @@
       for (int j = 0; j < num_test_cases; j++)
 	{
 	  edit_distance_t dist_ij
-	    = levenshtein_distance (test_data[i], test_data[j]);
+	    = get_edit_distance (test_data[i], test_data[j]);
 
 	  /* Identity of indiscernibles: d(i, j) > 0 iff i == j.  */
 	  if (i == j)
@@ -253,44 +463,30 @@
 
 	  /* Symmetry: d(i, j) == d(j, i).  */
 	  edit_distance_t dist_ji
-	    = levenshtein_distance (test_data[j], test_data[i]);
+	    = get_edit_distance (test_data[j], test_data[i]);
 	  ASSERT_EQ (dist_ij, dist_ji);
 
 	  /* Triangle inequality.  */
 	  for (int k = 0; k < num_test_cases; k++)
 	    {
 	      edit_distance_t dist_ik
-		= levenshtein_distance (test_data[i], test_data[k]);
+		= get_edit_distance (test_data[i], test_data[k]);
 	      edit_distance_t dist_jk
-		= levenshtein_distance (test_data[j], test_data[k]);
+		= get_edit_distance (test_data[j], test_data[k]);
 	      ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
 	    }
 	}
     }
 }
 
-/* Verify levenshtein_distance for a variety of pairs of pre-canned
-   inputs, comparing against known-good values.  */
+/* Run all of the selftests within this file.  */
 
 void
 spellcheck_c_tests ()
 {
-  levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
-  levenshtein_distance_unit_test ("saturday", "sunday", 3);
-  levenshtein_distance_unit_test ("foo", "m_foo", 2);
-  levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
-  levenshtein_distance_unit_test
-    ("the quick brown fox jumps over the lazy dog", "dog", 40);
-  levenshtein_distance_unit_test
-    ("the quick brown fox jumps over the lazy dog",
-     "the quick brown dog jumps over the lazy fox",
-     4);
-  levenshtein_distance_unit_test
-    ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
-     "All your base are belong to us",
-     44);
-  levenshtein_distance_unit_test ("foo", "FOO", 3);
-
+  test_edit_distances ();
+  test_get_edit_distance_cutoff ();
+  test_suggestions ();
   test_find_closest_string ();
   test_metric_conditions ();
 }