Mercurial > hg > CbC > CbC_gcc
diff gcc/spellcheck.c @ 131:84e7813d76e9
gcc-8.2
author | mir3636 |
---|---|
date | Thu, 25 Oct 2018 07:37:49 +0900 |
parents | 04ced10e8804 |
children | 1830386684a0 |
line wrap: on
line diff
--- a/gcc/spellcheck.c Fri Oct 27 22:46:09 2017 +0900 +++ b/gcc/spellcheck.c Thu Oct 25 07:37:49 2018 +0900 @@ -1,5 +1,5 @@ /* Find near-matches for strings. - Copyright (C) 2015-2017 Free Software Foundation, Inc. + Copyright (C) 2015-2018 Free Software Foundation, Inc. This file is part of GCC. @@ -25,15 +25,18 @@ #include "spellcheck.h" #include "selftest.h" -/* The Levenshtein distance is an "edit-distance": the minimal - number of one-character insertions, removals or substitutions - that are needed to change one string into another. +/* Get the edit distance between the two strings: the minimal + number of edits that are needed to change one string into another, + where edits can be one-character insertions, removals, or substitutions, + or transpositions of two adjacent characters (counting as one "edit"). - This implementation uses the Wagner-Fischer algorithm. */ + This implementation uses the Wagner-Fischer algorithm for the + Damerau-Levenshtein distance; specifically, the "optimal string alignment + distance" or "restricted edit distance" variant. */ edit_distance_t -levenshtein_distance (const char *s, int len_s, - const char *t, int len_t) +get_edit_distance (const char *s, int len_s, + const char *t, int len_t) { const bool debug = false; @@ -49,76 +52,86 @@ return len_s; /* We effectively build a matrix where each (i, j) contains the - Levenshtein distance between the prefix strings s[0:j] - and t[0:i]. + distance between the prefix strings s[0:j] and t[0:i]. Rather than actually build an (len_t + 1) * (len_s + 1) matrix, - we simply keep track of the last row, v0 and a new row, v1, - which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses - in favor of two (len_s + 1) allocations. These could potentially be + we simply keep track of the last two rows, v_one_ago and v_two_ago, + and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1) + allocation and memory accesses in favor of three (len_s + 1) + allocations. These could potentially be statically-allocated if we impose a maximum length on the strings of interest. */ - edit_distance_t *v0 = new edit_distance_t[len_s + 1]; - edit_distance_t *v1 = new edit_distance_t[len_s + 1]; + edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1]; + edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1]; + edit_distance_t *v_next = new edit_distance_t[len_s + 1]; /* The first row is for the case of an empty target string, which we can reach by deleting every character in the source string. */ for (int i = 0; i < len_s + 1; i++) - v0[i] = i; + v_one_ago[i] = i; /* Build successive rows. */ for (int i = 0; i < len_t; i++) { if (debug) { - printf ("i:%i v0 = ", i); + printf ("i:%i v_one_ago = ", i); for (int j = 0; j < len_s + 1; j++) - printf ("%i ", v0[j]); + printf ("%i ", v_one_ago[j]); printf ("\n"); } /* The initial column is for the case of an empty source string; we can reach prefixes of the target string of length i by inserting i characters. */ - v1[0] = i + 1; + v_next[0] = i + 1; /* Build the rest of the row by considering neighbors to the north, west and northwest. */ for (int j = 0; j < len_s; j++) { edit_distance_t cost = (s[j] == t[i] ? 0 : 1); - edit_distance_t deletion = v1[j] + 1; - edit_distance_t insertion = v0[j + 1] + 1; - edit_distance_t substitution = v0[j] + cost; + edit_distance_t deletion = v_next[j] + 1; + edit_distance_t insertion = v_one_ago[j + 1] + 1; + edit_distance_t substitution = v_one_ago[j] + cost; edit_distance_t cheapest = MIN (deletion, insertion); cheapest = MIN (cheapest, substitution); - v1[j + 1] = cheapest; + if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i]) + { + edit_distance_t transposition = v_two_ago[j - 1] + 1; + cheapest = MIN (cheapest, transposition); + } + v_next[j + 1] = cheapest; } /* Prepare to move on to next row. */ for (int j = 0; j < len_s + 1; j++) - v0[j] = v1[j]; + { + v_two_ago[j] = v_one_ago[j]; + v_one_ago[j] = v_next[j]; + } } if (debug) { - printf ("final v1 = "); + printf ("final v_next = "); for (int j = 0; j < len_s + 1; j++) - printf ("%i ", v1[j]); + printf ("%i ", v_next[j]); printf ("\n"); } - edit_distance_t result = v1[len_s]; - delete[] v0; - delete[] v1; + edit_distance_t result = v_next[len_s]; + delete[] v_two_ago; + delete[] v_one_ago; + delete[] v_next; return result; } -/* Calculate Levenshtein distance between two nil-terminated strings. */ +/* Get the edit distance between two nil-terminated strings. */ edit_distance_t -levenshtein_distance (const char *s, const char *t) +get_edit_distance (const char *s, const char *t) { - return levenshtein_distance (s, strlen (s), t, strlen (t)); + return get_edit_distance (s, strlen (s), t, strlen (t)); } /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to @@ -149,35 +162,222 @@ return bm.get_best_meaningful_candidate (); } +/* Generate the maximum edit distance for which we consider a suggestion + to be meaningful, given a goal of length GOAL_LEN and a candidate of + length CANDIDATE_LEN. + + This is a third of the the length of the candidate or of the goal, + whichever is bigger. */ + +edit_distance_t +get_edit_distance_cutoff (size_t goal_len, size_t candidate_len) +{ + size_t max_length = MAX (goal_len, candidate_len); + size_t min_length = MIN (goal_len, candidate_len); + + gcc_assert (max_length >= min_length); + + /* Special case: don't offer suggestions for a pair of + length == 1 strings (or empty strings). */ + if (max_length <= 1) + return 0; + + /* If the lengths are close, then round down. */ + if (max_length - min_length <= 1) + /* ...but allow an edit distance of at least 1. */ + return MAX (max_length / 3, 1); + + /* Otherwise, round up (thus giving a little extra leeway to some cases + involving insertions/deletions). */ + return (max_length + 2) / 3; +} + #if CHECKING_P namespace selftest { /* Selftests. */ -/* Verify that the levenshtein_distance (A, B) equals the expected - value. */ +/* Verify that get_edit_distance (A, B) equals the expected value. */ static void -levenshtein_distance_unit_test_oneway (const char *a, const char *b, - edit_distance_t expected) +test_get_edit_distance_one_way (const char *a, const char *b, + edit_distance_t expected) { - edit_distance_t actual = levenshtein_distance (a, b); + edit_distance_t actual = get_edit_distance (a, b); ASSERT_EQ (actual, expected); } /* Verify that both - levenshtein_distance (A, B) + get_edit_distance (A, B) and - levenshtein_distance (B, A) + get_edit_distance (B, A) equal the expected value, to ensure that the function is symmetric. */ static void -levenshtein_distance_unit_test (const char *a, const char *b, - edit_distance_t expected) +test_get_edit_distance_both_ways (const char *a, const char *b, + edit_distance_t expected) +{ + test_get_edit_distance_one_way (a, b, expected); + test_get_edit_distance_one_way (b, a, expected); +} + +/* Verify get_edit_distance for a variety of pairs of pre-canned + inputs, comparing against known-good values. */ + +static void +test_edit_distances () +{ + test_get_edit_distance_both_ways ("", "nonempty", strlen ("nonempty")); + test_get_edit_distance_both_ways ("saturday", "sunday", 3); + test_get_edit_distance_both_ways ("foo", "m_foo", 2); + test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 3); + test_get_edit_distance_both_ways + ("the quick brown fox jumps over the lazy dog", "dog", 40); + test_get_edit_distance_both_ways + ("the quick brown fox jumps over the lazy dog", + "the quick brown dog jumps over the lazy fox", + 4); + test_get_edit_distance_both_ways + ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,", + "All your base are belong to us", + 44); + test_get_edit_distance_both_ways ("foo", "FOO", 3); + test_get_edit_distance_both_ways ("fee", "deed", 2); + test_get_edit_distance_both_ways ("coorzd1", "coordx1", 2); + test_get_edit_distance_both_ways ("assert", "sqrt", 3); + test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", 3); + test_get_edit_distance_both_ways ("time", "nice", 2); + test_get_edit_distance_both_ways ("bar", "carg", 2); + test_get_edit_distance_both_ways ("gtk_widget_show_all", + "GtkWidgetShowAll", 7); + test_get_edit_distance_both_ways ("m_bar", "bar", 2); + test_get_edit_distance_both_ways ("MACRO", "MACRAME", 3); + test_get_edit_distance_both_ways ("ab", "ac", 1); + test_get_edit_distance_both_ways ("ab", "a", 1); + test_get_edit_distance_both_ways ("a", "b", 1); + test_get_edit_distance_both_ways ("nanl", "name", 2); + test_get_edit_distance_both_ways ("char", "bar", 2); + test_get_edit_distance_both_ways ("-optimize", "fsanitize", 5); + test_get_edit_distance_both_ways ("__DATE__", "__i386__", 4); + + /* Examples where transposition helps. */ + test_get_edit_distance_both_ways ("ab", "ba", 1); + test_get_edit_distance_both_ways ("ba", "abc", 2); + test_get_edit_distance_both_ways ("coorzd1", "coordz1", 1); + test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz", + "bacdefghijklmnopqrstuvwxzy", 2); + test_get_edit_distance_both_ways ("saturday", "sundya", 4); + test_get_edit_distance_both_ways ("signed", "singed", 1); +} + +/* Subroutine of test_get_edit_distance_cutoff, for emulating the + spellchecking cutoff in up to GCC 8. */ + +static edit_distance_t +get_old_cutoff (size_t goal_len, size_t candidate_len) +{ + return MAX (goal_len, candidate_len) / 2; +} + +/* Verify that the cutoff for "meaningfulness" of suggestions is at least as + conservative as in older GCC releases. + + This should ensure that we don't offer additional meaningless + suggestions (apart from those for which transposition has helped). */ + +static void +test_get_edit_distance_cutoff () { - levenshtein_distance_unit_test_oneway (a, b, expected); - levenshtein_distance_unit_test_oneway (b, a, expected); + for (size_t goal_len = 0; goal_len < 30; goal_len++) + for (size_t candidate_len = 0; candidate_len < 30; candidate_len++) + ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len) + <= get_old_cutoff (goal_len, candidate_len)); +} + +/* Assert that CANDIDATE is offered as a suggestion for TARGET. */ + +static void +assert_suggested_for (const location &loc, const char *candidate, + const char *target) +{ + auto_vec<const char *> candidates; + candidates.safe_push (candidate); + ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates)); +} + +/* Assert that CANDIDATE is offered as a suggestion for TARGET. */ + +#define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET) \ + SELFTEST_BEGIN_STMT \ + assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \ + SELFTEST_END_STMT + +/* Assert that CANDIDATE is not offered as a suggestion for TARGET. */ + +static void +assert_not_suggested_for (const location &loc, const char *candidate, + const char *target) +{ + auto_vec<const char *> candidates; + candidates.safe_push (candidate); + ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates)); +} + +/* Assert that CANDIDATE is not offered as a suggestion for TARGET. */ + +#define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET) \ + SELFTEST_BEGIN_STMT \ + assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \ + SELFTEST_END_STMT + +/* Verify that we offer varous suggestions that are meaningful, + and that we don't offer various other ones that aren't (PR c/82967). */ + +static void +test_suggestions () +{ + /* Good suggestions. */ + + ASSERT_SUGGESTED_FOR ("m_bar", "bar"); + // dist == 2, max_length == 5, min_length == 3 + + ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME"); + // dist == 3, max_length == 7, min_length == 5 + + ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll"); + // dist == 7, max_length == 16, min_length = 19 + + ASSERT_SUGGESTED_FOR ("ab", "ac"); + // dist == 1, max_length == min_length = 2 + + ASSERT_SUGGESTED_FOR ("ab", "a"); + // dist == 1, max_length == 2, min_length = 1 + + /* Bad suggestions. */ + + ASSERT_NOT_SUGGESTED_FOR ("a", "b"); + // dist == 1, max_length == min_length = 1 + + ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert"); + // dist == 3, max_length 6, min_length == 4 + + ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX"); + // dist == 3, max_length == min_length == 8 + + ASSERT_NOT_SUGGESTED_FOR ("nice", "time"); + ASSERT_NOT_SUGGESTED_FOR ("nanl", "name"); + // dist == 2, max_length == min_length == 4 + + ASSERT_NOT_SUGGESTED_FOR ("carg", "bar"); + ASSERT_NOT_SUGGESTED_FOR ("char", "bar"); + // dist == 2, max_length == 4, min_length == 3 + + ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize"); + // dist == 5, max_length == min_length == 9 + + ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__"); + // dist == 4, max_length == min_length == 8 } /* Verify that find_closest_string is sane. */ @@ -215,6 +415,16 @@ it as a suggestion will be nonsensical. Verify that we don't offer such suggestions. */ ASSERT_EQ (NULL, find_closest_string ("banana", &candidates)); + + /* Example from PR 69968 where transposition helps. */ + candidates.truncate (0); + candidates.safe_push("coordx"); + candidates.safe_push("coordy"); + candidates.safe_push("coordz"); + candidates.safe_push("coordx1"); + candidates.safe_push("coordy1"); + candidates.safe_push("coordz1"); + ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates)); } /* Test data for test_metric_conditions. */ @@ -227,7 +437,7 @@ "1234567890123456789012345678901234567890123456789012345678901234567890" }; -/* Verify that levenshtein_distance appears to be a sane distance function, +/* Verify that get_edit_distance appears to be a sane distance function, i.e. the conditions for being a metric. This is done directly for a small set of examples, using test_data above. This is O(N^3) in the size of the array, due to the test for the triangle inequality, so we keep the @@ -243,7 +453,7 @@ for (int j = 0; j < num_test_cases; j++) { edit_distance_t dist_ij - = levenshtein_distance (test_data[i], test_data[j]); + = get_edit_distance (test_data[i], test_data[j]); /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */ if (i == j) @@ -253,44 +463,30 @@ /* Symmetry: d(i, j) == d(j, i). */ edit_distance_t dist_ji - = levenshtein_distance (test_data[j], test_data[i]); + = get_edit_distance (test_data[j], test_data[i]); ASSERT_EQ (dist_ij, dist_ji); /* Triangle inequality. */ for (int k = 0; k < num_test_cases; k++) { edit_distance_t dist_ik - = levenshtein_distance (test_data[i], test_data[k]); + = get_edit_distance (test_data[i], test_data[k]); edit_distance_t dist_jk - = levenshtein_distance (test_data[j], test_data[k]); + = get_edit_distance (test_data[j], test_data[k]); ASSERT_TRUE (dist_ik <= dist_ij + dist_jk); } } } } -/* Verify levenshtein_distance for a variety of pairs of pre-canned - inputs, comparing against known-good values. */ +/* Run all of the selftests within this file. */ void spellcheck_c_tests () { - levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty")); - levenshtein_distance_unit_test ("saturday", "sunday", 3); - levenshtein_distance_unit_test ("foo", "m_foo", 2); - levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3); - levenshtein_distance_unit_test - ("the quick brown fox jumps over the lazy dog", "dog", 40); - levenshtein_distance_unit_test - ("the quick brown fox jumps over the lazy dog", - "the quick brown dog jumps over the lazy fox", - 4); - levenshtein_distance_unit_test - ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,", - "All your base are belong to us", - 44); - levenshtein_distance_unit_test ("foo", "FOO", 3); - + test_edit_distances (); + test_get_edit_distance_cutoff (); + test_suggestions (); test_find_closest_string (); test_metric_conditions (); }