Mercurial > hg > CbC > CbC_gcc
diff gcc/spellcheck.c @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | |
children | 84e7813d76e9 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gcc/spellcheck.c Fri Oct 27 22:46:09 2017 +0900 @@ -0,0 +1,300 @@ +/* Find near-matches for strings. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +<http://www.gnu.org/licenses/>. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "tree.h" +#include "spellcheck.h" +#include "selftest.h" + +/* The Levenshtein distance is an "edit-distance": the minimal + number of one-character insertions, removals or substitutions + that are needed to change one string into another. + + This implementation uses the Wagner-Fischer algorithm. */ + +edit_distance_t +levenshtein_distance (const char *s, int len_s, + const char *t, int len_t) +{ + const bool debug = false; + + if (debug) + { + printf ("s: \"%s\" (len_s=%i)\n", s, len_s); + printf ("t: \"%s\" (len_t=%i)\n", t, len_t); + } + + if (len_s == 0) + return len_t; + if (len_t == 0) + return len_s; + + /* We effectively build a matrix where each (i, j) contains the + Levenshtein distance between the prefix strings s[0:j] + and t[0:i]. + Rather than actually build an (len_t + 1) * (len_s + 1) matrix, + we simply keep track of the last row, v0 and a new row, v1, + which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses + in favor of two (len_s + 1) allocations. These could potentially be + statically-allocated if we impose a maximum length on the + strings of interest. */ + edit_distance_t *v0 = new edit_distance_t[len_s + 1]; + edit_distance_t *v1 = new edit_distance_t[len_s + 1]; + + /* The first row is for the case of an empty target string, which + we can reach by deleting every character in the source string. */ + for (int i = 0; i < len_s + 1; i++) + v0[i] = i; + + /* Build successive rows. */ + for (int i = 0; i < len_t; i++) + { + if (debug) + { + printf ("i:%i v0 = ", i); + for (int j = 0; j < len_s + 1; j++) + printf ("%i ", v0[j]); + printf ("\n"); + } + + /* The initial column is for the case of an empty source string; we + can reach prefixes of the target string of length i + by inserting i characters. */ + v1[0] = i + 1; + + /* Build the rest of the row by considering neighbors to + the north, west and northwest. */ + for (int j = 0; j < len_s; j++) + { + edit_distance_t cost = (s[j] == t[i] ? 0 : 1); + edit_distance_t deletion = v1[j] + 1; + edit_distance_t insertion = v0[j + 1] + 1; + edit_distance_t substitution = v0[j] + cost; + edit_distance_t cheapest = MIN (deletion, insertion); + cheapest = MIN (cheapest, substitution); + v1[j + 1] = cheapest; + } + + /* Prepare to move on to next row. */ + for (int j = 0; j < len_s + 1; j++) + v0[j] = v1[j]; + } + + if (debug) + { + printf ("final v1 = "); + for (int j = 0; j < len_s + 1; j++) + printf ("%i ", v1[j]); + printf ("\n"); + } + + edit_distance_t result = v1[len_s]; + delete[] v0; + delete[] v1; + return result; +} + +/* Calculate Levenshtein distance between two nil-terminated strings. */ + +edit_distance_t +levenshtein_distance (const char *s, const char *t) +{ + return levenshtein_distance (s, strlen (s), t, strlen (t)); +} + +/* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to + an autovec of non-NULL strings, determine which element within + CANDIDATES has the lowest edit distance to TARGET. If there are + multiple elements with the same minimal distance, the first in the + vector wins. + + If more than half of the letters were misspelled, the suggestion is + likely to be meaningless, so return NULL for this case. */ + +const char * +find_closest_string (const char *target, + const auto_vec<const char *> *candidates) +{ + gcc_assert (target); + gcc_assert (candidates); + + int i; + const char *candidate; + best_match<const char *, const char *> bm (target); + FOR_EACH_VEC_ELT (*candidates, i, candidate) + { + gcc_assert (candidate); + bm.consider (candidate); + } + + return bm.get_best_meaningful_candidate (); +} + +#if CHECKING_P + +namespace selftest { + +/* Selftests. */ + +/* Verify that the levenshtein_distance (A, B) equals the expected + value. */ + +static void +levenshtein_distance_unit_test_oneway (const char *a, const char *b, + edit_distance_t expected) +{ + edit_distance_t actual = levenshtein_distance (a, b); + ASSERT_EQ (actual, expected); +} + +/* Verify that both + levenshtein_distance (A, B) + and + levenshtein_distance (B, A) + equal the expected value, to ensure that the function is symmetric. */ + +static void +levenshtein_distance_unit_test (const char *a, const char *b, + edit_distance_t expected) +{ + levenshtein_distance_unit_test_oneway (a, b, expected); + levenshtein_distance_unit_test_oneway (b, a, expected); +} + +/* Verify that find_closest_string is sane. */ + +static void +test_find_closest_string () +{ + auto_vec<const char *> candidates; + + /* Verify that it can handle an empty vec. */ + ASSERT_EQ (NULL, find_closest_string ("", &candidates)); + + /* Verify that it works sanely for non-empty vecs. */ + candidates.safe_push ("apple"); + candidates.safe_push ("banana"); + candidates.safe_push ("cherry"); + + ASSERT_STREQ ("apple", find_closest_string ("app", &candidates)); + ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates)); + ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates)); + ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates)); + + /* The order of the vec can matter, but it should not matter for these + inputs. */ + candidates.truncate (0); + candidates.safe_push ("cherry"); + candidates.safe_push ("banana"); + candidates.safe_push ("apple"); + ASSERT_STREQ ("apple", find_closest_string ("app", &candidates)); + ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates)); + ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates)); + ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates)); + + /* If the goal string somehow makes it into the candidate list, offering + it as a suggestion will be nonsensical. Verify that we don't offer such + suggestions. */ + ASSERT_EQ (NULL, find_closest_string ("banana", &candidates)); +} + +/* Test data for test_metric_conditions. */ + +static const char * const test_data[] = { + "", + "foo", + "food", + "boo", + "1234567890123456789012345678901234567890123456789012345678901234567890" +}; + +/* Verify that levenshtein_distance appears to be a sane distance function, + i.e. the conditions for being a metric. This is done directly for a + small set of examples, using test_data above. This is O(N^3) in the size + of the array, due to the test for the triangle inequality, so we keep the + array small. */ + +static void +test_metric_conditions () +{ + const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]); + + for (int i = 0; i < num_test_cases; i++) + { + for (int j = 0; j < num_test_cases; j++) + { + edit_distance_t dist_ij + = levenshtein_distance (test_data[i], test_data[j]); + + /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */ + if (i == j) + ASSERT_EQ (dist_ij, 0); + else + ASSERT_TRUE (dist_ij > 0); + + /* Symmetry: d(i, j) == d(j, i). */ + edit_distance_t dist_ji + = levenshtein_distance (test_data[j], test_data[i]); + ASSERT_EQ (dist_ij, dist_ji); + + /* Triangle inequality. */ + for (int k = 0; k < num_test_cases; k++) + { + edit_distance_t dist_ik + = levenshtein_distance (test_data[i], test_data[k]); + edit_distance_t dist_jk + = levenshtein_distance (test_data[j], test_data[k]); + ASSERT_TRUE (dist_ik <= dist_ij + dist_jk); + } + } + } +} + +/* Verify levenshtein_distance for a variety of pairs of pre-canned + inputs, comparing against known-good values. */ + +void +spellcheck_c_tests () +{ + levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty")); + levenshtein_distance_unit_test ("saturday", "sunday", 3); + levenshtein_distance_unit_test ("foo", "m_foo", 2); + levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3); + levenshtein_distance_unit_test + ("the quick brown fox jumps over the lazy dog", "dog", 40); + levenshtein_distance_unit_test + ("the quick brown fox jumps over the lazy dog", + "the quick brown dog jumps over the lazy fox", + 4); + levenshtein_distance_unit_test + ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,", + "All your base are belong to us", + 44); + levenshtein_distance_unit_test ("foo", "FOO", 3); + + test_find_closest_string (); + test_metric_conditions (); +} + +} // namespace selftest + +#endif /* #if CHECKING_P */