111
|
1 /* Find near-matches for strings.
|
|
2 Copyright (C) 2015-2017 Free Software Foundation, Inc.
|
|
3
|
|
4 This file is part of GCC.
|
|
5
|
|
6 GCC is free software; you can redistribute it and/or modify it under
|
|
7 the terms of the GNU General Public License as published by the Free
|
|
8 Software Foundation; either version 3, or (at your option) any later
|
|
9 version.
|
|
10
|
|
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
14 for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with GCC; see the file COPYING3. If not see
|
|
18 <http://www.gnu.org/licenses/>. */
|
|
19
|
|
20 #include "config.h"
|
|
21 #include "system.h"
|
|
22 #include "coretypes.h"
|
|
23 #include "tm.h"
|
|
24 #include "tree.h"
|
|
25 #include "spellcheck.h"
|
|
26 #include "selftest.h"
|
|
27
|
|
28 /* The Levenshtein distance is an "edit-distance": the minimal
|
|
29 number of one-character insertions, removals or substitutions
|
|
30 that are needed to change one string into another.
|
|
31
|
|
32 This implementation uses the Wagner-Fischer algorithm. */
|
|
33
|
|
34 edit_distance_t
|
|
35 levenshtein_distance (const char *s, int len_s,
|
|
36 const char *t, int len_t)
|
|
37 {
|
|
38 const bool debug = false;
|
|
39
|
|
40 if (debug)
|
|
41 {
|
|
42 printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
|
|
43 printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
|
|
44 }
|
|
45
|
|
46 if (len_s == 0)
|
|
47 return len_t;
|
|
48 if (len_t == 0)
|
|
49 return len_s;
|
|
50
|
|
51 /* We effectively build a matrix where each (i, j) contains the
|
|
52 Levenshtein distance between the prefix strings s[0:j]
|
|
53 and t[0:i].
|
|
54 Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
|
|
55 we simply keep track of the last row, v0 and a new row, v1,
|
|
56 which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
|
|
57 in favor of two (len_s + 1) allocations. These could potentially be
|
|
58 statically-allocated if we impose a maximum length on the
|
|
59 strings of interest. */
|
|
60 edit_distance_t *v0 = new edit_distance_t[len_s + 1];
|
|
61 edit_distance_t *v1 = new edit_distance_t[len_s + 1];
|
|
62
|
|
63 /* The first row is for the case of an empty target string, which
|
|
64 we can reach by deleting every character in the source string. */
|
|
65 for (int i = 0; i < len_s + 1; i++)
|
|
66 v0[i] = i;
|
|
67
|
|
68 /* Build successive rows. */
|
|
69 for (int i = 0; i < len_t; i++)
|
|
70 {
|
|
71 if (debug)
|
|
72 {
|
|
73 printf ("i:%i v0 = ", i);
|
|
74 for (int j = 0; j < len_s + 1; j++)
|
|
75 printf ("%i ", v0[j]);
|
|
76 printf ("\n");
|
|
77 }
|
|
78
|
|
79 /* The initial column is for the case of an empty source string; we
|
|
80 can reach prefixes of the target string of length i
|
|
81 by inserting i characters. */
|
|
82 v1[0] = i + 1;
|
|
83
|
|
84 /* Build the rest of the row by considering neighbors to
|
|
85 the north, west and northwest. */
|
|
86 for (int j = 0; j < len_s; j++)
|
|
87 {
|
|
88 edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
|
|
89 edit_distance_t deletion = v1[j] + 1;
|
|
90 edit_distance_t insertion = v0[j + 1] + 1;
|
|
91 edit_distance_t substitution = v0[j] + cost;
|
|
92 edit_distance_t cheapest = MIN (deletion, insertion);
|
|
93 cheapest = MIN (cheapest, substitution);
|
|
94 v1[j + 1] = cheapest;
|
|
95 }
|
|
96
|
|
97 /* Prepare to move on to next row. */
|
|
98 for (int j = 0; j < len_s + 1; j++)
|
|
99 v0[j] = v1[j];
|
|
100 }
|
|
101
|
|
102 if (debug)
|
|
103 {
|
|
104 printf ("final v1 = ");
|
|
105 for (int j = 0; j < len_s + 1; j++)
|
|
106 printf ("%i ", v1[j]);
|
|
107 printf ("\n");
|
|
108 }
|
|
109
|
|
110 edit_distance_t result = v1[len_s];
|
|
111 delete[] v0;
|
|
112 delete[] v1;
|
|
113 return result;
|
|
114 }
|
|
115
|
|
116 /* Calculate Levenshtein distance between two nil-terminated strings. */
|
|
117
|
|
118 edit_distance_t
|
|
119 levenshtein_distance (const char *s, const char *t)
|
|
120 {
|
|
121 return levenshtein_distance (s, strlen (s), t, strlen (t));
|
|
122 }
|
|
123
|
|
124 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
|
|
125 an autovec of non-NULL strings, determine which element within
|
|
126 CANDIDATES has the lowest edit distance to TARGET. If there are
|
|
127 multiple elements with the same minimal distance, the first in the
|
|
128 vector wins.
|
|
129
|
|
130 If more than half of the letters were misspelled, the suggestion is
|
|
131 likely to be meaningless, so return NULL for this case. */
|
|
132
|
|
133 const char *
|
|
134 find_closest_string (const char *target,
|
|
135 const auto_vec<const char *> *candidates)
|
|
136 {
|
|
137 gcc_assert (target);
|
|
138 gcc_assert (candidates);
|
|
139
|
|
140 int i;
|
|
141 const char *candidate;
|
|
142 best_match<const char *, const char *> bm (target);
|
|
143 FOR_EACH_VEC_ELT (*candidates, i, candidate)
|
|
144 {
|
|
145 gcc_assert (candidate);
|
|
146 bm.consider (candidate);
|
|
147 }
|
|
148
|
|
149 return bm.get_best_meaningful_candidate ();
|
|
150 }
|
|
151
|
|
152 #if CHECKING_P
|
|
153
|
|
154 namespace selftest {
|
|
155
|
|
156 /* Selftests. */
|
|
157
|
|
158 /* Verify that the levenshtein_distance (A, B) equals the expected
|
|
159 value. */
|
|
160
|
|
161 static void
|
|
162 levenshtein_distance_unit_test_oneway (const char *a, const char *b,
|
|
163 edit_distance_t expected)
|
|
164 {
|
|
165 edit_distance_t actual = levenshtein_distance (a, b);
|
|
166 ASSERT_EQ (actual, expected);
|
|
167 }
|
|
168
|
|
169 /* Verify that both
|
|
170 levenshtein_distance (A, B)
|
|
171 and
|
|
172 levenshtein_distance (B, A)
|
|
173 equal the expected value, to ensure that the function is symmetric. */
|
|
174
|
|
175 static void
|
|
176 levenshtein_distance_unit_test (const char *a, const char *b,
|
|
177 edit_distance_t expected)
|
|
178 {
|
|
179 levenshtein_distance_unit_test_oneway (a, b, expected);
|
|
180 levenshtein_distance_unit_test_oneway (b, a, expected);
|
|
181 }
|
|
182
|
|
183 /* Verify that find_closest_string is sane. */
|
|
184
|
|
185 static void
|
|
186 test_find_closest_string ()
|
|
187 {
|
|
188 auto_vec<const char *> candidates;
|
|
189
|
|
190 /* Verify that it can handle an empty vec. */
|
|
191 ASSERT_EQ (NULL, find_closest_string ("", &candidates));
|
|
192
|
|
193 /* Verify that it works sanely for non-empty vecs. */
|
|
194 candidates.safe_push ("apple");
|
|
195 candidates.safe_push ("banana");
|
|
196 candidates.safe_push ("cherry");
|
|
197
|
|
198 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
|
|
199 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
|
|
200 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
|
|
201 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
|
|
202
|
|
203 /* The order of the vec can matter, but it should not matter for these
|
|
204 inputs. */
|
|
205 candidates.truncate (0);
|
|
206 candidates.safe_push ("cherry");
|
|
207 candidates.safe_push ("banana");
|
|
208 candidates.safe_push ("apple");
|
|
209 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
|
|
210 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
|
|
211 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
|
|
212 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
|
|
213
|
|
214 /* If the goal string somehow makes it into the candidate list, offering
|
|
215 it as a suggestion will be nonsensical. Verify that we don't offer such
|
|
216 suggestions. */
|
|
217 ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
|
|
218 }
|
|
219
|
|
220 /* Test data for test_metric_conditions. */
|
|
221
|
|
222 static const char * const test_data[] = {
|
|
223 "",
|
|
224 "foo",
|
|
225 "food",
|
|
226 "boo",
|
|
227 "1234567890123456789012345678901234567890123456789012345678901234567890"
|
|
228 };
|
|
229
|
|
230 /* Verify that levenshtein_distance appears to be a sane distance function,
|
|
231 i.e. the conditions for being a metric. This is done directly for a
|
|
232 small set of examples, using test_data above. This is O(N^3) in the size
|
|
233 of the array, due to the test for the triangle inequality, so we keep the
|
|
234 array small. */
|
|
235
|
|
236 static void
|
|
237 test_metric_conditions ()
|
|
238 {
|
|
239 const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
|
|
240
|
|
241 for (int i = 0; i < num_test_cases; i++)
|
|
242 {
|
|
243 for (int j = 0; j < num_test_cases; j++)
|
|
244 {
|
|
245 edit_distance_t dist_ij
|
|
246 = levenshtein_distance (test_data[i], test_data[j]);
|
|
247
|
|
248 /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
|
|
249 if (i == j)
|
|
250 ASSERT_EQ (dist_ij, 0);
|
|
251 else
|
|
252 ASSERT_TRUE (dist_ij > 0);
|
|
253
|
|
254 /* Symmetry: d(i, j) == d(j, i). */
|
|
255 edit_distance_t dist_ji
|
|
256 = levenshtein_distance (test_data[j], test_data[i]);
|
|
257 ASSERT_EQ (dist_ij, dist_ji);
|
|
258
|
|
259 /* Triangle inequality. */
|
|
260 for (int k = 0; k < num_test_cases; k++)
|
|
261 {
|
|
262 edit_distance_t dist_ik
|
|
263 = levenshtein_distance (test_data[i], test_data[k]);
|
|
264 edit_distance_t dist_jk
|
|
265 = levenshtein_distance (test_data[j], test_data[k]);
|
|
266 ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
|
|
267 }
|
|
268 }
|
|
269 }
|
|
270 }
|
|
271
|
|
272 /* Verify levenshtein_distance for a variety of pairs of pre-canned
|
|
273 inputs, comparing against known-good values. */
|
|
274
|
|
275 void
|
|
276 spellcheck_c_tests ()
|
|
277 {
|
|
278 levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
|
|
279 levenshtein_distance_unit_test ("saturday", "sunday", 3);
|
|
280 levenshtein_distance_unit_test ("foo", "m_foo", 2);
|
|
281 levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
|
|
282 levenshtein_distance_unit_test
|
|
283 ("the quick brown fox jumps over the lazy dog", "dog", 40);
|
|
284 levenshtein_distance_unit_test
|
|
285 ("the quick brown fox jumps over the lazy dog",
|
|
286 "the quick brown dog jumps over the lazy fox",
|
|
287 4);
|
|
288 levenshtein_distance_unit_test
|
|
289 ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
|
|
290 "All your base are belong to us",
|
|
291 44);
|
|
292 levenshtein_distance_unit_test ("foo", "FOO", 3);
|
|
293
|
|
294 test_find_closest_string ();
|
|
295 test_metric_conditions ();
|
|
296 }
|
|
297
|
|
298 } // namespace selftest
|
|
299
|
|
300 #endif /* #if CHECKING_P */
|