111
|
1 /* Find near-matches for strings.
|
145
|
2 Copyright (C) 2015-2020 Free Software Foundation, Inc.
|
111
|
3
|
|
4 This file is part of GCC.
|
|
5
|
|
6 GCC is free software; you can redistribute it and/or modify it under
|
|
7 the terms of the GNU General Public License as published by the Free
|
|
8 Software Foundation; either version 3, or (at your option) any later
|
|
9 version.
|
|
10
|
|
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
14 for more details.
|
|
15
|
|
16 You should have received a copy of the GNU General Public License
|
|
17 along with GCC; see the file COPYING3. If not see
|
|
18 <http://www.gnu.org/licenses/>. */
|
|
19
|
|
20 #include "config.h"
|
|
21 #include "system.h"
|
|
22 #include "coretypes.h"
|
|
23 #include "tm.h"
|
|
24 #include "tree.h"
|
|
25 #include "spellcheck.h"
|
|
26 #include "selftest.h"
|
|
27
|
131
|
28 /* Get the edit distance between the two strings: the minimal
|
|
29 number of edits that are needed to change one string into another,
|
|
30 where edits can be one-character insertions, removals, or substitutions,
|
|
31 or transpositions of two adjacent characters (counting as one "edit").
|
111
|
32
|
131
|
33 This implementation uses the Wagner-Fischer algorithm for the
|
|
34 Damerau-Levenshtein distance; specifically, the "optimal string alignment
|
|
35 distance" or "restricted edit distance" variant. */
|
111
|
36
|
|
37 edit_distance_t
|
131
|
38 get_edit_distance (const char *s, int len_s,
|
|
39 const char *t, int len_t)
|
111
|
40 {
|
|
41 const bool debug = false;
|
|
42
|
|
43 if (debug)
|
|
44 {
|
|
45 printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
|
|
46 printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
|
|
47 }
|
|
48
|
|
49 if (len_s == 0)
|
|
50 return len_t;
|
|
51 if (len_t == 0)
|
|
52 return len_s;
|
|
53
|
|
54 /* We effectively build a matrix where each (i, j) contains the
|
131
|
55 distance between the prefix strings s[0:j] and t[0:i].
|
111
|
56 Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
|
131
|
57 we simply keep track of the last two rows, v_one_ago and v_two_ago,
|
|
58 and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
|
|
59 allocation and memory accesses in favor of three (len_s + 1)
|
|
60 allocations. These could potentially be
|
111
|
61 statically-allocated if we impose a maximum length on the
|
|
62 strings of interest. */
|
131
|
63 edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
|
|
64 edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
|
|
65 edit_distance_t *v_next = new edit_distance_t[len_s + 1];
|
111
|
66
|
|
67 /* The first row is for the case of an empty target string, which
|
|
68 we can reach by deleting every character in the source string. */
|
|
69 for (int i = 0; i < len_s + 1; i++)
|
131
|
70 v_one_ago[i] = i;
|
111
|
71
|
|
72 /* Build successive rows. */
|
|
73 for (int i = 0; i < len_t; i++)
|
|
74 {
|
|
75 if (debug)
|
|
76 {
|
131
|
77 printf ("i:%i v_one_ago = ", i);
|
111
|
78 for (int j = 0; j < len_s + 1; j++)
|
131
|
79 printf ("%i ", v_one_ago[j]);
|
111
|
80 printf ("\n");
|
|
81 }
|
|
82
|
|
83 /* The initial column is for the case of an empty source string; we
|
|
84 can reach prefixes of the target string of length i
|
|
85 by inserting i characters. */
|
131
|
86 v_next[0] = i + 1;
|
111
|
87
|
|
88 /* Build the rest of the row by considering neighbors to
|
|
89 the north, west and northwest. */
|
|
90 for (int j = 0; j < len_s; j++)
|
|
91 {
|
|
92 edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
|
131
|
93 edit_distance_t deletion = v_next[j] + 1;
|
|
94 edit_distance_t insertion = v_one_ago[j + 1] + 1;
|
|
95 edit_distance_t substitution = v_one_ago[j] + cost;
|
111
|
96 edit_distance_t cheapest = MIN (deletion, insertion);
|
|
97 cheapest = MIN (cheapest, substitution);
|
131
|
98 if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
|
|
99 {
|
|
100 edit_distance_t transposition = v_two_ago[j - 1] + 1;
|
|
101 cheapest = MIN (cheapest, transposition);
|
|
102 }
|
|
103 v_next[j + 1] = cheapest;
|
111
|
104 }
|
|
105
|
|
106 /* Prepare to move on to next row. */
|
|
107 for (int j = 0; j < len_s + 1; j++)
|
131
|
108 {
|
|
109 v_two_ago[j] = v_one_ago[j];
|
|
110 v_one_ago[j] = v_next[j];
|
|
111 }
|
111
|
112 }
|
|
113
|
|
114 if (debug)
|
|
115 {
|
131
|
116 printf ("final v_next = ");
|
111
|
117 for (int j = 0; j < len_s + 1; j++)
|
131
|
118 printf ("%i ", v_next[j]);
|
111
|
119 printf ("\n");
|
|
120 }
|
|
121
|
131
|
122 edit_distance_t result = v_next[len_s];
|
|
123 delete[] v_two_ago;
|
|
124 delete[] v_one_ago;
|
|
125 delete[] v_next;
|
111
|
126 return result;
|
|
127 }
|
|
128
|
131
|
129 /* Get the edit distance between two nil-terminated strings. */
|
111
|
130
|
|
131 edit_distance_t
|
131
|
132 get_edit_distance (const char *s, const char *t)
|
111
|
133 {
|
131
|
134 return get_edit_distance (s, strlen (s), t, strlen (t));
|
111
|
135 }
|
|
136
|
|
137 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
|
|
138 an autovec of non-NULL strings, determine which element within
|
|
139 CANDIDATES has the lowest edit distance to TARGET. If there are
|
|
140 multiple elements with the same minimal distance, the first in the
|
|
141 vector wins.
|
|
142
|
|
143 If more than half of the letters were misspelled, the suggestion is
|
|
144 likely to be meaningless, so return NULL for this case. */
|
|
145
|
|
146 const char *
|
|
147 find_closest_string (const char *target,
|
|
148 const auto_vec<const char *> *candidates)
|
|
149 {
|
|
150 gcc_assert (target);
|
|
151 gcc_assert (candidates);
|
|
152
|
|
153 int i;
|
|
154 const char *candidate;
|
|
155 best_match<const char *, const char *> bm (target);
|
|
156 FOR_EACH_VEC_ELT (*candidates, i, candidate)
|
|
157 {
|
|
158 gcc_assert (candidate);
|
|
159 bm.consider (candidate);
|
|
160 }
|
|
161
|
|
162 return bm.get_best_meaningful_candidate ();
|
|
163 }
|
|
164
|
131
|
165 /* Generate the maximum edit distance for which we consider a suggestion
|
|
166 to be meaningful, given a goal of length GOAL_LEN and a candidate of
|
|
167 length CANDIDATE_LEN.
|
|
168
|
|
169 This is a third of the the length of the candidate or of the goal,
|
|
170 whichever is bigger. */
|
|
171
|
|
172 edit_distance_t
|
|
173 get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
|
|
174 {
|
|
175 size_t max_length = MAX (goal_len, candidate_len);
|
|
176 size_t min_length = MIN (goal_len, candidate_len);
|
|
177
|
|
178 gcc_assert (max_length >= min_length);
|
|
179
|
|
180 /* Special case: don't offer suggestions for a pair of
|
|
181 length == 1 strings (or empty strings). */
|
|
182 if (max_length <= 1)
|
|
183 return 0;
|
|
184
|
|
185 /* If the lengths are close, then round down. */
|
|
186 if (max_length - min_length <= 1)
|
|
187 /* ...but allow an edit distance of at least 1. */
|
|
188 return MAX (max_length / 3, 1);
|
|
189
|
|
190 /* Otherwise, round up (thus giving a little extra leeway to some cases
|
|
191 involving insertions/deletions). */
|
|
192 return (max_length + 2) / 3;
|
|
193 }
|
|
194
|
111
|
195 #if CHECKING_P
|
|
196
|
|
197 namespace selftest {
|
|
198
|
|
199 /* Selftests. */
|
|
200
|
131
|
201 /* Verify that get_edit_distance (A, B) equals the expected value. */
|
111
|
202
|
|
203 static void
|
131
|
204 test_get_edit_distance_one_way (const char *a, const char *b,
|
|
205 edit_distance_t expected)
|
111
|
206 {
|
131
|
207 edit_distance_t actual = get_edit_distance (a, b);
|
111
|
208 ASSERT_EQ (actual, expected);
|
|
209 }
|
|
210
|
|
211 /* Verify that both
|
131
|
212 get_edit_distance (A, B)
|
111
|
213 and
|
131
|
214 get_edit_distance (B, A)
|
111
|
215 equal the expected value, to ensure that the function is symmetric. */
|
|
216
|
|
217 static void
|
131
|
218 test_get_edit_distance_both_ways (const char *a, const char *b,
|
|
219 edit_distance_t expected)
|
|
220 {
|
|
221 test_get_edit_distance_one_way (a, b, expected);
|
|
222 test_get_edit_distance_one_way (b, a, expected);
|
|
223 }
|
|
224
|
|
225 /* Verify get_edit_distance for a variety of pairs of pre-canned
|
|
226 inputs, comparing against known-good values. */
|
|
227
|
|
228 static void
|
|
229 test_edit_distances ()
|
|
230 {
|
|
231 test_get_edit_distance_both_ways ("", "nonempty", strlen ("nonempty"));
|
|
232 test_get_edit_distance_both_ways ("saturday", "sunday", 3);
|
|
233 test_get_edit_distance_both_ways ("foo", "m_foo", 2);
|
|
234 test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 3);
|
|
235 test_get_edit_distance_both_ways
|
|
236 ("the quick brown fox jumps over the lazy dog", "dog", 40);
|
|
237 test_get_edit_distance_both_ways
|
|
238 ("the quick brown fox jumps over the lazy dog",
|
|
239 "the quick brown dog jumps over the lazy fox",
|
|
240 4);
|
|
241 test_get_edit_distance_both_ways
|
|
242 ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
|
|
243 "All your base are belong to us",
|
|
244 44);
|
|
245 test_get_edit_distance_both_ways ("foo", "FOO", 3);
|
|
246 test_get_edit_distance_both_ways ("fee", "deed", 2);
|
|
247 test_get_edit_distance_both_ways ("coorzd1", "coordx1", 2);
|
|
248 test_get_edit_distance_both_ways ("assert", "sqrt", 3);
|
|
249 test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", 3);
|
|
250 test_get_edit_distance_both_ways ("time", "nice", 2);
|
|
251 test_get_edit_distance_both_ways ("bar", "carg", 2);
|
|
252 test_get_edit_distance_both_ways ("gtk_widget_show_all",
|
|
253 "GtkWidgetShowAll", 7);
|
|
254 test_get_edit_distance_both_ways ("m_bar", "bar", 2);
|
|
255 test_get_edit_distance_both_ways ("MACRO", "MACRAME", 3);
|
|
256 test_get_edit_distance_both_ways ("ab", "ac", 1);
|
|
257 test_get_edit_distance_both_ways ("ab", "a", 1);
|
|
258 test_get_edit_distance_both_ways ("a", "b", 1);
|
|
259 test_get_edit_distance_both_ways ("nanl", "name", 2);
|
|
260 test_get_edit_distance_both_ways ("char", "bar", 2);
|
|
261 test_get_edit_distance_both_ways ("-optimize", "fsanitize", 5);
|
|
262 test_get_edit_distance_both_ways ("__DATE__", "__i386__", 4);
|
|
263
|
|
264 /* Examples where transposition helps. */
|
|
265 test_get_edit_distance_both_ways ("ab", "ba", 1);
|
|
266 test_get_edit_distance_both_ways ("ba", "abc", 2);
|
|
267 test_get_edit_distance_both_ways ("coorzd1", "coordz1", 1);
|
|
268 test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
|
|
269 "bacdefghijklmnopqrstuvwxzy", 2);
|
|
270 test_get_edit_distance_both_ways ("saturday", "sundya", 4);
|
|
271 test_get_edit_distance_both_ways ("signed", "singed", 1);
|
|
272 }
|
|
273
|
|
274 /* Subroutine of test_get_edit_distance_cutoff, for emulating the
|
|
275 spellchecking cutoff in up to GCC 8. */
|
|
276
|
|
277 static edit_distance_t
|
|
278 get_old_cutoff (size_t goal_len, size_t candidate_len)
|
|
279 {
|
|
280 return MAX (goal_len, candidate_len) / 2;
|
|
281 }
|
|
282
|
|
283 /* Verify that the cutoff for "meaningfulness" of suggestions is at least as
|
|
284 conservative as in older GCC releases.
|
|
285
|
|
286 This should ensure that we don't offer additional meaningless
|
|
287 suggestions (apart from those for which transposition has helped). */
|
|
288
|
|
289 static void
|
|
290 test_get_edit_distance_cutoff ()
|
111
|
291 {
|
131
|
292 for (size_t goal_len = 0; goal_len < 30; goal_len++)
|
|
293 for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
|
|
294 ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
|
|
295 <= get_old_cutoff (goal_len, candidate_len));
|
|
296 }
|
|
297
|
|
298 /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
|
|
299
|
|
300 static void
|
|
301 assert_suggested_for (const location &loc, const char *candidate,
|
|
302 const char *target)
|
|
303 {
|
|
304 auto_vec<const char *> candidates;
|
|
305 candidates.safe_push (candidate);
|
|
306 ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
|
|
307 }
|
|
308
|
|
309 /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
|
|
310
|
|
311 #define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET) \
|
|
312 SELFTEST_BEGIN_STMT \
|
|
313 assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
|
|
314 SELFTEST_END_STMT
|
|
315
|
|
316 /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
|
|
317
|
|
318 static void
|
|
319 assert_not_suggested_for (const location &loc, const char *candidate,
|
|
320 const char *target)
|
|
321 {
|
|
322 auto_vec<const char *> candidates;
|
|
323 candidates.safe_push (candidate);
|
|
324 ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
|
|
325 }
|
|
326
|
|
327 /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
|
|
328
|
|
329 #define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET) \
|
|
330 SELFTEST_BEGIN_STMT \
|
|
331 assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
|
|
332 SELFTEST_END_STMT
|
|
333
|
|
334 /* Verify that we offer varous suggestions that are meaningful,
|
|
335 and that we don't offer various other ones that aren't (PR c/82967). */
|
|
336
|
|
337 static void
|
|
338 test_suggestions ()
|
|
339 {
|
|
340 /* Good suggestions. */
|
|
341
|
|
342 ASSERT_SUGGESTED_FOR ("m_bar", "bar");
|
|
343 // dist == 2, max_length == 5, min_length == 3
|
|
344
|
|
345 ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
|
|
346 // dist == 3, max_length == 7, min_length == 5
|
|
347
|
|
348 ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
|
|
349 // dist == 7, max_length == 16, min_length = 19
|
|
350
|
|
351 ASSERT_SUGGESTED_FOR ("ab", "ac");
|
|
352 // dist == 1, max_length == min_length = 2
|
|
353
|
|
354 ASSERT_SUGGESTED_FOR ("ab", "a");
|
|
355 // dist == 1, max_length == 2, min_length = 1
|
|
356
|
|
357 /* Bad suggestions. */
|
|
358
|
|
359 ASSERT_NOT_SUGGESTED_FOR ("a", "b");
|
|
360 // dist == 1, max_length == min_length = 1
|
|
361
|
|
362 ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
|
|
363 // dist == 3, max_length 6, min_length == 4
|
|
364
|
|
365 ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
|
|
366 // dist == 3, max_length == min_length == 8
|
|
367
|
|
368 ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
|
|
369 ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
|
|
370 // dist == 2, max_length == min_length == 4
|
|
371
|
|
372 ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
|
|
373 ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
|
|
374 // dist == 2, max_length == 4, min_length == 3
|
|
375
|
|
376 ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
|
|
377 // dist == 5, max_length == min_length == 9
|
|
378
|
|
379 ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
|
|
380 // dist == 4, max_length == min_length == 8
|
145
|
381
|
|
382 ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
|
|
383 // dist == 9, max_length == 18, min_length == 11
|
111
|
384 }
|
|
385
|
|
386 /* Verify that find_closest_string is sane. */
|
|
387
|
|
388 static void
|
|
389 test_find_closest_string ()
|
|
390 {
|
|
391 auto_vec<const char *> candidates;
|
|
392
|
|
393 /* Verify that it can handle an empty vec. */
|
|
394 ASSERT_EQ (NULL, find_closest_string ("", &candidates));
|
|
395
|
|
396 /* Verify that it works sanely for non-empty vecs. */
|
|
397 candidates.safe_push ("apple");
|
|
398 candidates.safe_push ("banana");
|
|
399 candidates.safe_push ("cherry");
|
|
400
|
|
401 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
|
|
402 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
|
|
403 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
|
|
404 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
|
|
405
|
|
406 /* The order of the vec can matter, but it should not matter for these
|
|
407 inputs. */
|
|
408 candidates.truncate (0);
|
|
409 candidates.safe_push ("cherry");
|
|
410 candidates.safe_push ("banana");
|
|
411 candidates.safe_push ("apple");
|
|
412 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
|
|
413 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
|
|
414 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
|
|
415 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
|
|
416
|
|
417 /* If the goal string somehow makes it into the candidate list, offering
|
|
418 it as a suggestion will be nonsensical. Verify that we don't offer such
|
|
419 suggestions. */
|
|
420 ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
|
131
|
421
|
|
422 /* Example from PR 69968 where transposition helps. */
|
|
423 candidates.truncate (0);
|
|
424 candidates.safe_push("coordx");
|
|
425 candidates.safe_push("coordy");
|
|
426 candidates.safe_push("coordz");
|
|
427 candidates.safe_push("coordx1");
|
|
428 candidates.safe_push("coordy1");
|
|
429 candidates.safe_push("coordz1");
|
|
430 ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
|
111
|
431 }
|
|
432
|
|
433 /* Test data for test_metric_conditions. */
|
|
434
|
|
435 static const char * const test_data[] = {
|
|
436 "",
|
|
437 "foo",
|
|
438 "food",
|
|
439 "boo",
|
|
440 "1234567890123456789012345678901234567890123456789012345678901234567890"
|
|
441 };
|
|
442
|
131
|
443 /* Verify that get_edit_distance appears to be a sane distance function,
|
111
|
444 i.e. the conditions for being a metric. This is done directly for a
|
|
445 small set of examples, using test_data above. This is O(N^3) in the size
|
|
446 of the array, due to the test for the triangle inequality, so we keep the
|
|
447 array small. */
|
|
448
|
|
449 static void
|
|
450 test_metric_conditions ()
|
|
451 {
|
|
452 const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
|
|
453
|
|
454 for (int i = 0; i < num_test_cases; i++)
|
|
455 {
|
|
456 for (int j = 0; j < num_test_cases; j++)
|
|
457 {
|
|
458 edit_distance_t dist_ij
|
131
|
459 = get_edit_distance (test_data[i], test_data[j]);
|
111
|
460
|
|
461 /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
|
|
462 if (i == j)
|
|
463 ASSERT_EQ (dist_ij, 0);
|
|
464 else
|
|
465 ASSERT_TRUE (dist_ij > 0);
|
|
466
|
|
467 /* Symmetry: d(i, j) == d(j, i). */
|
|
468 edit_distance_t dist_ji
|
131
|
469 = get_edit_distance (test_data[j], test_data[i]);
|
111
|
470 ASSERT_EQ (dist_ij, dist_ji);
|
|
471
|
|
472 /* Triangle inequality. */
|
|
473 for (int k = 0; k < num_test_cases; k++)
|
|
474 {
|
|
475 edit_distance_t dist_ik
|
131
|
476 = get_edit_distance (test_data[i], test_data[k]);
|
111
|
477 edit_distance_t dist_jk
|
131
|
478 = get_edit_distance (test_data[j], test_data[k]);
|
111
|
479 ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
|
|
480 }
|
|
481 }
|
|
482 }
|
|
483 }
|
|
484
|
131
|
485 /* Run all of the selftests within this file. */
|
111
|
486
|
|
487 void
|
|
488 spellcheck_c_tests ()
|
|
489 {
|
131
|
490 test_edit_distances ();
|
|
491 test_get_edit_distance_cutoff ();
|
|
492 test_suggestions ();
|
111
|
493 test_find_closest_string ();
|
|
494 test_metric_conditions ();
|
|
495 }
|
|
496
|
|
497 } // namespace selftest
|
|
498
|
|
499 #endif /* #if CHECKING_P */
|