annotate gcc/spellcheck.c @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents
children 84e7813d76e9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
111
kono
parents:
diff changeset
1 /* Find near-matches for strings.
kono
parents:
diff changeset
2 Copyright (C) 2015-2017 Free Software Foundation, Inc.
kono
parents:
diff changeset
3
kono
parents:
diff changeset
4 This file is part of GCC.
kono
parents:
diff changeset
5
kono
parents:
diff changeset
6 GCC is free software; you can redistribute it and/or modify it under
kono
parents:
diff changeset
7 the terms of the GNU General Public License as published by the Free
kono
parents:
diff changeset
8 Software Foundation; either version 3, or (at your option) any later
kono
parents:
diff changeset
9 version.
kono
parents:
diff changeset
10
kono
parents:
diff changeset
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
kono
parents:
diff changeset
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
kono
parents:
diff changeset
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
kono
parents:
diff changeset
14 for more details.
kono
parents:
diff changeset
15
kono
parents:
diff changeset
16 You should have received a copy of the GNU General Public License
kono
parents:
diff changeset
17 along with GCC; see the file COPYING3. If not see
kono
parents:
diff changeset
18 <http://www.gnu.org/licenses/>. */
kono
parents:
diff changeset
19
kono
parents:
diff changeset
20 #include "config.h"
kono
parents:
diff changeset
21 #include "system.h"
kono
parents:
diff changeset
22 #include "coretypes.h"
kono
parents:
diff changeset
23 #include "tm.h"
kono
parents:
diff changeset
24 #include "tree.h"
kono
parents:
diff changeset
25 #include "spellcheck.h"
kono
parents:
diff changeset
26 #include "selftest.h"
kono
parents:
diff changeset
27
kono
parents:
diff changeset
28 /* The Levenshtein distance is an "edit-distance": the minimal
kono
parents:
diff changeset
29 number of one-character insertions, removals or substitutions
kono
parents:
diff changeset
30 that are needed to change one string into another.
kono
parents:
diff changeset
31
kono
parents:
diff changeset
32 This implementation uses the Wagner-Fischer algorithm. */
kono
parents:
diff changeset
33
kono
parents:
diff changeset
34 edit_distance_t
kono
parents:
diff changeset
35 levenshtein_distance (const char *s, int len_s,
kono
parents:
diff changeset
36 const char *t, int len_t)
kono
parents:
diff changeset
37 {
kono
parents:
diff changeset
38 const bool debug = false;
kono
parents:
diff changeset
39
kono
parents:
diff changeset
40 if (debug)
kono
parents:
diff changeset
41 {
kono
parents:
diff changeset
42 printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
kono
parents:
diff changeset
43 printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
kono
parents:
diff changeset
44 }
kono
parents:
diff changeset
45
kono
parents:
diff changeset
46 if (len_s == 0)
kono
parents:
diff changeset
47 return len_t;
kono
parents:
diff changeset
48 if (len_t == 0)
kono
parents:
diff changeset
49 return len_s;
kono
parents:
diff changeset
50
kono
parents:
diff changeset
51 /* We effectively build a matrix where each (i, j) contains the
kono
parents:
diff changeset
52 Levenshtein distance between the prefix strings s[0:j]
kono
parents:
diff changeset
53 and t[0:i].
kono
parents:
diff changeset
54 Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
kono
parents:
diff changeset
55 we simply keep track of the last row, v0 and a new row, v1,
kono
parents:
diff changeset
56 which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
kono
parents:
diff changeset
57 in favor of two (len_s + 1) allocations. These could potentially be
kono
parents:
diff changeset
58 statically-allocated if we impose a maximum length on the
kono
parents:
diff changeset
59 strings of interest. */
kono
parents:
diff changeset
60 edit_distance_t *v0 = new edit_distance_t[len_s + 1];
kono
parents:
diff changeset
61 edit_distance_t *v1 = new edit_distance_t[len_s + 1];
kono
parents:
diff changeset
62
kono
parents:
diff changeset
63 /* The first row is for the case of an empty target string, which
kono
parents:
diff changeset
64 we can reach by deleting every character in the source string. */
kono
parents:
diff changeset
65 for (int i = 0; i < len_s + 1; i++)
kono
parents:
diff changeset
66 v0[i] = i;
kono
parents:
diff changeset
67
kono
parents:
diff changeset
68 /* Build successive rows. */
kono
parents:
diff changeset
69 for (int i = 0; i < len_t; i++)
kono
parents:
diff changeset
70 {
kono
parents:
diff changeset
71 if (debug)
kono
parents:
diff changeset
72 {
kono
parents:
diff changeset
73 printf ("i:%i v0 = ", i);
kono
parents:
diff changeset
74 for (int j = 0; j < len_s + 1; j++)
kono
parents:
diff changeset
75 printf ("%i ", v0[j]);
kono
parents:
diff changeset
76 printf ("\n");
kono
parents:
diff changeset
77 }
kono
parents:
diff changeset
78
kono
parents:
diff changeset
79 /* The initial column is for the case of an empty source string; we
kono
parents:
diff changeset
80 can reach prefixes of the target string of length i
kono
parents:
diff changeset
81 by inserting i characters. */
kono
parents:
diff changeset
82 v1[0] = i + 1;
kono
parents:
diff changeset
83
kono
parents:
diff changeset
84 /* Build the rest of the row by considering neighbors to
kono
parents:
diff changeset
85 the north, west and northwest. */
kono
parents:
diff changeset
86 for (int j = 0; j < len_s; j++)
kono
parents:
diff changeset
87 {
kono
parents:
diff changeset
88 edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
kono
parents:
diff changeset
89 edit_distance_t deletion = v1[j] + 1;
kono
parents:
diff changeset
90 edit_distance_t insertion = v0[j + 1] + 1;
kono
parents:
diff changeset
91 edit_distance_t substitution = v0[j] + cost;
kono
parents:
diff changeset
92 edit_distance_t cheapest = MIN (deletion, insertion);
kono
parents:
diff changeset
93 cheapest = MIN (cheapest, substitution);
kono
parents:
diff changeset
94 v1[j + 1] = cheapest;
kono
parents:
diff changeset
95 }
kono
parents:
diff changeset
96
kono
parents:
diff changeset
97 /* Prepare to move on to next row. */
kono
parents:
diff changeset
98 for (int j = 0; j < len_s + 1; j++)
kono
parents:
diff changeset
99 v0[j] = v1[j];
kono
parents:
diff changeset
100 }
kono
parents:
diff changeset
101
kono
parents:
diff changeset
102 if (debug)
kono
parents:
diff changeset
103 {
kono
parents:
diff changeset
104 printf ("final v1 = ");
kono
parents:
diff changeset
105 for (int j = 0; j < len_s + 1; j++)
kono
parents:
diff changeset
106 printf ("%i ", v1[j]);
kono
parents:
diff changeset
107 printf ("\n");
kono
parents:
diff changeset
108 }
kono
parents:
diff changeset
109
kono
parents:
diff changeset
110 edit_distance_t result = v1[len_s];
kono
parents:
diff changeset
111 delete[] v0;
kono
parents:
diff changeset
112 delete[] v1;
kono
parents:
diff changeset
113 return result;
kono
parents:
diff changeset
114 }
kono
parents:
diff changeset
115
kono
parents:
diff changeset
116 /* Calculate Levenshtein distance between two nil-terminated strings. */
kono
parents:
diff changeset
117
kono
parents:
diff changeset
118 edit_distance_t
kono
parents:
diff changeset
119 levenshtein_distance (const char *s, const char *t)
kono
parents:
diff changeset
120 {
kono
parents:
diff changeset
121 return levenshtein_distance (s, strlen (s), t, strlen (t));
kono
parents:
diff changeset
122 }
kono
parents:
diff changeset
123
kono
parents:
diff changeset
124 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
kono
parents:
diff changeset
125 an autovec of non-NULL strings, determine which element within
kono
parents:
diff changeset
126 CANDIDATES has the lowest edit distance to TARGET. If there are
kono
parents:
diff changeset
127 multiple elements with the same minimal distance, the first in the
kono
parents:
diff changeset
128 vector wins.
kono
parents:
diff changeset
129
kono
parents:
diff changeset
130 If more than half of the letters were misspelled, the suggestion is
kono
parents:
diff changeset
131 likely to be meaningless, so return NULL for this case. */
kono
parents:
diff changeset
132
kono
parents:
diff changeset
133 const char *
kono
parents:
diff changeset
134 find_closest_string (const char *target,
kono
parents:
diff changeset
135 const auto_vec<const char *> *candidates)
kono
parents:
diff changeset
136 {
kono
parents:
diff changeset
137 gcc_assert (target);
kono
parents:
diff changeset
138 gcc_assert (candidates);
kono
parents:
diff changeset
139
kono
parents:
diff changeset
140 int i;
kono
parents:
diff changeset
141 const char *candidate;
kono
parents:
diff changeset
142 best_match<const char *, const char *> bm (target);
kono
parents:
diff changeset
143 FOR_EACH_VEC_ELT (*candidates, i, candidate)
kono
parents:
diff changeset
144 {
kono
parents:
diff changeset
145 gcc_assert (candidate);
kono
parents:
diff changeset
146 bm.consider (candidate);
kono
parents:
diff changeset
147 }
kono
parents:
diff changeset
148
kono
parents:
diff changeset
149 return bm.get_best_meaningful_candidate ();
kono
parents:
diff changeset
150 }
kono
parents:
diff changeset
151
kono
parents:
diff changeset
152 #if CHECKING_P
kono
parents:
diff changeset
153
kono
parents:
diff changeset
154 namespace selftest {
kono
parents:
diff changeset
155
kono
parents:
diff changeset
156 /* Selftests. */
kono
parents:
diff changeset
157
kono
parents:
diff changeset
158 /* Verify that the levenshtein_distance (A, B) equals the expected
kono
parents:
diff changeset
159 value. */
kono
parents:
diff changeset
160
kono
parents:
diff changeset
161 static void
kono
parents:
diff changeset
162 levenshtein_distance_unit_test_oneway (const char *a, const char *b,
kono
parents:
diff changeset
163 edit_distance_t expected)
kono
parents:
diff changeset
164 {
kono
parents:
diff changeset
165 edit_distance_t actual = levenshtein_distance (a, b);
kono
parents:
diff changeset
166 ASSERT_EQ (actual, expected);
kono
parents:
diff changeset
167 }
kono
parents:
diff changeset
168
kono
parents:
diff changeset
169 /* Verify that both
kono
parents:
diff changeset
170 levenshtein_distance (A, B)
kono
parents:
diff changeset
171 and
kono
parents:
diff changeset
172 levenshtein_distance (B, A)
kono
parents:
diff changeset
173 equal the expected value, to ensure that the function is symmetric. */
kono
parents:
diff changeset
174
kono
parents:
diff changeset
175 static void
kono
parents:
diff changeset
176 levenshtein_distance_unit_test (const char *a, const char *b,
kono
parents:
diff changeset
177 edit_distance_t expected)
kono
parents:
diff changeset
178 {
kono
parents:
diff changeset
179 levenshtein_distance_unit_test_oneway (a, b, expected);
kono
parents:
diff changeset
180 levenshtein_distance_unit_test_oneway (b, a, expected);
kono
parents:
diff changeset
181 }
kono
parents:
diff changeset
182
kono
parents:
diff changeset
183 /* Verify that find_closest_string is sane. */
kono
parents:
diff changeset
184
kono
parents:
diff changeset
185 static void
kono
parents:
diff changeset
186 test_find_closest_string ()
kono
parents:
diff changeset
187 {
kono
parents:
diff changeset
188 auto_vec<const char *> candidates;
kono
parents:
diff changeset
189
kono
parents:
diff changeset
190 /* Verify that it can handle an empty vec. */
kono
parents:
diff changeset
191 ASSERT_EQ (NULL, find_closest_string ("", &candidates));
kono
parents:
diff changeset
192
kono
parents:
diff changeset
193 /* Verify that it works sanely for non-empty vecs. */
kono
parents:
diff changeset
194 candidates.safe_push ("apple");
kono
parents:
diff changeset
195 candidates.safe_push ("banana");
kono
parents:
diff changeset
196 candidates.safe_push ("cherry");
kono
parents:
diff changeset
197
kono
parents:
diff changeset
198 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
kono
parents:
diff changeset
199 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
kono
parents:
diff changeset
200 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
kono
parents:
diff changeset
201 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
kono
parents:
diff changeset
202
kono
parents:
diff changeset
203 /* The order of the vec can matter, but it should not matter for these
kono
parents:
diff changeset
204 inputs. */
kono
parents:
diff changeset
205 candidates.truncate (0);
kono
parents:
diff changeset
206 candidates.safe_push ("cherry");
kono
parents:
diff changeset
207 candidates.safe_push ("banana");
kono
parents:
diff changeset
208 candidates.safe_push ("apple");
kono
parents:
diff changeset
209 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
kono
parents:
diff changeset
210 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
kono
parents:
diff changeset
211 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
kono
parents:
diff changeset
212 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
kono
parents:
diff changeset
213
kono
parents:
diff changeset
214 /* If the goal string somehow makes it into the candidate list, offering
kono
parents:
diff changeset
215 it as a suggestion will be nonsensical. Verify that we don't offer such
kono
parents:
diff changeset
216 suggestions. */
kono
parents:
diff changeset
217 ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
kono
parents:
diff changeset
218 }
kono
parents:
diff changeset
219
kono
parents:
diff changeset
220 /* Test data for test_metric_conditions. */
kono
parents:
diff changeset
221
kono
parents:
diff changeset
222 static const char * const test_data[] = {
kono
parents:
diff changeset
223 "",
kono
parents:
diff changeset
224 "foo",
kono
parents:
diff changeset
225 "food",
kono
parents:
diff changeset
226 "boo",
kono
parents:
diff changeset
227 "1234567890123456789012345678901234567890123456789012345678901234567890"
kono
parents:
diff changeset
228 };
kono
parents:
diff changeset
229
kono
parents:
diff changeset
230 /* Verify that levenshtein_distance appears to be a sane distance function,
kono
parents:
diff changeset
231 i.e. the conditions for being a metric. This is done directly for a
kono
parents:
diff changeset
232 small set of examples, using test_data above. This is O(N^3) in the size
kono
parents:
diff changeset
233 of the array, due to the test for the triangle inequality, so we keep the
kono
parents:
diff changeset
234 array small. */
kono
parents:
diff changeset
235
kono
parents:
diff changeset
236 static void
kono
parents:
diff changeset
237 test_metric_conditions ()
kono
parents:
diff changeset
238 {
kono
parents:
diff changeset
239 const int num_test_cases = sizeof (test_data) / sizeof (test_data[0]);
kono
parents:
diff changeset
240
kono
parents:
diff changeset
241 for (int i = 0; i < num_test_cases; i++)
kono
parents:
diff changeset
242 {
kono
parents:
diff changeset
243 for (int j = 0; j < num_test_cases; j++)
kono
parents:
diff changeset
244 {
kono
parents:
diff changeset
245 edit_distance_t dist_ij
kono
parents:
diff changeset
246 = levenshtein_distance (test_data[i], test_data[j]);
kono
parents:
diff changeset
247
kono
parents:
diff changeset
248 /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
kono
parents:
diff changeset
249 if (i == j)
kono
parents:
diff changeset
250 ASSERT_EQ (dist_ij, 0);
kono
parents:
diff changeset
251 else
kono
parents:
diff changeset
252 ASSERT_TRUE (dist_ij > 0);
kono
parents:
diff changeset
253
kono
parents:
diff changeset
254 /* Symmetry: d(i, j) == d(j, i). */
kono
parents:
diff changeset
255 edit_distance_t dist_ji
kono
parents:
diff changeset
256 = levenshtein_distance (test_data[j], test_data[i]);
kono
parents:
diff changeset
257 ASSERT_EQ (dist_ij, dist_ji);
kono
parents:
diff changeset
258
kono
parents:
diff changeset
259 /* Triangle inequality. */
kono
parents:
diff changeset
260 for (int k = 0; k < num_test_cases; k++)
kono
parents:
diff changeset
261 {
kono
parents:
diff changeset
262 edit_distance_t dist_ik
kono
parents:
diff changeset
263 = levenshtein_distance (test_data[i], test_data[k]);
kono
parents:
diff changeset
264 edit_distance_t dist_jk
kono
parents:
diff changeset
265 = levenshtein_distance (test_data[j], test_data[k]);
kono
parents:
diff changeset
266 ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
kono
parents:
diff changeset
267 }
kono
parents:
diff changeset
268 }
kono
parents:
diff changeset
269 }
kono
parents:
diff changeset
270 }
kono
parents:
diff changeset
271
kono
parents:
diff changeset
272 /* Verify levenshtein_distance for a variety of pairs of pre-canned
kono
parents:
diff changeset
273 inputs, comparing against known-good values. */
kono
parents:
diff changeset
274
kono
parents:
diff changeset
275 void
kono
parents:
diff changeset
276 spellcheck_c_tests ()
kono
parents:
diff changeset
277 {
kono
parents:
diff changeset
278 levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
kono
parents:
diff changeset
279 levenshtein_distance_unit_test ("saturday", "sunday", 3);
kono
parents:
diff changeset
280 levenshtein_distance_unit_test ("foo", "m_foo", 2);
kono
parents:
diff changeset
281 levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
kono
parents:
diff changeset
282 levenshtein_distance_unit_test
kono
parents:
diff changeset
283 ("the quick brown fox jumps over the lazy dog", "dog", 40);
kono
parents:
diff changeset
284 levenshtein_distance_unit_test
kono
parents:
diff changeset
285 ("the quick brown fox jumps over the lazy dog",
kono
parents:
diff changeset
286 "the quick brown dog jumps over the lazy fox",
kono
parents:
diff changeset
287 4);
kono
parents:
diff changeset
288 levenshtein_distance_unit_test
kono
parents:
diff changeset
289 ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
kono
parents:
diff changeset
290 "All your base are belong to us",
kono
parents:
diff changeset
291 44);
kono
parents:
diff changeset
292 levenshtein_distance_unit_test ("foo", "FOO", 3);
kono
parents:
diff changeset
293
kono
parents:
diff changeset
294 test_find_closest_string ();
kono
parents:
diff changeset
295 test_metric_conditions ();
kono
parents:
diff changeset
296 }
kono
parents:
diff changeset
297
kono
parents:
diff changeset
298 } // namespace selftest
kono
parents:
diff changeset
299
kono
parents:
diff changeset
300 #endif /* #if CHECKING_P */