Line data Source code
1 : /* Find near-matches for strings.
2 : Copyright (C) 2015-2026 Free Software Foundation, Inc.
3 :
4 : This file is part of GCC.
5 :
6 : GCC is free software; you can redistribute it and/or modify it under
7 : the terms of the GNU General Public License as published by the Free
8 : Software Foundation; either version 3, or (at your option) any later
9 : version.
10 :
11 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with GCC; see the file COPYING3. If not see
18 : <http://www.gnu.org/licenses/>. */
19 :
20 : #define INCLUDE_ALGORITHM
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "tm.h"
25 : #include "tree.h"
26 : #include "spellcheck.h"
27 : #include "selftest.h"
28 :
29 : namespace {
30 :
31 : /* Cost of a case transformation. */
32 : const edit_distance_t case_cost = 1;
33 :
34 : /* Cost of another kind of edit. */
35 : const edit_distance_t base_cost = 2;
36 :
37 : } // anonymous namespace
38 :
39 : /* Get the edit distance between the two strings: the minimal
40 : number of edits that are needed to change one string into another,
41 : where edits can be one-character insertions, removals, or substitutions,
42 : or transpositions of two adjacent characters (counting as one "edit").
43 :
44 : This implementation uses a modified variant of the Wagner-Fischer
45 : algorithm for the Damerau-Levenshtein distance; specifically, the
46 : "optimal string alignment distance" or "restricted edit distance"
47 : variant. This implementation has been further modified to take
48 : case into account. */
49 :
50 : edit_distance_t
51 5950467 : get_edit_distance (const char *s, int len_s,
52 : const char *t, int len_t)
53 : {
54 5950467 : const bool debug = false;
55 :
56 5950467 : if (debug)
57 : {
58 : printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
59 : printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
60 : }
61 :
62 5950467 : if (len_s == 0)
63 92 : return base_cost * len_t;
64 5950375 : if (len_t == 0)
65 60 : return base_cost * len_s;
66 :
67 : /* We effectively build a matrix where each (i, j) contains the
68 : distance between the prefix strings s[0:j] and t[0:i].
69 : Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
70 : we simply keep track of the last two rows, v_one_ago and v_two_ago,
71 : and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
72 : allocation and memory accesses in favor of three (len_s + 1)
73 : allocations. These could potentially be
74 : statically-allocated if we impose a maximum length on the
75 : strings of interest. */
76 5950315 : edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
77 5950315 : edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
78 5950315 : edit_distance_t *v_next = new edit_distance_t[len_s + 1];
79 :
80 : /* The first row is for the case of an empty target string, which
81 : we can reach by deleting every character in the source string. */
82 111728874 : for (int i = 0; i < len_s + 1; i++)
83 105778559 : v_one_ago[i] = i * base_cost;
84 :
85 : /* Build successive rows. */
86 108781374 : for (int i = 0; i < len_t; i++)
87 : {
88 102831059 : if (debug)
89 : {
90 : printf ("i:%i v_one_ago = ", i);
91 : for (int j = 0; j < len_s + 1; j++)
92 : printf ("%i ", v_one_ago[j]);
93 : printf ("\n");
94 : }
95 :
96 : /* The initial column is for the case of an empty source string; we
97 : can reach prefixes of the target string of length i
98 : by inserting i characters. */
99 102831059 : v_next[0] = (i + 1) * base_cost;
100 :
101 : /* Build the rest of the row by considering neighbors to
102 : the north, west and northwest. */
103 1878380105 : for (int j = 0; j < len_s; j++)
104 : {
105 1775549046 : edit_distance_t cost;
106 :
107 1775549046 : if (s[j] == t[i])
108 : cost = 0;
109 1669063789 : else if (TOLOWER (s[j]) == TOLOWER (t[i]))
110 : cost = case_cost;
111 : else
112 1664991509 : cost = base_cost;
113 1775549046 : edit_distance_t deletion = v_next[j] + base_cost;
114 1775549046 : edit_distance_t insertion = v_one_ago[j + 1] + base_cost;
115 1775549046 : edit_distance_t substitution = v_one_ago[j] + cost;
116 1775549046 : edit_distance_t cheapest = std::min (deletion, insertion);
117 1775549046 : cheapest = std::min (cheapest, substitution);
118 1775549046 : if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
119 : {
120 6797157 : edit_distance_t transposition = v_two_ago[j - 1] + base_cost;
121 6797157 : cheapest = std::min (cheapest, transposition);
122 : }
123 1775549046 : v_next[j + 1] = cheapest;
124 : }
125 :
126 : /* Prepare to move on to next row. */
127 1981211164 : for (int j = 0; j < len_s + 1; j++)
128 : {
129 1878380105 : v_two_ago[j] = v_one_ago[j];
130 1878380105 : v_one_ago[j] = v_next[j];
131 : }
132 : }
133 :
134 5950315 : if (debug)
135 : {
136 : printf ("final v_next = ");
137 : for (int j = 0; j < len_s + 1; j++)
138 : printf ("%i ", v_next[j]);
139 : printf ("\n");
140 : }
141 :
142 5950315 : edit_distance_t result = v_next[len_s];
143 5950315 : delete[] v_two_ago;
144 5950315 : delete[] v_one_ago;
145 5950315 : delete[] v_next;
146 5950315 : return result;
147 : }
148 :
149 : /* Get the edit distance between two nil-terminated strings. */
150 :
151 : edit_distance_t
152 752 : get_edit_distance (const char *s, const char *t)
153 : {
154 752 : return get_edit_distance (s, strlen (s), t, strlen (t));
155 : }
156 :
157 : /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
158 : an autovec of non-NULL strings, determine which element within
159 : CANDIDATES has the lowest edit distance to TARGET. If there are
160 : multiple elements with the same minimal distance, the first in the
161 : vector wins.
162 :
163 : If more than half of the letters were misspelled, the suggestion is
164 : likely to be meaningless, so return NULL for this case. */
165 :
166 : const char *
167 1179 : find_closest_string (const char *target,
168 : const auto_vec<const char *> *candidates)
169 : {
170 1179 : gcc_assert (target);
171 1179 : gcc_assert (candidates);
172 :
173 1179 : int i;
174 1179 : const char *candidate;
175 1179 : best_match<const char *, const char *> bm (target);
176 6261757 : FOR_EACH_VEC_ELT (*candidates, i, candidate)
177 : {
178 6259399 : gcc_assert (candidate);
179 6259399 : bm.consider (candidate);
180 : }
181 :
182 1179 : return bm.get_best_meaningful_candidate ();
183 : }
184 :
185 : /* Generate the maximum edit distance for which we consider a suggestion
186 : to be meaningful, given a goal of length GOAL_LEN and a candidate of
187 : length CANDIDATE_LEN.
188 :
189 : This is a third of the length of the candidate or of the goal,
190 : whichever is bigger. */
191 :
192 : edit_distance_t
193 6080369 : get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
194 : {
195 6080369 : size_t max_length = std::max (goal_len, candidate_len);
196 6080369 : size_t min_length = std::min (goal_len, candidate_len);
197 :
198 6080369 : gcc_assert (max_length >= min_length);
199 :
200 : /* Special case: don't offer suggestions for a pair of
201 : length == 1 strings (or empty strings). */
202 6080369 : if (max_length <= 1)
203 : return 0;
204 :
205 : /* If the lengths are close, then round down. */
206 6071858 : if (max_length - min_length <= 1)
207 : /* ...but allow an edit distance of at least 1. */
208 1922718 : return base_cost * std::max (max_length / 3, size_t {1});
209 :
210 : /* Otherwise, round up (thus giving a little extra leeway to some cases
211 : involving insertions/deletions). */
212 5108485 : return base_cost * (max_length + 2) / 3;
213 : }
214 :
215 : #if CHECKING_P
216 :
217 : namespace selftest {
218 :
219 : /* Selftests. */
220 :
221 : /* Verify that get_edit_distance (A, B) equals the expected value. */
222 :
223 : static void
224 240 : test_get_edit_distance_one_way (const char *a, const char *b,
225 : edit_distance_t expected)
226 : {
227 240 : edit_distance_t actual = get_edit_distance (a, b);
228 240 : ASSERT_EQ (actual, expected);
229 240 : }
230 :
231 : /* Verify that both
232 : get_edit_distance (A, B)
233 : and
234 : get_edit_distance (B, A)
235 : equal the expected value, to ensure that the function is symmetric. */
236 :
237 : static void
238 120 : test_get_edit_distance_both_ways (const char *a, const char *b,
239 : edit_distance_t expected)
240 : {
241 120 : test_get_edit_distance_one_way (a, b, expected);
242 120 : test_get_edit_distance_one_way (b, a, expected);
243 120 : }
244 :
245 : /* Verify get_edit_distance for a variety of pairs of pre-canned
246 : inputs, comparing against known-good values. */
247 :
248 : static void
249 4 : test_edit_distances ()
250 : {
251 4 : test_get_edit_distance_both_ways ("", "nonempty",
252 : base_cost * strlen ("nonempty"));
253 4 : test_get_edit_distance_both_ways ("saturday", "sunday",
254 : base_cost * 3);
255 4 : test_get_edit_distance_both_ways ("foo", "m_foo", base_cost * 2);
256 4 : test_get_edit_distance_both_ways ("hello_world", "HelloWorld", 4);
257 4 : test_get_edit_distance_both_ways
258 4 : ("the quick brown fox jumps over the lazy dog", "dog", base_cost * 40);
259 4 : test_get_edit_distance_both_ways
260 4 : ("the quick brown fox jumps over the lazy dog",
261 : "the quick brown dog jumps over the lazy fox",
262 : base_cost * 4);
263 4 : test_get_edit_distance_both_ways
264 4 : ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
265 : "All your base are belong to us",
266 : base_cost * 44);
267 4 : test_get_edit_distance_both_ways ("foo", "FOO", 3);
268 4 : test_get_edit_distance_both_ways ("fee", "deed", base_cost * 2);
269 4 : test_get_edit_distance_both_ways ("coorzd1", "coordx1", base_cost * 2);
270 4 : test_get_edit_distance_both_ways ("assert", "sqrt", base_cost * 3);
271 4 : test_get_edit_distance_both_ways ("PATH_MAX", "INT8_MAX", base_cost * 3);
272 4 : test_get_edit_distance_both_ways ("time", "nice", base_cost * 2);
273 4 : test_get_edit_distance_both_ways ("bar", "carg", base_cost * 2);
274 4 : test_get_edit_distance_both_ways ("gtk_widget_show_all",
275 : "GtkWidgetShowAll", 10);
276 4 : test_get_edit_distance_both_ways ("m_bar", "bar", base_cost * 2);
277 4 : test_get_edit_distance_both_ways ("MACRO", "MACRAME", base_cost * 3);
278 4 : test_get_edit_distance_both_ways ("ab", "ac", base_cost * 1);
279 4 : test_get_edit_distance_both_ways ("ab", "a", base_cost * 1);
280 4 : test_get_edit_distance_both_ways ("a", "b", base_cost * 1);
281 4 : test_get_edit_distance_both_ways ("nanl", "name", base_cost * 2);
282 4 : test_get_edit_distance_both_ways ("char", "bar", base_cost * 2);
283 4 : test_get_edit_distance_both_ways ("-optimize", "fsanitize", base_cost * 5);
284 4 : test_get_edit_distance_both_ways ("__DATE__", "__i386__", base_cost * 4);
285 :
286 : /* Examples where transposition helps. */
287 4 : test_get_edit_distance_both_ways ("ab", "ba", base_cost * 1);
288 4 : test_get_edit_distance_both_ways ("ba", "abc", base_cost * 2);
289 4 : test_get_edit_distance_both_ways ("coorzd1", "coordz1", base_cost * 1);
290 4 : test_get_edit_distance_both_ways ("abcdefghijklmnopqrstuvwxyz",
291 : "bacdefghijklmnopqrstuvwxzy",
292 : base_cost * 2);
293 4 : test_get_edit_distance_both_ways ("saturday", "sundya", base_cost * 4);
294 4 : test_get_edit_distance_both_ways ("signed", "singed", base_cost * 1);
295 4 : }
296 :
297 : /* Subroutine of test_get_edit_distance_cutoff, for emulating the
298 : spellchecking cutoff in up to GCC 8. */
299 :
300 : static edit_distance_t
301 3600 : get_old_cutoff (size_t goal_len, size_t candidate_len)
302 : {
303 3600 : return base_cost * std::max (goal_len, candidate_len) / 2;
304 : }
305 :
306 : /* Verify that the cutoff for "meaningfulness" of suggestions is at least as
307 : conservative as in older GCC releases.
308 :
309 : This should ensure that we don't offer additional meaningless
310 : suggestions (apart from those for which transposition has helped). */
311 :
312 : static void
313 4 : test_get_edit_distance_cutoff ()
314 : {
315 124 : for (size_t goal_len = 0; goal_len < 30; goal_len++)
316 3720 : for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
317 3600 : ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
318 : <= get_old_cutoff (goal_len, candidate_len));
319 4 : }
320 :
321 : /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
322 :
323 : static void
324 20 : assert_suggested_for (const location &loc, const char *candidate,
325 : const char *target)
326 : {
327 20 : auto_vec<const char *> candidates;
328 20 : candidates.safe_push (candidate);
329 20 : ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
330 20 : }
331 :
332 : /* Assert that CANDIDATE is offered as a suggestion for TARGET. */
333 :
334 : #define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET) \
335 : SELFTEST_BEGIN_STMT \
336 : assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
337 : SELFTEST_END_STMT
338 :
339 : /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
340 :
341 : static void
342 40 : assert_not_suggested_for (const location &loc, const char *candidate,
343 : const char *target)
344 : {
345 40 : auto_vec<const char *> candidates;
346 40 : candidates.safe_push (candidate);
347 40 : ASSERT_EQ_AT (loc, nullptr, find_closest_string (target, &candidates));
348 40 : }
349 :
350 : /* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
351 :
352 : #define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET) \
353 : SELFTEST_BEGIN_STMT \
354 : assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
355 : SELFTEST_END_STMT
356 :
357 : /* Verify that we offer varous suggestions that are meaningful,
358 : and that we don't offer various other ones that aren't (PR c/82967). */
359 :
360 : static void
361 4 : test_suggestions ()
362 : {
363 : /* Good suggestions. */
364 :
365 4 : ASSERT_SUGGESTED_FOR ("m_bar", "bar");
366 : // dist == 2, max_length == 5, min_length == 3
367 :
368 4 : ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
369 : // dist == 3, max_length == 7, min_length == 5
370 :
371 4 : ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
372 : // dist == 7, max_length == 16, min_length = 19
373 :
374 4 : ASSERT_SUGGESTED_FOR ("ab", "ac");
375 : // dist == 1, max_length == min_length = 2
376 :
377 4 : ASSERT_SUGGESTED_FOR ("ab", "a");
378 : // dist == 1, max_length == 2, min_length = 1
379 :
380 : /* Bad suggestions. */
381 :
382 4 : ASSERT_NOT_SUGGESTED_FOR ("a", "b");
383 : // dist == 1, max_length == min_length = 1
384 :
385 4 : ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
386 : // dist == 3, max_length 6, min_length == 4
387 :
388 4 : ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
389 : // dist == 3, max_length == min_length == 8
390 :
391 4 : ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
392 4 : ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
393 : // dist == 2, max_length == min_length == 4
394 :
395 4 : ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
396 4 : ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
397 : // dist == 2, max_length == 4, min_length == 3
398 :
399 4 : ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
400 : // dist == 5, max_length == min_length == 9
401 :
402 4 : ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
403 : // dist == 4, max_length == min_length == 8
404 :
405 4 : ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
406 : // dist == 9, max_length == 18, min_length == 11
407 4 : }
408 :
409 : /* Verify that find_closest_string is sane. */
410 :
411 : static void
412 4 : test_find_closest_string ()
413 : {
414 4 : auto_vec<const char *> candidates;
415 :
416 : /* Verify that it can handle an empty vec. */
417 4 : ASSERT_EQ (nullptr, find_closest_string ("", &candidates));
418 :
419 : /* Verify that it works sanely for non-empty vecs. */
420 4 : candidates.safe_push ("apple");
421 4 : candidates.safe_push ("banana");
422 4 : candidates.safe_push ("cherry");
423 :
424 4 : ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
425 4 : ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
426 4 : ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
427 4 : ASSERT_EQ (nullptr, find_closest_string ("not like the others", &candidates));
428 :
429 : /* The order of the vec can matter, but it should not matter for these
430 : inputs. */
431 4 : candidates.truncate (0);
432 4 : candidates.safe_push ("cherry");
433 4 : candidates.safe_push ("banana");
434 4 : candidates.safe_push ("apple");
435 4 : ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
436 4 : ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
437 4 : ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
438 4 : ASSERT_EQ (nullptr, find_closest_string ("not like the others", &candidates));
439 :
440 : /* If the goal string somehow makes it into the candidate list, offering
441 : it as a suggestion will be nonsensical. Verify that we don't offer such
442 : suggestions. */
443 4 : ASSERT_EQ (nullptr, find_closest_string ("banana", &candidates));
444 :
445 : /* Example from PR 69968 where transposition helps. */
446 4 : candidates.truncate (0);
447 4 : candidates.safe_push("coordx");
448 4 : candidates.safe_push("coordy");
449 4 : candidates.safe_push("coordz");
450 4 : candidates.safe_push("coordx1");
451 4 : candidates.safe_push("coordy1");
452 4 : candidates.safe_push("coordz1");
453 4 : ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
454 :
455 4 : candidates.truncate (0);
456 4 : candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
457 4 : candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
458 4 : candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
459 4 : ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
460 : find_closest_string ("DWARF_GNAT_ENCODINGS_all",
461 : &candidates));
462 :
463 : /* The same as the previous test, but with a different order of
464 : candidates. */
465 4 : candidates.truncate (0);
466 4 : candidates.safe_push ("DWARF_GNAT_ENCODINGS_ALL");
467 4 : candidates.safe_push ("DWARF_GNAT_ENCODINGS_GDB");
468 4 : candidates.safe_push ("DWARF_GNAT_ENCODINGS_MINIMAL");
469 4 : ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
470 : find_closest_string ("DWARF_GNAT_ENCODINGS_all",
471 : &candidates));
472 :
473 : /* Example from PR 105564 where option name with missing equal
474 : sign should win. */
475 4 : candidates.truncate (0);
476 4 : candidates.safe_push ("-Wtrivial-auto-var-init");
477 4 : candidates.safe_push ("-ftrivial-auto-var-init=");
478 4 : ASSERT_STREQ ("-ftrivial-auto-var-init=",
479 : find_closest_string ("-ftrivial-auto-var-init",
480 : &candidates));
481 4 : }
482 :
483 : /* Test data for test_metric_conditions. */
484 :
485 : static const char * const test_data[] = {
486 : "",
487 : "foo",
488 : "food",
489 : "boo",
490 : "1234567890123456789012345678901234567890123456789012345678901234567890",
491 : "abc",
492 : "ac",
493 : "ca",
494 : };
495 :
496 : /* Verify that get_edit_distance appears to be a sane distance function,
497 : even though it doesn't satisfy the conditions for being a metric. (This
498 : is because the triangle inequality fails to hold: the distance between
499 : "ca" and "ac" is 1, and so is the distance between "abc" and "ac", but
500 : the distance between "abc" and "ca" is 3. Algorithms that calculate the
501 : true Levenshtein-Damerau metric are much more expensive.) */
502 :
503 : static void
504 4 : test_metric_conditions ()
505 : {
506 4 : const int num_test_cases = ARRAY_SIZE (test_data);
507 :
508 36 : for (int i = 0; i < num_test_cases; i++)
509 : {
510 288 : for (int j = 0; j < num_test_cases; j++)
511 : {
512 256 : edit_distance_t dist_ij
513 256 : = get_edit_distance (test_data[i], test_data[j]);
514 :
515 : /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
516 256 : if (i == j)
517 32 : ASSERT_EQ (dist_ij, 0);
518 : else
519 224 : ASSERT_TRUE (dist_ij > 0);
520 :
521 : /* Symmetry: d(i, j) == d(j, i). */
522 256 : edit_distance_t dist_ji
523 256 : = get_edit_distance (test_data[j], test_data[i]);
524 256 : ASSERT_EQ (dist_ij, dist_ji);
525 : }
526 : }
527 4 : }
528 :
529 : /* Run all of the selftests within this file. */
530 :
531 : void
532 4 : spellcheck_cc_tests ()
533 : {
534 4 : test_edit_distances ();
535 4 : test_get_edit_distance_cutoff ();
536 4 : test_suggestions ();
537 4 : test_find_closest_string ();
538 4 : test_metric_conditions ();
539 4 : }
540 :
541 : } // namespace selftest
542 :
543 : #endif /* #if CHECKING_P */
|