Mercurial > gnulib
comparison lib/fstrcmp.c @ 30128:c66969b563a8
New function fstrcmp_bounded.
author | Ralf Wildenhues <Ralf.Wildenhues@gmx.de> |
---|---|
date | Sun, 14 Sep 2008 21:08:57 +0200 |
parents | 896026426559 |
children | 1a57c9d644f2 |
comparison
equal
deleted
inserted
replaced
30127:8bc04be64aed | 30128:c66969b563a8 |
---|---|
95 | 95 |
96 /* Ensure that keys_init is called once only. */ | 96 /* Ensure that keys_init is called once only. */ |
97 gl_once_define(static, keys_init_once) | 97 gl_once_define(static, keys_init_once) |
98 | 98 |
99 | 99 |
100 /* NAME | |
101 fstrcmp - fuzzy string compare | |
102 | |
103 SYNOPSIS | |
104 double fstrcmp(const char *, const char *); | |
105 | |
106 DESCRIPTION | |
107 The fstrcmp function may be used to compare two string for | |
108 similarity. It is very useful in reducing "cascade" or | |
109 "secondary" errors in compilers or other situations where | |
110 symbol tables occur. | |
111 | |
112 RETURNS | |
113 double; 0 if the strings are entirly dissimilar, 1 if the | |
114 strings are identical, and a number in between if they are | |
115 similar. */ | |
116 | |
117 double | 100 double |
118 fstrcmp (const char *string1, const char *string2) | 101 fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) |
119 { | 102 { |
120 struct context ctxt; | 103 struct context ctxt; |
121 int xvec_length; | 104 int xvec_length = strlen (string1); |
122 int yvec_length; | 105 int yvec_length = strlen (string2); |
123 int i; | 106 int i; |
124 | 107 |
125 size_t fdiag_len; | 108 size_t fdiag_len; |
126 int *buffer; | 109 int *buffer; |
127 size_t bufmax; | 110 size_t bufmax; |
128 | 111 |
112 /* short-circuit obvious comparisons */ | |
113 if (xvec_length == 0 || yvec_length == 0) | |
114 return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0); | |
115 | |
116 if (lower_bound > 0) | |
117 { | |
118 /* Compute a quick upper bound. | |
119 Each edit is an insertion or deletion of an element, hence modifies | |
120 the length of the sequence by at most 1. | |
121 Therefore, when starting from a sequence X and ending at a sequence Y, | |
122 with N edits, | yvec_length - xvec_length | <= N. (Proof by | |
123 induction over N.) | |
124 So, at the end, we will have | |
125 xvec_edit_count + yvec_edit_count >= | xvec_length - yvec_length |. | |
126 and hence | |
127 result | |
128 = (xvec_length + yvec_length - (xvec_edit_count + yvec_edit_count)) | |
129 / (xvec_length + yvec_length) | |
130 <= (xvec_length + yvec_length - | yvec_length - xvec_length |) | |
131 / (xvec_length + yvec_length) | |
132 = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). | |
133 */ | |
134 volatile double upper_bound = | |
135 (double) (2 * MIN (xvec_length, yvec_length)) | |
136 / (xvec_length + yvec_length); | |
137 | |
138 if (upper_bound < lower_bound) | |
139 /* Return an arbitrary value < LOWER_BOUND. */ | |
140 return 0.0; | |
141 } | |
142 | |
129 /* set the info for each string. */ | 143 /* set the info for each string. */ |
130 ctxt.xvec = string1; | 144 ctxt.xvec = string1; |
131 xvec_length = strlen (string1); | |
132 ctxt.yvec = string2; | 145 ctxt.yvec = string2; |
133 yvec_length = strlen (string2); | |
134 | |
135 /* short-circuit obvious comparisons */ | |
136 if (xvec_length == 0 && yvec_length == 0) | |
137 return 1.0; | |
138 if (xvec_length == 0 || yvec_length == 0) | |
139 return 0.0; | |
140 | 146 |
141 /* Set TOO_EXPENSIVE to be approximate square root of input size, | 147 /* Set TOO_EXPENSIVE to be approximate square root of input size, |
142 bounded below by 256. */ | 148 bounded below by 256. */ |
143 ctxt.too_expensive = 1; | 149 ctxt.too_expensive = 1; |
144 for (i = xvec_length + yvec_length; | 150 for (i = xvec_length + yvec_length; |