comparison lib/fstrcmp.c @ 30128:c66969b563a8

New function fstrcmp_bounded.
author Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
date Sun, 14 Sep 2008 21:08:57 +0200
parents 896026426559
children 1a57c9d644f2
comparison
equal deleted inserted replaced
30127:8bc04be64aed 30128:c66969b563a8
95 95
96 /* Ensure that keys_init is called once only. */ 96 /* Ensure that keys_init is called once only. */
97 gl_once_define(static, keys_init_once) 97 gl_once_define(static, keys_init_once)
98 98
99 99
100 /* NAME
101 fstrcmp - fuzzy string compare
102
103 SYNOPSIS
104 double fstrcmp(const char *, const char *);
105
106 DESCRIPTION
107 The fstrcmp function may be used to compare two string for
108 similarity. It is very useful in reducing "cascade" or
109 "secondary" errors in compilers or other situations where
110 symbol tables occur.
111
112 RETURNS
113 double; 0 if the strings are entirly dissimilar, 1 if the
114 strings are identical, and a number in between if they are
115 similar. */
116
117 double 100 double
118 fstrcmp (const char *string1, const char *string2) 101 fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)
119 { 102 {
120 struct context ctxt; 103 struct context ctxt;
121 int xvec_length; 104 int xvec_length = strlen (string1);
122 int yvec_length; 105 int yvec_length = strlen (string2);
123 int i; 106 int i;
124 107
125 size_t fdiag_len; 108 size_t fdiag_len;
126 int *buffer; 109 int *buffer;
127 size_t bufmax; 110 size_t bufmax;
128 111
112 /* short-circuit obvious comparisons */
113 if (xvec_length == 0 || yvec_length == 0)
114 return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0);
115
116 if (lower_bound > 0)
117 {
118 /* Compute a quick upper bound.
119 Each edit is an insertion or deletion of an element, hence modifies
120 the length of the sequence by at most 1.
121 Therefore, when starting from a sequence X and ending at a sequence Y,
122 with N edits, | yvec_length - xvec_length | <= N. (Proof by
123 induction over N.)
124 So, at the end, we will have
125 xvec_edit_count + yvec_edit_count >= | xvec_length - yvec_length |.
126 and hence
127 result
128 = (xvec_length + yvec_length - (xvec_edit_count + yvec_edit_count))
129 / (xvec_length + yvec_length)
130 <= (xvec_length + yvec_length - | yvec_length - xvec_length |)
131 / (xvec_length + yvec_length)
132 = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length).
133 */
134 volatile double upper_bound =
135 (double) (2 * MIN (xvec_length, yvec_length))
136 / (xvec_length + yvec_length);
137
138 if (upper_bound < lower_bound)
139 /* Return an arbitrary value < LOWER_BOUND. */
140 return 0.0;
141 }
142
129 /* set the info for each string. */ 143 /* set the info for each string. */
130 ctxt.xvec = string1; 144 ctxt.xvec = string1;
131 xvec_length = strlen (string1);
132 ctxt.yvec = string2; 145 ctxt.yvec = string2;
133 yvec_length = strlen (string2);
134
135 /* short-circuit obvious comparisons */
136 if (xvec_length == 0 && yvec_length == 0)
137 return 1.0;
138 if (xvec_length == 0 || yvec_length == 0)
139 return 0.0;
140 146
141 /* Set TOO_EXPENSIVE to be approximate square root of input size, 147 /* Set TOO_EXPENSIVE to be approximate square root of input size,
142 bounded below by 256. */ 148 bounded below by 256. */
143 ctxt.too_expensive = 1; 149 ctxt.too_expensive = 1;
144 for (i = xvec_length + yvec_length; 150 for (i = xvec_length + yvec_length;