changeset 30128:c66969b563a8

New function fstrcmp_bounded.
author Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
date Sun, 14 Sep 2008 21:08:57 +0200
parents 8bc04be64aed
children fcb5098c37dd
files ChangeLog lib/fstrcmp.c lib/fstrcmp.h tests/test-fstrcmp.c
diffstat 4 files changed, 82 insertions(+), 32 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sun Sep 14 18:34:59 2008 +0200
+++ b/ChangeLog	Sun Sep 14 21:08:57 2008 +0200
@@ -1,3 +1,11 @@
+2008-09-14  Ralf Wildenhues  <Ralf.Wildenhues@gmx.de>
+
+	* lib/fstrcmp.h (fstrcmp_bounded): New declaration.
+	(fstrcmp): Define in terms of fstrcmp_bounded.
+	* lib/fstrcmp.c (fstrcmp_bounded): Renamed from fstrcmp. Add lower_bound argument.
+	Return quickly if the result is certainly < lower_bound.
+	* tests/test-fstrcmp.c (check_fstrcmp): Test also fstrcmp_bounded.
+
 2008-09-14  Ralf Wildenhues  <Ralf.Wildenhues@gmx.de>
 
 	* lib/diffseq.h (EARLY_ABORT): New macro.
--- a/lib/fstrcmp.c	Sun Sep 14 18:34:59 2008 +0200
+++ b/lib/fstrcmp.c	Sun Sep 14 21:08:57 2008 +0200
@@ -97,46 +97,52 @@
 gl_once_define(static, keys_init_once)
 
 
-/* NAME
-	fstrcmp - fuzzy string compare
-
-   SYNOPSIS
-	double fstrcmp(const char *, const char *);
-
-   DESCRIPTION
-	The fstrcmp function may be used to compare two string for
-	similarity.  It is very useful in reducing "cascade" or
-	"secondary" errors in compilers or other situations where
-	symbol tables occur.
-
-   RETURNS
-	double; 0 if the strings are entirly dissimilar, 1 if the
-	strings are identical, and a number in between if they are
-	similar.  */
-
 double
-fstrcmp (const char *string1, const char *string2)
+fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)
 {
   struct context ctxt;
-  int xvec_length;
-  int yvec_length;
+  int xvec_length = strlen (string1);
+  int yvec_length = strlen (string2);
   int i;
 
   size_t fdiag_len;
   int *buffer;
   size_t bufmax;
 
+  /* short-circuit obvious comparisons */
+  if (xvec_length == 0 || yvec_length == 0)
+    return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0);
+
+  if (lower_bound > 0)
+    {
+      /* Compute a quick upper bound.
+	 Each edit is an insertion or deletion of an element, hence modifies
+	 the length of the sequence by at most 1.
+	 Therefore, when starting from a sequence X and ending at a sequence Y,
+	 with N edits,  | yvec_length - xvec_length | <= N.  (Proof by
+	 induction over N.)
+	 So, at the end, we will have
+	   xvec_edit_count + yvec_edit_count >= | xvec_length - yvec_length |.
+	 and hence
+	   result
+	     = (xvec_length + yvec_length - (xvec_edit_count + yvec_edit_count))
+	       / (xvec_length + yvec_length)
+	     <= (xvec_length + yvec_length - | yvec_length - xvec_length |)
+		/ (xvec_length + yvec_length)
+	     = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length).
+       */
+      volatile double upper_bound =
+	(double) (2 * MIN (xvec_length, yvec_length))
+	/ (xvec_length + yvec_length);
+
+      if (upper_bound < lower_bound)
+	/* Return an arbitrary value < LOWER_BOUND.  */
+	return 0.0;
+    }
+
   /* set the info for each string.  */
   ctxt.xvec = string1;
-  xvec_length = strlen (string1);
   ctxt.yvec = string2;
-  yvec_length = strlen (string2);
-
-  /* short-circuit obvious comparisons */
-  if (xvec_length == 0 && yvec_length == 0)
-    return 1.0;
-  if (xvec_length == 0 || yvec_length == 0)
-    return 0.0;
 
   /* Set TOO_EXPENSIVE to be approximate square root of input size,
      bounded below by 256.  */
--- a/lib/fstrcmp.h	Sun Sep 14 18:34:59 2008 +0200
+++ b/lib/fstrcmp.h	Sun Sep 14 21:08:57 2008 +0200
@@ -1,5 +1,6 @@
 /* Fuzzy string comparison.
-   Copyright (C) 1995, 2000, 2002-2003, 2006 Free Software Foundation, Inc.
+   Copyright (C) 1995, 2000, 2002-2003, 2006, 2008 Free Software
+   Foundation, Inc.
 
    This file was written by Peter Miller <pmiller@agso.gov.au>
 
@@ -24,9 +25,19 @@
 #endif
 
 /* Fuzzy compare of S1 and S2.  Return a measure for the similarity of S1
-   and S1.  The higher the result, the more similar the strings are.  */
+   and S1.  The higher the result, the more similar the strings are.
+   The result is bounded between 0 (meaning very dissimilar strings) and
+   1 (meaning identical strings).  */
 extern double fstrcmp (const char *s1, const char *s2);
 
+/* Like fstrcmp (S1, S2), except that if the result is < LOWER_BOUND, an
+   arbitrary other value < LOWER_BOUND can be returned.  */
+extern double fstrcmp_bounded (const char *s1, const char *s2,
+			       double lower_bound);
+
+/* A shortcut for fstrcmp.  Avoids a function call.  */
+#define fstrcmp(s1,s2) fstrcmp_bounded (s1, s2, 0.0)
+
 #ifdef __cplusplus
 }
 #endif
--- a/tests/test-fstrcmp.c	Sun Sep 14 18:34:59 2008 +0200
+++ b/tests/test-fstrcmp.c	Sun Sep 14 21:08:57 2008 +0200
@@ -45,8 +45,33 @@
      compliant by default, to avoid that msgmerge results become platform and
      compiler option dependent.  'volatile' is a portable alternative to gcc's
      -ffloat-store option.  */
-  volatile double result = fstrcmp (string1, string2);
-  return (result == expected);
+  {
+    volatile double result = fstrcmp (string1, string2);
+    if (!(result == expected))
+      return false;
+  }
+  {
+    volatile double result = fstrcmp_bounded (string1, string2, expected);
+    if (!(result == expected))
+      return false;
+  }
+  {
+    double bound = expected * 0.5; /* implies bound <= expected */
+    volatile double result = fstrcmp_bounded (string1, string2, bound);
+    if (!(result == expected))
+      return false;
+  }
+  {
+    double bound = (1 + expected) * 0.5;
+    if (expected < bound)
+      {
+	volatile double result = fstrcmp_bounded (string1, string2, bound);
+	if (!(result < bound))
+	  return false;
+      }
+  }
+
+  return true;
 }
 
 int