changeset 27124:f291e7f1f63e

Update comments. Talk about vectors instead of files/strings, and about elements instead of lines/characters.
author Bruno Haible <bruno@clisp.org>
date Sat, 07 Oct 2006 16:25:52 +0000
parents 0887e66a91e4
children b484f3df2d7b
files lib/diffseq.h lib/fstrcmp.c
diffstat 2 files changed, 63 insertions(+), 58 deletions(-) [+]
line wrap: on
line diff
--- a/lib/diffseq.h	Sat Oct 07 16:07:31 2006 +0000
+++ b/lib/diffseq.h	Sat Oct 07 16:25:52 2006 +0000
@@ -1,4 +1,4 @@
-/* Analyze differences between two sequences.
+/* Analyze differences between two vectors.
    Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006 Free Software Foundation, Inc.
 
    This program is free software; you can redistribute it and/or modify
@@ -16,21 +16,30 @@
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.  */
 
 
-/* The basic algorithm is described in:
+/* The basic idea is to consider two vectors as similar if, when
+   transforming the first vector into the second vector through a
+   sequence of edits (inserts and deletes of one character each),
+   this sequence is short - or equivalently, if the ordered list
+   of elements that are untouched by these edits is long.  For a
+   good introduction to the subject, read about the "Levenshtein
+   distance" in Wikipedia.
+
+   The basic algorithm is described in:
    "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
    Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
    see especially section 4.2, which describes the variation used below.
-   Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
-   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
-   at the price of producing suboptimal output for large inputs with
-   many differences.
 
    The basic algorithm was independently discovered as described in:
    "Algorithms for Approximate String Matching", E. Ukkonen,
-   Information and Control Vol. 64, 1985, pp. 100-118.  */
+   Information and Control Vol. 64, 1985, pp. 100-118.
+
+   Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
+   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
+   at the price of producing suboptimal output for large inputs with
+   many differences.  */
 
 /* Before including this file, you need to define:
-     ELEMENT                 The element type of the sequences being compared.
+     ELEMENT                 The element type of the vectors being compared.
      EQUAL                   A two-argument macro that tests two elements for
                              equality.
      OFFSET                  A signed integer type sufficient to hold the
@@ -70,10 +79,10 @@
   bool hi_minimal;
 };
 
-/* Find the midpoint of the shortest edit script for a specified
-   portion of the two files.
+/* Find the midpoint of the shortest edit script for a specified portion
+   of the two vectors.
 
-   Scan from the beginnings of the files, and simultaneously from the ends,
+   Scan from the beginnings of the vectors, and simultaneously from the ends,
    doing a breadth-first search through the space of edit-sequence.
    When the two searches meet, we have found the midpoint of the shortest
    edit sequence.
@@ -83,20 +92,19 @@
    heuristics to stop the search and report a suboptimal answer.
 
    Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
-   XMID - YMID equals the number of inserted lines minus the number
-   of deleted lines (counting only lines before the midpoint).
+   XMID - YMID equals the number of inserted elements minus the number
+   of deleted elements (counting only elements before the midpoint).
 
    Set PART->lo_minimal to true iff the minimal edit script for the
    left half of the partition is known; similarly for PART->hi_minimal.
 
-   This function assumes that the first lines of the specified portions
-   of the two files do not match, and likewise that the last lines do not
-   match.  The caller must trim matching lines from the beginning and end
+   This function assumes that the first elements of the specified portions
+   of the two vectors do not match, and likewise that the last elements do not
+   match.  The caller must trim matching elements from the beginning and end
    of the portions it is going to specify.
 
-   If we return the "wrong" partitions,
-   the worst this can do is cause suboptimal diff output.
-   It cannot cause incorrect diff output.  */
+   If we return the "wrong" partitions, the worst this can do is cause
+   suboptimal diff output.  It cannot cause incorrect diff output.  */
 
 static void
 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
@@ -204,8 +212,8 @@
 	 If we have any such, find the one that has made the most
 	 progress and return it as if it had succeeded.
 
-	 With this heuristic, for files with a constant small density
-	 of changes, the algorithm is linear in the file size.  */
+	 With this heuristic, for vectors with a constant small density
+	 of changes, the algorithm is linear in the vector size.  */
 
       if (c > 200 && big_snake && speed_large_files)
 	{
@@ -340,16 +348,16 @@
     }
 }
 
-/* Compare in detail contiguous subsequences of the two files
+/* Compare in detail contiguous subsequences of the two vectors
    which are known, as a whole, to match each other.
 
    The results are recorded in the vectors files[N].changed, by
    storing 1 in the element for each line that is an insertion or deletion.
 
-   The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
+   The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
 
    Note that XLIM, YLIM are exclusive bounds.
-   All line numbers are origin-0 and discarded lines are not counted.
+   All line numbers are origin-0 and discarded elements are not counted.
 
    If FIND_MINIMAL, find a minimal difference no matter how
    expensive it is.  */
@@ -384,7 +392,7 @@
     {
       struct partition part IF_LINT (= {0});
 
-      /* Find a point of correspondence in the middle of the files.  */
+      /* Find a point of correspondence in the middle of the vectors.  */
       diag (xoff, xlim, yoff, ylim, find_minimal, &part);
 
       /* Use the partitions to split this problem into subproblems.  */
--- a/lib/fstrcmp.c	Sat Oct 07 16:07:31 2006 +0000
+++ b/lib/fstrcmp.c	Sat Oct 07 16:25:52 2006 +0000
@@ -18,11 +18,11 @@
 
    Derived from GNU diff 2.7, analyze.c et al.
 
-   The basic idea is to consider two strings as similar if, when
-   transforming the first string into the second string through a
+   The basic idea is to consider two sequences as similar if, when
+   transforming the first sequence into the second sequence through a
    sequence of edits (inserts and deletes of one character each),
    this sequence is short - or equivalently, if the ordered list
-   of characters that are untouched by these edits is long.  For a
+   of elements that are untouched by these edits is long.  For a
    good introduction to the subject, read about the "Levenshtein
    distance" in Wikipedia.
 
@@ -35,13 +35,10 @@
    "Algorithms for Approximate String Matching", E. Ukkonen,
    Information and Control Vol. 64, 1985, pp. 100-118.
 
-   Unless the 'minimal' flag is set, this code uses the TOO_EXPENSIVE
+   Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
    heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
    at the price of producing suboptimal output for large inputs with
-   many differences.
-
-   Modified to work on strings rather than files
-   by Peter Miller <pmiller@agso.gov.au>, October 1995 */
+   many differences.  */
 
 #include <config.h>
 
@@ -67,10 +64,6 @@
 #define EQUAL(x,y) ((x) == (y))
 #define OFFSET int
 
-/* Maximum value of type OFFSET.  */
-#define OFFSET_MAX \
-  ((((OFFSET)1 << (sizeof (OFFSET_MAX) * CHAR_BIT - 2)) - 1) * 2 + 1)
-
 /* Before including this file, you need to define:
      ELEMENT                 The element type of the sequences being compared.
      EQUAL                   A two-argument macro that tests two elements for
@@ -79,6 +72,10 @@
                              difference between two indices. Usually
                              something like ssize_t.  */
 
+/* Maximum value of type OFFSET.  */
+#define OFFSET_MAX \
+  ((((OFFSET)1 << (sizeof (OFFSET_MAX) * CHAR_BIT - 2)) - 1) * 2 + 1)
+
 /*
  * Context of comparison operation.
  */
@@ -95,7 +92,7 @@
     /* The length of the string to be compared. */
     int data_length;
 
-    /* The number of characters inserted or deleted. */
+    /* The number of elements inserted or deleted. */
     int edit_count;
   }
   string[2];
@@ -103,8 +100,8 @@
   #ifdef MINUS_H_FLAG
 
   /* This corresponds to the diff -H flag.  With this heuristic, for
-     strings with a constant small density of changes, the algorithm is
-     linear in the strings size.  This is unlikely in typical uses of
+     vectors with a constant small density of changes, the algorithm is
+     linear in the vectors size.  This is unlikely in typical uses of
      fstrcmp, and so is usually compiled out.  Besides, there is no
      interface to set it true.  */
   int heuristic;
@@ -150,9 +147,9 @@
 
    DESCRIPTION
 	Find the midpoint of the shortest edit script for a specified
-	portion of the two strings.
+	portion of the two vectors.
 
-	Scan from the beginnings of the strings, and simultaneously from
+	Scan from the beginnings of the vectors, and simultaneously from
 	the ends, doing a breadth-first search through the space of
 	edit-sequence.  When the two searches meet, we have found the
 	midpoint of the shortest edit sequence.
@@ -163,22 +160,22 @@
 
    RETURNS
 	Set PART->(XMID,YMID) to the midpoint (XMID,YMID).  The diagonal
-	number XMID - YMID equals the number of inserted characters
-	minus the number of deleted characters (counting only characters
+	number XMID - YMID equals the number of inserted elements
+	minus the number of deleted elements (counting only elements
 	before the midpoint).  Return the approximate edit cost; this is
-	the total number of characters inserted or deleted (counting
-	only characters before the midpoint), unless a heuristic is used
+	the total number of elements inserted or deleted (counting
+	only elements before the midpoint), unless a heuristic is used
 	to terminate the search prematurely.
 
-	Set PART->LEFT_MINIMAL to nonzero iff the minimal edit script
+	Set PART->lo_minimal to nonzero iff the minimal edit script
 	for the left half of the partition is known; similarly for
-	PART->RIGHT_MINIMAL.
+	PART->hi_minimal.
 
    CAVEAT
-	This function assumes that the first characters of the specified
-	portions of the two strings do not match, and likewise that the
-	last characters do not match.  The caller must trim matching
-	characters from the beginning and end of the portions it is
+	This function assumes that the first elements of the specified
+	portions of the two vectors do not match, and likewise that the
+	last elements do not match.  The caller must trim matching
+	elements from the beginning and end of the portions it is
 	going to specify.
 
 	If we return the "wrong" partitions, the worst this can do is
@@ -310,8 +307,8 @@
          such, find the one that has made the most progress and return
          it as if it had succeeded.
 
-         With this heuristic, for strings with a constant small density
-         of changes, the algorithm is linear in the strings size.  */
+         With this heuristic, for vectors with a constant small density
+         of changes, the algorithm is linear in the vector size.  */
       if (c > 200 && big_snake && ctxt->heuristic)
 	{
 	  OFFSET best;
@@ -487,11 +484,11 @@
 			struct context *ctxt);
 
    DESCRIPTION
-	Compare in detail contiguous subsequences of the two strings
+	Compare in detail contiguous subsequences of the two vectors
 	which are known, as a whole, to match each other.
 
-	The subsequence of string 0 is [XOFF, XLIM) and likewise for
-	string 1.
+	The subsequence of vector 0 is [XOFF, XLIM) and likewise for
+	vector 1.
 
 	Note that XLIM, YLIM are exclusive bounds.  All character
 	numbers are origin-0.
@@ -542,7 +539,7 @@
       OFFSET c;
       struct partition part;
 
-      /* Find a point of correspondence in the middle of the strings.  */
+      /* Find a point of correspondence in the middle of the vectors.  */
       c = diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
       if (c == 1)
 	{