changeset 27173:0ca8a2a1cd25

Big performance improvement for fts-based tools that use FTS_NOSTAT. Avoid spurious inode-mismatch problems on non-POSIX file systems. Details: http://article.gmane.org/gmane.comp.lib.gnulib.bugs/7416 * lib/fts_.h (FTS_DEFER_STAT): Define new flag. (FTS_OPTIONMASK): Extend the mask to reflect this addition. * lib/fts.c (DT_IS_KNOWN, DT_MUST_BE): Define. (FTS_NO_STAT_REQUIRED, FTS_STAT_REQUIRED): Define. (fts_set_stat_required): New function. (fts_open): Defer the calls to fts_stat, if possible or requested. Move the code that maps a command-line fts_info value FTS_DOT to FTS_D into fts_stat itself. (fts_read): Perform any required (deferred) fts_stat call. (fts_build): Likewise, for the directory we're about to open and read. In the readdir loop, carefully decide whether each entry will require an eventual call to fts_stat, using dirent.d_type info if available. (fts_stat): Move the test for whether to honor FTS_COMFOLLOW on a command line argument into this function. Update all callers. Map a return value of FTS_DOT to FTS_D for a command line argument. * modules/fts (Depends-on): Add d-type. Alphabetize. Thanks to Miklos Szeredi for his tenacity and for the initial bug report about "find" failing on a FUSE-based file system.
author Jim Meyering <jim@meyering.net>
date Thu, 12 Oct 2006 10:32:32 +0000
parents 1f17423add4d
children 48b6f30eae9e
files ChangeLog lib/fts.c lib/fts_.h modules/fts
diffstat 4 files changed, 146 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Thu Oct 12 08:01:18 2006 +0000
+++ b/ChangeLog	Thu Oct 12 10:32:32 2006 +0000
@@ -1,3 +1,27 @@
+2006-10-12  Jim Meyering  <jim@meyering.net>
+
+	Big performance improvement for fts-based tools that use FTS_NOSTAT.
+	Avoid spurious inode-mismatch problems on non-POSIX file systems.
+	Details: http://article.gmane.org/gmane.comp.lib.gnulib.bugs/7416
+	* lib/fts_.h (FTS_DEFER_STAT): Define new flag.
+	(FTS_OPTIONMASK): Extend the mask to reflect this addition.
+	* lib/fts.c (DT_IS_KNOWN, DT_MUST_BE): Define.
+	(FTS_NO_STAT_REQUIRED, FTS_STAT_REQUIRED): Define.
+	(fts_set_stat_required): New function.
+	(fts_open): Defer the calls to fts_stat, if possible or requested.
+	Move the code that maps a command-line fts_info value FTS_DOT to FTS_D
+	into fts_stat itself.
+	(fts_read): Perform any required (deferred) fts_stat call.
+	(fts_build): Likewise, for the directory we're about to open and read.
+	In the readdir loop, carefully decide whether each entry will require
+	an eventual call to fts_stat, using dirent.d_type info if available.
+	(fts_stat): Move the test for whether to honor FTS_COMFOLLOW on
+	a command line argument into this function.  Update all callers.
+	Map a return value of FTS_DOT to FTS_D for a command line argument.
+	* modules/fts (Depends-on): Add d-type.  Alphabetize.
+	Thanks to Miklos Szeredi for his tenacity and for the initial
+	bug report about "find" failing on a FUSE-based file system.
+
 2006-10-12  Paul Eggert  <eggert@cs.ucla.edu>
 
 	* m4/extensions.m4 (AC_USE_SYSTEM_EXTENSIONS): Renamed from
--- a/lib/fts.c	Thu Oct 12 08:01:18 2006 +0000
+++ b/lib/fts.c	Thu Oct 12 10:32:32 2006 +0000
@@ -81,6 +81,22 @@
 # define _D_EXACT_NAMLEN(dirent) strlen ((dirent)->d_name)
 #endif
 
+#if HAVE_STRUCT_DIRENT_D_TYPE
+/* True if the type of the directory entry D is known.  */
+# define DT_IS_KNOWN(d) ((d)->d_type != DT_UNKNOWN)
+/* True if the type of the directory entry D must be T.  */
+# define DT_MUST_BE(d, t) ((d)->d_type == (t))
+#else
+# define DT_IS_KNOWN(d) false
+# define DT_MUST_BE(d, t) false
+#endif
+
+enum
+{
+  FTS_NO_STAT_REQUIRED = 1,
+  FTS_STAT_REQUIRED = 2
+};
+
 #ifdef _LIBC
 # undef close
 # define close __close
@@ -195,6 +211,19 @@
     }								\
   while (false)
 
+/* Overload the fts_statp->st_size member (otherwise unused, when
+   fts_info is FTS_NSOK) to indicate whether fts_read should stat
+   this entry or not.  */
+static void
+fts_set_stat_required (FTSENT *p, bool required)
+{
+  if (p->fts_info != FTS_NSOK)
+    abort ();
+  p->fts_statp->st_size = (required
+			   ? FTS_STAT_REQUIRED
+			   : FTS_NO_STAT_REQUIRED);
+}
+
 /* file-descriptor-relative opendir.  */
 /* FIXME: if others need this function, move it into lib/openat.c */
 static inline DIR *
@@ -258,6 +287,7 @@
 	FTSENT *parent = NULL;
 	FTSENT *tmp = NULL;	/* pacify gcc */
 	size_t len;
+	bool defer_stat;
 
 	/* Options check. */
 	if (options & ~FTS_OPTIONMASK) {
@@ -340,6 +370,19 @@
 		parent->fts_level = FTS_ROOTPARENTLEVEL;
 	  }
 
+	/* The classic fts implementation would call fts_stat with
+	   a new entry for each iteration of the loop below.
+	   If the comparison function is not specified or if the
+	   FTS_DEFER_STAT option is in effect, don't stat any entry
+	   in this loop.  This is an attempt to minimize the interval
+	   between the initial stat/lstat/fstatat and the point at which
+	   a directory argument is first opened.  This matters for any
+	   directory command line argument that resides on a file system
+	   without genuine i-nodes.  If you specify FTS_DEFER_STAT along
+	   with a comparison function, that function must not access any
+	   data via the fts_statp pointer.  */
+	defer_stat = (compar == NULL || ISSET(FTS_DEFER_STAT));
+
 	/* Allocate/initialize root(s). */
 	for (root = NULL, nitems = 0; *argv != NULL; ++argv, ++nitems) {
 		/* Don't allow zero-length file names. */
@@ -353,11 +396,12 @@
 		p->fts_level = FTS_ROOTLEVEL;
 		p->fts_parent = parent;
 		p->fts_accpath = p->fts_name;
-		p->fts_info = fts_stat(sp, p, ISSET(FTS_COMFOLLOW) != 0);
-
-		/* Command-line "." and ".." are real directories. */
-		if (p->fts_info == FTS_DOT)
-			p->fts_info = FTS_D;
+		if (defer_stat) {
+			p->fts_info = FTS_NSOK;
+			fts_set_stat_required(p, true);
+		} else {
+			p->fts_info = fts_stat(sp, p, false);
+		}
 
 		/*
 		 * If comparison routine supplied, traverse in sorted
@@ -645,6 +689,19 @@
 		*t++ = '/';
 		memmove(t, p->fts_name, p->fts_namelen + 1);
 check_for_dir:
+		if (p->fts_info == FTS_NSOK)
+		  {
+		    switch (p->fts_statp->st_size)
+		      {
+		      case FTS_STAT_REQUIRED:
+			p->fts_info = fts_stat(sp, p, false);
+			break;
+		      case FTS_NO_STAT_REQUIRED:
+			break;
+		      default:
+			abort ();
+		      }
+		  }
 		sp->fts_cur = p;
 		if (p->fts_info == FTS_D)
 		  {
@@ -672,6 +729,9 @@
 		return (sp->fts_cur = NULL);
 	}
 
+	if (p->fts_info == FTS_NSOK)
+	  abort ();
+
 	/* NUL terminate the file name.  */
 	sp->fts_path[p->fts_pathlen] = '\0';
 
@@ -862,6 +922,11 @@
 		}
 		return (NULL);
 	}
+       /* Rather than calling fts_stat for each and every entry encountered
+	  in the readdir loop (below), stat each directory only right after
+	  opening it.  */
+       if (cur->fts_info == FTS_NSOK)
+	 cur->fts_info = fts_stat(sp, cur, false);
 
 	/*
 	 * Nlinks is the number of possible entries of type directory in the
@@ -1004,12 +1069,38 @@
 			memmove(cp, p->fts_name, p->fts_namelen + 1);
 		} else
 			p->fts_accpath = p->fts_name;
-		/* Stat it. */
-		p->fts_info = fts_stat(sp, p, false);
+
+		bool is_dir;
+		if (sp->fts_compar == NULL || ISSET(FTS_DEFER_STAT)) {
+			/* Record what fts_read will have to do with this
+			   entry. In many cases, it will simply fts_stat it,
+			   but we can take advantage of any d_type information
+			   to optimize away the unnecessary stat calls.  I.e.,
+			   if FTS_NOSTAT is in effect and we're not following
+			   symlinks (FTS_PHYSICAL) and d_type indicates this
+			   is *not* a directory, then we won't have to stat it
+			   at all.  If it *is* a directory, then (currently)
+			   we stat it regardless, in order to get device and
+			   inode numbers.  Some day we might optimize that
+			   away, too, for directories where d_ino is known to
+			   be valid.  */
+			bool skip_stat = (ISSET(FTS_PHYSICAL)
+					  && ISSET(FTS_NOSTAT)
+					  && DT_IS_KNOWN(dp)
+					  && ! DT_MUST_BE(dp, DT_DIR));
+			p->fts_info = FTS_NSOK;
+			fts_set_stat_required(p, !skip_stat);
+			is_dir = (ISSET(FTS_PHYSICAL) && ISSET(FTS_NOSTAT)
+				  && DT_MUST_BE(dp, DT_DIR));
+		} else {
+			p->fts_info = fts_stat(sp, p, false);
+			is_dir = (p->fts_info == FTS_D
+				  || p->fts_info == FTS_DC
+				  || p->fts_info == FTS_DOT);
+		}
 
 		/* Decrement link count if applicable. */
-		if (nlinks > 0 && (p->fts_info == FTS_D ||
-		    p->fts_info == FTS_DC || p->fts_info == FTS_DOT))
+		if (nlinks > 0 && is_dir)
 			nlinks -= nostat;
 
 		/* We walk in directory order so "ls -f" doesn't get upset. */
@@ -1143,6 +1234,9 @@
 	struct stat *sbp = p->fts_statp;
 	int saved_errno;
 
+	if (p->fts_level == FTS_ROOTLEVEL && ISSET(FTS_COMFOLLOW))
+		follow = true;
+
 #if defined FTS_WHITEOUT && 0
 	/* check for whiteout */
 	if (p->fts_flags & FTS_ISW) {
@@ -1176,8 +1270,10 @@
 	}
 
 	if (S_ISDIR(sbp->st_mode)) {
-		if (ISDOT(p->fts_name))
-			return (FTS_DOT);
+		if (ISDOT(p->fts_name)) {
+			/* Command-line "." and ".." are real directories. */
+			return (p->fts_level == FTS_ROOTLEVEL ? FTS_D : FTS_DOT);
+		}
 
 #if _LGPL_PACKAGE
 		{
--- a/lib/fts_.h	Thu Oct 12 08:01:18 2006 +0000
+++ b/lib/fts_.h	Thu Oct 12 10:32:32 2006 +0000
@@ -123,7 +123,19 @@
      through the file descriptor member, fts_cwd_fd.  */
 # define FTS_CWDFD		0x0200
 
-# define FTS_OPTIONMASK	0x03ff		/* valid user option mask */
+  /* Historically, for each directory that fts initially encounters, it would
+     open it, read all entries, and stat each entry, storing the results, and
+     then it would process the first entry.  But that behavior is bad for
+     locality of reference, and also causes trouble with inode-simulating
+     file systems like FAT, CIFS, FUSE-based ones, etc., when entries from
+     their name/inode cache are flushed too early.
+     Use this flag to make fts_open and fts_read defer the stat/lstat/fststat
+     of each entry until it actually processed.  However, note that if you use
+     this option and also specify a comparison function, that function may not
+     examine any data via fts_statp.  */
+# define FTS_DEFER_STAT		0x0400
+
+# define FTS_OPTIONMASK	0x07ff		/* valid user option mask */
 
 # define FTS_NAMEONLY	0x1000		/* (private) child names only */
 # define FTS_STOP	0x2000		/* (private) unrecoverable error */
--- a/modules/fts	Thu Oct 12 08:01:18 2006 +0000
+++ b/modules/fts	Thu Oct 12 10:32:32 2006 +0000
@@ -9,13 +9,14 @@
 
 Depends-on:
 cycle-check
+d-type
 dirfd
 fcntl
+fcntl-safer
 hash
 lstat
 openat
 stdbool
-fcntl-safer
 unistd-safer
 
 configure.ac: