changeset 27163:47c0b11da31a

hist.m: Improve performance with large numbers of bins (bug #55909). * hist.m: Determine whether bins are equally spaced in input validation. For low bin counts (< 26), use the existing algorithm. For higher bin counts, use a new algorithm based on accumarray and sometimes lookup. Update BIST tests.
author Michael Leitner <michael.leitner@frm2.tum.de>
date Sat, 08 Jun 2019 16:34:27 -0700
parents eab1c573c4fc
children e5f248d614f5
files scripts/plot/draw/hist.m
diffstat 1 files changed, 52 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/scripts/plot/draw/hist.m	Fri Jun 07 16:53:09 2019 -0400
+++ b/scripts/plot/draw/hist.m	Sat Jun 08 16:34:27 2019 -0700
@@ -104,6 +104,9 @@
     min_val = double (min_val);
   endif
 
+  ## Equidistant entries allow much more efficient algorithms.
+  equal_bin_spacing = true;
+
   ## Process possible second argument
   if (nargin == 1 || ischar (varargin{iarg}))
     n = 10;
@@ -141,6 +144,13 @@
       endif
       x = x.';  # Convert to matrix;
     elseif (isvector (x))
+      equal_bin_spacing = strcmp (typeinfo (x), "range");
+      if (! equal_bin_spacing)
+        diffs = diff (x);
+        if (all (diffs == diffs(1)))
+          equal_bin_spacing = true;
+        endif
+      endif
       x = x(:);
       if (! issorted (x))
         warning ("hist: bin values X not sorted on input");
@@ -162,35 +172,54 @@
 
   ## Perform histogram calculation
   cutoff = (x(1:end-1,:) + x(2:end,:)) / 2;
-  if (isinteger (y))
-    cutoff = floor (cutoff);
-  endif
 
   n = rows (x);
   y_nc = columns (y);
-  if (n < 30 && columns (x) == 1)
-    ## The following algorithm works fastest for n less than about 30.
+
+  if (n < 11 * (1 + (! equal_bin_spacing)))
+    ## The following algorithm works fastest for small n.
+    nanidx = isnan (y);
     chist = zeros (n+1, y_nc);
     for i = 1:n-1
       chist(i+1,:) = sum (y <= cutoff(i));
     endfor
-    chist(n+1,:) = sum (! isnan (y));
+    chist(n+1,:) = sum (! nanidx);
+
+    freq = diff (chist);
   else
-    ## The following algorithm works fastest for n greater than about 30.
-    ## Put cutoff elements between boundaries, integrate over all
-    ## elements, keep totals at boundaries.
-    m = (nthargout (2, @sort, [y; repmat(cutoff, 1, y_nc)]) <= rows (y));
-    chist = cumsum (m);
-    chist = [(zeros (1, y_nc));
-             (reshape (chist(! m), rows (cutoff), y_nc));
-             (chist(end,:) - sum (isnan (y)))];
+    ## Lookup is more efficient if y is sorted, but sorting costs.
+    if (! equal_bin_spacing && n > sqrt (rows (y) * 1e4))
+      y = sort (y);
+    end
+
+    nanidx = isnan (y);
+    y(nanidx) = 0;
+    freq = zeros (n, y_nc);
+    if (equal_bin_spacing)
+      if (n < 3)
+        d = 1;
+      else
+        d = (x(end) - x(1)) / (length (x) - 1);
+      end
+      cutlen = length (cutoff);
+      for j = 1:y_nc
+        freq(:,j) = accumarray (1 + max (0, min (cutlen, ceil ((double (y(:,j))
+                                                         - cutoff(1)) / d))),
+                                double (! nanidx(:,j)),
+                                [n, 1]);
+      end
+    else
+      for j = 1:y_nc
+        i = lookup (cutoff, y(:,j));
+        i = 1 + i - (cutoff(max (i, 1)) == y(:,j));
+        freq(:,j) = accumarray (i, double (! nanidx(:,j)), [n, 1]);
+      end
+    end
   endif
 
-  freq = diff (chist);
-
   if (norm)
     ## Normalize the histogram
-    freq = freq .* norm ./ sum (! isnan (y));
+    freq .*= norm ./ sum (! nanidx);
   endif
 
   if (nargout == 0)
@@ -257,9 +286,9 @@
 %! assert (nn, n);
 %! assert (xx, x);
 %!
-%! ## test again with N > 30 because there's a special case for it
-%! [n, x] = hist (y, 35);
-%! [nn, xx] = hist (uint8 (y), 35);
+%! ## test again with N > 26 because there's a special case for it
+%! [n, x] = hist (y, 30);
+%! [nn, xx] = hist (uint8 (y), 30);
 %! assert (nn, n);
 %! assert (xx, x);
 
@@ -271,9 +300,9 @@
 %! assert (nn, n);
 %! assert (xx, x);
 %!
-%! ## test again with N > 30 because there's a special case for it
-%! [n, x] = hist (y, 35);
-%! [nn, xx] = hist (logical (y), 35);
+%! ## test again with N > 26 because there's a special case for it
+%! [n, x] = hist (y, 30);
+%! [nn, xx] = hist (logical (y), 30);
 %! assert (nn, n);
 %! assert (xx, x);