Mercurial > octave
changeset 27163:47c0b11da31a
hist.m: Improve performance with large numbers of bins (bug #55909).
* hist.m: Determine whether bins are equally spaced in input validation.
For low bin counts (< 26), use the existing algorithm. For higher bin counts,
use a new algorithm based on accumarray and sometimes lookup.
Update BIST tests.
author | Michael Leitner <michael.leitner@frm2.tum.de> |
---|---|
date | Sat, 08 Jun 2019 16:34:27 -0700 |
parents | eab1c573c4fc |
children | e5f248d614f5 |
files | scripts/plot/draw/hist.m |
diffstat | 1 files changed, 52 insertions(+), 23 deletions(-) [+] |
line wrap: on
line diff
--- a/scripts/plot/draw/hist.m Fri Jun 07 16:53:09 2019 -0400 +++ b/scripts/plot/draw/hist.m Sat Jun 08 16:34:27 2019 -0700 @@ -104,6 +104,9 @@ min_val = double (min_val); endif + ## Equidistant entries allow much more efficient algorithms. + equal_bin_spacing = true; + ## Process possible second argument if (nargin == 1 || ischar (varargin{iarg})) n = 10; @@ -141,6 +144,13 @@ endif x = x.'; # Convert to matrix; elseif (isvector (x)) + equal_bin_spacing = strcmp (typeinfo (x), "range"); + if (! equal_bin_spacing) + diffs = diff (x); + if (all (diffs == diffs(1))) + equal_bin_spacing = true; + endif + endif x = x(:); if (! issorted (x)) warning ("hist: bin values X not sorted on input"); @@ -162,35 +172,54 @@ ## Perform histogram calculation cutoff = (x(1:end-1,:) + x(2:end,:)) / 2; - if (isinteger (y)) - cutoff = floor (cutoff); - endif n = rows (x); y_nc = columns (y); - if (n < 30 && columns (x) == 1) - ## The following algorithm works fastest for n less than about 30. + + if (n < 11 * (1 + (! equal_bin_spacing))) + ## The following algorithm works fastest for small n. + nanidx = isnan (y); chist = zeros (n+1, y_nc); for i = 1:n-1 chist(i+1,:) = sum (y <= cutoff(i)); endfor - chist(n+1,:) = sum (! isnan (y)); + chist(n+1,:) = sum (! nanidx); + + freq = diff (chist); else - ## The following algorithm works fastest for n greater than about 30. - ## Put cutoff elements between boundaries, integrate over all - ## elements, keep totals at boundaries. - m = (nthargout (2, @sort, [y; repmat(cutoff, 1, y_nc)]) <= rows (y)); - chist = cumsum (m); - chist = [(zeros (1, y_nc)); - (reshape (chist(! m), rows (cutoff), y_nc)); - (chist(end,:) - sum (isnan (y)))]; + ## Lookup is more efficient if y is sorted, but sorting costs. + if (! equal_bin_spacing && n > sqrt (rows (y) * 1e4)) + y = sort (y); + end + + nanidx = isnan (y); + y(nanidx) = 0; + freq = zeros (n, y_nc); + if (equal_bin_spacing) + if (n < 3) + d = 1; + else + d = (x(end) - x(1)) / (length (x) - 1); + end + cutlen = length (cutoff); + for j = 1:y_nc + freq(:,j) = accumarray (1 + max (0, min (cutlen, ceil ((double (y(:,j)) + - cutoff(1)) / d))), + double (! nanidx(:,j)), + [n, 1]); + end + else + for j = 1:y_nc + i = lookup (cutoff, y(:,j)); + i = 1 + i - (cutoff(max (i, 1)) == y(:,j)); + freq(:,j) = accumarray (i, double (! nanidx(:,j)), [n, 1]); + end + end endif - freq = diff (chist); - if (norm) ## Normalize the histogram - freq = freq .* norm ./ sum (! isnan (y)); + freq .*= norm ./ sum (! nanidx); endif if (nargout == 0) @@ -257,9 +286,9 @@ %! assert (nn, n); %! assert (xx, x); %! -%! ## test again with N > 30 because there's a special case for it -%! [n, x] = hist (y, 35); -%! [nn, xx] = hist (uint8 (y), 35); +%! ## test again with N > 26 because there's a special case for it +%! [n, x] = hist (y, 30); +%! [nn, xx] = hist (uint8 (y), 30); %! assert (nn, n); %! assert (xx, x); @@ -271,9 +300,9 @@ %! assert (nn, n); %! assert (xx, x); %! -%! ## test again with N > 30 because there's a special case for it -%! [n, x] = hist (y, 35); -%! [nn, xx] = hist (logical (y), 35); +%! ## test again with N > 26 because there's a special case for it +%! [n, x] = hist (y, 30); +%! [nn, xx] = hist (logical (y), 30); %! assert (nn, n); %! assert (xx, x);