changeset 31395:068342cc93b8

factor.m: Speed up for large inputs. __isprimelarge.cc__: Add implementation of Pollard Rho technique. New local functions localgcd, pollardrho, and new DEFUN __pollardrho__. factor.m: Call __pollardrho__ for inputs larger than 1e10. Change if-else ladder to favor faster technique for each value range. Remove long explanation of previous technique, now no longer applicable. Performance data: For inputs <= 1e10, performance is unchanged. For inputs around 1e12, it's about 33% faster, sampled over 1e4 inputs. For inputs around 1e15, it's about 13X faster, sampled over 1e4 inputs. For inputs around 1e18, it's about 400X faster, sampled over 100 inputs. For inputs beyond 2^63, it's 1000X faster, sampled over 100 inputs.
author Arun Giridhar <arungiridhar@gmail.com>
date Sat, 05 Nov 2022 00:10:11 -0400
parents 7781b1e77406
children 7e60506a5428
files libinterp/corefcn/__isprimelarge__.cc scripts/specfun/factor.m
diffstat 2 files changed, 118 insertions(+), 95 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/corefcn/__isprimelarge__.cc	Thu Nov 03 17:50:24 2022 -0400
+++ b/libinterp/corefcn/__isprimelarge__.cc	Sat Nov 05 00:10:11 2022 -0400
@@ -198,4 +198,86 @@
 %!error <unable to convert input> (__isprimelarge__ ({'foo'; 'bar'}))
 */
 
+
+// This function implements a fast, private GCD function
+// optimized for uint64_t.  No input validation by design.
+inline
+uint64_t
+localgcd (uint64_t a, uint64_t b)
+{
+  return (a <= b) ? ( (b % a == 0) ? a : localgcd (a, b % a) )
+                  : ( (a % b == 0) ? b : localgcd (a % b, b) );
+}
+
+// This function implements a textbook version of the Pollard Rho
+// factorization algorithm with Brent update.
+// The code is short and simple, but the math behind it is complicated.
+uint64_t
+pollardrho (uint64_t n, uint64_t c = 1ULL)
+{
+  uint64_t i = 1ULL, j = 2ULL;  // cycle index values
+  uint64_t x = (c+2) % n;       // can also be rand () % n
+  uint64_t y = x;               // other value in the chain
+  uint64_t g = 0;               // GCD
+
+  while (true)
+    {
+      i++;
+
+      // Calculate x = mod (x^2 - c, n) without overflow.
+      x = safemultiply (x, x, n);
+      x = (x >= c) ? (x - c) : (x + n - c);
+
+      // Calculate GCD (abs (x-y), n).
+      g = (x == y) ? 0 : (x > y) ? localgcd (x - y, n) : localgcd (y - x, n);
+
+      if (i == j)  // cycle detected ==> double j
+        {
+          y = x;
+          j <<= 1;
+        }
+
+      if (g == n)  // restart with a different c
+        return pollardrho (n, c + 2);
+
+      if (g > 1ULL)  // found GCD ==> exit loop properly
+        {
+          if (n % g)   // nonzero remainder ==> not a factor
+            return 0;  // rare but theoretically possible case.
+          else         // normal case
+            return g;  // GCD is a divisor of n by definition.
+        }
+    }
+}
+
+DEFUN (__pollardrho__, args, ,
+       doc: /* -*- texinfo -*-
+@deftypefn {} {@var{x} =} __pollardrho__ (@var{n})
+Private function.  Use the Pollard Rho test to find a factor of @var{n}.
+The input @var{n} is required to be a composite 64-bit integer.
+Do not pass it a prime input!  You should call factor(@var{n}) instead
+of directly calling this function.
+
+@seealso{isprime, factor}
+@end deftypefn */)
+{
+  int nargin = args.length ();
+  if (nargin != 1)
+    print_usage ();
+
+  octave_uint64 inp = args(0).xuint64_scalar_value
+    ("__pollardrho__: unable to convert input. Call factor() instead.");
+
+  uint64_t n = inp;
+  octave_uint64 retval = pollardrho (n);
+
+  return ovl (retval);
+}
+
+/*
+%!assert (__pollardrho__ (uint64 (78567695338254293)), uint64 (443363))
+
+%!error <unable to convert input> (__pollardrho__ ({'foo'; 'bar'}))
+*/
+
 OCTAVE_NAMESPACE_END
--- a/scripts/specfun/factor.m	Thu Nov 03 17:50:24 2022 -0400
+++ b/scripts/specfun/factor.m	Sat Nov 05 00:10:11 2022 -0400
@@ -67,77 +67,19 @@
     error ("factor: Q too large to factor (> flintmax)");
   endif
 
-  ## The basic idea is to divide by the prime numbers from 1 to sqrt(q).
-  ## But primes(sqrt(q)) can be very time-consuming to compute for q > 1e16,
-  ## so we divide by smaller primes first.
-  ##
-  ## This won't make a difference for prime q, but it makes a big (100x)
-  ## difference for large composite q.  Since there are many more composites
-  ## than primes, this leads overall to a speedup.
-  ##
-  ## There is at most one prime greater than sqrt(q), and if it exists,
-  ## it has multiplicity 1, so no need to consider any factors greater
-  ## than sqrt(q) directly.  If there were two factors p1, p2 > sqrt(q), then
-  ##
-  ##   q >= p1*p2 > sqrt(q)*sqrt(q) == q,
-  ##
-  ## which is a contradiction.
-  ##
-  ## The following calculation of transition and number of divisors to use
-  ## was determined empirically.  As of now (October 2021) it gives the best
-  ## overall performance over the range of 1 <= q <= intmax ("uint64").
-  ##
-  ## For future programmers: check periodically for performance improvements
-  ## and tune this transition as required.  Trials that didn't yield success
-  ## in (October 2021):
-  ##
-  ## 1.) persistent smallprimes = primes (FOO)
-  ##
-  ##     For various fixed FOO in the range 10 <= FOO <= 10e6.
-  ##     (FOO is independent of q.)  The thought had been that making it
-  ##     persistent would cache it so it didn't need to be recomputed for
-  ##     subsequent calls, but it slowed it down overall.  It seems calling
-  ##     primes twice with smaller q is still faster than one persistent
-  ##     call for a large q.
-  ##
-  ## 2.) smallprimes = primes (q ^ FOO)
-  ##
-  ##     For various values of FOO.  For FOO >= 0.25 or FOO <= 0.16, the
-  ##     performance is very poor.  FOO needs to be in the 0.17 to 0.24 range,
-  ##     somewhat.  Benchmark experiments indicate it should increase gently
-  ##     from 0.18 to 0.21 as q goes from 10^11 to 10^18.
-  ##
-  ##     But putting in such an expression would require calculating the log
-  ##     of q, which defeats any performance improvement.  Or a step-wise
-  ##     approximation like:
-  ##
-  ##     foo = 0.18 + 0.01 * (q > 1e12) + 0.01 * (q > 1e14) ...
-  ##                                    + 0.01 * (q > 1e16);
-  ##     smallprimes = primes (feval (cls, q^foo));
-  ##
-  ##     where the RHS of foo would go from 0.18 to 0.21 over several orders
-  ##     of magnitude without calling the log.  Obviously that is overly
-  ##     empirical, so putting in q^0.2 seems to be the most robust overall
-  ##     for 64-bit q.
+  ## The overall flow is this:
+  ## 1. Divide by small primes smaller than q^0.2, if any.
+  ## 2. Use Pollard Rho to reduce the value below 1e10 if possible.
+  ## 3. Divide by primes smaller than sqrt (q), if any.
+  ## 4. At all stages, stop if the remaining value is prime.
 
-  ## Lookup table for sufficiently small values for q.
-  if (q < 10e9)
-    ## Lookup, rather calling up to primes(100) is about 3% faster, than the
-    ## previous value of primes(30).  Same for very small q < 1e6.
-    ##
-    ## For 1e9 < q < 10e9 the lookup approach is about 7% faster.
-
+  ## First divide by primes (q ^ 0.2).
+  ## For q < 1e10, we can hard-code the primes.
+  if (q < 1e10)
     smallprimes = feval (cls, ...
       [2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 71 73 79 83 89 97]);
-
-    ## Only for really small values of q, statements like
-    ##
-    ##   smallprimes(smallprimes > q) = [];
-    ##
-    ## are relevant and slow down significantly for large values of q.
   else
-    ## For sufficiently large q, go up to the 5th root of q for now.
-    smallprimes = primes (feval (cls, q^0.2));
+    smallprimes = primes (feval (cls, q ^ 0.2));
   endif
 
   ## pf is the list of prime factors returned with type of input class.
@@ -147,37 +89,41 @@
   ## pf now contains all prime factors of q within smallprimes, including
   ## repetitions, in ascending order.
   ##
-  ## q itself will be divided by those prime factors to become smaller,
+  ## q itself has been divided by those prime factors to become smaller,
   ## unless q was prime to begin with.
 
+  sortflag = false;
   if (isprime (q))
-    ## This is an optimization for numbers like 18446744073709551566
-    ## == 2 * 9223372036854775783, where the small factors can be pulled
-    ## out easily and the remaining is prime.  This optimization reduces
-    ## 14.3 s to 1.8 ms (8000X faster) for such cases.
     pf(end+1) = q;
   else
-    ## Now go all the way to sqrt(q), where q is smaller than the original q in
-    ## most cases.
-    ##
-    ## Note: Do not try to weed out the smallprimes inside largeprimes, whether
-    ## using length(smallprimes) or max(smallprimes) -- it slows it down!
-    largeprimes = primes (sqrt (q));
-    [pf, q] = reducefactors (q, pf, largeprimes);
+    ## Use Pollard Rho technique to pull factors one at a time.
+    while (q > 1e10 && ! isprime (q))
+      pr = feval (cls, __pollardrho__ (q));  # pr is a prime factor.
+      [pf, q] = reducefactors (q, pf, pr);
+      ## q is now divided by all occurrences of factor pr.
+      sortflag = true;
+    endwhile
 
-    ## At this point, all prime factors <= the sqrt of the original q have been
-    ## pulled out in ascending order.
-    ##
-    ## If q = 1, then no further primes are left.
-    ## If q > 1, then q itself must be prime, and it must be the single prime
-    ## factor that was larger than the sqrt of the original q.
-    if (q > 1)
+    if (isprime (q))
       pf(end+1) = q;
-    endif
-  end
+    else
+      ## If we are here, then q is composite but less than 1e10,
+      ## and that is fast enough to test by division.
+      largeprimes = primes (feval (cls, sqrt (q)));
+      [pf, q] = reducefactors (q, pf, largeprimes);
 
-  ## At this point, all prime factors have been pulled out of q in ascending
-  ## order.  There is no need to sort(pf).
+      ## If q is still not 1, then it must be a prime of power 1.
+      if (q > 1)
+        pf(end+1) = q;
+      endif
+    endif
+  endif
+
+  ## The Pollard Rho technique can give factors in arbitrary order,
+  ## so we need to sort pf if that was used.
+  if (sortflag)
+    pf = sort (pf);
+  endif
 
   ## Determine multiplicity.
   if (nargout > 1)
@@ -192,16 +138,11 @@
 
   pf = pfin;
   q = qin;
-  ## The following line is a few milliseconds faster than
-  ## divisors (mod (q, divisors) ~= 0) = [];
   divisors = divisors (mod (q, divisors) == 0);
 
   for pp = divisors  # for each factor in turn
     ## Keep extracting all occurrences of that factor before going to larger
     ## factors.
-    ##
-    ## Note: mod() was marginally faster than rem(), when assessed over 10e6
-    ##       trials of the whole factor() function.
     while (mod (q, pp) == 0)
       pf(end+1) = pp;
       q /= pp;