view scripts/optimization/fsolve.m @ 8613:38482007c834

relax scaling in fsolve and use class-dependent default tolerances
author Jaroslav Hajek <highegg@gmail.com>
date Wed, 28 Jan 2009 08:38:29 +0100
parents 43f831758d42
children 7baacb6a8a65
line wrap: on
line source

## Copyright (C) 2008, 2009 VZLU Prague, a.s.
##
## This file is part of Octave.
##
## Octave is free software; you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or (at
## your option) any later version.
##
## Octave is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Octave; see the file COPYING.  If not, see
## <http://www.gnu.org/licenses/>.
##
## Author: Jaroslav Hajek <highegg@gmail.com>

## -*- texinfo -*-
## @deftypefn{Function File} {} fsolve (@var{fcn}, @var{x0}, @var{options})
## @deftypefnx{Function File} {[@var{x}, @var{fvec}, @var{info}, @var{output}, @var{fjac}]} = fsolve (@var{fcn}, @dots{})
## Solve a system of nonlinear equations defined by the function @var{fcn}.
## @var{fcn} should accepts a vector (array) defining the unknown variables,
## and return a vector of left-hand sides of the equations. Right-hand sides
## are defined to be zeros.
## In other words, this function attempts to determine a vector @var{x} such 
## that @code{@var{fcn} (@var{x})} gives (approximately) all zeros.
## @var{x0} determines a starting guess. The shape of @var{x0} is preserved
## in all calls to @var{fcn}, but otherwise it is treated as a column vector.
## @var{options} is a structure specifying additional options.
## Currently, @code{fsolve} recognizes these options:
## @code{"FunValCheck"}, @code{"OutputFcn"}, @code{"TolX"},
## @code{"TolFun"}, @code{"MaxIter"}, @code{"MaxFunEvals"}, 
## @code{"Jacobian"} and @code{"Updating"}. 
##
## If @code{"Jacobian"} is @code{"on"}, it specifies that @var{fcn},
## called with 2 output arguments, also returns the Jacobian matrix
## of right-hand sides at the requested point.  @code{"TolX"} specifies
## the termination tolerance in the unknown variables, while 
## @code{"TolFun"} is a tolerance for equations. Default is @code{1e-7}
## for both @code{"TolX"} and @code{"TolFun"}.
## If @code{"Updating"} is "on", the function will attempt to use Broyden
## updates to update the Jacobian, in order to reduce the amount of jacobian
## calculations.
## If your user function always calculates the Jacobian (regardless of number
## of output arguments), this option provides no advantage and should be set to
## false.
## 
## For description of the other options, see @code{optimset}.
##
## On return, @var{fval} contains the value of the function @var{fcn}
## evaluated at @var{x}, and @var{info} may be one of the following values:
## 
## @table @asis
## @item 1
## Converged to a solution point. Relative residual error is less than specified
## by TolFun.
## @item 2
## Last relative step size was less that TolX.
## @item 3
## Last relative decrease in residual was less than TolF. 
## @item 0
## Iteration limit exceeded.
## @item -3
## The trust region radius became excessively small. 
## @end table
##
## Note: If you only have a single nonlinear equation of one variable, using 
## @code{fzero} is usually a much better idea.
## @seealso{fzero, optimset}
## @end deftypefn

function [x, fvec, info, output, fjac] = fsolve (fcn, x0, options = struct ())

  ## Get default options if requested.
  if (nargin == 1 && ischar (fcn) && strcmp (fcn, 'defaults'))
    x = optimset ("MaxIter", 400, "MaxFunEvals", Inf, \
    "Jacobian", "off", "TolX", 1.5e-8, "TolF", 1.5e-8,
    "OutputFcn", [], "Updating", "on", "FunValCheck", "off");
    return;
  endif

  if (nargin < 2 || nargin > 3 || ! ismatrix (x0))
    print_usage ();
  endif    

  if (ischar (fcn))
    fcn = str2func (fcn);
  endif

  xsiz = size (x0);
  n = numel (x0);

  has_jac = strcmpi (optimget (options, "Jacobian", "off"), "on");
  maxiter = optimget (options, "MaxIter", 400);
  maxfev = optimget (options, "MaxFunEvals", Inf);
  outfcn = optimget (options, "OutputFcn");
  updating = strcmpi (optimget (options, "Updating", "on"), "on");

  funvalchk = strcmpi (optimget (options, "FunValCheck", "off"), "on");

  if (funvalchk)
    ## Replace fcn with a guarded version.
    fcn = @(x) guarded_eval (fcn, x);
  endif

  ## These defaults are rather stringent. I think that normally, user
  ## prefers accuracy to performance.

  macheps = eps (class (x0));

  tolx = optimget (options, "TolX", sqrt (macheps));
  tolf = optimget (options, "TolFun", sqrt (macheps));

  factor = 100;
  ## FIXME: TypicalX corresponds to user scaling (???)
  autodg = true;

  niter = 1;
  nfev = 0;

  x = x0(:);
  info = 0;

  ## Outer loop.
  while (niter < maxiter && nfev < maxfev && ! info)

    ## Calculate function value and Jacobian (possibly via FD).
    ## Handle arbitrary shapes of x and f and remember them.
    if (has_jac)
      [fvec, fjac] = fcn (reshape (x, xsiz));
      ## If the jacobian is sparse, disable Broyden updating.
      if (issparse (fjac))
        updating = false;
      endif
      nfev ++;
    else
      [fvec, fjac] = __fdjac__ (fcn, reshape (x, xsiz));
      nfev += 1 + length (x);
    endif
    fsiz = size (fvec);
    fvec = fvec(:);
    fn = norm (fvec);
    m = length (fvec);
    n = length (x);

    ## For square and overdetermined systems, we update a (pivoted) QR
    ## factorization of the jacobian to avoid solving a full system in each
    ## step. In this case, we pass a triangular matrix to __dogleg__.
    ## Pivoted QR is used for slightly better robustness and invariance
    ## w.r.t. permutations of variables.
    useqr = updating && m >= n && n > 10;

    if (useqr)
      ## Get QR factorization of the jacobian, optionally with column pivoting.
      ## TODO: pivoting is only done in the first step, to get invariance
      ## w.r.t. permutations of variables. Maybe it could be beneficial to
      ## repivot occassionally?
      if (niter == 1)
        [q, r, p] = qr (fjac, 0);
        ## p is a column vector. Blame Matlab for the inconsistency.
        ## Octave can handle permutation matrices gracefully.
        p = eye (n)(:, p);
      else
        [q, r] = qr (fjac*p, 0);
      endif
    endif

    ## Get column norms, use them as scaling factors.
    jcn = norm (fjac, 'columns').';
    if (niter == 1)
      if (autodg)
        dg = jcn;
        dg(dg == 0) = 1;
      endif
      xn = norm (dg .* x);
      ## FIXME: something better?
      delta = max (factor * xn, 1);
    endif

    ## Rescale if necessary.
    if (autodg)
      ## FIXME: the original minpack used the following rescaling strategy:
      ##   dg = max (dg, jcn);
      ## but it seems not good if we start with a bad guess yielding jacobian
      ## columns with large norms that later decrease, because the corresponding
      ## variable will still be overscaled. So instead, we only give the old
      ## scaling a small momentum, but do not honor it.

      dg = max (0.1*dg, jcn);

      ## It also seems that in the case of fast (and inhomogeneously) changing
      ## jacobian, the Broyden updates are of little use, so maybe we could
      ## skip them if a big disproportional change is expected. The question is,
      ## of course, how to define the above terms :)
    endif

    nfail = 0;
    nsuc = 0;
    ## Inner loop.
    while (niter <= maxiter && nfev < maxfev && ! info)

      if (useqr)
        tr_mat = r;
        tr_vec = q'*fvec;
      else
        tr_mat = fjac;
        tr_vec = fvec;
      endif

      ## Get trust-region model (dogleg) minimizer.
      if (useqr)
        qtf = q'*fvec;
        s = - __dogleg__ (r, qtf, dg, delta);
        w = qtf + r * s;
        s = p * s;
      else
        s = - __dogleg__ (fjac, fvec, dg, delta);
        w = fvec + fjac * s;
      endif

      sn = norm (dg .* s);
      if (niter == 1)
        delta = min (delta, sn);
      endif

      fvec1 = fcn (reshape (x + s, xsiz)) (:);
      fn1 = norm (fvec1);
      nfev ++;

      if (fn1 < fn)
        ## Scaled actual reduction.
        actred = 1 - (fn1/fn)^2;
      else
        actred = -1;
      endif

      ## Scaled predicted reduction, and ratio.
      t = norm (w);
      if (t < fn)
        prered = 1 - (t/fn)^2;
        ratio = actred / prered;
      else
        prered = 0;
        ratio = 0;
      endif

      ## Update delta.
      if (ratio < 0.1)
        nsuc = 0;
        nfail ++;
        delta *= 0.5;
        if (delta <= 1e1*macheps*xn)
          ## Trust region became uselessly small.
          info = -3;
          break;
        endif
      else
        nfail = 0;
        nsuc ++;
        if (abs (1-ratio) <= 0.1)
          delta = 2*sn;
        elseif (ratio >= 0.5 || nsuc > 1)
          delta = max (delta, 2*sn);
        endif
      endif

      if (ratio >= 1e-4)
        ## Successful iteration.
        x += s;
        xn = norm (dg .* x);
        fvec = fvec1;
        fn = fn1;
        niter ++;
      endif

      ## FIXME: should outputfcn be only called after a successful iteration?
      if (outfcn)
        optimvalues.iter = niter;
        optimvalues.funccount = nfev;
        optimvalues.fval = fn;
        optimvalues.searchdirection = s;
        state = 'iter';
        stop = outfcn (x, optimvalues, state);
        if (stop)
          info = -1;
          break;
        endif
      endif

      ## Tests for termination conditions. A mysterious place, anything
      ## can happen if you change something here...
      
      ## The rule of thumb (which I'm not sure M*b is quite following)
      ## is that for a tolerance that depends on scaling, only 0 makes
      ## sense as a default value. But 0 usually means uselessly long
      ## iterations, so we need scaling-independent tolerances wherever
      ## possible.

      ## FIXME -- why tolf*n*xn? If abs (e) ~ abs(x) * eps is a vector
      ## of perturbations of x, then norm (fjac*e) <= eps*n*xn, i.e. by
      ## tolf ~ eps we demand as much accuracy as we can expect.
      if (fn <= tolf*n*xn)
        info = 1;
	## The following tests done only after successful step.
      elseif (actred > 0)
        ## This one is classic. Note that we use scaled variables again,
	## but compare to scaled step, so nothing bad.
        if (sn <= tolx*xn)
          info = 2;
          ## Again a classic one. It seems weird to use the same tolf
	  ## for two different tests, but that's what M*b manual appears
	  ## to say.
        elseif (actred < tolf)
          info = 3;
        endif
      endif

      ## Criterion for recalculating jacobian.
      if (! updating || nfail == 2)
        break;
      endif

      ## Compute the scaled Broyden update.
      if (useqr)
        u = (fvec1 - q*w) / sn; 
        v = dg .* ((dg .* s) / sn);
        v = p' * v;

        ## Update the QR factorization.
        [q, r] = qrupdate (q, r, u, v);
      else
        u = (fvec1 - w);
        v = dg .* ((dg .* s) / sn);

        ## update the jacobian
        fjac += u * v';
      endif
    endwhile
  endwhile

  ## Restore original shapes.
  x = reshape (x, xsiz);
  fvec = reshape (fvec, fsiz);

  output.iterations = niter;
  output.funcCount = nfev;

endfunction

## An assistant function that evaluates a function handle and checks for
## bad results.
function fx = guarded_eval (fun, x)
  fx = fun (x);
  if (! all (isreal (fx)))
    error ("fsolve:notreal", "fsolve: non-real value encountered"); 
  elseif (any (isnan (fx)))
    error ("fsolve:isnan", "fsolve: NaN value encountered"); 
  endif
endfunction

%!function retval = f (p) 
%!  x = p(1);
%!  y = p(2);
%!  z = p(3);
%!  retval = zeros (3, 1);
%!  retval(1) = sin(x) + y**2 + log(z) - 7;
%!  retval(2) = 3*x + 2**y -z**3 + 1;
%!  retval(3) = x + y + z - 5;
%!test
%! x_opt = [ 0.599054;
%! 2.395931;
%! 2.005014 ];
%! tol = 1.0e-5;
%! [x, fval, info] = fsolve (@f, [ 0.5; 2.0; 2.5 ]);
%! assert (info > 0);
%! assert (norm (x - x_opt, Inf) < tol);
%! assert (norm (fval) < tol);

%!function retval = f (p)
%!  x = p(1);
%!  y = p(2);
%!  z = p(3);
%!  w = p(4);
%!  retval = zeros (4, 1);
%!  retval(1) = 3*x + 4*y + exp (z + w) - 1.007;
%!  retval(2) = 6*x - 4*y + exp (3*z + w) - 11;
%!  retval(3) = x^4 - 4*y^2 + 6*z - 8*w - 20;
%!  retval(4) = x^2 + 2*y^3 + z - w - 4;
%!test
%! x_opt = [ -0.767297326653401, 0.590671081117440, 1.47190018629642, -1.52719341133957 ];
%! tol = 1.0e-5;
%! [x, fval, info] = fsolve (@f, [-1, 1, 2, -1]);
%! assert (info > 0);
%! assert (norm (x - x_opt, Inf) < tol);
%! assert (norm (fval) < tol);

%!function retval = f (p) 
%!  x = p(1);
%!  y = p(2);
%!  z = p(3);
%!  retval = zeros (3, 1);
%!  retval(1) = sin(x) + y**2 + log(z) - 7;
%!  retval(2) = 3*x + 2**y -z**3 + 1;
%!  retval(3) = x + y + z - 5;
%!  retval(4) = x*x + y - z*log(z) - 1.36;
%!test
%! x_opt = [ 0.599054;
%! 2.395931;
%! 2.005014 ];
%! tol = 1.0e-5;
%! [x, fval, info] = fsolve (@f, [ 0.5; 2.0; 2.5 ]);
%! assert (info > 0);
%! assert (norm (x - x_opt, Inf) < tol);
%! assert (norm (fval) < tol);

%!function retval = f (p) 
%!  x = p(1);
%!  y = p(2);
%!  z = p(3);
%!  retval = zeros (3, 1);
%!  retval(1) = sin(x) + y**2 + log(z) - 7;
%!  retval(2) = 3*x + 2**y -z**3 + 1;
%!  retval(3) = x + y + z - 5;
%!test
%! x_opt = [ 0.599054;
%! 2.395931;
%! 2.005014 ];
%! tol = 1.0e-5;
%! opt = optimset ("Updating", "qrp");
%! [x, fval, info] = fsolve (@f, [ 0.5; 2.0; 2.5 ], opt);
%! assert (info > 0);
%! assert (norm (x - x_opt, Inf) < tol);
%! assert (norm (fval) < tol);

%!test
%! b0 = 3;
%! a0 = 0.2;
%! x = 0:.5:5;
%! noise = 1e-5 * sin (100*x);
%! y = exp (-a0*x) + b0 + noise;
%! c_opt = [a0, b0];
%! tol = 1e-5;
%! 
%! [c, fval, info, output] =  fsolve (@(c) (exp(-c(1)*x) + c(2) - y), [0, 0]);
%! assert (info > 0);
%! assert (norm (c - c_opt, Inf) < tol);
%! assert (norm (fval) < norm (noise));