Mercurial > forge
view main/optim/src/__bfgsmin.cc @ 9930:d30cfca46e8a octave-forge
optim: upgrade license to GPLv3+ and mention on DESCRIPTION the other package licenses
author | carandraug |
---|---|
date | Fri, 30 Mar 2012 15:14:48 +0000 |
parents | c76dd0b09f9e |
children | 07a36f1d7316 |
line wrap: on
line source
// Copyright (C) 2004,2005,2006,2007,2010 Michael Creel <michael.creel@uab.es> // // This program is free software; you can redistribute it and/or modify it under // the terms of the GNU General Public License as published by the Free Software // Foundation; either version 3 of the License, or (at your option) any later // version. // // This program is distributed in the hope that it will be useful, but WITHOUT // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more // details. // // You should have received a copy of the GNU General Public License along with // this program; if not, see <http://www.gnu.org/licenses/>. // the functions defined in this file are: // __bfgsmin_obj: bulletproofed objective function that allows checking for availability of analytic gradient // __numgradient: numeric gradient, used only if analytic not supplied // __bisectionstep: fallback stepsize algorithm // __newtonstep: default stepsize algorithm // __bfgsmin: the DLD function that does the minimization, to be called from bfgsmin.m #include <oct.h> #include <octave/parse.h> #include <octave/Cell.h> #include <octave/lo-mappers.h> #include <float.h> #include "error.h" int __bfgsmin_obj(double &obj, const std::string f, const octave_value_list f_args, const ColumnVector theta, const int minarg) { octave_value_list f_return, f_args_new; int success = 1; f_args_new = f_args; f_args_new(minarg - 1) = theta; f_return = feval(f, f_args_new); obj = f_return(0).double_value(); // bullet-proof the objective function if (error_state) { warning("__bfgsmin_obj: objective function could not be evaluated - setting to DBL_MAX"); obj = DBL_MAX; success = 0; } return success; } // __numgradient: numeric central difference gradient for bfgs. // This is the same as numgradient, except the derivative is known to be a vector, it's defined as a column, // and the finite difference delta is incorporated directly rather than called from a function int __numgradient(ColumnVector &derivative, const std::string f, const octave_value_list f_args, const int minarg) { double SQRT_EPS, diff, delta, obj_left, obj_right, p; int j, test, success; ColumnVector parameter = f_args(minarg - 1).column_vector_value(); int k = parameter.rows(); ColumnVector g(k); SQRT_EPS = sqrt(DBL_EPSILON); diff = exp(log(DBL_EPSILON)/3.0); // get 1st derivative by central difference for (j=0; j<k; j++) { p = parameter(j); // determine delta for finite differencing test = (fabs(p) + SQRT_EPS) * SQRT_EPS > diff; if (test) delta = (fabs(p) + SQRT_EPS) * SQRT_EPS; else delta = diff; // right side parameter(j) = p + delta; success = __bfgsmin_obj(obj_right, f, f_args, parameter, minarg); if (!success) error("__numgradient: objective function failed, can't compute numeric gradient"); // left size parameter(j) = p - delta; success = __bfgsmin_obj(obj_left, f, f_args, parameter, minarg); if (!success) error("__numgradient: objective function failed, can't compute numeric gradient"); parameter(j) = p; // restore original parameter for next round g(j) = (obj_right - obj_left) / (2.0*delta); } derivative = g; return success; } int __bfgsmin_gradient(ColumnVector &derivative, const std::string f, octave_value_list f_args, const ColumnVector theta, const int minarg, int try_analytic_gradient, int &have_analytic_gradient) { octave_value_list f_return; int k = theta.rows(); int success; ColumnVector g(k); Matrix check_gradient(k,1); if (have_analytic_gradient) { f_args(minarg - 1) = theta; f_return = feval(f, f_args); g = f_return(1).column_vector_value(); } else if (try_analytic_gradient) { f_args(minarg - 1) = theta; f_return = feval(f, f_args); if (f_return.length() > 1) { if (f_return(1).is_real_matrix()) { if ((f_return(1).rows() == k) & (f_return(1).columns() == 1)) { g = f_return(1).column_vector_value(); have_analytic_gradient = 1; } else have_analytic_gradient = 0; } else have_analytic_gradient = 0; } else have_analytic_gradient = 0; if (!have_analytic_gradient) __numgradient(g, f, f_args, minarg); } else __numgradient(g, f, f_args, minarg); // check that gradient is ok check_gradient.column(0) = g; if (check_gradient.any_element_is_inf_or_nan()) { error("__bfgsmin_gradient: gradient contains NaNs or Inf"); success = 0; } else success = 1; derivative = g; return success; } // this is the lbfgs direction, used if control has 5 elements ColumnVector lbfgs_recursion(const int memory, const Matrix sigmas, const Matrix gammas, ColumnVector d) { if (memory == 0) { const int n = sigmas.columns(); ColumnVector sig = sigmas.column(n-1); ColumnVector gam = gammas.column(n-1); // do conditioning if there is any memory double cond = gam.transpose()*gam; if (cond > 0) { cond = (sig.transpose()*gam) / cond; d = cond*d; } return d; } else { const int k = d.rows(); const int n = sigmas.columns(); int i, j; ColumnVector sig = sigmas.column(memory-1); ColumnVector gam = gammas.column(memory-1); double rho; rho = 1.0 / (gam.transpose() * sig); double alpha; alpha = rho * (sig.transpose() * d); d = d - alpha*gam; d = lbfgs_recursion(memory - 1, sigmas, gammas, d); d = d + (alpha - rho * gam.transpose() * d) * sig; } return d; } // __bisectionstep: fallback stepsize method if __newtonstep fails int __bisectionstep(double &step, double &obj, const std::string f, const octave_value_list f_args, const ColumnVector x, const ColumnVector dx, const int minarg, const int verbose) { double best_obj, improvement, improvement_0; int found_improvement; ColumnVector trial; // initial values best_obj = obj; improvement_0 = 0.0; step = 1.0; trial = x + step*dx; __bfgsmin_obj(obj, f, f_args, trial, minarg); if (verbose) printf("bisectionstep: trial step: %g obj value: %g\n", step, obj); // this first loop goes until an improvement is found while (obj >= best_obj) { if (step < 2.0*DBL_EPSILON) { if (verbose) warning("bisectionstep: unable to find improvement, setting step to zero"); step = 0.0; return 0; } step = 0.5*step; trial = x + step*dx; __bfgsmin_obj(obj, f, f_args, trial, minarg); if (verbose) printf("bisectionstep: trial step: %g obj value: %g best_value: %g\n", step, obj, best_obj); } // now keep going until rate of improvement is too low, or reach max trials best_obj = obj; while (step > 2.0*DBL_EPSILON) { step = 0.5*step; trial = x + step*dx; __bfgsmin_obj(obj, f, f_args, trial, minarg); if (verbose) printf("bisectionstep: trial step: %g obj value: %g\n", step, obj); // if improved, record new best and try another step if (obj < best_obj) { improvement = best_obj - obj; best_obj = obj; if (improvement > 0.5*improvement_0) { improvement_0 = improvement; } else break; } else { step = 2.0*step; // put it back to best found obj = best_obj; break; } } return 1; } // __newtonstep: default stepsize algorithm int __newtonstep(double &step, double &obj, const std::string f, const octave_value_list f_args, const ColumnVector x, const ColumnVector dx, const int minarg, const int verbose) { double obj_0, obj_left, obj_right, delta, inv_delta_sq, gradient, hessian; int found_improvement = 0; obj_0 = obj; delta = 0.001; // experimentation shows that this is a good choice inv_delta_sq = 1.0 / (delta*delta); ColumnVector x_right = x + delta*dx; ColumnVector x_left = x - delta*dx; // right __bfgsmin_obj(obj_right, f, f_args, x_right, minarg); // left __bfgsmin_obj(obj_left, f, f_args, x_left, minarg); gradient = (obj_right - obj_left) / (2.0*delta); // take central difference hessian = inv_delta_sq*(obj_right - 2.0*obj_0 + obj_left); hessian = fabs(hessian); // ensures we're going in a decreasing direction if (hessian < 2.0*DBL_EPSILON) hessian = 1.0; // avoid div by zero step = - gradient / hessian; // hessian inverse gradient: the Newton step // step = (step < 1.0)*step + 1.0*(step >= 1.0); // maximum stepsize is 1.0 - conservative // ensure that this is improvement, and if not, fall back to bisection __bfgsmin_obj(obj, f, f_args, x + step*dx, minarg); if (verbose) printf("newtonstep: trial step: %g obj value: %g\n", step, obj); if (obj > obj_0) { obj = obj_0; if (verbose) warning("__stepsize: no improvement with Newton step, falling back to bisection"); found_improvement = __bisectionstep(step, obj, f, f_args, x, dx, minarg, verbose); } else found_improvement = 1; if (xisnan(obj)) { obj = obj_0; if (verbose) warning("__stepsize: objective function crash in Newton step, falling back to bisection"); found_improvement = __bisectionstep(step, obj, f, f_args, x, dx, minarg, verbose); } else found_improvement = 1; return found_improvement; } DEFUN_DLD(__bfgsmin, args, ,"__bfgsmin: backend for bfgs minimization\n\ Users should not use this directly. Use bfgsmin.m instead") { std::string f (args(0).string_value()); Cell f_args_cell (args(1).cell_value()); octave_value_list f_args, f_return; // holder for return items int max_iters, verbosity, criterion, minarg, convergence, iter, memory, \ gradient_ok, i, j, k, conv_fun, conv_param, conv_grad, have_gradient, \ try_gradient, warnings; double func_tol, param_tol, gradient_tol, stepsize, obj_value, obj_in, \ last_obj_value, obj_value2, denominator, test; Matrix H, H1, H2; ColumnVector thetain, d, g, g_new, p, q, sig, gam; // controls Cell control (args(2).cell_value()); max_iters = control(0).int_value(); if (max_iters == -1) max_iters = INT_MAX; verbosity = control(1).int_value(); criterion = control(2).int_value(); minarg = control(3).int_value(); memory = control(4).int_value(); func_tol = control(5).double_value(); param_tol = control(6).double_value(); gradient_tol = control(7).double_value(); // want to see warnings? warnings = 0; if (verbosity == 3) warnings = 1; // copy cell contents over to octave_value_list to use feval() k = f_args_cell.length(); f_args(k); // resize only once for (i = 0; i<k; i++) f_args(i) = f_args_cell(i); // get the minimization argument ColumnVector theta = f_args(minarg - 1).column_vector_value(); k = theta.rows(); // containers for items in limited memory version Matrix sigmas(k, memory); Matrix gammas(k, memory); sigmas.fill(0.0); gammas.fill(0.0); // initialize things have_gradient = 0; // have analytic gradient try_gradient = 1; // try to get analytic gradient convergence = -1; // if this doesn't change, it means that maxiters were exceeded thetain = theta; H = identity_matrix(k,k); // Initial obj_value __bfgsmin_obj(obj_in, f, f_args, theta, minarg); if (warnings) printf("initial obj_value %g\n", obj_in); // Initial gradient (try analytic, and use it if it's close enough to numeric) __bfgsmin_gradient(g, f, f_args, theta, minarg, 1, have_gradient); // try analytic if (have_gradient) { // check equality if analytic available if (warnings) printf("function claims to provide analytic gradient\n"); have_gradient = 0; // force numeric __bfgsmin_gradient(g_new, f, f_args, theta, minarg, 0, have_gradient); p = g - g_new; have_gradient = sqrt(p.transpose() * p) < gradient_tol; if (have_gradient && warnings) printf("function claims to provide analytic gradient, and it agrees with numeric - using analytic\n"); if (!have_gradient && warnings) printf("function claims to provide analytic gradient, but it does not agree with numeric - using numeric\n"); } last_obj_value = obj_in; // initialize, is updated after each iteration // MAIN LOOP STARTS HERE for (iter = 0; iter < max_iters; iter++) { if(memory > 0) { // lbfgs if (iter < memory) d = lbfgs_recursion(iter, sigmas, gammas, g); else d = lbfgs_recursion(memory, sigmas, gammas, g); d = -d; } else d = -H*g; // ordinary bfgs // convergence tests conv_fun = 0; conv_param = 0; conv_grad = 0; // function convergence p = theta+d; __bfgsmin_obj(obj_value, f, f_args, p, minarg); if (fabs(last_obj_value) > 1.0) conv_fun=(fabs((obj_value/last_obj_value-1)))<func_tol; else conv_fun = fabs(obj_value - last_obj_value) < func_tol; // parameter change convergence test = sqrt(theta.transpose() * theta); if (test > 1.0) conv_param = sqrt(d.transpose() * d) / test < param_tol ; else conv_param = sqrt(d.transpose() * d) < param_tol; // Want intermediate results? // gradient convergence conv_grad = sqrt(g.transpose() * g) < gradient_tol; // Are we done? if (criterion == 1) { if (conv_fun && conv_param && conv_grad) { convergence = 1; break; } } else if (conv_fun) { convergence = 1; break; } // if not done, then take a step // stepsize: try (l)bfgs direction, then steepest descent if it fails f_args(minarg - 1) = theta; obj_value = last_obj_value; __newtonstep(stepsize, obj_value, f, f_args, theta, d, minarg, warnings); if (stepsize == 0.0) { // fall back to steepest descent if (warnings) warning("bfgsmin: BFGS direction fails, switch to steepest descent"); d = -g; // try steepest descent H = identity_matrix(k,k); // accompany with Hessian reset, for good measure obj_value = last_obj_value; __newtonstep(stepsize, obj_value, f, f_args, theta, d, minarg, warnings); if (stepsize == 0.0) { // if true, exit, we can't find a direction of descent warning("bfgsmin: failure, exiting. Try different start values?"); f_return(0) = theta; f_return(1) = obj_value; f_return(2) = -1; f_return(3) = iter; return octave_value_list(f_return); } } p = stepsize*d; // Want intermediate results? if (verbosity > 1) { printf("------------------------------------------------\n"); printf("bfgsmin iteration %d convergence (f g p): %d %d %d\n", iter, conv_fun, conv_grad, conv_param); if (warnings) { if (memory > 0) printf("Using LBFGS, memory is last %d iterations\n",memory); } printf("\nfunction value: %g stepsize: %g \n\n", last_obj_value, stepsize); if (have_gradient) printf("used analytic gradient\n"); else printf("used numeric gradient\n"); for (j = 0; j<k; j++) printf("%15.5f %15.5f %15.5f\n",theta(j), g(j), p(j)); } //-------------------------------------------------- // // Are we done? // if (criterion == 1) { // if (conv_fun && conv_param && conv_grad) { // convergence = 1; // break; // } // } // else if (conv_fun) { // convergence = 1; // break; // } //-------------------------------------------------- last_obj_value = obj_value; theta = theta + p; // new gradient gradient_ok = __bfgsmin_gradient(g_new, f, f_args, theta, minarg, try_gradient, have_gradient); if (memory == 0) { //bfgs? // Hessian update if gradient ok if (gradient_ok) { q = g_new-g; g = g_new; denominator = q.transpose()*p; if ((fabs(denominator) < DBL_EPSILON)) { // reset Hessian if necessary if (verbosity == 1) printf("bfgsmin: Hessian reset\n"); H = identity_matrix(k,k); } else { H1 = (1.0+(q.transpose() * H * q) / denominator) / denominator \ * (p * p.transpose()); H2 = (p * q.transpose() * H + H*q*p.transpose()); H2 = H2 / denominator; H = H + H1 - H2; } } else H = identity_matrix(k,k); // reset hessian if gradient fails // then try to start again with steepest descent } else { // otherwise lbfgs // save components for Hessian if gradient ok if (gradient_ok) { sig = p; // change in parameter gam = g_new - g; // change in gradient g = g_new; // shift remembered vectors to the right (forget last) for(j = memory - 1; j > 0; j--) { for(i = 0; i < k; i++) { sigmas(i,j) = sigmas(i,j-1); gammas(i,j) = gammas(i,j-1); } } // insert new vectors in left-most column for(i = 0; i < k; i++) { sigmas(i, 0) = sig(i); gammas(i, 0) = gam(i); } } else { // failed gradient - loose memory and use previous theta sigmas.fill(0.0); gammas.fill(0.0); theta = theta - p; } } } // Want last iteration results? if (verbosity > 0) { printf("------------------------------------------------\n"); printf("bfgsmin final results: %d iterations\n", iter); if (warnings) { if (memory > 0) printf("Used LBFGS, memory is last %d iterations\n",memory); } printf("\nfunction value: %g\n\n", obj_value); if (convergence == -1) printf("NO CONVERGENCE: max iters exceeded\n"); if ((convergence == 1) & (criterion == 1)) printf("STRONG CONVERGENCE\n"); if ((convergence == 1) & !(criterion == 1)) printf("WEAK CONVERGENCE\n"); if (convergence == 2) printf("NO CONVERGENCE: algorithm failed\n"); printf("Function conv %d Param conv %d Gradient conv %d\n\n", conv_fun, conv_param, conv_grad); if (have_gradient) printf("used analytic gradient\n"); else printf("used numeric gradient\n"); printf(" param gradient (n) change\n"); for (j = 0; j<k; j++) printf("%15.5f %15.5f %15.5f\n",theta(j), g(j), d(j)); } f_return(3) = iter; f_return(2) = convergence; f_return(1) = obj_value; f_return(0) = theta; return f_return; }