Yannick Estève / ONTRAC-Kaldi

Blame view

src/matrix/optimization.cc 21.9 KB
  // matrix/optimization.cc
  
  // Copyright 2012  Johns Hopkins University (author: Daniel Povey)
  
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  //
  // (*) incorporates, with permission, FFT code from his book
  // "Signal Processing with Lapped Transforms", Artech, 1992.
  
  #include <algorithm>
  
  #include "matrix/optimization.h"
  #include "matrix/sp-matrix.h"
  
  namespace kaldi {
  
  
  // Below, N&W refers to Nocedal and Wright, "Numerical Optimization", 2nd Ed.
  
  template<typename Real>
  OptimizeLbfgs<Real>::OptimizeLbfgs(const VectorBase<Real> &x,
                                     const LbfgsOptions &opts):
      opts_(opts), k_(0), computation_state_(kBeforeStep), H_was_set_(false) {
    KALDI_ASSERT(opts.m > 0); // dimension.
    MatrixIndexT dim = x.Dim();
    KALDI_ASSERT(dim > 0);
    x_ = x; // this is the value of x_k
    new_x_ = x;  // this is where we'll evaluate the function next.
    deriv_.Resize(dim);
    temp_.Resize(dim);
    data_.Resize(2 * opts.m, dim);
    rho_.Resize(opts.m);
    // Just set f_ to some invalid value, as we haven't yet set it.
    f_ = (opts.minimize ? 1 : -1 ) * std::numeric_limits<Real>::infinity();
    best_f_ = f_;
    best_x_ = x_;
  }
  
  
  template<typename Real>
  Real OptimizeLbfgs<Real>::RecentStepLength() const {
    size_t n = step_lengths_.size();
    if (n == 0) return std::numeric_limits<Real>::infinity();
    else {
      if (n >= 2 && step_lengths_[n-1] == 0.0 && step_lengths_[n-2] == 0.0)
        return 0.0; // two zeros in a row means repeated restarts, which is
      // a loop.  Short-circuit this by returning zero.
      Real avg = 0.0;
      for (size_t i = 0; i < n; i++)
        avg += step_lengths_[i] / n;
      return avg;
    }
  }
  
  template<typename Real>
  void OptimizeLbfgs<Real>::ComputeHifNeeded(const VectorBase<Real> &gradient) {
    if (k_ == 0) {
      if (H_.Dim() == 0) {
        // H was never set up.  Set it up for the first time.
        Real learning_rate;
        if (opts_.first_step_length > 0.0) { // this takes
          // precedence over first_step_learning_rate, if set.
          // We are setting up H for the first time.
          Real gradient_length = gradient.Norm(2.0);
          learning_rate = (gradient_length > 0.0 ?
                           opts_.first_step_length / gradient_length :
                           1.0);
        } else if (opts_.first_step_impr > 0.0) {
          Real gradient_length = gradient.Norm(2.0);
          learning_rate = (gradient_length > 0.0 ?
                    opts_.first_step_impr / (gradient_length * gradient_length) :
                    1.0);
        } else {
          learning_rate = opts_.first_step_learning_rate;
        }
        H_.Resize(x_.Dim());
        KALDI_ASSERT(learning_rate > 0.0);
        H_.Set(opts_.minimize ? learning_rate : -learning_rate);
      }
    } else { // k_ > 0
      if (!H_was_set_) { // The user never specified an approximate
        // diagonal inverse Hessian.
        // Set it using formula 7.20: H_k^{(0)} = \gamma_k I, where
        // \gamma_k = s_{k-1}^T y_{k-1} / y_{k-1}^T y_{k-1}
        SubVector<Real> y_km1 = Y(k_-1);
        double gamma_k = VecVec(S(k_-1), y_km1) / VecVec(y_km1, y_km1);
        if (KALDI_ISNAN(gamma_k) || KALDI_ISINF(gamma_k)) {
          KALDI_WARN << "NaN encountered in L-BFGS (already converged?)";
          gamma_k = (opts_.minimize ? 1.0 : -1.0);
        }
        H_.Set(gamma_k);
      }
    }
  }  
  
  // This represents the first 2 lines of Algorithm 7.5 (N&W), which
  // in fact is mostly a call to Algorithm 7.4.
  // Note: this is valid whether we are minimizing or maximizing.
  template<typename Real>
  void OptimizeLbfgs<Real>::ComputeNewDirection(Real function_value,
                                                const VectorBase<Real> &gradient) {
    KALDI_ASSERT(computation_state_ == kBeforeStep);
    SignedMatrixIndexT m = M(), k = k_;
    ComputeHifNeeded(gradient);
    // The rest of this is computing p_k <-- - H_k 
  abla f_k using Algorithm
    // 7.4 of N&W.
    Vector<Real> &q(deriv_), &r(new_x_); // Use deriv_ as a temporary place to put
    // q, and new_x_ as a temporay place to put r.
    // The if-statement below is just to get rid of spurious warnings from
    // valgrind about memcpy source and destination overlap, since sometimes q and
    // gradient are the same variable.
    if (&q != &gradient)
      q.CopyFromVec(gradient); // q <-- 
  abla f_k.
    Vector<Real> alpha(m);
    // for i = k - 1, k - 2, ... k - m
    for (SignedMatrixIndexT i = k - 1;
         i >= std::max(k - m, static_cast<SignedMatrixIndexT>(0));
         i--) { 
      alpha(i % m) = rho_(i % m) * VecVec(S(i), q); // \alpha_i <-- \rho_i s_i^T q.
      q.AddVec(-alpha(i % m), Y(i)); // q <-- q - \alpha_i y_i
    }
    r.SetZero();
    r.AddVecVec(1.0, H_, q, 0.0); // r <-- H_k^{(0)} q.
    // for k = k - m, k - m + 1, ... , k - 1
    for (SignedMatrixIndexT i = std::max(k - m, static_cast<SignedMatrixIndexT>(0));
         i < k;
         i++) {
      Real beta = rho_(i % m) * VecVec(Y(i), r); // \beta <-- \rho_i y_i^T r
      r.AddVec(alpha(i % m) - beta, S(i)); // r <-- r + s_i (\alpha_i - \beta)
    }
  
    { // TEST.  Note, -r will be the direction.
      Real dot = VecVec(gradient, r);
      if ((opts_.minimize && dot < 0) || (!opts_.minimize && dot > 0))
        KALDI_WARN << "Step direction has the wrong sign!  Routine will fail.";
    }
    
    // Now we're out of Alg. 7.4 and back into Alg. 7.5.
    // Alg. 7.4 returned r (using new_x_ as the location), and with \alpha_k = 1
    // as the initial guess, we're setting x_{k+1} = x_k + \alpha_k p_k, with
    // p_k = -r [hence the statement new_x_.Scale(-1.0)]., and \alpha_k = 1.
    // This is the first place we'll get the user to evaluate the function;
    // any backtracking (or acceptance of that step) occurs inside StepSizeIteration.
    // We're still within iteration k; we haven't yet finalized the step size.
    new_x_.Scale(-1.0);
    new_x_.AddVec(1.0, x_);
    if (&deriv_ != &gradient)
      deriv_.CopyFromVec(gradient);
    f_ = function_value;
    d_ = opts_.d;
    num_wolfe_i_failures_ = 0;
    num_wolfe_ii_failures_ = 0;
    last_failure_type_ = kNone;
    computation_state_ = kWithinStep;
  }
  
  
  template<typename Real>
  bool OptimizeLbfgs<Real>::AcceptStep(Real function_value,
                                       const VectorBase<Real> &gradient) {
    // Save s_k = x_{k+1} - x_{k}, and y_k = 
  abla f_{k+1} - 
  abla f_k.
    SubVector<Real> s = S(k_), y = Y(k_);
    s.CopyFromVec(new_x_);
    s.AddVec(-1.0, x_); // s = new_x_ - x_.
    y.CopyFromVec(gradient);
    y.AddVec(-1.0, deriv_); // y = gradient - deriv_.
    
    // Warning: there is a division in the next line.  This could
    // generate inf or nan, but this wouldn't necessarily be an error
    // at this point because for zero step size or derivative we should
    // terminate the iterations.  But this is up to the calling code.
    Real prod = VecVec(y, s);
    rho_(k_ % opts_.m) = 1.0 / prod;
    Real len = s.Norm(2.0);
  
    if ((opts_.minimize && prod <= 1.0e-20) || (!opts_.minimize && prod >= -1.0e-20)
        || len == 0.0)
      return false; // This will force restart.
    
    KALDI_VLOG(3) << "Accepted step; length was " << len
                  << ", prod was " << prod;
    RecordStepLength(len);
    
    // store x_{k+1} and the function value f_{k+1}.
    x_.CopyFromVec(new_x_);
    f_ = function_value;
    k_++;
  
    return true; // We successfully accepted the step.
  }
  
  template<typename Real>
  void OptimizeLbfgs<Real>::RecordStepLength(Real s) {
    step_lengths_.push_back(s);
    if (step_lengths_.size() > static_cast<size_t>(opts_.avg_step_length))
      step_lengths_.erase(step_lengths_.begin(), step_lengths_.begin() + 1);
  }
  
  
  template<typename Real>
  void OptimizeLbfgs<Real>::Restart(const VectorBase<Real> &x,
                                    Real f,
                                    const VectorBase<Real> &gradient) {
    // Note: we will consider restarting (the transition of x_ -> x)
    // as a step, even if it has zero step size.  This is necessary in
    // order for convergence to be detected.
    {
      Vector<Real> &diff(temp_);
      diff.CopyFromVec(x);
      diff.AddVec(-1.0, x_);
      RecordStepLength(diff.Norm(2.0));
    }
    k_ = 0; // Restart the iterations!  [But note that the Hessian,
    // whatever it was, stays as before.]
    if (&x_ != &x)
      x_.CopyFromVec(x);
    new_x_.CopyFromVec(x);
    f_ = f;
    computation_state_ = kBeforeStep;
    ComputeNewDirection(f, gradient);
  }
  
  template<typename Real>
  void OptimizeLbfgs<Real>::StepSizeIteration(Real function_value,
                                              const VectorBase<Real> &gradient) {
    KALDI_VLOG(3) << "In step size iteration, function value changed "
                  << f_ << " to " << function_value;
    
    // We're in some part of the backtracking, and the user is providing
    // the objective function value and gradient.
    // We're checking two conditions: Wolfe i) [the Armijo rule] and
    // Wolfe ii).
    
    // The Armijo rule (when minimizing) is:
    // f(k_k + \alpha_k p_k) <= f(x_k) + c_1 \alpha_k p_k^T 
  abla f(x_k), where
    //  
  abla means the derivative.
    // Below, "temp" is the RHS of this equation, where (\alpha_k p_k) equals
    // (new_x_ - x_); we don't store \alpha or p_k separately, they are implicit
    // as the difference new_x_ - x_.
  
    // Below, pf is \alpha_k p_k^T 
  abla f(x_k).
    Real pf = VecVec(new_x_, deriv_) - VecVec(x_, deriv_);
    Real temp = f_ + opts_.c1 * pf;
    
    bool wolfe_i_ok;
    if (opts_.minimize) wolfe_i_ok = (function_value <= temp);
    else wolfe_i_ok = (function_value >= temp);
    
    // Wolfe condition ii) can be written as:
    //  p_k^T 
  abla f(x_k + \alpha_k p_k) >= c_2 p_k^T 
  abla f(x_k)
    // p2f equals \alpha_k p_k^T 
  abla f(x_k + \alpha_k p_k), where
    // (\alpha_k p_k^T) is (new_x_ - x_).
    // Note that in our version of Wolfe condition (ii) we have an extra
    // factor alpha, which doesn't affect anything.
    Real p2f = VecVec(new_x_, gradient) - VecVec(x_, gradient);
    //eps = (sizeof(Real) == 4 ? 1.0e-05 : 1.0e-10) *
    //(std::abs(p2f) + std::abs(pf));
    bool wolfe_ii_ok;
    if (opts_.minimize) wolfe_ii_ok = (p2f >= opts_.c2 * pf);
    else wolfe_ii_ok = (p2f <= opts_.c2 * pf);
  
    enum { kDecrease, kNoChange } d_action; // What do do with d_: leave it alone,
    // or take the square root.
    enum { kAccept, kDecreaseStep, kIncreaseStep, kRestart } iteration_action;
    // What we'll do in the overall iteration: accept this value, DecreaseStep
    // (reduce the step size), IncreaseStep (increase the step size), or kRestart
    // (set k back to zero).  Generally when we can't get both conditions to be
    // true with a reasonable period of time, it makes sense to restart, because
    // probably we've almost converged and got into numerical issues; from here
    // we'll just produced NaN's.  Restarting is a safe thing to do and the outer
    // code will quickly detect convergence.
  
    d_action = kNoChange; // the default.
    
    if (wolfe_i_ok && wolfe_ii_ok) {
      iteration_action = kAccept;
      d_action = kNoChange; // actually doesn't matter, it'll get reset.
    } else if (!wolfe_i_ok) {
      // If wolfe i) [the Armijo rule] failed then we went too far (or are
      // meeting numerical problems).
      if (last_failure_type_ == kWolfeII) { // Last time we failed it was Wolfe ii).
        // When we switch between them we decrease d.
        d_action = kDecrease;
      }
      iteration_action = kDecreaseStep;
      last_failure_type_ = kWolfeI;
      num_wolfe_i_failures_++;
    } else if (!wolfe_ii_ok) {
      // Curvature condition failed -> we did not go far enough.
      if (last_failure_type_ == kWolfeI) // switching between wolfe i and ii failures->
        d_action = kDecrease; // decrease value of d.
      iteration_action = kIncreaseStep;
      last_failure_type_ = kWolfeII;
      num_wolfe_ii_failures_++;
    }
  
    // Test whether we've been switching too many times betwen wolfe i) and ii)
    // failures, or overall have an excessive number of failures.  We just give up
    // and restart L-BFGS.  Probably we've almost converged.
    if (num_wolfe_i_failures_ + num_wolfe_ii_failures_ >
        opts_.max_line_search_iters) {
      KALDI_VLOG(2) << "Too many steps in line search -> restarting.";
      iteration_action = kRestart;
    }
  
    if (d_action == kDecrease)
      d_ = std::sqrt(d_);
    
    KALDI_VLOG(3) << "d = " << d_ << ", iter = " << k_ << ", action = "
                  << (iteration_action == kAccept ? "accept" :
                      (iteration_action == kDecreaseStep ? "decrease" :
                       (iteration_action == kIncreaseStep ? "increase" :
                        "reject")));
    
    // Note: even if iteration_action != Restart at this point,
    // some code below may set it to Restart.
    if (iteration_action == kAccept) {
      if (AcceptStep(function_value, gradient)) { // If we did
        // not detect a problem while accepting the step..
        computation_state_ = kBeforeStep;
        ComputeNewDirection(function_value, gradient);
      } else {
        KALDI_VLOG(2) << "Restarting L-BFGS computation; problem found while "
                      << "accepting step.";
        iteration_action = kRestart; // We'll have to restart now.
      }
    }
    if (iteration_action == kDecreaseStep || iteration_action == kIncreaseStep) {
      Real scale = (iteration_action == kDecreaseStep ? 1.0 / d_ : d_);
      temp_.CopyFromVec(new_x_);
      new_x_.Scale(scale);
      new_x_.AddVec(1.0 - scale, x_);
      if (new_x_.ApproxEqual(temp_, 0.0)) {
        // Value of new_x_ did not change at all --> we must restart.
        KALDI_VLOG(3) << "Value of x did not change, when taking step; "
                      << "will restart computation.";
        iteration_action = kRestart;
      }
      if (new_x_.ApproxEqual(temp_, 1.0e-08) &&
          std::abs(f_ - function_value) < 1.0e-08 *
          std::abs(f_) && iteration_action == kDecreaseStep) {
        // This is common and due to roundoff.
        KALDI_VLOG(3) << "We appear to be backtracking while we are extremely "
                      << "close to the old value; restarting.";
        iteration_action = kRestart;
      }
          
      if (iteration_action == kDecreaseStep) {
        num_wolfe_i_failures_++;
        last_failure_type_ = kWolfeI;
      } else {
        num_wolfe_ii_failures_++;
        last_failure_type_ = kWolfeII;
      }
    }
    if (iteration_action == kRestart) {
      // We want to restart the computation.  If the objf at new_x_ is
      // better than it was at x_, we'll start at new_x_, else at x_.
      bool use_newx;
      if (opts_.minimize) use_newx = (function_value < f_);
      else use_newx = (function_value > f_);
      KALDI_VLOG(3) << "Restarting computation.";
      if (use_newx) Restart(new_x_, function_value, gradient);
      else Restart(x_, f_, deriv_);
    }
  }
  
  template<typename Real>
  void OptimizeLbfgs<Real>::DoStep(Real function_value,
                                   const VectorBase<Real> &gradient) {
    if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
      best_f_ = function_value;
      best_x_.CopyFromVec(new_x_);
    }
    if (computation_state_ == kBeforeStep)
      ComputeNewDirection(function_value, gradient);
    else // kWithinStep{1,2,3}
      StepSizeIteration(function_value, gradient);
  }
  
  template<typename Real>
  void OptimizeLbfgs<Real>::DoStep(Real function_value,
                                   const VectorBase<Real> &gradient,
                                   const VectorBase<Real> &diag_approx_2nd_deriv) {
    if (opts_.minimize ? function_value < best_f_ : function_value > best_f_) {
      best_f_ = function_value;
      best_x_.CopyFromVec(new_x_);
    }
    if (opts_.minimize) {
      KALDI_ASSERT(diag_approx_2nd_deriv.Min() > 0.0);
    } else {
      KALDI_ASSERT(diag_approx_2nd_deriv.Max() < 0.0);
    }
    H_was_set_ = true;
    H_.CopyFromVec(diag_approx_2nd_deriv);
    H_.InvertElements();
    DoStep(function_value, gradient);
  }
  
  template<typename Real>
  const VectorBase<Real>&
  OptimizeLbfgs<Real>::GetValue(Real *objf_value) const {
    if (objf_value != NULL) *objf_value = best_f_;
    return best_x_;
  }
  
  // to compute the alpha, we are minimizing f(x) =  x^T b - 0.5 x_k^T A x_k  along
  // direction p_k... consider alpha
  // d/dx of f(x) = b - A x_k = r.
  
  // Notation based on Sec. 5.1 of Nocedal and Wright
  // Computation based on Alg. 5.2 of Nocedal and Wright (Pg. 112)
  // Notation (replicated for convenience):
  //  To solve Ax=b for x
  //  k : current iteration
  //  x_k : estimate of x (at iteration k)
  //  r_k : residual ( r_k \eqdef A x_k - b )
  //  \alpha_k : step size
  //  p_k : A-conjugate direction
  //  \beta_k  : coefficient used in A-conjugate direction computation for next
  //  iteration
  //  
  //  Algo.  LinearCG(A,b,x_0)
  //  ========================
  //  r_0 = Ax_0 - b
  //  p_0 = -r_0
  //  k = 0
  //
  //  while r_k != 0
  //    \alpha_k = (r_k^T  r_k) / (p_k^T  A  p_k)
  //    x_{k+1} = x_k + \alpha_k  p_k;
  //    r_{k+1} = r_k + \alpha_k  A  p_k
  //    \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
  //    p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
  //    k = k + 1
  //  end
  
  template<class Real>
  int32 LinearCgd(const LinearCgdOptions &opts,
                  const SpMatrix<Real> &A,
                  const VectorBase<Real> &b,
                  VectorBase<Real> *x) {
    // Initialize the variables
    //
    int32 M = A.NumCols();
  
    Matrix<Real> storage(4, M);
    SubVector<Real> r(storage, 0), p(storage, 1), Ap(storage, 2), x_orig(storage, 3);
    p.CopyFromVec(b);
    p.AddSpVec(-1.0, A, *x, 1.0);  // p_0 = b - A x_0
    r.AddVec(-1.0, p);  // r_0 = - p_0
    x_orig.CopyFromVec(*x);  // in case of failure.
    
    Real r_cur_norm_sq = VecVec(r, r),
        r_initial_norm_sq = r_cur_norm_sq,
        r_recompute_norm_sq = r_cur_norm_sq;
  
    KALDI_VLOG(5) << "In linear CG: initial norm-square of residual = "
                  << r_initial_norm_sq;
    
    KALDI_ASSERT(opts.recompute_residual_factor <= 1.0);
    Real max_error_sq = std::max<Real>(opts.max_error * opts.max_error,
                                       std::numeric_limits<Real>::min()),
        residual_factor = opts.recompute_residual_factor *
                          opts.recompute_residual_factor,
        inv_residual_factor = 1.0 / residual_factor;
    
    // Note: although from a mathematical point of view the method should converge
    // after M iterations, in practice (due to roundoff) it does not always
    // converge to good precision after that many iterations so we let the maximum
    // be M + 5 instead.
    int32 k = 0;
    for (; k < M + 5 && k != opts.max_iters; k++) {
      // Note: we'll break from this loop if we converge sooner due to
      // max_error.
      Ap.AddSpVec(1.0, A, p, 0.0);  // Ap = A p
  
      // Below is how the code used to look.
      // // next line: \alpha_k = (r_k^T r_k) / (p_k^T A p_k)
      // Real alpha = r_cur_norm_sq / VecVec(p, Ap);
      // 
      // We changed r_cur_norm_sq below to -VecVec(p, r).  Although this is
      // slightly less efficient, it seems to make the algorithm dramatically more
      // robust.  Note that -p^T r is the mathematically more natural quantity to
      // use here, that corresponds to minimizing along that direction... r^T r is
      // recommended in Nocedal and Wright only as a kind of optimization as it is
      // supposed to be the same as -p^T r and we already have it computed.
      Real alpha = -VecVec(p, r) / VecVec(p, Ap);
      
      // next line: x_{k+1} = x_k + \alpha_k p_k;
      x->AddVec(alpha, p);
      // next line: r_{k+1} = r_k + \alpha_k A p_k
      r.AddVec(alpha, Ap);
      Real r_next_norm_sq = VecVec(r, r);
      
      if (r_next_norm_sq < residual_factor * r_recompute_norm_sq ||
          r_next_norm_sq > inv_residual_factor * r_recompute_norm_sq) {
           
        // Recompute the residual from scratch if the residual norm has decreased
        // a lot; this costs an extra matrix-vector multiply, but helps keep the
        // residual accurate.
        // Also do the same if the residual norm has increased a lot since
        // the last time we recomputed... this shouldn't happen often, but
        // it can indicate bad stuff is happening.
        
        // r_{k+1} = A x_{k+1} - b
        r.AddSpVec(1.0, A, *x, 0.0);
        r.AddVec(-1.0, b);
        r_next_norm_sq = VecVec(r, r);
        r_recompute_norm_sq = r_next_norm_sq;
  
        KALDI_VLOG(5) << "In linear CG: recomputing residual.";
      }
      KALDI_VLOG(5) << "In linear CG: k = " << k
                    << ", r_next_norm_sq = " << r_next_norm_sq;
      // Check if converged.
      if (r_next_norm_sq <= max_error_sq)
        break;
      
      // next line: \beta_{k+1} = \frac{r_{k+1}^T r_{k+1}}{r_k^T r_K}
      Real beta_next = r_next_norm_sq / r_cur_norm_sq;
      // next lines: p_{k+1} = -r_{k+1} + \beta_{k+1} p_k
      Vector<Real> p_old(p);
      p.Scale(beta_next);
      p.AddVec(-1.0, r);
      r_cur_norm_sq = r_next_norm_sq;
    }
  
    // note: the first element of the && is only there to save compute.
    // the residual r is A x - b, and r_cur_norm_sq and r_initial_norm_sq are
    // of the form r * r, so it's clear that b * b has the right dimension to
    // compare with the residual.
    if (r_cur_norm_sq > r_initial_norm_sq &&
        r_cur_norm_sq > r_initial_norm_sq + 1.0e-10 * VecVec(b, b)) {
      KALDI_WARN << "Doing linear CGD in dimension " << A.NumRows() << ", after " << k
                << " iterations the squared residual has got worse, "
                 << r_cur_norm_sq << " > " << r_initial_norm_sq
                 << ".  Will do an exact optimization.";
      SolverOptions opts("called-from-linearCGD");
      x->CopyFromVec(x_orig);
      SolveQuadraticProblem(A, b, opts, x);
    }
    return k;
  } 
      
  // Instantiate the class for float and double.
  template
  class OptimizeLbfgs<float>;
  template
  class OptimizeLbfgs<double>;
  
  
  template
  int32 LinearCgd<float>(const LinearCgdOptions &opts,
                        const SpMatrix<float> &A, const VectorBase<float> &b,
                        VectorBase<float> *x);
  
  template
  int32 LinearCgd<double>(const LinearCgdOptions &opts,
                          const SpMatrix<double> &A, const VectorBase<double> &b,
                          VectorBase<double> *x);
  
  } // end namespace kaldi