logistic-regression.cc 11.2 KB
// ivector/logistic-regression.cc

// Copyright 2014  David Snyder

// See ../../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABLITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.


#include "ivector/logistic-regression.h"
#include "gmm/model-common.h" // For GetSplitTargets()
#include <numeric> // For std::accumulate

namespace kaldi {

void LogisticRegression::Train(const Matrix<BaseFloat> &xs,
                               const std::vector<int32> &ys,
                               const LogisticRegressionConfig &conf) {

  int32 xs_num_rows = xs.NumRows(), xs_num_cols = xs.NumCols(),
                     num_ys = ys.size();
  KALDI_ASSERT(xs_num_rows == num_ys);

  // Adding on extra column for each x to handle the prior.
  Matrix<BaseFloat> xs_with_prior(xs_num_rows, xs_num_cols + 1);
  SubMatrix<BaseFloat> sub_xs(xs_with_prior, 0, xs_num_rows, 0, xs_num_cols);
  sub_xs.CopyFromMat(xs);

  int32 num_classes = *std::max_element(ys.begin(), ys.end()) + 1;
  weights_.Resize(num_classes, xs_num_cols + 1);
  Matrix<BaseFloat> xw(xs_num_rows, num_classes);

  // Adding on extra column for each x to handle the prior.
  for (int32 i = 0; i < xs_num_rows; i++) {
    xs_with_prior(i, xs_num_cols) = 1.0;
  }

  // At the beginning of training we have no mixture components,
  // therefore class_ is the "identity" mapping, that is
  // class_[i] = i.
  for (int32 i = 0; i < num_classes; i++) {
    class_.push_back(i);
  }

  weights_.SetZero();
  TrainParameters(xs_with_prior, ys, conf, &xw);
  KALDI_LOG << "Finished training parameters without mixture components.";

  // If we are using mixture components, we add those components
  // in MixUp and retrain with the extra weights.
  if (conf.mix_up > num_classes) {
    MixUp(ys, num_classes, conf);
    Matrix<BaseFloat> xw(xs_num_rows, weights_.NumRows());
    TrainParameters(xs_with_prior, ys, conf, &xw);
    KALDI_LOG << "Finished training mixture components.";
  }
}


void LogisticRegression::MixUp(const std::vector<int32> &ys,
                               const int32 &num_classes,
                               const LogisticRegressionConfig &conf) {

  Vector<BaseFloat> counts(num_classes);
  for (int32 i = 0; i < ys.size(); i++) {
    counts(ys[i]) += 1.0;
  }

  // TODO: Figure out what min_count should be
  int32 min_count = 1;
  std::vector<int32> targets;
  GetSplitTargets(counts, conf.mix_up, conf.power, min_count, &targets);
  int32 new_dim = std::accumulate(targets.begin(), targets.end(),
                                  static_cast<int32>(0));

  KALDI_LOG << "Target number mixture components was " << conf.mix_up
            << ". Training " << new_dim << " mixture components.";

  int32 old_dim = weights_.NumRows(),
        num_components = old_dim,
        num_feats = weights_.NumCols();

  Matrix<BaseFloat> old_weights(weights_);
  weights_.Resize(new_dim, num_feats);
  SubMatrix<BaseFloat> sub_weights(weights_, 0, num_classes, 0, num_feats);
  // We need to retain the original weights
  sub_weights.CopyFromMat(old_weights);
  class_.resize(new_dim);
  // For each class i
  for (int32 i = 0; i < targets.size(); i++) {
    int32 mixes = targets[i];
    // We start at j = 1 since one copy of the components already
    // exists in weights_.
    for (int32 j = 1; j < mixes; j++) {
      int32 offset = num_components;
      weights_.Row(offset).CopyRowFromMat(weights_, i);
      Vector<BaseFloat> noise(num_feats);
      noise.SetRandn();
      weights_.Row(offset).AddVec(1.0e-05, noise);
      class_[offset] = i; // The class i maps to the row at offset
      num_components += 1;
    }
  }
}

void LogisticRegression::TrainParameters(const Matrix<BaseFloat> &xs,
    const std::vector<int32> &ys, const LogisticRegressionConfig &conf,
    Matrix<BaseFloat> *xw) {
  int32 max_steps = conf.max_steps;
  BaseFloat normalizer = conf.normalizer;
  LbfgsOptions lbfgs_opts;
  lbfgs_opts.minimize = false;
  // Get initial w vector
  Vector<BaseFloat> init_w(weights_.NumRows() * weights_.NumCols());
  init_w.CopyRowsFromMat(weights_);
  OptimizeLbfgs<BaseFloat> lbfgs(init_w, lbfgs_opts);

  for (int32 step = 0; step < max_steps; step++) {
    DoStep(xs, xw, ys, &lbfgs, normalizer);
  }

  Vector<BaseFloat> best_w(lbfgs.GetValue());
  weights_.CopyRowsFromVec(best_w);
}

void LogisticRegression::GetLogPosteriors(const Matrix<BaseFloat> &xs,
                                          Matrix<BaseFloat> *log_posteriors) {
  int32 xs_num_rows = xs.NumRows(),
      xs_num_cols = xs.NumCols(),
      num_mixes = weights_.NumRows();

  int32 num_classes = *std::max_element(class_.begin(), class_.end()) + 1;

  log_posteriors->Resize(xs_num_rows, num_classes);
  Matrix<BaseFloat> xw(xs_num_rows, num_mixes);

  Matrix<BaseFloat> xs_with_prior(xs_num_rows, xs_num_cols + 1);
  SubMatrix<BaseFloat> sub_xs(xs_with_prior, 0, xs_num_rows, 0, xs_num_cols);
  sub_xs.CopyFromMat(xs);
  // Adding on extra column for each x to handle the prior.
  for (int32 i = 0; i < xs_num_rows; i++) {
    xs_with_prior(i, xs_num_cols) = 1.0;
  }
  xw.AddMatMat(1.0, xs_with_prior, kNoTrans, weights_,
               kTrans, 0.0);

  log_posteriors->Set(-std::numeric_limits<BaseFloat>::infinity());

  // i is the training example
  for (int32 i = 0; i < xs_num_rows; i++) {
    for (int32 j = 0; j < num_mixes; j++) {
      int32 k = class_[j];
      (*log_posteriors)(i,k) = LogAdd((*log_posteriors)(i,k), xw(i, j));
    }
    // Normalize the row.
    log_posteriors->Row(i).Add(-xw.Row(i).LogSumExp());
  }
}

void LogisticRegression::GetLogPosteriors(const Vector<BaseFloat> &x,
                                          Vector<BaseFloat> *log_posteriors) {
  int32 x_dim = x.Dim();
  int32 num_classes = *std::max_element(class_.begin(), class_.end()) + 1,
      num_mixes = weights_.NumRows();
  log_posteriors->Resize(num_classes);
  Vector<BaseFloat> xw(weights_.NumRows());

  Vector<BaseFloat> x_with_prior(x_dim + 1);
  SubVector<BaseFloat> sub_x(x_with_prior, 0, x_dim);
  sub_x.CopyFromVec(x);
  // Adding on extra element to handle the prior
  x_with_prior(x_dim) = 1.0;

  xw.AddMatVec(1.0, weights_, kNoTrans, x_with_prior, kNoTrans);

  log_posteriors->Set(-std::numeric_limits<BaseFloat>::infinity());

  for (int32 i = 0; i < num_mixes; i++) {
    int32 j = class_[i];
    (*log_posteriors)(j) = LogAdd((*log_posteriors)(j), xw(i));
  }
  log_posteriors->Add(-log_posteriors->LogSumExp());
}

BaseFloat LogisticRegression::DoStep(const Matrix<BaseFloat> &xs,
    Matrix<BaseFloat> *xw,
    const std::vector<int32> &ys, OptimizeLbfgs<BaseFloat> *lbfgs,
    BaseFloat normalizer) {
  Matrix<BaseFloat> gradient(weights_.NumRows(), weights_.NumCols());
  // Vector form of the above matrix
  Vector<BaseFloat> grad_vec(weights_.NumRows() * weights_.NumCols());

  // Calculate XW.T. The rows correspond to the x
  // training examples and the columns to the class labels.
  xw->AddMatMat(1.0, xs, kNoTrans, weights_, kTrans, 0.0);

  // Calculate both the gradient and the objective function.
  BaseFloat objf = GetObjfAndGrad(xs, ys, *xw, &gradient, normalizer);

  // Convert gradient (a matrix) into a vector of size
  // gradient.NumCols * gradient.NumRows.
  grad_vec.CopyRowsFromMat(gradient);

  // Compute next step in L-BFGS.
  lbfgs->DoStep(objf, grad_vec);

  // Update weights
  Vector<BaseFloat> new_w(lbfgs->GetProposedValue());
  weights_.CopyRowsFromVec(new_w);
  KALDI_LOG << "Objective function is " << objf;
  return objf;
}

BaseFloat LogisticRegression::GetObjfAndGrad(
    const Matrix<BaseFloat> &xs,
    const std::vector<int32> &ys, const Matrix<BaseFloat> &xw,
    Matrix<BaseFloat> *grad, BaseFloat normalizer) {
  BaseFloat raw_objf = 0.0;
  int32 num_classes = *std::max_element(ys.begin(), ys.end()) + 1;
  std::vector< std::vector<int32> > class_to_cols(num_classes, std::vector<int32>());
  for (int32 i = 0; i < class_.size(); i++) {
    class_to_cols[class_[i]].push_back(i);
  }
  // For each training example class
  for (int32 i = 0; i < ys.size(); i++) {
    Vector<BaseFloat> row(xw.NumCols());
    row.CopyFromVec(xw.Row(i));
    row.ApplySoftMax();
    // Identify the rows of weights_ (which are a set of columns in wx)
    // which correspond to class ys[i]
    const std::vector<int32> &cols = class_to_cols[ys[i]];
    SubVector<BaseFloat> x = xs.Row(i);
    BaseFloat class_sum = 0.0;
    for (int32 j = 0; j < cols.size(); j++) {
      class_sum += row(cols[j]);
    }
    if (class_sum < 1.0e-20) class_sum = 1.0e-20;
    raw_objf += Log(class_sum);
    // Iterate over weights for each component. If there are no
    // mixtures each row corresponds to a class.
    for (int32 k = 0; k < weights_.NumRows(); k++) {
      // p(y = k | x_i) where k is a component.
      BaseFloat p = row(k);
      if (class_[k] == ys[i]) {
        // If the classes aren't split into mixture components
        // then p/class_sum = 1.0.
        grad->Row(k).AddVec(p/class_sum - p, x);
      } else {
        grad->Row(k).AddVec(-1.0 * p, x);
      }
    }
  }
  // Scale and add regularization term.
  grad->Scale(1.0/ys.size());
  grad->AddMat(-1.0 * normalizer, weights_);
  raw_objf /= ys.size();
  BaseFloat regularizer = - 0.5 * normalizer
                          * TraceMatMat(weights_, weights_, kTrans);
  KALDI_VLOG(2) << "Objf is " << raw_objf << " + " << regularizer
                << " = " << (raw_objf + regularizer);
  return raw_objf + regularizer;
}

void LogisticRegression::SetWeights(const Matrix<BaseFloat> &weights,
                                    const std::vector<int32> classes) {
  weights_.Resize(weights.NumRows(), weights.NumCols());
  weights_.CopyFromMat(weights);
  class_.resize(classes.size());
  for (int32 i = 0; i < class_.size(); i++)
    class_[i] = classes[i];
}

void LogisticRegression::ScalePriors(const Vector<BaseFloat> &scales) {
  Vector<BaseFloat> log_scales(scales);
  log_scales.ApplyLog();

  for (int32 i = 0; i < weights_.NumRows(); i++)
    weights_(i, weights_.NumCols() - 1) += log_scales(class_[i]);
}

void LogisticRegression::Write(std::ostream &os, bool binary) const {
  WriteToken(os, binary, "<LogisticRegression>");
  WriteToken(os, binary, "<weights>");
  weights_.Write(os, binary);
  WriteToken(os, binary, "<class>");
  WriteIntegerVector(os, binary, class_);
  WriteToken(os, binary, "</LogisticRegression>");
}

void LogisticRegression::Read(std::istream &is, bool binary) {
  ExpectToken(is, binary, "<LogisticRegression>");
  ExpectToken(is, binary, "<weights>");
  weights_.Read(is, binary);
  std::string token;
  ReadToken(is, binary, &token);
  if (token == "<class>") {
    ReadIntegerVector(is, binary, &class_);
  } else {
    int32 num_classes = weights_.NumRows();
    for (int32 i = 0; i < num_classes; i++) {
      class_.push_back(i);
    }
  }
  ExpectToken(is, binary, "</LogisticRegression>");
}

}