sampling-lm.cc 17.4 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
// sampling-lm.cc

// Copyright 2017  Ke Li
//           2017  Johns Hopkins University (author: Daniel Povey)

// See ../COPYING for clarification regarding multiple authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//  http://www.apache.org/licenses/LICENSE-2.0
//
// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
// MERCHANTABILITY OR NON-INFRINGEMENT.
// See the Apache 2 License for the specific language governing permissions and
// limitations under the License.

#include "rnnlm/sampling-lm.h"

namespace kaldi {
namespace rnnlm {

// This function reads in each ngram line from an ARPA file
void SamplingLm::ConsumeNGram(const NGram& ngram) {
  int32 cur_order = ngram.words.size(),
      max_order = Order();
  int32 word = ngram.words.back();  // word is the last word in a ngram term
  KALDI_ASSERT(cur_order > 0 && word > 0);

  if (cur_order == 1) {
    // unigram
    if (unigram_probs_.size() <= static_cast<size_t>(word))
      unigram_probs_.resize(static_cast<size_t>(word + 1), 0.0);
    KALDI_ASSERT(unigram_probs_[word] == 0.0);  // or repeated unigram.
    unigram_probs_[word] = Exp(ngram.logprob);
    if (ngram.backoff != 0.0)
      higher_order_probs_[cur_order - 1][ngram.words].backoff_prob =
        Exp(ngram.backoff);
  } else {
    HistType history(ngram.words.begin(), ngram.words.end() - 1);
    // Note: we'll later on change the probability, subtracting the
    // part that is due to backoff.  This change of format is
    // convenient for our application.
    // ngram.logprob has already been converted to log-base e at
    // this point.
    higher_order_probs_[cur_order - 2][history].words_and_probs.push_back(
        std::pair<int32, BaseFloat>(word, Exp(ngram.logprob)));
    if (ngram.backoff != 0.0) {
      KALDI_ASSERT(cur_order != max_order);
      higher_order_probs_[cur_order - 1][ngram.words].backoff_prob =
          Exp(ngram.backoff);
    }
  }
}

void SamplingLm::HeaderAvailable() {
  unigram_probs_.reserve(NgramCounts()[0] + 100);
  // e.g. for a trigram LM we store bigram and trigram
  // history states in probs_, while unigram_probs_ stores
  // the unigram probabilities.
  int32 ngram_order = NgramCounts().size();
  higher_order_probs_.resize(ngram_order - 1);
}

BaseFloat SamplingLm::GetProbWithBackoff(
    const std::vector<int32> &history,
    const HistoryState *state,
    int32 word) const {
  if (state == NULL) {
    int32 order = history.size() + 1;
    if (order == 1) {
      KALDI_ASSERT(static_cast<size_t>(word) < unigram_probs_.size());
      return unigram_probs_[word];
    } else {
      std::unordered_map<HistType, HistoryState, VectorHasher<int32> >::const_iterator
          hist_iter = higher_order_probs_[order - 2].find(history);
      KALDI_ASSERT(hist_iter != higher_order_probs_[order - 2].end());
      // it's not optimally efficient to recurse here, but this is on a code
      // path that will rarely be taken in practice.
      return GetProbWithBackoff(history, &(hist_iter->second), word);
    }
  } else {
    std::pair<int32, BaseFloat> p(word, 0.0);
    std::vector<std::pair<int32, BaseFloat> >::const_iterator iter =
        std::lower_bound(state->words_and_probs.begin(),
                         state->words_and_probs.end(), p);
    if (iter != state->words_and_probs.end() && iter->first == word) {
      // the probability for this word was given in this history state.  (note:
      // we assume that at the time this function is called, the entire
      // probability is present here, as it is in the ARPA format LM.  See
      // documentation for this function for more explanation.
      return iter->second;
    } else {
      // we have to back off.
      std::vector<int32> backoff_history(history.begin() + 1,
                                         history.end());
      return state->backoff_prob *
          GetProbWithBackoff(backoff_history, NULL, word);
    }
  }
}

void SamplingLm::EnsureHistoryStatesSorted() {
  for (size_t i = 0; i < higher_order_probs_.size(); i++) {
    std::unordered_map<HistType, HistoryState, VectorHasher<int32> >::iterator
        iter = higher_order_probs_[i].begin(),
        end = higher_order_probs_[i].end();
    for (; iter != end; ++iter)
      std::sort(iter->second.words_and_probs.begin(),
                iter->second.words_and_probs.end());
  }
}

void SamplingLm::ReadComplete() {
  EnsureHistoryStatesSorted();
  int32 max_order = Order();
  for (int32 order = max_order; order >= 2; order--) {
    std::unordered_map<HistType, HistoryState, VectorHasher<int32> >
        &this_map = higher_order_probs_[order - 2];
    std::unordered_map<HistType, HistoryState,
        VectorHasher<int32> >::iterator
        hist_iter = this_map.begin(), hist_end = this_map.end();
    for (; hist_iter != hist_end; ++hist_iter) {
      const HistType &history = hist_iter->first;
      HistoryState &history_state = hist_iter->second;
      BaseFloat backoff_prob = history_state.backoff_prob;
      HistoryState *backoff_state;
      HistType backoff_history(history.begin() + 1, history.end());
      if (order == 2) backoff_state = NULL;  // unigram has different format.
      else backoff_state = &(higher_order_probs_[order - 3][backoff_history]);

      std::vector<std::pair<int32, BaseFloat> >::iterator
          word_iter = history_state.words_and_probs.begin(),
          word_end = history_state.words_and_probs.end();
      double total_prob_after_subtracting = 0.0;
      for (; word_iter != word_end; ++word_iter) {
        int32 word = word_iter->first;
        BaseFloat prob = word_iter->second;
        // OK, we want to subtract the backoff part.
        BaseFloat backoff_part_of_prob = backoff_prob *
            GetProbWithBackoff(backoff_history, backoff_state, word);
        if (backoff_part_of_prob > 1.01 * prob) {
          KALDI_WARN << "Backoff part of prob is larger than prob itself: "
                     << backoff_part_of_prob << " > " << prob
                     << ".  This may mean your language model was not "
                     << "Kneser-Ney 'with addition'.  We advise to use "
                     << "Kneser-Ney with addition or some other type of "
                     << "LM 'with addition'.";
        }
        // OK, this could now be negative.  This shouldn't matter
        BaseFloat new_prob = prob - backoff_part_of_prob;
        word_iter->second = new_prob;
        total_prob_after_subtracting += new_prob;
      }
      BaseFloat new_total = total_prob_after_subtracting + backoff_prob;
      if (fabs(new_total - 1.0) > 0.01)
        KALDI_WARN << "Expected LM-state to sum to one, got "
                   << new_total;
    }
  }
}

void SamplingLm::AddBackoffToHistoryStates(
    const WeightedHistType &histories,
    WeightedHistType *histories_closure,
    BaseFloat *total_weight_out,
    BaseFloat *unigram_weight_out) const {
  // the implementation of this function is not as efficient as it could be,
  // but it should not dominate.
  std::vector<std::pair<HistType, BaseFloat> >::const_iterator
      histories_iter = histories.begin(), histories_end = histories.end();
  int32 max_order = Order();
  std::unordered_map<HistType, BaseFloat,
      VectorHasher<int32> > hist_to_weight_map;
  double total_weight = 0.0, total_unigram_weight = 0.0;
  for (; histories_iter != histories_end; ++histories_iter) {
    std::vector<int32> history = histories_iter->first;
    int32 cur_hist_len = history.size();
    BaseFloat weight = histories_iter->second;
    total_weight += weight;
    KALDI_ASSERT(history.size() <= max_order - 1 && weight > 0);

    // back off until the history exists or until we reached the unigram state.
    while (cur_hist_len > 0 &&
           higher_order_probs_[cur_hist_len - 1].count(history) == 0) {
      history.erase(history.begin());
      cur_hist_len--;
    }
    // OK, the history-state exists.
    while (cur_hist_len > 0) {
      hist_to_weight_map[history] += weight;
      std::unordered_map<HistType, HistoryState, VectorHasher<int32> >::const_iterator
          iter = higher_order_probs_[cur_hist_len - 1].find(history);
      KALDI_ASSERT(iter != higher_order_probs_[cur_hist_len - 1].end());
      weight *= iter->second.backoff_prob;
      history.erase(history.begin());
      cur_hist_len--;
    }
    // at this point, 'history' is empty and 'weight' is the unigram
    // backoff weight for this history state.
    total_unigram_weight += weight;
  }
  histories_closure->clear();
  histories_closure->resize(hist_to_weight_map.size());
  std::unordered_map<HistType, BaseFloat, VectorHasher<int32> >::iterator
      hist_to_weight_iter = hist_to_weight_map.begin(),
      hist_to_weight_end = hist_to_weight_map.end();
  size_t pos = 0;
  for (; hist_to_weight_iter != hist_to_weight_end; ++hist_to_weight_iter) {
    (*histories_closure)[pos].first = hist_to_weight_iter->first;
    (*histories_closure)[pos].second = hist_to_weight_iter->second;
    pos++;
  }
  *total_weight_out = total_weight;
  *unigram_weight_out = total_unigram_weight;
  KALDI_ASSERT(pos == hist_to_weight_map.size());
}


BaseFloat SamplingLm::GetDistribution(
    const WeightedHistType &histories,
    std::vector<std::pair<int32, BaseFloat> > *non_unigram_probs_out) const {
  std::unordered_map<int32, BaseFloat> non_unigram_probs_temp;
  // Call the other version of GetDistribution().
  BaseFloat ans = GetDistribution(histories, &non_unigram_probs_temp);
  non_unigram_probs_out->clear();
  non_unigram_probs_out->reserve(non_unigram_probs_temp.size());
  non_unigram_probs_out->insert(non_unigram_probs_out->end(),
                                non_unigram_probs_temp.begin(),
                                non_unigram_probs_temp.end());
  std::sort(non_unigram_probs_out->begin(),
            non_unigram_probs_out->end());
  return ans;
}

BaseFloat SamplingLm::GetDistribution(
    const WeightedHistType &histories,
    std::unordered_map<int32, BaseFloat> *non_unigram_probs) const {
  WeightedHistType histories_closure;
  BaseFloat total_weight, total_unigram_weight;
  AddBackoffToHistoryStates(histories, &histories_closure,
                            &total_weight, &total_unigram_weight);
  non_unigram_probs->clear();
  double total_weight_check = total_unigram_weight;
  WeightedHistType::const_iterator iter = histories_closure.begin(),
      end = histories_closure.end();
  for (; iter != end; ++iter) {
    const HistType &history = iter->first;
    BaseFloat hist_weight = iter->second;
    int32 order = history.size() + 1;
    KALDI_ASSERT(order > 1);  // unigram history is not included at this point.
    std::unordered_map<HistType, HistoryState,
        VectorHasher<int32> >::const_iterator it_hist =
           higher_order_probs_[order - 2].find(history);
    KALDI_ASSERT(it_hist != higher_order_probs_[order - 2].end());
    std::vector<std::pair<int32, BaseFloat> >::const_iterator
        word_iter = it_hist->second.words_and_probs.begin(),
        word_end = it_hist->second.words_and_probs.end();
    for (; word_iter != word_end; ++word_iter) {
      int32 word = word_iter->first;
      BaseFloat prob = word_iter->second;
      // note: if 'word' was not in the map, it's as if it were zero, for C++
      // version >= C++11; search for unordered_map value initialization for
      // explanation
      (*non_unigram_probs)[word] += prob * hist_weight;
      total_weight_check += prob * hist_weight;
    }
  }
  // Check that 'total_weight' and 'total_weight_check' are
  // the same.  'total_weight' is the total of the of the .second
  // member of the input 'histories', and 'total_weight_check' is the
  // total weight of 'non_unigrm_probs' plus 'total_unigram_weight'.
  // Essentially this is a check that the distribution given
  // by the ARPA file (and as processed by us) sums to one for each
  // history state.  If this check fails, it could either be
  // a problem with this code, or an issue with the software that
  // created the ARPA file.
  if (fabs(total_weight - total_weight_check) >
      0.01 * total_weight) {
    static int32 num_times_warned = 0;
    if (num_times_warned < 10) {
      KALDI_WARN << "Total weight does not have expected value (problem in "
          "your ARPA file, or this code).  Won't warn >10 times.";
      num_times_warned++;
    }
  }
  KALDI_ASSERT(total_unigram_weight > 0.0);
  return total_unigram_weight;
}

SamplingLm::SamplingLm(const SamplingLmEstimator &estimator):
    ArpaFileParser(ArpaParseOptions(), NULL),
    unigram_probs_(estimator.unigram_probs_),
    higher_order_probs_(estimator.history_states_.size() - 1) {
  for (int32 o = 2;
       o <= static_cast<int32>(estimator.history_states_.size()); o++) {
    higher_order_probs_[o-2].reserve(estimator.history_states_[o-1].size());
    unordered_map<std::vector<int32>, SamplingLmEstimator::HistoryState*,
                  VectorHasher<int32> >::const_iterator
        iter = estimator.history_states_[o-1].begin(),
        end =  estimator.history_states_[o-1].end();
    for (; iter != end; ++iter) {
      const std::vector<int32> &history = iter->first;
      const SamplingLmEstimator::HistoryState &src_state = *(iter->second);
      // the next statement adds a history state to the map.
      HistoryState &dest_state = higher_order_probs_[o-2][history];
      BaseFloat inv_total_count = BaseFloat(1.0) / src_state.total_count;
      dest_state.backoff_prob = src_state.backoff_count * inv_total_count;
      dest_state.words_and_probs.resize(src_state.counts.size());
      std::vector<SamplingLmEstimator::Count>::const_iterator
          src_iter = src_state.counts.begin(),
          src_end = src_state.counts.end();
      std::vector<std::pair<int32, BaseFloat> >::iterator
          dest_iter = dest_state.words_and_probs.begin();
      for (; src_iter != src_end; ++src_iter, ++dest_iter) {
        dest_iter->first = src_iter->word;
        dest_iter->second = inv_total_count * src_iter->count;
      }
    }
  }
}

void SamplingLm::Write(std::ostream &os, bool binary) const {
  WriteToken(os, binary, "<SamplingLm>");
  WriteToken(os, binary, "<Order>");
  int32 order = higher_order_probs_.size() + 1;
  WriteBasicType(os, binary, order);
  WriteToken(os, binary, "<VocabSize>");
  int32 vocab_size = unigram_probs_.size();
  WriteBasicType(os, binary, vocab_size);
  KALDI_ASSERT(!unigram_probs_.empty());
  // we have read and write functions in class Vector, so use that.
  SubVector<BaseFloat> probs(const_cast<BaseFloat*>(&(unigram_probs_[0])),
                             static_cast<int32>(unigram_probs_.size()));
  probs.Write(os, binary);
  for (int32 o = 2; o <= order; o++) {
    WriteToken(os, binary, "<StatesOfOrder>");
    WriteBasicType(os, binary, o);
    WriteToken(os, binary, "<NumStates>");
    int32 num_states = higher_order_probs_[o-2].size();
    WriteBasicType(os, binary, num_states);

    unordered_map<std::vector<int32>, HistoryState,
                  VectorHasher<int32> >::const_iterator
        iter = higher_order_probs_[o-2].begin(),
        end = higher_order_probs_[o-2].end();
    for (; iter != end; ++iter ){
      const std::vector<int32> &history = iter->first;
      const HistoryState &state = iter->second;
      WriteIntegerVector(os, binary, history);
      WriteBasicType(os, binary, state.backoff_prob);
      int32 num_words = state.words_and_probs.size();
      WriteBasicType(os, binary, num_words);
      for (int32 i = 0; i < num_words; i++) {
        WriteBasicType(os, binary, state.words_and_probs[i].first);
        WriteBasicType(os, binary, state.words_and_probs[i].second);
      }
      if (!binary) os << std::endl;
    }
  }
  WriteToken(os, binary, "</SamplingLm>");
}


void SamplingLm::Read(std::istream &is, bool binary) {
  ExpectToken(is, binary, "<SamplingLm>");
  ExpectToken(is, binary, "<Order>");
  int32 order;
  ReadBasicType(is, binary, &order);
  KALDI_ASSERT(order >= 1 && order < 100);
  higher_order_probs_.resize(order - 1);
  ExpectToken(is, binary, "<VocabSize>");
  int32 vocab_size;
  ReadBasicType(is, binary, &vocab_size);
  unigram_probs_.resize(vocab_size);
  // we have read and write functions in class Vector, so use that.
  SubVector<BaseFloat> probs(&(unigram_probs_[0]), vocab_size);
  probs.Read(is, binary);
  for (int32 o = 2; o <= order; o++) {
    ExpectToken(is, binary, "<StatesOfOrder>");
    int32 o2;
    ReadBasicType(is, binary, &o2);
    KALDI_ASSERT(o2 == o);
    int32 num_states;
    ExpectToken(is, binary, "<NumStates>");
    ReadBasicType(is, binary, &num_states);
    higher_order_probs_[o-2].reserve(num_states);
    for  (int32 s = 0; s < num_states; s++) {
      std::vector<int32> history;
      ReadIntegerVector(is, binary, &history);
      HistoryState &state = higher_order_probs_[o-2][history];
      ReadBasicType(is, binary, &(state.backoff_prob));
      int32 num_words;
      ReadBasicType(is, binary, &num_words);
      KALDI_ASSERT(num_words >= 0);
      state.words_and_probs.resize(num_words);
      for (int32 i = 0; i < num_words; i++) {
        ReadBasicType(is, binary, &(state.words_and_probs[i].first));
        ReadBasicType(is, binary, &(state.words_and_probs[i].second));
      }
    }
  }
  ExpectToken(is, binary, "</SamplingLm>");
}

// TODO: delete if unused.
void SamplingLm::Swap(SamplingLm *other) {
  unigram_probs_.swap(other->unigram_probs_);
  higher_order_probs_.swap(other->higher_order_probs_);
}

}  // namespace rnnlm
}  // namespace kaldi