Yannick Estève / ONTRAC-Kaldi

Blame view

src/lm/const-arpa-lm.cc 36.5 KB
  // lm/const-arpa-lm.cc
  
  // Copyright 2014  Guoguo Chen
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  #include <algorithm>
  #include <limits>
  #include <sstream>
  #include <utility>
  
  #include "base/kaldi-math.h"
  #include "lm/arpa-file-parser.h"
  #include "lm/const-arpa-lm.h"
  #include "util/stl-utils.h"
  #include "util/text-utils.h"
  
  
  namespace kaldi {
  
  // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa
  // format.
  struct ArpaLine {
    std::vector<int32> words;  // Sequence of words to be printed.
    float logprob;             // Logprob corresponds to word sequence.
    float backoff_logprob;     // Backoff_logprob corresponds to word sequence.
    // Comparison function for sorting.
    bool operator < (const ArpaLine &other) const {
      if (words.size() < other.words.size()) {
        return true;
      } else if (words.size() > other.words.size()) {
        return false;
      } else {
        return words < other.words;
      }
    }
  };
  
  // Auxiliary class to build ConstArpaLm. We first use this class to figure out
  // the relative address of different LmStates, and then put everything into one
  // block in memory.
  class LmState {
   public:
    union ChildType {
      // If child is not the final order, we keep the pointer to its LmState.
      LmState* state;
  
      // If child is the final order, we only keep the log probability for it.
      float prob;
    };
  
    struct ChildrenVectorLessThan {
      bool operator()(
          const std::pair<int32, union ChildType>& lhs,
          const std::pair<int32, union ChildType>& rhs) const {
        return lhs.first < rhs.first;
      }
    };
  
    LmState(const bool is_unigram, const bool is_child_final_order,
            const float logprob, const float backoff_logprob) :
        is_unigram_(is_unigram), is_child_final_order_(is_child_final_order),
        logprob_(logprob), backoff_logprob_(backoff_logprob) {}
  
    void SetMyAddress(const int64 address) {
      my_address_ = address;
    }
  
    void AddChild(const int32 word, LmState* child_state) {
      KALDI_ASSERT(!is_child_final_order_);
      ChildType child;
      child.state = child_state;
      children_.push_back(std::make_pair(word, child));
    }
  
    void AddChild(const int32 word, const float child_prob) {
      KALDI_ASSERT(is_child_final_order_);
      ChildType child;
      child.prob = child_prob;
      children_.push_back(std::make_pair(word, child));
    }
  
    int64 MyAddress() const {
      return my_address_;
    }
  
    bool IsUnigram() const {
      return is_unigram_;
    }
  
    bool IsChildFinalOrder() const {
      return is_child_final_order_;
    }
  
    float Logprob() const {
      return logprob_;
    }
  
    float BackoffLogprob() const {
      return backoff_logprob_;
    }
  
    int32 NumChildren() const {
      return children_.size();
    }
  
    std::pair<int32, union ChildType> GetChild(const int32 index) {
      KALDI_ASSERT(index < children_.size());
      KALDI_ASSERT(index >= 0);
      return children_[index];
    }
  
    void SortChildren() {
      std::sort(children_.begin(), children_.end(), ChildrenVectorLessThan());
    }
  
    // Checks if the current LmState is a leaf.
    bool IsLeaf() const {
      return (backoff_logprob_ == 0.0 && children_.empty());
    }
  
    // Computes the size of the memory that the current LmState would take in
    // <lm_states> array. It's the number of 4-byte chunks.
    int32 MemSize() const {
      if (IsLeaf() && !is_unigram_) {
        // We don't create an entry in this case; the logprob will be stored in
        // the same int32 that we would normally store the pointer in.
        return 0;
      } else {
        // We store the following information:
        // logprob, backoff_logprob, children.size() and children data.
        return (3 + 2 * children_.size());
      }
    }
  
   private:
    // Unigram states will have LmStates even if they are leaves, therefore we
    // need to note when this is a unigram or not.
    bool is_unigram_;
  
    // If the current LmState has an order of (final_order - 1), then its child
    // must be the final order. We only keep the log probability for its child.
    bool is_child_final_order_;
  
    // When we compute the addresses of the LmStates as offsets into <lm_states_>
    // pointer, and put the offsets here. Note that this is just offset, not
    // actual pointer.
    int64 my_address_;
  
    // Language model log probability of the current sequence. For example, if
    // this state is "A B", then it would be the logprob of "A -> B".
    float logprob_;
  
    // Language model backoff log probability of the current sequence, e.g., state
    // "A B -> X" backing off to "B -> X".
    float backoff_logprob_;
  
    // List of children.
    std::vector<std::pair<int32, union ChildType> > children_;
  };
  
  // Class to build ConstArpaLm from Arpa format language model. It relies on the
  // auxiliary class LmState above.
  class ConstArpaLmBuilder : public ArpaFileParser {
   public:
    explicit ConstArpaLmBuilder(ArpaParseOptions options)
        : ArpaFileParser(options, NULL) {
      ngram_order_ = 0;
      num_words_ = 0;
      overflow_buffer_size_ = 0;
      lm_states_size_ = 0;
      max_address_offset_ = pow(2, 30) - 1;
      is_built_ = false;
      lm_states_ = NULL;
      unigram_states_ = NULL;
      overflow_buffer_ = NULL;
    }
  
    ~ConstArpaLmBuilder() {
      unordered_map<std::vector<int32>,
                    LmState*, VectorHasher<int32> >::iterator iter;
      for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) {
        delete iter->second;
      }
      if (is_built_) {
        delete[] lm_states_;
        delete[] unigram_states_;
        delete[] overflow_buffer_;
      }
    }
  
    // Writes ConstArpaLm.
    void Write(std::ostream &os, bool binary) const;
  
    void SetMaxAddressOffset(const int32 max_address_offset) {
      KALDI_WARN << "You are changing <max_address_offset_>; the default should "
          << "not be changed unless you are in testing mode.";
      max_address_offset_ = max_address_offset;
    }
  
   protected:
    // ArpaFileParser overrides.
    virtual void HeaderAvailable();
    virtual void ConsumeNGram(const NGram& ngram);
    virtual void ReadComplete();
  
   private:
    struct WordsAndLmStatePairLessThan {
      bool operator()(
          const std::pair<std::vector<int32>*, LmState*>& lhs,
          const std::pair<std::vector<int32>*, LmState*>& rhs) const {
        return *(lhs.first) < *(rhs.first);
      }
    };
  
   private:
    // Indicating if ConstArpaLm has been built or not.
    bool is_built_;
  
    // Maximum relative address for the child. We put it here just for testing.
    // The default value is 30-bits and should not be changed except for testing.
    int32 max_address_offset_;
  
    // N-gram order of language model. This can be figured out from "/data/"
    // section in Arpa format language model.
    int32 ngram_order_;
  
    // Index of largest word-id plus one. It defines the end of <unigram_states_>
    // array.
    int32 num_words_;
  
    // Number of entries in the overflow buffer for pointers that couldn't be
    // represented as a 30-bit relative index.
    int32 overflow_buffer_size_;
  
    // Size of the <lm_states_> array, which will be needed by I/O.
    int64 lm_states_size_;
  
    // Memory blcok for storing LmStates.
    int32* lm_states_;
  
    // Memory block for storing pointers of unigram LmStates.
    int32** unigram_states_;
  
    // Memory block for storing pointers of the LmStates that have large relative
    // address to their parents.
    int32** overflow_buffer_;
  
    // Hash table from word sequences to LmStates.
    unordered_map<std::vector<int32>,
                  LmState*, VectorHasher<int32> > seq_to_state_;
  };
  
  void ConstArpaLmBuilder::HeaderAvailable() {
    ngram_order_ = NgramCounts().size();
  }
  
  void ConstArpaLmBuilder::ConsumeNGram(const NGram &ngram) {
    int32 cur_order = ngram.words.size();
    // If <ngram_order_> is larger than 1, then we do not create LmState for
    // the final order entry. We only keep the log probability for it.
    LmState *lm_state = NULL;
    if (cur_order != ngram_order_ || ngram_order_ == 1) {
      lm_state = new LmState(cur_order == 1,
                             cur_order == ngram_order_ - 1,
                             ngram.logprob, ngram.backoff);
  
      if (seq_to_state_.find(ngram.words) != seq_to_state_.end()) {
        std::ostringstream os;
        os << "[ ";
        for (size_t i = 0; i < ngram.words.size(); i++) {
          os << ngram.words[i] << " ";
        }
        os <<"]";
  
        KALDI_ERR << "N-gram " << os.str() << " appears twice in the arpa file";
      }
      seq_to_state_[ngram.words] = lm_state;
    }
  
    // If n-gram order is larger than 1, we have to add possible child to
    // existing LmStates. We have the following two assumptions:
    // 1. N-grams are processed from small order to larger ones, i.e., from
    //    1, 2, ... to the highest order.
    // 2. If a n-gram exists in the Arpa format language model, then the
    //    "history" n-gram also exists. For example, if "A B C" is a valid
    //    n-gram, then "A B" is also a valid n-gram.
    int32 last_word = ngram.words[cur_order - 1];
    if (cur_order > 1) {
      std::vector<int32> hist(ngram.words.begin(), ngram.words.end() - 1);
      unordered_map<std::vector<int32>,
                    LmState*, VectorHasher<int32> >::iterator hist_iter;
      hist_iter = seq_to_state_.find(hist);
      if (hist_iter == seq_to_state_.end()) {
        std::ostringstream ss;
        for (int i = 0; i < cur_order; ++i)
          ss << (i == 0 ? '[' : ' ') << ngram.words[i];
        KALDI_ERR << "In line " << LineNumber() << ": "
                  << cur_order << "-gram " << ss.str() << "] does not have "
                  << "a parent model " << cur_order << "-gram.";
      }
      if (cur_order != ngram_order_ || ngram_order_ == 1) {
        KALDI_ASSERT(lm_state != NULL);
        KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder());
        hist_iter->second->AddChild(last_word, lm_state);
      } else {
        KALDI_ASSERT(lm_state == NULL);
        KALDI_ASSERT(hist_iter->second->IsChildFinalOrder());
        hist_iter->second->AddChild(last_word, ngram.logprob);
      }
    } else {
      // Figures out <max_word_id>.
      num_words_ = std::max(num_words_, last_word + 1);
    }
  }
  
  // ConstArpaLm can be built in the following steps, assuming we have already
  // created LmStates <seq_to_state_>:
  // 1. Sort LmStates lexicographically.
  //    This enables us to compute relative address. When we say lexicographic, we
  //    treat the word-ids as letters. After sorting, the LmStates are in the
  //    following order:
  //    ...
  //    A B
  //    A B A
  //    A B B
  //    A B C
  //    ...
  //    where each line represents a LmState.
  // 2. Update <my_address> in LmState, which is relative to the first element in
  //    <sorted_vec>.
  // 3. Put the following structure into the memory block
  //    struct LmState {
  //      float logprob;
  //      float backoff_logprob;
  //      int32 num_children;
  //      std::pair<int32, int32> [] children;
  //    }
  //
  //    At the same time, we will also create two special buffers:
  //    <unigram_states_>
  //    <overflow_buffer_>
  void ConstArpaLmBuilder::ReadComplete() {
    // STEP 1: sorting LmStates lexicographically.
    // Vector for holding the sorted LmStates.
    std::vector<std::pair<std::vector<int32>*, LmState*> > sorted_vec;
    unordered_map<std::vector<int32>,
                  LmState*, VectorHasher<int32> >::iterator iter;
    for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) {
      if (iter->second->MemSize() > 0) {
        sorted_vec.push_back(
            std::make_pair(const_cast<std::vector<int32>*>(&(iter->first)),
                           iter->second));
      }
    }
  
    std::sort(sorted_vec.begin(), sorted_vec.end(),
              WordsAndLmStatePairLessThan());
  
    // STEP 2: updating <my_address> in LmState.
    for (int32 i = 0; i < sorted_vec.size(); ++i) {
      lm_states_size_ += sorted_vec[i].second->MemSize();
      if (i == 0) {
        sorted_vec[i].second->SetMyAddress(0);
      } else {
        sorted_vec[i].second->SetMyAddress(sorted_vec[i - 1].second->MyAddress()
            + sorted_vec[i - 1].second->MemSize());
      }
    }
  
    // STEP 3: creating memory block to store LmStates.
    // Reserves a memory block for LmStates.
    int64 lm_states_index = 0;
    try {
      lm_states_ = new int32[lm_states_size_];
    } catch(const std::exception &e) {
      KALDI_ERR << e.what();
    }
  
    // Puts data into memory block.
    unigram_states_ = new int32*[num_words_];
    std::vector<int32*> overflow_buffer_vec;
    for (int32 i = 0; i < num_words_; ++i) {
      unigram_states_[i] = NULL;
    }
    for (int32 i = 0; i < sorted_vec.size(); ++i) {
      // Current address.
      int32* parent_address = lm_states_ + lm_states_index;
  
      // Adds logprob.
      Int32AndFloat logprob_f(sorted_vec[i].second->Logprob());
      lm_states_[lm_states_index++] = logprob_f.i;
  
      // Adds backoff_logprob.
      Int32AndFloat backoff_logprob_f(sorted_vec[i].second->BackoffLogprob());
      lm_states_[lm_states_index++] = backoff_logprob_f.i;
  
      // Adds num_children.
      lm_states_[lm_states_index++] = sorted_vec[i].second->NumChildren();
  
      // Adds children, there are 3 cases:
      // 1. Child is a leaf and not unigram
      // 2. Child is not a leaf or is unigram
      //    2.1 Relative address can be represented by 30 bits
      //    2.2 Relative address cannot be represented by 30 bits
      sorted_vec[i].second->SortChildren();
      for (int32 j = 0; j < sorted_vec[i].second->NumChildren(); ++j) {
        int32 child_info;
        if (sorted_vec[i].second->IsChildFinalOrder() ||
            sorted_vec[i].second->GetChild(j).second.state->MemSize() == 0) {
          // Child is a leaf and not unigram. In this case we will not create an
          // entry in <lm_states_>; instead, we put the logprob in the place where
          // we normally store the poitner.
          Int32AndFloat child_logprob_f;
          if (sorted_vec[i].second->IsChildFinalOrder()) {
            child_logprob_f.f = sorted_vec[i].second->GetChild(j).second.prob;
          } else {
            child_logprob_f.f =
                sorted_vec[i].second->GetChild(j).second.state->Logprob();
          }
          child_info = child_logprob_f.i;
          child_info &= ~1;   // Sets the last bit to 0 so <child_info> is even.
        } else {
          // Child is not a leaf or is unigram.
          int64 offset =
              sorted_vec[i].second->GetChild(j).second.state->MyAddress()
              - sorted_vec[i].second->MyAddress();
          KALDI_ASSERT(offset > 0);
          if (offset <= max_address_offset_) {
            // Relative address can be represented by 30 bits.
            child_info = offset * 2;
            child_info |= 1;
          } else {
            // Relative address cannot be represented by 30 bits, we have to put
            // the child address into <overflow_buffer_>.
            int32* abs_address = parent_address + offset;
            overflow_buffer_vec.push_back(abs_address);
            int32 overflow_buffer_index = overflow_buffer_vec.size() - 1;
            child_info = overflow_buffer_index * 2;
            child_info |= 1;
            child_info *= -1;
          }
        }
        // Child word.
        lm_states_[lm_states_index++] = sorted_vec[i].second->GetChild(j).first;
        // Child info.
        lm_states_[lm_states_index++] = child_info;
      }
  
      // If the current state corresponds to an unigram, then create a separate
      // loop up table to improve efficiency, since those will be looked up pretty
      // frequently.
      if (sorted_vec[i].second->IsUnigram()) {
        KALDI_ASSERT(sorted_vec[i].first->size() == 1);
        unigram_states_[(*sorted_vec[i].first)[0]] = parent_address;
      }
    }
    KALDI_ASSERT(lm_states_size_ == lm_states_index);
  
    // Move <overflow_buffer_> from vector holder to array.
    overflow_buffer_size_ = overflow_buffer_vec.size();
    overflow_buffer_ = new int32*[overflow_buffer_size_];
    for (int32 i = 0; i < overflow_buffer_size_; ++i) {
      overflow_buffer_[i] = overflow_buffer_vec[i];
    }
  
    is_built_ = true;
  }
  
  void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const {
    if (!binary) {
      KALDI_ERR << "text-mode writing is not implemented for ConstArpaLmBuilder.";
    }
    KALDI_ASSERT(is_built_);
  
    // Creates ConstArpaLm.
    ConstArpaLm const_arpa_lm(
        Options().bos_symbol, Options().eos_symbol, Options().unk_symbol,
        ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_,
        unigram_states_, overflow_buffer_, lm_states_);
    const_arpa_lm.Write(os, binary);
  }
  
  void ConstArpaLm::Write(std::ostream &os, bool binary) const {
    KALDI_ASSERT(initialized_);
    if (!binary) {
      KALDI_ERR << "text-mode writing is not implemented for ConstArpaLm.";
    }
  
    WriteToken(os, binary, "<ConstArpaLm>");
  
    // Misc info.
    WriteToken(os, binary, "<LmInfo>");
    WriteBasicType(os, binary, bos_symbol_);
    WriteBasicType(os, binary, eos_symbol_);
    WriteBasicType(os, binary, unk_symbol_);
    WriteBasicType(os, binary, ngram_order_);
    WriteToken(os, binary, "</LmInfo>");
  
    // LmStates section.
    WriteToken(os, binary, "<LmStates>");
    WriteBasicType(os, binary, lm_states_size_);
    os.write(reinterpret_cast<char *>(lm_states_),
             sizeof(int32) * lm_states_size_);
    if (!os.good()) {
      KALDI_ERR << "ConstArpaLm <LmStates> section writing failed.";
    }
    WriteToken(os, binary, "</LmStates>");
  
    // Unigram section. We write memory offset to disk instead of the absolute
    // pointers.
    WriteToken(os, binary, "<LmUnigram>");
    WriteBasicType(os, binary, num_words_);
    int64* tmp_unigram_address = new int64[num_words_];
    for (int32 i = 0; i < num_words_; ++i) {
      // The relative address here is a little bit tricky:
      // 1. If the original address is NULL, then we set the relative address to
      //    zero.
      // 2. If the original address is not NULL, we set it to the following:
      //      unigram_states_[i] - lm_states_ + 1
      //    we plus 1 to ensure that the above value is positive.
      tmp_unigram_address[i] = (unigram_states_[i] == NULL) ? 0 :
          unigram_states_[i] - lm_states_ + 1;
    }
    os.write(reinterpret_cast<char *>(tmp_unigram_address),
             sizeof(int64) * num_words_);
    if (!os.good()) {
      KALDI_ERR << "ConstArpaLm <LmUnigram> section writing failed.";
    }
    delete[] tmp_unigram_address;   // Releases the memory.
    tmp_unigram_address = NULL;
    WriteToken(os, binary, "</LmUnigram>");
  
    // Overflow section. We write memory offset to disk instead of the absolute
    // pointers.
    WriteToken(os, binary, "<LmOverflow>");
    WriteBasicType(os, binary, overflow_buffer_size_);
    int64* tmp_overflow_address = new int64[overflow_buffer_size_];
    for (int32 i = 0; i < overflow_buffer_size_; ++i) {
      // The relative address here is a little bit tricky:
      // 1. If the original address is NULL, then we set the relative address to
      //    zero.
      // 2. If the original address is not NULL, we set it to the following:
      //      overflow_buffer_[i] - lm_states_ + 1
      //    we plus 1 to ensure that the above value is positive.
      tmp_overflow_address[i] = (overflow_buffer_[i] == NULL) ? 0 :
          overflow_buffer_[i] - lm_states_ + 1;
    }
    os.write(reinterpret_cast<char *>(tmp_overflow_address),
             sizeof(int64) * overflow_buffer_size_);
    if (!os.good()) {
      KALDI_ERR << "ConstArpaLm <LmOverflow> section writing failed.";
    }
    delete[] tmp_overflow_address;
    tmp_overflow_address = NULL;
    WriteToken(os, binary, "</LmOverflow>");
    WriteToken(os, binary, "</ConstArpaLm>");
  }
  
  void ConstArpaLm::Read(std::istream &is, bool binary) {
    KALDI_ASSERT(!initialized_);
    if (!binary) {
      KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
    }
  
    int first_char = is.peek();
    if (first_char == 4) {  // Old on-disk format starts with length of int32.
      ReadInternalOldFormat(is, binary);
    } else {                // New on-disk format starts with token <ConstArpaLm>.
      ReadInternal(is, binary);
    }
  }
  
  void ConstArpaLm::ReadInternal(std::istream &is, bool binary) {
    KALDI_ASSERT(!initialized_);
    if (!binary) {
      KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
    }
  
    ExpectToken(is, binary, "<ConstArpaLm>");
  
    // Misc info.
    ExpectToken(is, binary, "<LmInfo>");
    ReadBasicType(is, binary, &bos_symbol_);
    ReadBasicType(is, binary, &eos_symbol_);
    ReadBasicType(is, binary, &unk_symbol_);
    ReadBasicType(is, binary, &ngram_order_);
    ExpectToken(is, binary, "</LmInfo>");
  
    // LmStates section.
    ExpectToken(is, binary, "<LmStates>");
    ReadBasicType(is, binary, &lm_states_size_);
    lm_states_ = new int32[lm_states_size_];
    is.read(reinterpret_cast<char *>(lm_states_),
            sizeof(int32) * lm_states_size_);
    if (!is.good()) {
      KALDI_ERR << "ConstArpaLm <LmStates> section reading failed.";
    }
    ExpectToken(is, binary, "</LmStates>");
  
    // Unigram section. We write memory offset to disk instead of the absolute
    // pointers.
    ExpectToken(is, binary, "<LmUnigram>");
    ReadBasicType(is, binary, &num_words_);
    unigram_states_ = new int32*[num_words_];
    int64* tmp_unigram_address = new int64[num_words_];
    is.read(reinterpret_cast<char *>(tmp_unigram_address),
            sizeof(int64) * num_words_);
    if (!is.good()) {
      KALDI_ERR << "ConstArpaLm <LmUnigram> section reading failed.";
    }
    for (int32 i = 0; i < num_words_; ++i) {
      // Check out how we compute the relative address in ConstArpaLm::Write().
      unigram_states_[i] = (tmp_unigram_address[i] == 0) ? NULL
          : lm_states_ + tmp_unigram_address[i] - 1;
    }
    delete[] tmp_unigram_address;
    tmp_unigram_address = NULL;
    ExpectToken(is, binary, "</LmUnigram>");
  
    // Overflow section. We write memory offset to disk instead of the absolute
    // pointers.
    ExpectToken(is, binary, "<LmOverflow>");
    ReadBasicType(is, binary, &overflow_buffer_size_);
    overflow_buffer_ = new int32*[overflow_buffer_size_];
    int64* tmp_overflow_address = new int64[overflow_buffer_size_];
    is.read(reinterpret_cast<char *>(tmp_overflow_address),
            sizeof(int64) * overflow_buffer_size_);
    if (!is.good()) {
      KALDI_ERR << "ConstArpaLm <LmOverflow> section reading failed.";
    }
    for (int32 i = 0; i < overflow_buffer_size_; ++i) {
      // Check out how we compute the relative address in ConstArpaLm::Write().
      overflow_buffer_[i] = (tmp_overflow_address[i] == 0) ? NULL
          : lm_states_ + tmp_overflow_address[i] - 1;
    }
    delete[] tmp_overflow_address;
    tmp_overflow_address = NULL;
    ExpectToken(is, binary, "</LmOverflow>");
    ExpectToken(is, binary, "</ConstArpaLm>");
  
    KALDI_ASSERT(ngram_order_ > 0);
    KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0);
    KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0);
    KALDI_ASSERT(unk_symbol_ < num_words_ &&
                 (unk_symbol_ > 0 || unk_symbol_ == -1));
    lm_states_end_ = lm_states_ + lm_states_size_ - 1;
    memory_assigned_ = true;
    initialized_ = true;
  }
  
  void ConstArpaLm::ReadInternalOldFormat(std::istream &is, bool binary) {
    KALDI_ASSERT(!initialized_);
    if (!binary) {
      KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm.";
    }
  
    // Misc info.
    ReadBasicType(is, binary, &bos_symbol_);
    ReadBasicType(is, binary, &eos_symbol_);
    ReadBasicType(is, binary, &unk_symbol_);
    ReadBasicType(is, binary, &ngram_order_);
  
    // LmStates section.
    // In the deprecated version, <lm_states_size_> used to be type of int32,
    // which was a bug. We therefore use int32 for read for back-compatibility.
    int32 lm_states_size_int32;
    ReadBasicType(is, binary, &lm_states_size_int32);
    lm_states_size_ = static_cast<int64>(lm_states_size_int32);
    lm_states_ = new int32[lm_states_size_];
    for (int64 i = 0; i < lm_states_size_; ++i) {
      ReadBasicType(is, binary, &lm_states_[i]);
    }
  
    // Unigram section. We write memory offset to disk instead of the absolute
    // pointers.
    ReadBasicType(is, binary, &num_words_);
    unigram_states_ = new int32*[num_words_];
    for (int32 i = 0; i < num_words_; ++i) {
      int64 tmp_address;
      ReadBasicType(is, binary, &tmp_address);
      // Check out how we compute the relative address in ConstArpaLm::Write().
      unigram_states_[i] =
          (tmp_address == 0) ? NULL : lm_states_ + tmp_address - 1;
    }
  
    // Overflow section. We write memory offset to disk instead of the absolute
    // pointers.
    ReadBasicType(is, binary, &overflow_buffer_size_);
    overflow_buffer_ = new int32*[overflow_buffer_size_];
    for (int32 i = 0; i < overflow_buffer_size_; ++i) {
      int64 tmp_address;
      ReadBasicType(is, binary, &tmp_address);
      // Check out how we compute the relative address in ConstArpaLm::Write().
      overflow_buffer_[i] =
          (tmp_address == 0) ? NULL : lm_states_ + tmp_address - 1;
    }
    KALDI_ASSERT(ngram_order_ > 0);
    KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0);
    KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0);
    KALDI_ASSERT(unk_symbol_ < num_words_ &&
                 (unk_symbol_ > 0 || unk_symbol_ == -1));
    lm_states_end_ = lm_states_ + lm_states_size_ - 1;
    memory_assigned_ = true;
    initialized_ = true;
  }
  
  bool ConstArpaLm::HistoryStateExists(const std::vector<int32>& hist) const {
    // We do not create LmState for empty word sequence, but technically it is the
    // history state of all unigrams.
    if (hist.size() == 0) {
      return true;
    }
  
    // Tries to locate the LmState of the given word sequence.
    int32* lm_state = GetLmState(hist);
    if (lm_state == NULL) {
      // <lm_state> does not exist means <hist> has no child.
      return false;
    } else {
      // Note that we always create LmState for unigrams, so even if <lm_state> is
      // not NULL, we still have to check if it has child.
      KALDI_ASSERT(lm_state >= lm_states_);
      KALDI_ASSERT(lm_state + 2 <= lm_states_end_);
      // <lm_state + 2> points to <num_children>.
      if (*(lm_state + 2) > 0) {
        return true;
      } else {
        return false;
      }
    }
    return true;
  }
  
  float ConstArpaLm::GetNgramLogprob(const int32 word,
                                     const std::vector<int32>& hist) const {
    KALDI_ASSERT(initialized_);
  
    // If the history size plus one is larger than <ngram_order_>, remove the old
    // words.
    std::vector<int32> mapped_hist(hist);
    while (mapped_hist.size() >= ngram_order_) {
      mapped_hist.erase(mapped_hist.begin(), mapped_hist.begin() + 1);
    }
    KALDI_ASSERT(mapped_hist.size() + 1 <= ngram_order_);
  
    // TODO(guoguo): check with Dan if this is reasonable.
    // Maps possible out-of-vocabulary words to <unk>. If a word does not have a
    // corresponding LmState, we treat it as <unk>. We map it to <unk> if <unk> is
    // specified.
    int32 mapped_word = word;
    if (unk_symbol_ != -1) {
      KALDI_ASSERT(mapped_word >= 0);
      if (mapped_word >= num_words_ || unigram_states_[mapped_word] == NULL) {
        mapped_word = unk_symbol_;
      }
      for (int32 i = 0; i < mapped_hist.size(); ++i) {
        KALDI_ASSERT(mapped_hist[i] >= 0);
        if (mapped_hist[i] >= num_words_ ||
            unigram_states_[mapped_hist[i]] == NULL) {
          mapped_hist[i] = unk_symbol_;
        }
      }
    }
  
    // Loops up n-gram probability.
    return GetNgramLogprobRecurse(mapped_word, mapped_hist);
  }
  
  float ConstArpaLm::GetNgramLogprobRecurse(
      const int32 word, const std::vector<int32>& hist) const {
    KALDI_ASSERT(initialized_);
    KALDI_ASSERT(hist.size() + 1 <= ngram_order_);
  
    // Unigram case.
    if (hist.size() == 0) {
      if (word >= num_words_ || unigram_states_[word] == NULL) {
        // If <unk> is defined, then the word sequence should have already been
        // mapped to <unk> is necessary; this is for the case where <unk> is not
        // defined.
        return std::numeric_limits<float>::min();
      } else {
        Int32AndFloat logprob_i(*unigram_states_[word]);
        return logprob_i.f;
      }
    }
  
    // High n-gram orders.
    float logprob = 0.0;
    float backoff_logprob = 0.0;
    int32* state;
    if ((state = GetLmState(hist)) != NULL) {
      int32 child_info;
      int32* child_lm_state = NULL;
      if (GetChildInfo(word, state, &child_info)) {
        DecodeChildInfo(child_info, state, &child_lm_state, &logprob);
        return logprob;
      } else {
        Int32AndFloat backoff_logprob_i(*(state + 1));
        backoff_logprob = backoff_logprob_i.f;
      }
    }
    std::vector<int32> new_hist(hist);
    new_hist.erase(new_hist.begin(), new_hist.begin() + 1);
    return backoff_logprob + GetNgramLogprobRecurse(word, new_hist);
  }
  
  int32* ConstArpaLm::GetLmState(const std::vector<int32>& seq) const {
    KALDI_ASSERT(initialized_);
  
    // No LmState exists for empty word sequence.
    if (seq.size() == 0) return NULL;
  
    // If <unk> is defined, then the word sequence should have already been mapped
    // to <unk> is necessary; this is for the case where <unk> is not defined.
    if (seq[0] >= num_words_ || unigram_states_[seq[0]] == NULL) return NULL;
    int32* parent = unigram_states_[seq[0]];
  
    int32 child_info;
    int32* child_lm_state = NULL;
    float logprob;
    for (int32 i = 1; i < seq.size(); ++i) {
      if (!GetChildInfo(seq[i], parent, &child_info)) {
        return NULL;
      }
      DecodeChildInfo(child_info, parent, &child_lm_state, &logprob);
      if (child_lm_state == NULL) {
        return NULL;
      } else {
        parent = child_lm_state;
      }
    }
    return parent;
  }
  
  bool ConstArpaLm::GetChildInfo(const int32 word,
                                 int32* parent, int32* child_info) const {
    KALDI_ASSERT(initialized_);
  
    KALDI_ASSERT(parent != NULL);
    KALDI_ASSERT(parent >= lm_states_);
    KALDI_ASSERT(child_info != NULL);
  
    KALDI_ASSERT(parent + 2 <= lm_states_end_);
    int32 num_children = *(parent + 2);
    KALDI_ASSERT(parent + 2 + 2 * num_children <= lm_states_end_);
  
    if (num_children == 0) return false;
  
    // A binary search into the children memory block.
    int32 start_index = 1;
    int32 end_index = num_children;
    while (start_index <= end_index) {
      int32 mid_index = round((start_index + end_index) / 2);
      int32 mid_word = *(parent + 1 + 2 * mid_index);
      if (mid_word == word) {
        *child_info = *(parent + 2 + 2 * mid_index);
        return true;
      } else if (mid_word < word) {
        start_index = mid_index + 1;
      } else {
        end_index = mid_index - 1;
      }
    }
  
    return false;
  }
  
  void ConstArpaLm::DecodeChildInfo(const int32 child_info,
                                    int32* parent,
                                    int32** child_lm_state,
                                    float* logprob) const {
    KALDI_ASSERT(initialized_);
  
    KALDI_ASSERT(logprob != NULL);
    if (child_info % 2 == 0) {
      // Child is a leaf, only returns the log probability.
      *child_lm_state = NULL;
      Int32AndFloat logprob_i(child_info);
      *logprob = logprob_i.f;
    } else {
      int32 child_offset = child_info / 2;
      if (child_offset > 0) {
        *child_lm_state = parent + child_offset;
        Int32AndFloat logprob_i(**child_lm_state);
        *logprob = logprob_i.f;
      } else {
        KALDI_ASSERT(-child_offset < overflow_buffer_size_);
        *child_lm_state = overflow_buffer_[-child_offset];
        Int32AndFloat logprob_i(**child_lm_state);
        *logprob = logprob_i.f;
      }
      KALDI_ASSERT(*child_lm_state >= lm_states_);
      KALDI_ASSERT(*child_lm_state <= lm_states_end_);
    }
  }
  
  void ConstArpaLm::WriteArpaRecurse(int32* lm_state,
                                     const std::vector<int32>& seq,
                                     std::vector<ArpaLine> *output) const {
    if (lm_state == NULL) return;
  
    KALDI_ASSERT(lm_state >= lm_states_);
    KALDI_ASSERT(lm_state + 2 <= lm_states_end_);
  
    // Inserts the current LmState to <output>.
    ArpaLine arpa_line;
    arpa_line.words = seq;
    Int32AndFloat logprob_i(*lm_state);
    arpa_line.logprob = logprob_i.f;
    Int32AndFloat backoff_logprob_i(*(lm_state + 1));
    arpa_line.backoff_logprob = backoff_logprob_i.f;
    output->push_back(arpa_line);
  
    // Scans for possible children, and recursively adds child to <output>.
    int32 num_children = *(lm_state + 2);
    KALDI_ASSERT(lm_state + 2 + 2 * num_children <= lm_states_end_);
    for (int32 i = 0; i < num_children; ++i) {
      std::vector<int32> new_seq(seq);
      new_seq.push_back(*(lm_state + 3 + 2 * i));
      int32 child_info = *(lm_state + 4 + 2 * i);
      float logprob;
      int32* child_lm_state = NULL;
      DecodeChildInfo(child_info, lm_state, &child_lm_state, &logprob);
  
      if (child_lm_state == NULL) {
        // Leaf case.
        ArpaLine child_arpa_line;
        child_arpa_line.words = new_seq;
        child_arpa_line.logprob = logprob;
        child_arpa_line.backoff_logprob = 0.0;
        output->push_back(child_arpa_line);
      } else {
        WriteArpaRecurse(child_lm_state, new_seq, output);
      }
    }
  }
  
  void ConstArpaLm::WriteArpa(std::ostream &os) const {
    KALDI_ASSERT(initialized_);
  
    std::vector<ArpaLine> tmp_output;
    for (int32 i = 0; i < num_words_; ++i) {
      if (unigram_states_[i] != NULL) {
        std::vector<int32> seq(1, i);
        WriteArpaRecurse(unigram_states_[i], seq, &tmp_output);
      }
    }
  
    // Sorts ArpaLines and collects head information.
    std::sort(tmp_output.begin(), tmp_output.end());
    std::vector<int32> ngram_count(1, 0);
    for (int32 i = 0; i < tmp_output.size(); ++i) {
      if (tmp_output[i].words.size() >= ngram_count.size()) {
        ngram_count.resize(tmp_output[i].words.size() + 1);
        ngram_count[tmp_output[i].words.size()] = 1;
      } else {
        ngram_count[tmp_output[i].words.size()] += 1;
      }
    }
  
    // Writes the header.
    os << std::endl;
    os << "\\data\\" << std::endl;
    for (int32 i = 1; i < ngram_count.size(); ++i) {
      os << "ngram " << i << "=" << ngram_count[i] << std::endl;
    }
  
    // Writes n-grams.
    int32 current_order = 0;
    for (int32 i = 0; i < tmp_output.size(); ++i) {
      // Beginning of a n-gram section.
      if (tmp_output[i].words.size() != current_order) {
        current_order = tmp_output[i].words.size();
        os << std::endl;
        os << "\\" << current_order << "-grams:" << std::endl;
      }
  
      // Writes logprob.
      os << tmp_output[i].logprob << '\t';
  
      // Writes word sequence.
      for (int32 j = 0; j < tmp_output[i].words.size(); ++j) {
        os << tmp_output[i].words[j];
        if (j != tmp_output[i].words.size() - 1) {
          os << " ";
        }
      }
  
      // Writes backoff_logprob if it is not zero.
      if (tmp_output[i].backoff_logprob != 0.0) {
        os << '\t' << tmp_output[i].backoff_logprob;
      }
      os << std::endl;
    }
  
    os << std::endl << "\\end\\" << std::endl;
  }
  
  ConstArpaLmDeterministicFst::ConstArpaLmDeterministicFst(
      const ConstArpaLm& lm) : lm_(lm) {
    // Creates a history state for <s>.
    std::vector<Label> bos_state(1, lm_.BosSymbol());
    state_to_wseq_.push_back(bos_state);
    wseq_to_state_[bos_state] = 0;
    start_state_ = 0;
  }
  
  fst::StdArc::Weight ConstArpaLmDeterministicFst::Final(StateId s) {
    // At this point, we should have created the state.
    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
    const std::vector<Label>& wseq = state_to_wseq_[s];
    float logprob = lm_.GetNgramLogprob(lm_.EosSymbol(), wseq);
    return Weight(-logprob);
  }
  
  bool ConstArpaLmDeterministicFst::GetArc(StateId s,
                                           Label ilabel, fst::StdArc *oarc) {
    // At this point, we should have created the state.
    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
    std::vector<Label> wseq = state_to_wseq_[s];
  
    float logprob = lm_.GetNgramLogprob(ilabel, wseq);
    if (logprob == std::numeric_limits<float>::min()) {
      return false;
    }
  
    // Locates the next state in ConstArpaLm. Note that OOV and backoff have been
    // taken care of in ConstArpaLm.
    wseq.push_back(ilabel);
    while (wseq.size() >= lm_.NgramOrder()) {
      // History state has at most lm_.NgramOrder() -1 words in the state.
      wseq.erase(wseq.begin(), wseq.begin() + 1);
    }
    while (!lm_.HistoryStateExists(wseq)) {
      KALDI_ASSERT(wseq.size() > 0);
      wseq.erase(wseq.begin(), wseq.begin() + 1);
    }
  
    std::pair<const std::vector<Label>, StateId> wseq_state_pair(
        wseq, static_cast<Label>(state_to_wseq_.size()));
  
    // Attemps to insert the current <wseq_state_pair>. If the pair already exists
    // then it returns false.
    typedef MapType::iterator IterType;
    std::pair<IterType, bool> result = wseq_to_state_.insert(wseq_state_pair);
  
    // If the pair was just inserted, then also add it to <state_to_wseq_>.
    if (result.second == true)
      state_to_wseq_.push_back(wseq);
  
    // Creates the arc.
    oarc->ilabel = ilabel;
    oarc->olabel = ilabel;
    oarc->nextstate = result.first->second;
    oarc->weight = Weight(-logprob);
  
    return true;
  }
  
  bool BuildConstArpaLm(const ArpaParseOptions& options,
                        const std::string& arpa_rxfilename,
                        const std::string& const_arpa_wxfilename) {
    ConstArpaLmBuilder lm_builder(options);
    KALDI_LOG << "Reading " << arpa_rxfilename;
    Input ki(arpa_rxfilename);
    lm_builder.Read(ki.Stream());
    WriteKaldiObject(lm_builder, const_arpa_wxfilename, true);
    return true;
  }
  
  }  // namespace kaldi