// lm/const-arpa-lm.cc // Copyright 2014 Guoguo Chen // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include #include #include #include #include "base/kaldi-math.h" #include "lm/arpa-file-parser.h" #include "lm/const-arpa-lm.h" #include "util/stl-utils.h" #include "util/text-utils.h" namespace kaldi { // Auxiliary struct for converting ConstArpaLm format langugae model to Arpa // format. struct ArpaLine { std::vector words; // Sequence of words to be printed. float logprob; // Logprob corresponds to word sequence. float backoff_logprob; // Backoff_logprob corresponds to word sequence. // Comparison function for sorting. bool operator < (const ArpaLine &other) const { if (words.size() < other.words.size()) { return true; } else if (words.size() > other.words.size()) { return false; } else { return words < other.words; } } }; // Auxiliary class to build ConstArpaLm. We first use this class to figure out // the relative address of different LmStates, and then put everything into one // block in memory. class LmState { public: union ChildType { // If child is not the final order, we keep the pointer to its LmState. LmState* state; // If child is the final order, we only keep the log probability for it. float prob; }; struct ChildrenVectorLessThan { bool operator()( const std::pair& lhs, const std::pair& rhs) const { return lhs.first < rhs.first; } }; LmState(const bool is_unigram, const bool is_child_final_order, const float logprob, const float backoff_logprob) : is_unigram_(is_unigram), is_child_final_order_(is_child_final_order), logprob_(logprob), backoff_logprob_(backoff_logprob) {} void SetMyAddress(const int64 address) { my_address_ = address; } void AddChild(const int32 word, LmState* child_state) { KALDI_ASSERT(!is_child_final_order_); ChildType child; child.state = child_state; children_.push_back(std::make_pair(word, child)); } void AddChild(const int32 word, const float child_prob) { KALDI_ASSERT(is_child_final_order_); ChildType child; child.prob = child_prob; children_.push_back(std::make_pair(word, child)); } int64 MyAddress() const { return my_address_; } bool IsUnigram() const { return is_unigram_; } bool IsChildFinalOrder() const { return is_child_final_order_; } float Logprob() const { return logprob_; } float BackoffLogprob() const { return backoff_logprob_; } int32 NumChildren() const { return children_.size(); } std::pair GetChild(const int32 index) { KALDI_ASSERT(index < children_.size()); KALDI_ASSERT(index >= 0); return children_[index]; } void SortChildren() { std::sort(children_.begin(), children_.end(), ChildrenVectorLessThan()); } // Checks if the current LmState is a leaf. bool IsLeaf() const { return (backoff_logprob_ == 0.0 && children_.empty()); } // Computes the size of the memory that the current LmState would take in // array. It's the number of 4-byte chunks. int32 MemSize() const { if (IsLeaf() && !is_unigram_) { // We don't create an entry in this case; the logprob will be stored in // the same int32 that we would normally store the pointer in. return 0; } else { // We store the following information: // logprob, backoff_logprob, children.size() and children data. return (3 + 2 * children_.size()); } } private: // Unigram states will have LmStates even if they are leaves, therefore we // need to note when this is a unigram or not. bool is_unigram_; // If the current LmState has an order of (final_order - 1), then its child // must be the final order. We only keep the log probability for its child. bool is_child_final_order_; // When we compute the addresses of the LmStates as offsets into // pointer, and put the offsets here. Note that this is just offset, not // actual pointer. int64 my_address_; // Language model log probability of the current sequence. For example, if // this state is "A B", then it would be the logprob of "A -> B". float logprob_; // Language model backoff log probability of the current sequence, e.g., state // "A B -> X" backing off to "B -> X". float backoff_logprob_; // List of children. std::vector > children_; }; // Class to build ConstArpaLm from Arpa format language model. It relies on the // auxiliary class LmState above. class ConstArpaLmBuilder : public ArpaFileParser { public: explicit ConstArpaLmBuilder(ArpaParseOptions options) : ArpaFileParser(options, NULL) { ngram_order_ = 0; num_words_ = 0; overflow_buffer_size_ = 0; lm_states_size_ = 0; max_address_offset_ = pow(2, 30) - 1; is_built_ = false; lm_states_ = NULL; unigram_states_ = NULL; overflow_buffer_ = NULL; } ~ConstArpaLmBuilder() { unordered_map, LmState*, VectorHasher >::iterator iter; for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) { delete iter->second; } if (is_built_) { delete[] lm_states_; delete[] unigram_states_; delete[] overflow_buffer_; } } // Writes ConstArpaLm. void Write(std::ostream &os, bool binary) const; void SetMaxAddressOffset(const int32 max_address_offset) { KALDI_WARN << "You are changing ; the default should " << "not be changed unless you are in testing mode."; max_address_offset_ = max_address_offset; } protected: // ArpaFileParser overrides. virtual void HeaderAvailable(); virtual void ConsumeNGram(const NGram& ngram); virtual void ReadComplete(); private: struct WordsAndLmStatePairLessThan { bool operator()( const std::pair*, LmState*>& lhs, const std::pair*, LmState*>& rhs) const { return *(lhs.first) < *(rhs.first); } }; private: // Indicating if ConstArpaLm has been built or not. bool is_built_; // Maximum relative address for the child. We put it here just for testing. // The default value is 30-bits and should not be changed except for testing. int32 max_address_offset_; // N-gram order of language model. This can be figured out from "/data/" // section in Arpa format language model. int32 ngram_order_; // Index of largest word-id plus one. It defines the end of // array. int32 num_words_; // Number of entries in the overflow buffer for pointers that couldn't be // represented as a 30-bit relative index. int32 overflow_buffer_size_; // Size of the array, which will be needed by I/O. int64 lm_states_size_; // Memory blcok for storing LmStates. int32* lm_states_; // Memory block for storing pointers of unigram LmStates. int32** unigram_states_; // Memory block for storing pointers of the LmStates that have large relative // address to their parents. int32** overflow_buffer_; // Hash table from word sequences to LmStates. unordered_map, LmState*, VectorHasher > seq_to_state_; }; void ConstArpaLmBuilder::HeaderAvailable() { ngram_order_ = NgramCounts().size(); } void ConstArpaLmBuilder::ConsumeNGram(const NGram &ngram) { int32 cur_order = ngram.words.size(); // If is larger than 1, then we do not create LmState for // the final order entry. We only keep the log probability for it. LmState *lm_state = NULL; if (cur_order != ngram_order_ || ngram_order_ == 1) { lm_state = new LmState(cur_order == 1, cur_order == ngram_order_ - 1, ngram.logprob, ngram.backoff); if (seq_to_state_.find(ngram.words) != seq_to_state_.end()) { std::ostringstream os; os << "[ "; for (size_t i = 0; i < ngram.words.size(); i++) { os << ngram.words[i] << " "; } os <<"]"; KALDI_ERR << "N-gram " << os.str() << " appears twice in the arpa file"; } seq_to_state_[ngram.words] = lm_state; } // If n-gram order is larger than 1, we have to add possible child to // existing LmStates. We have the following two assumptions: // 1. N-grams are processed from small order to larger ones, i.e., from // 1, 2, ... to the highest order. // 2. If a n-gram exists in the Arpa format language model, then the // "history" n-gram also exists. For example, if "A B C" is a valid // n-gram, then "A B" is also a valid n-gram. int32 last_word = ngram.words[cur_order - 1]; if (cur_order > 1) { std::vector hist(ngram.words.begin(), ngram.words.end() - 1); unordered_map, LmState*, VectorHasher >::iterator hist_iter; hist_iter = seq_to_state_.find(hist); if (hist_iter == seq_to_state_.end()) { std::ostringstream ss; for (int i = 0; i < cur_order; ++i) ss << (i == 0 ? '[' : ' ') << ngram.words[i]; KALDI_ERR << "In line " << LineNumber() << ": " << cur_order << "-gram " << ss.str() << "] does not have " << "a parent model " << cur_order << "-gram."; } if (cur_order != ngram_order_ || ngram_order_ == 1) { KALDI_ASSERT(lm_state != NULL); KALDI_ASSERT(!hist_iter->second->IsChildFinalOrder()); hist_iter->second->AddChild(last_word, lm_state); } else { KALDI_ASSERT(lm_state == NULL); KALDI_ASSERT(hist_iter->second->IsChildFinalOrder()); hist_iter->second->AddChild(last_word, ngram.logprob); } } else { // Figures out . num_words_ = std::max(num_words_, last_word + 1); } } // ConstArpaLm can be built in the following steps, assuming we have already // created LmStates : // 1. Sort LmStates lexicographically. // This enables us to compute relative address. When we say lexicographic, we // treat the word-ids as letters. After sorting, the LmStates are in the // following order: // ... // A B // A B A // A B B // A B C // ... // where each line represents a LmState. // 2. Update in LmState, which is relative to the first element in // . // 3. Put the following structure into the memory block // struct LmState { // float logprob; // float backoff_logprob; // int32 num_children; // std::pair [] children; // } // // At the same time, we will also create two special buffers: // // void ConstArpaLmBuilder::ReadComplete() { // STEP 1: sorting LmStates lexicographically. // Vector for holding the sorted LmStates. std::vector*, LmState*> > sorted_vec; unordered_map, LmState*, VectorHasher >::iterator iter; for (iter = seq_to_state_.begin(); iter != seq_to_state_.end(); ++iter) { if (iter->second->MemSize() > 0) { sorted_vec.push_back( std::make_pair(const_cast*>(&(iter->first)), iter->second)); } } std::sort(sorted_vec.begin(), sorted_vec.end(), WordsAndLmStatePairLessThan()); // STEP 2: updating in LmState. for (int32 i = 0; i < sorted_vec.size(); ++i) { lm_states_size_ += sorted_vec[i].second->MemSize(); if (i == 0) { sorted_vec[i].second->SetMyAddress(0); } else { sorted_vec[i].second->SetMyAddress(sorted_vec[i - 1].second->MyAddress() + sorted_vec[i - 1].second->MemSize()); } } // STEP 3: creating memory block to store LmStates. // Reserves a memory block for LmStates. int64 lm_states_index = 0; try { lm_states_ = new int32[lm_states_size_]; } catch(const std::exception &e) { KALDI_ERR << e.what(); } // Puts data into memory block. unigram_states_ = new int32*[num_words_]; std::vector overflow_buffer_vec; for (int32 i = 0; i < num_words_; ++i) { unigram_states_[i] = NULL; } for (int32 i = 0; i < sorted_vec.size(); ++i) { // Current address. int32* parent_address = lm_states_ + lm_states_index; // Adds logprob. Int32AndFloat logprob_f(sorted_vec[i].second->Logprob()); lm_states_[lm_states_index++] = logprob_f.i; // Adds backoff_logprob. Int32AndFloat backoff_logprob_f(sorted_vec[i].second->BackoffLogprob()); lm_states_[lm_states_index++] = backoff_logprob_f.i; // Adds num_children. lm_states_[lm_states_index++] = sorted_vec[i].second->NumChildren(); // Adds children, there are 3 cases: // 1. Child is a leaf and not unigram // 2. Child is not a leaf or is unigram // 2.1 Relative address can be represented by 30 bits // 2.2 Relative address cannot be represented by 30 bits sorted_vec[i].second->SortChildren(); for (int32 j = 0; j < sorted_vec[i].second->NumChildren(); ++j) { int32 child_info; if (sorted_vec[i].second->IsChildFinalOrder() || sorted_vec[i].second->GetChild(j).second.state->MemSize() == 0) { // Child is a leaf and not unigram. In this case we will not create an // entry in ; instead, we put the logprob in the place where // we normally store the poitner. Int32AndFloat child_logprob_f; if (sorted_vec[i].second->IsChildFinalOrder()) { child_logprob_f.f = sorted_vec[i].second->GetChild(j).second.prob; } else { child_logprob_f.f = sorted_vec[i].second->GetChild(j).second.state->Logprob(); } child_info = child_logprob_f.i; child_info &= ~1; // Sets the last bit to 0 so is even. } else { // Child is not a leaf or is unigram. int64 offset = sorted_vec[i].second->GetChild(j).second.state->MyAddress() - sorted_vec[i].second->MyAddress(); KALDI_ASSERT(offset > 0); if (offset <= max_address_offset_) { // Relative address can be represented by 30 bits. child_info = offset * 2; child_info |= 1; } else { // Relative address cannot be represented by 30 bits, we have to put // the child address into . int32* abs_address = parent_address + offset; overflow_buffer_vec.push_back(abs_address); int32 overflow_buffer_index = overflow_buffer_vec.size() - 1; child_info = overflow_buffer_index * 2; child_info |= 1; child_info *= -1; } } // Child word. lm_states_[lm_states_index++] = sorted_vec[i].second->GetChild(j).first; // Child info. lm_states_[lm_states_index++] = child_info; } // If the current state corresponds to an unigram, then create a separate // loop up table to improve efficiency, since those will be looked up pretty // frequently. if (sorted_vec[i].second->IsUnigram()) { KALDI_ASSERT(sorted_vec[i].first->size() == 1); unigram_states_[(*sorted_vec[i].first)[0]] = parent_address; } } KALDI_ASSERT(lm_states_size_ == lm_states_index); // Move from vector holder to array. overflow_buffer_size_ = overflow_buffer_vec.size(); overflow_buffer_ = new int32*[overflow_buffer_size_]; for (int32 i = 0; i < overflow_buffer_size_; ++i) { overflow_buffer_[i] = overflow_buffer_vec[i]; } is_built_ = true; } void ConstArpaLmBuilder::Write(std::ostream &os, bool binary) const { if (!binary) { KALDI_ERR << "text-mode writing is not implemented for ConstArpaLmBuilder."; } KALDI_ASSERT(is_built_); // Creates ConstArpaLm. ConstArpaLm const_arpa_lm( Options().bos_symbol, Options().eos_symbol, Options().unk_symbol, ngram_order_, num_words_, overflow_buffer_size_, lm_states_size_, unigram_states_, overflow_buffer_, lm_states_); const_arpa_lm.Write(os, binary); } void ConstArpaLm::Write(std::ostream &os, bool binary) const { KALDI_ASSERT(initialized_); if (!binary) { KALDI_ERR << "text-mode writing is not implemented for ConstArpaLm."; } WriteToken(os, binary, ""); // Misc info. WriteToken(os, binary, ""); WriteBasicType(os, binary, bos_symbol_); WriteBasicType(os, binary, eos_symbol_); WriteBasicType(os, binary, unk_symbol_); WriteBasicType(os, binary, ngram_order_); WriteToken(os, binary, ""); // LmStates section. WriteToken(os, binary, ""); WriteBasicType(os, binary, lm_states_size_); os.write(reinterpret_cast(lm_states_), sizeof(int32) * lm_states_size_); if (!os.good()) { KALDI_ERR << "ConstArpaLm section writing failed."; } WriteToken(os, binary, ""); // Unigram section. We write memory offset to disk instead of the absolute // pointers. WriteToken(os, binary, ""); WriteBasicType(os, binary, num_words_); int64* tmp_unigram_address = new int64[num_words_]; for (int32 i = 0; i < num_words_; ++i) { // The relative address here is a little bit tricky: // 1. If the original address is NULL, then we set the relative address to // zero. // 2. If the original address is not NULL, we set it to the following: // unigram_states_[i] - lm_states_ + 1 // we plus 1 to ensure that the above value is positive. tmp_unigram_address[i] = (unigram_states_[i] == NULL) ? 0 : unigram_states_[i] - lm_states_ + 1; } os.write(reinterpret_cast(tmp_unigram_address), sizeof(int64) * num_words_); if (!os.good()) { KALDI_ERR << "ConstArpaLm section writing failed."; } delete[] tmp_unigram_address; // Releases the memory. tmp_unigram_address = NULL; WriteToken(os, binary, ""); // Overflow section. We write memory offset to disk instead of the absolute // pointers. WriteToken(os, binary, ""); WriteBasicType(os, binary, overflow_buffer_size_); int64* tmp_overflow_address = new int64[overflow_buffer_size_]; for (int32 i = 0; i < overflow_buffer_size_; ++i) { // The relative address here is a little bit tricky: // 1. If the original address is NULL, then we set the relative address to // zero. // 2. If the original address is not NULL, we set it to the following: // overflow_buffer_[i] - lm_states_ + 1 // we plus 1 to ensure that the above value is positive. tmp_overflow_address[i] = (overflow_buffer_[i] == NULL) ? 0 : overflow_buffer_[i] - lm_states_ + 1; } os.write(reinterpret_cast(tmp_overflow_address), sizeof(int64) * overflow_buffer_size_); if (!os.good()) { KALDI_ERR << "ConstArpaLm section writing failed."; } delete[] tmp_overflow_address; tmp_overflow_address = NULL; WriteToken(os, binary, ""); WriteToken(os, binary, ""); } void ConstArpaLm::Read(std::istream &is, bool binary) { KALDI_ASSERT(!initialized_); if (!binary) { KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm."; } int first_char = is.peek(); if (first_char == 4) { // Old on-disk format starts with length of int32. ReadInternalOldFormat(is, binary); } else { // New on-disk format starts with token . ReadInternal(is, binary); } } void ConstArpaLm::ReadInternal(std::istream &is, bool binary) { KALDI_ASSERT(!initialized_); if (!binary) { KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm."; } ExpectToken(is, binary, ""); // Misc info. ExpectToken(is, binary, ""); ReadBasicType(is, binary, &bos_symbol_); ReadBasicType(is, binary, &eos_symbol_); ReadBasicType(is, binary, &unk_symbol_); ReadBasicType(is, binary, &ngram_order_); ExpectToken(is, binary, ""); // LmStates section. ExpectToken(is, binary, ""); ReadBasicType(is, binary, &lm_states_size_); lm_states_ = new int32[lm_states_size_]; is.read(reinterpret_cast(lm_states_), sizeof(int32) * lm_states_size_); if (!is.good()) { KALDI_ERR << "ConstArpaLm section reading failed."; } ExpectToken(is, binary, ""); // Unigram section. We write memory offset to disk instead of the absolute // pointers. ExpectToken(is, binary, ""); ReadBasicType(is, binary, &num_words_); unigram_states_ = new int32*[num_words_]; int64* tmp_unigram_address = new int64[num_words_]; is.read(reinterpret_cast(tmp_unigram_address), sizeof(int64) * num_words_); if (!is.good()) { KALDI_ERR << "ConstArpaLm section reading failed."; } for (int32 i = 0; i < num_words_; ++i) { // Check out how we compute the relative address in ConstArpaLm::Write(). unigram_states_[i] = (tmp_unigram_address[i] == 0) ? NULL : lm_states_ + tmp_unigram_address[i] - 1; } delete[] tmp_unigram_address; tmp_unigram_address = NULL; ExpectToken(is, binary, ""); // Overflow section. We write memory offset to disk instead of the absolute // pointers. ExpectToken(is, binary, ""); ReadBasicType(is, binary, &overflow_buffer_size_); overflow_buffer_ = new int32*[overflow_buffer_size_]; int64* tmp_overflow_address = new int64[overflow_buffer_size_]; is.read(reinterpret_cast(tmp_overflow_address), sizeof(int64) * overflow_buffer_size_); if (!is.good()) { KALDI_ERR << "ConstArpaLm section reading failed."; } for (int32 i = 0; i < overflow_buffer_size_; ++i) { // Check out how we compute the relative address in ConstArpaLm::Write(). overflow_buffer_[i] = (tmp_overflow_address[i] == 0) ? NULL : lm_states_ + tmp_overflow_address[i] - 1; } delete[] tmp_overflow_address; tmp_overflow_address = NULL; ExpectToken(is, binary, ""); ExpectToken(is, binary, ""); KALDI_ASSERT(ngram_order_ > 0); KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0); KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0); KALDI_ASSERT(unk_symbol_ < num_words_ && (unk_symbol_ > 0 || unk_symbol_ == -1)); lm_states_end_ = lm_states_ + lm_states_size_ - 1; memory_assigned_ = true; initialized_ = true; } void ConstArpaLm::ReadInternalOldFormat(std::istream &is, bool binary) { KALDI_ASSERT(!initialized_); if (!binary) { KALDI_ERR << "text-mode reading is not implemented for ConstArpaLm."; } // Misc info. ReadBasicType(is, binary, &bos_symbol_); ReadBasicType(is, binary, &eos_symbol_); ReadBasicType(is, binary, &unk_symbol_); ReadBasicType(is, binary, &ngram_order_); // LmStates section. // In the deprecated version, used to be type of int32, // which was a bug. We therefore use int32 for read for back-compatibility. int32 lm_states_size_int32; ReadBasicType(is, binary, &lm_states_size_int32); lm_states_size_ = static_cast(lm_states_size_int32); lm_states_ = new int32[lm_states_size_]; for (int64 i = 0; i < lm_states_size_; ++i) { ReadBasicType(is, binary, &lm_states_[i]); } // Unigram section. We write memory offset to disk instead of the absolute // pointers. ReadBasicType(is, binary, &num_words_); unigram_states_ = new int32*[num_words_]; for (int32 i = 0; i < num_words_; ++i) { int64 tmp_address; ReadBasicType(is, binary, &tmp_address); // Check out how we compute the relative address in ConstArpaLm::Write(). unigram_states_[i] = (tmp_address == 0) ? NULL : lm_states_ + tmp_address - 1; } // Overflow section. We write memory offset to disk instead of the absolute // pointers. ReadBasicType(is, binary, &overflow_buffer_size_); overflow_buffer_ = new int32*[overflow_buffer_size_]; for (int32 i = 0; i < overflow_buffer_size_; ++i) { int64 tmp_address; ReadBasicType(is, binary, &tmp_address); // Check out how we compute the relative address in ConstArpaLm::Write(). overflow_buffer_[i] = (tmp_address == 0) ? NULL : lm_states_ + tmp_address - 1; } KALDI_ASSERT(ngram_order_ > 0); KALDI_ASSERT(bos_symbol_ < num_words_ && bos_symbol_ > 0); KALDI_ASSERT(eos_symbol_ < num_words_ && eos_symbol_ > 0); KALDI_ASSERT(unk_symbol_ < num_words_ && (unk_symbol_ > 0 || unk_symbol_ == -1)); lm_states_end_ = lm_states_ + lm_states_size_ - 1; memory_assigned_ = true; initialized_ = true; } bool ConstArpaLm::HistoryStateExists(const std::vector& hist) const { // We do not create LmState for empty word sequence, but technically it is the // history state of all unigrams. if (hist.size() == 0) { return true; } // Tries to locate the LmState of the given word sequence. int32* lm_state = GetLmState(hist); if (lm_state == NULL) { // does not exist means has no child. return false; } else { // Note that we always create LmState for unigrams, so even if is // not NULL, we still have to check if it has child. KALDI_ASSERT(lm_state >= lm_states_); KALDI_ASSERT(lm_state + 2 <= lm_states_end_); // points to . if (*(lm_state + 2) > 0) { return true; } else { return false; } } return true; } float ConstArpaLm::GetNgramLogprob(const int32 word, const std::vector& hist) const { KALDI_ASSERT(initialized_); // If the history size plus one is larger than , remove the old // words. std::vector mapped_hist(hist); while (mapped_hist.size() >= ngram_order_) { mapped_hist.erase(mapped_hist.begin(), mapped_hist.begin() + 1); } KALDI_ASSERT(mapped_hist.size() + 1 <= ngram_order_); // TODO(guoguo): check with Dan if this is reasonable. // Maps possible out-of-vocabulary words to . If a word does not have a // corresponding LmState, we treat it as . We map it to if is // specified. int32 mapped_word = word; if (unk_symbol_ != -1) { KALDI_ASSERT(mapped_word >= 0); if (mapped_word >= num_words_ || unigram_states_[mapped_word] == NULL) { mapped_word = unk_symbol_; } for (int32 i = 0; i < mapped_hist.size(); ++i) { KALDI_ASSERT(mapped_hist[i] >= 0); if (mapped_hist[i] >= num_words_ || unigram_states_[mapped_hist[i]] == NULL) { mapped_hist[i] = unk_symbol_; } } } // Loops up n-gram probability. return GetNgramLogprobRecurse(mapped_word, mapped_hist); } float ConstArpaLm::GetNgramLogprobRecurse( const int32 word, const std::vector& hist) const { KALDI_ASSERT(initialized_); KALDI_ASSERT(hist.size() + 1 <= ngram_order_); // Unigram case. if (hist.size() == 0) { if (word >= num_words_ || unigram_states_[word] == NULL) { // If is defined, then the word sequence should have already been // mapped to is necessary; this is for the case where is not // defined. return std::numeric_limits::min(); } else { Int32AndFloat logprob_i(*unigram_states_[word]); return logprob_i.f; } } // High n-gram orders. float logprob = 0.0; float backoff_logprob = 0.0; int32* state; if ((state = GetLmState(hist)) != NULL) { int32 child_info; int32* child_lm_state = NULL; if (GetChildInfo(word, state, &child_info)) { DecodeChildInfo(child_info, state, &child_lm_state, &logprob); return logprob; } else { Int32AndFloat backoff_logprob_i(*(state + 1)); backoff_logprob = backoff_logprob_i.f; } } std::vector new_hist(hist); new_hist.erase(new_hist.begin(), new_hist.begin() + 1); return backoff_logprob + GetNgramLogprobRecurse(word, new_hist); } int32* ConstArpaLm::GetLmState(const std::vector& seq) const { KALDI_ASSERT(initialized_); // No LmState exists for empty word sequence. if (seq.size() == 0) return NULL; // If is defined, then the word sequence should have already been mapped // to is necessary; this is for the case where is not defined. if (seq[0] >= num_words_ || unigram_states_[seq[0]] == NULL) return NULL; int32* parent = unigram_states_[seq[0]]; int32 child_info; int32* child_lm_state = NULL; float logprob; for (int32 i = 1; i < seq.size(); ++i) { if (!GetChildInfo(seq[i], parent, &child_info)) { return NULL; } DecodeChildInfo(child_info, parent, &child_lm_state, &logprob); if (child_lm_state == NULL) { return NULL; } else { parent = child_lm_state; } } return parent; } bool ConstArpaLm::GetChildInfo(const int32 word, int32* parent, int32* child_info) const { KALDI_ASSERT(initialized_); KALDI_ASSERT(parent != NULL); KALDI_ASSERT(parent >= lm_states_); KALDI_ASSERT(child_info != NULL); KALDI_ASSERT(parent + 2 <= lm_states_end_); int32 num_children = *(parent + 2); KALDI_ASSERT(parent + 2 + 2 * num_children <= lm_states_end_); if (num_children == 0) return false; // A binary search into the children memory block. int32 start_index = 1; int32 end_index = num_children; while (start_index <= end_index) { int32 mid_index = round((start_index + end_index) / 2); int32 mid_word = *(parent + 1 + 2 * mid_index); if (mid_word == word) { *child_info = *(parent + 2 + 2 * mid_index); return true; } else if (mid_word < word) { start_index = mid_index + 1; } else { end_index = mid_index - 1; } } return false; } void ConstArpaLm::DecodeChildInfo(const int32 child_info, int32* parent, int32** child_lm_state, float* logprob) const { KALDI_ASSERT(initialized_); KALDI_ASSERT(logprob != NULL); if (child_info % 2 == 0) { // Child is a leaf, only returns the log probability. *child_lm_state = NULL; Int32AndFloat logprob_i(child_info); *logprob = logprob_i.f; } else { int32 child_offset = child_info / 2; if (child_offset > 0) { *child_lm_state = parent + child_offset; Int32AndFloat logprob_i(**child_lm_state); *logprob = logprob_i.f; } else { KALDI_ASSERT(-child_offset < overflow_buffer_size_); *child_lm_state = overflow_buffer_[-child_offset]; Int32AndFloat logprob_i(**child_lm_state); *logprob = logprob_i.f; } KALDI_ASSERT(*child_lm_state >= lm_states_); KALDI_ASSERT(*child_lm_state <= lm_states_end_); } } void ConstArpaLm::WriteArpaRecurse(int32* lm_state, const std::vector& seq, std::vector *output) const { if (lm_state == NULL) return; KALDI_ASSERT(lm_state >= lm_states_); KALDI_ASSERT(lm_state + 2 <= lm_states_end_); // Inserts the current LmState to . ArpaLine arpa_line; arpa_line.words = seq; Int32AndFloat logprob_i(*lm_state); arpa_line.logprob = logprob_i.f; Int32AndFloat backoff_logprob_i(*(lm_state + 1)); arpa_line.backoff_logprob = backoff_logprob_i.f; output->push_back(arpa_line); // Scans for possible children, and recursively adds child to . int32 num_children = *(lm_state + 2); KALDI_ASSERT(lm_state + 2 + 2 * num_children <= lm_states_end_); for (int32 i = 0; i < num_children; ++i) { std::vector new_seq(seq); new_seq.push_back(*(lm_state + 3 + 2 * i)); int32 child_info = *(lm_state + 4 + 2 * i); float logprob; int32* child_lm_state = NULL; DecodeChildInfo(child_info, lm_state, &child_lm_state, &logprob); if (child_lm_state == NULL) { // Leaf case. ArpaLine child_arpa_line; child_arpa_line.words = new_seq; child_arpa_line.logprob = logprob; child_arpa_line.backoff_logprob = 0.0; output->push_back(child_arpa_line); } else { WriteArpaRecurse(child_lm_state, new_seq, output); } } } void ConstArpaLm::WriteArpa(std::ostream &os) const { KALDI_ASSERT(initialized_); std::vector tmp_output; for (int32 i = 0; i < num_words_; ++i) { if (unigram_states_[i] != NULL) { std::vector seq(1, i); WriteArpaRecurse(unigram_states_[i], seq, &tmp_output); } } // Sorts ArpaLines and collects head information. std::sort(tmp_output.begin(), tmp_output.end()); std::vector ngram_count(1, 0); for (int32 i = 0; i < tmp_output.size(); ++i) { if (tmp_output[i].words.size() >= ngram_count.size()) { ngram_count.resize(tmp_output[i].words.size() + 1); ngram_count[tmp_output[i].words.size()] = 1; } else { ngram_count[tmp_output[i].words.size()] += 1; } } // Writes the header. os << std::endl; os << "\\data\\" << std::endl; for (int32 i = 1; i < ngram_count.size(); ++i) { os << "ngram " << i << "=" << ngram_count[i] << std::endl; } // Writes n-grams. int32 current_order = 0; for (int32 i = 0; i < tmp_output.size(); ++i) { // Beginning of a n-gram section. if (tmp_output[i].words.size() != current_order) { current_order = tmp_output[i].words.size(); os << std::endl; os << "\\" << current_order << "-grams:" << std::endl; } // Writes logprob. os << tmp_output[i].logprob << '\t'; // Writes word sequence. for (int32 j = 0; j < tmp_output[i].words.size(); ++j) { os << tmp_output[i].words[j]; if (j != tmp_output[i].words.size() - 1) { os << " "; } } // Writes backoff_logprob if it is not zero. if (tmp_output[i].backoff_logprob != 0.0) { os << '\t' << tmp_output[i].backoff_logprob; } os << std::endl; } os << std::endl << "\\end\\" << std::endl; } ConstArpaLmDeterministicFst::ConstArpaLmDeterministicFst( const ConstArpaLm& lm) : lm_(lm) { // Creates a history state for . std::vector bos_state(1, lm_.BosSymbol()); state_to_wseq_.push_back(bos_state); wseq_to_state_[bos_state] = 0; start_state_ = 0; } fst::StdArc::Weight ConstArpaLmDeterministicFst::Final(StateId s) { // At this point, we should have created the state. KALDI_ASSERT(static_cast(s) < state_to_wseq_.size()); const std::vector& wseq = state_to_wseq_[s]; float logprob = lm_.GetNgramLogprob(lm_.EosSymbol(), wseq); return Weight(-logprob); } bool ConstArpaLmDeterministicFst::GetArc(StateId s, Label ilabel, fst::StdArc *oarc) { // At this point, we should have created the state. KALDI_ASSERT(static_cast(s) < state_to_wseq_.size()); std::vector wseq = state_to_wseq_[s]; float logprob = lm_.GetNgramLogprob(ilabel, wseq); if (logprob == std::numeric_limits::min()) { return false; } // Locates the next state in ConstArpaLm. Note that OOV and backoff have been // taken care of in ConstArpaLm. wseq.push_back(ilabel); while (wseq.size() >= lm_.NgramOrder()) { // History state has at most lm_.NgramOrder() -1 words in the state. wseq.erase(wseq.begin(), wseq.begin() + 1); } while (!lm_.HistoryStateExists(wseq)) { KALDI_ASSERT(wseq.size() > 0); wseq.erase(wseq.begin(), wseq.begin() + 1); } std::pair, StateId> wseq_state_pair( wseq, static_cast(state_to_wseq_.size())); // Attemps to insert the current . If the pair already exists // then it returns false. typedef MapType::iterator IterType; std::pair result = wseq_to_state_.insert(wseq_state_pair); // If the pair was just inserted, then also add it to . if (result.second == true) state_to_wseq_.push_back(wseq); // Creates the arc. oarc->ilabel = ilabel; oarc->olabel = ilabel; oarc->nextstate = result.first->second; oarc->weight = Weight(-logprob); return true; } bool BuildConstArpaLm(const ArpaParseOptions& options, const std::string& arpa_rxfilename, const std::string& const_arpa_wxfilename) { ConstArpaLmBuilder lm_builder(options); KALDI_LOG << "Reading " << arpa_rxfilename; Input ki(arpa_rxfilename); lm_builder.Read(ki.Stream()); WriteKaldiObject(lm_builder, const_arpa_wxfilename, true); return true; } } // namespace kaldi