Blame view
src/lat/word-align-lattice-lexicon.cc
42.5 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 |
// lat/word-align-lattice-lexicon.cc // Copyright 2013 Johns Hopkins University (Author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "lat/phone-align-lattice.h" #include "lat/word-align-lattice-lexicon.h" #include "lat/lattice-functions.h" #include "hmm/transition-model.h" #include "hmm/hmm-utils.h" #include "util/stl-utils.h" namespace kaldi { const int kTemporaryEpsilon = -2; const int kNumStatesOffset = 1000; // relates to how we apply the // max-states to the lattices; relates to the --max-expand option which // stops this blowing up for pathological cases or in case of a mismatch. class LatticeLexiconWordAligner { public: typedef CompactLatticeArc::StateId StateId; typedef CompactLatticeArc::Label Label; typedef WordAlignLatticeLexiconInfo::ViabilityMap ViabilityMap; typedef WordAlignLatticeLexiconInfo::LexiconMap LexiconMap; typedef WordAlignLatticeLexiconInfo::NumPhonesMap NumPhonesMap; /* The Freshness enum is applied to phone and word-sequences in the computation state; it is related to the epsilon sequencing problem. If a phone or word is new (added by the latest transition), it is fresh. We are only concerned with the freshness of the left-most word (i.e. word index 0) in words_, and the freshness of that can take only two values, kNotFresh or kFresh. As regards the phones_ variable, the difference between kFresh and kAllFresh is: if we just appended a phone it's kFresh, but if we just shifted off a phone or phones by outputting a nonempty word it's kAllFresh, meaning that all sub-sequences of the phone sequence are new. Note: if a phone or word-sequence is empty the freshness of that sequence does not matter or is not defined; we'll let it default to kNotFresh. */ typedef enum { kNotFresh, kFresh, kAllFresh } Freshness; class ComputationState { /// The state of the computation in which, /// along a single path in the lattice, we work out the word /// boundaries and output aligned arcs. public: /// Advance the computation state by adding the symbols and weights from /// this arc. Outputs weight to "leftover_weight" and sets the weight to /// 1.0 (this helps keep the state space small). Note: because we /// previously did PhoneAlignLattice, we can assume this arc corresponds to /// exactly one or zero phones. void Advance(const CompactLatticeArc &arc, const TransitionModel &tmodel, LatticeWeight *leftover_weight); /// Returns true if, assuming we were to add one or more phones by calling /// Advance one or more times on this, we might be able later to /// successfully call TakeTransition. It's a kind of co-accessibility test /// that avoids us creating an exponentially large number of states that /// would contribute nothing to the final output. bool ViableIfAdvanced(const ViabilityMap &viability_map) const; int32 NumPhones() const { return phones_.size(); } int32 NumWords() const { return words_.size(); } int32 PendingWord() const { KALDI_ASSERT(!words_.empty()); return words_[0]; } Freshness WordFreshness() const { return word_fresh_; } Freshness PhoneFreshness() const { return phone_fresh_; } /// This may be called at the end of a lattice, if it was forced /// out. Note: we will only use "partial_word_label" if there are /// phones without corresponding words; otherwise we'll use the /// word label that was there. void TakeForcedTransition(int32 partial_word_label, ComputationState *next_state, CompactLatticeArc *arc_out) const; /// Take a transition, if possible; consume "num_phones" phones and (if /// word_id != 0) the word "word_id" which must be the first word in words_. /// Returns true if we could take the transition. bool TakeTransition(const LexiconMap &lexicon_map, int32 word_id, int32 num_phones, ComputationState *next_state, CompactLatticeArc *arc_out) const; bool IsEmpty() const { return (transition_ids_.empty() && words_.empty()); } /// FinalWeight() will return "weight" if both transition_ids /// and word_labels are empty, otherwise it will return /// Weight::Zero(). LatticeWeight FinalWeight() const { return (IsEmpty() ? weight_ : LatticeWeight::Zero()); } size_t Hash() const { VectorHasher<int32> vh; const int32 p1 = 11117, p2 = 90647, p3 = 3967, p4 = 3557; // primes. int32 ans = 0; for (int32 i = 0; i < static_cast<int32>(transition_ids_.size()); i++) { ans *= p1; ans += vh(transition_ids_[i]); } ans += p2 * vh(words_) + static_cast<int32>(word_fresh_) * p3 + static_cast<int32>(phone_fresh_) * p4; // phones_ is determined by transition-id sequence so we don't // need to include it in the hash. return ans; } bool operator == (const ComputationState &other) const { // phones_ is determined by transition-id sequence so don't // need to compare it. return (transition_ids_ == other.transition_ids_ && words_ == other.words_ && weight_ == other.weight_ && phone_fresh_ == other.phone_fresh_ && word_fresh_ == other.word_fresh_); } ComputationState(): phone_fresh_(kNotFresh), word_fresh_(kNotFresh), weight_(LatticeWeight::One()) { } // initial state. ComputationState(const ComputationState &other): phones_(other.phones_), words_(other.words_), phone_fresh_(other.phone_fresh_), word_fresh_(other.word_fresh_), transition_ids_(other.transition_ids_), weight_(other.weight_) { } private: std::vector<int32> phones_; // sequence of pending phones std::vector<int32> words_; // sequence of pending words. // The following variables tell us whether the phones_ and/or words_ // variables were modified by the last operation on the computation state. // This is used to make sure we don't have multiple ways of handling the // same sequence, by taking transitions at multiple points (see code for // more details). It's related to the epsilon sequencing problem. // See the declaration above of the enum "Freshness". Freshness phone_fresh_; Freshness word_fresh_; std::vector<std::vector<int32> > transition_ids_; // sequence of transition-ids for each phone.. LatticeWeight weight_; // contains two floats. }; static void AppendVectors( std::vector<std::vector<int32> >::const_iterator input_begin, std::vector<std::vector<int32> >::const_iterator input_end, std::vector<int32> *output); struct Tuple { Tuple(StateId input_state, ComputationState comp_state): input_state(input_state), comp_state(comp_state) {} Tuple() {} StateId input_state; ComputationState comp_state; }; struct TupleHash { size_t operator() (const Tuple &state) const { return state.input_state + 102763 * state.comp_state.Hash(); // 102763 is just an arbitrary prime number } }; struct TupleEqual { bool operator () (const Tuple &state1, const Tuple &state2) const { // treat this like operator == return (state1.input_state == state2.input_state && state1.comp_state == state2.comp_state); } }; typedef unordered_map<Tuple, StateId, TupleHash, TupleEqual> MapType; // This function may alter queue_. StateId GetStateForTuple(const Tuple &tuple) { MapType::iterator iter = map_.find(tuple); if (iter == map_.end()) { // not in map. StateId output_state = lat_out_->AddState(); map_[tuple] = output_state; queue_.push_back(std::make_pair(tuple, output_state)); return output_state; } else { return iter->second; } } // This function may alter queue_, via GetStateForTuple. void ProcessTransition(StateId prev_output_state, // state-id of from-state in output lattice const Tuple &next_tuple, CompactLatticeArc *arc) { // arc to add (must first modify it by adding "nextstate") arc->nextstate = GetStateForTuple(next_tuple); // adds it to queue_ if new. lat_out_->AddArc(prev_output_state, *arc); } // Process any epsilon transitions out of this state. This refers to // filler-words, such as silence, which have epsilon as the symbol in the // original lattice, or no symbol at all (typically the original lattice // will be determinized with epsilon-removal so there is no separate arc, // just one or more extra phones that don't match up with any word. void ProcessEpsilonTransitions(const Tuple &tuple, StateId output_state); // Process any non-epsilon transitions out of this state in the output lattice. void ProcessWordTransitions(const Tuple &tuple, StateId output_state); // Take any transitions that correspond to advancing along arcs arc in the // original FST. void PossiblyAdvanceArc(const Tuple &tuple, StateId output_state); /// Process all final-probs (normal case, no forcing-out). /// returns true if we had at least one final-prob. bool ProcessFinal(); /// This function returns true if the state "output_state" in the output /// lattice has arcs out that have either a non-epsilon symbol or transition-ids /// in the string of the weight. bool HasNonEpsArcsOut(StateId output_state); /// Creates arcs from all the tuples that were final in the original lattice /// but have no arcs out of them in the output lattice that consume words or /// phones-- does so by "forcing out" any words and phones there are pending /// in the computation states. This function is only called if no states were /// "naturally" final; this will only happen for lattices that were forced out /// during decoding. void ProcessFinalForceOut(); // Process all final-probs -- a wrapper function that handles the forced-out case. void ProcessFinalWrapper() { if (final_queue_.empty()) { KALDI_WARN << "No final-probs to process."; error_ = true; return; } if (ProcessFinal()) return; error_ = true; KALDI_WARN << "Word-aligning lattice: lattice was forced out, will have partial words at end."; ProcessFinalForceOut(); if (ProcessFinal()) return; KALDI_WARN << "Word-aligning lattice: had no final-states even after forcing out " << "(result will be empty). This probably indicates wrong input."; return; } void ProcessQueueElement() { KALDI_ASSERT(!queue_.empty()); Tuple tuple = queue_.back().first; StateId output_state = queue_.back().second; queue_.pop_back(); ProcessEpsilonTransitions(tuple, output_state); ProcessWordTransitions(tuple, output_state); PossiblyAdvanceArc(tuple, output_state); // Note: we'll do a bit more filtering in ProcessFinal(), meaning // that we won't necessarily give a final-prob to all of the things // that go onto final_queue_. if (lat_in_.Final(tuple.input_state) != CompactLatticeWeight::Zero()) final_queue_.push_back(std::make_pair(tuple, output_state)); } LatticeLexiconWordAligner(const CompactLattice &lat, const TransitionModel &tmodel, const WordAlignLatticeLexiconInfo &lexicon_info, int32 max_states, int32 partial_word_label, CompactLattice *lat_out): lat_in_(lat), tmodel_(tmodel), lexicon_info_(lexicon_info), max_states_(max_states), lat_out_(lat_out), partial_word_label_(partial_word_label == 0 ? kTemporaryEpsilon : partial_word_label), error_(false) { // lat_in_ is after PhoneAlignLattice, it is not deterministic and contains epsilons fst::CreateSuperFinal(&lat_in_); // Creates a super-final state, so the // only final-probs are One(). Note: the member lat_in_ is not a reference. } // Removes epsilons; also removes unreachable states... // not sure if these would exist if original was connected. // This also replaces the temporary symbols for the silence // and partial-words, with epsilons, if we wanted epsilons. void RemoveEpsilonsFromLattice() { Connect(lat_out_); RmEpsilon(lat_out_, true); // true = connect. std::vector<int32> syms_to_remove; syms_to_remove.push_back(kTemporaryEpsilon); RemoveSomeInputSymbols(syms_to_remove, lat_out_); Project(lat_out_, fst::PROJECT_INPUT); } bool AlignLattice() { lat_out_->DeleteStates(); if (lat_in_.Start() == fst::kNoStateId) { KALDI_WARN << "Trying to word-align empty lattice."; return false; } ComputationState initial_comp_state; Tuple initial_tuple(lat_in_.Start(), initial_comp_state); StateId start_state = GetStateForTuple(initial_tuple); lat_out_->SetStart(start_state); while (!queue_.empty()) { if (max_states_ > 0 && lat_out_->NumStates() > max_states_) { KALDI_WARN << "Number of states in lattice exceeded max-states of " << max_states_ << ", original lattice had " << lat_in_.NumStates() << " states. Returning empty lattice."; lat_out_->DeleteStates(); return false; } ProcessQueueElement(); } ProcessFinalWrapper(); RemoveEpsilonsFromLattice(); return !error_; } CompactLattice lat_in_; const TransitionModel &tmodel_; const WordAlignLatticeLexiconInfo &lexicon_info_; int32 max_states_; CompactLattice *lat_out_; std::vector<std::pair<Tuple, StateId> > queue_; std::vector<std::pair<Tuple, StateId> > final_queue_; // as queue_, but // just contains states that may have final-probs to process. We process these // all at once, at the end. MapType map_; // map from tuples to StateId. int32 partial_word_label_; bool error_; }; // static void LatticeLexiconWordAligner::AppendVectors( std::vector<std::vector<int32> >::const_iterator input_begin, std::vector<std::vector<int32> >::const_iterator input_end, std::vector<int32> *output) { size_t size = 0; for (std::vector<std::vector<int32> >::const_iterator iter = input_begin; iter != input_end; ++iter) size += iter->size(); output->clear(); output->reserve(size); for (std::vector<std::vector<int32> >::const_iterator iter = input_begin; iter != input_end; ++iter) output->insert(output->end(), iter->begin(), iter->end()); } void LatticeLexiconWordAligner::ProcessEpsilonTransitions( const Tuple &tuple, StateId output_state) { const ComputationState &comp_state = tuple.comp_state; StateId input_state = tuple.input_state; StateId zero_word = 0; NumPhonesMap::const_iterator iter = lexicon_info_.num_phones_map_.find(zero_word); if (iter == lexicon_info_.num_phones_map_.end()) { return; // No epsilons to match; this can only happen if the lexicon // we were provided had no lines with 0 as the first entry, i.e. no // optional silences or the like. } // Now decide what range of phone-lengths we must process. This is all // about only getting a single opportunity to process any given sequence of // phones. int32 min_num_phones, max_num_phones; if (comp_state.PhoneFreshness() == kAllFresh) { // All sub-sequences of the phone sequence are fresh because we just // shifted some phones off, so we do this for all lengths. We can limit // ourselves to the range of possible lengths for the epsilon symbol, // in the lexicon. min_num_phones = iter->second.first; max_num_phones = std::min(iter->second.second, comp_state.NumPhones()); } else if (comp_state.PhoneFreshness() == kFresh) { // only last phone is "fresh", so only consider the sequence of all // phones including the last one. int32 num_phones = comp_state.NumPhones(); if (num_phones >= iter->second.first && num_phones <= iter->second.second) { min_num_phones = num_phones; max_num_phones = num_phones; } else { return; } } else { // kNotFresh return; } if (min_num_phones == 0) KALDI_ERR << "Lexicon error: epsilon transition that produces no output:"; for (int32 num_phones = min_num_phones; num_phones <= max_num_phones; num_phones++) { Tuple next_tuple; next_tuple.input_state = input_state; // We're not taking a transition in the // input FST so this stays the same. CompactLatticeArc arc; if (comp_state.TakeTransition(lexicon_info_.lexicon_map_, zero_word, num_phones, &next_tuple.comp_state, &arc)) { ProcessTransition(output_state, next_tuple, &arc); } } } void LatticeLexiconWordAligner::ProcessWordTransitions( const Tuple &tuple, StateId output_state) { const ComputationState &comp_state = tuple.comp_state; StateId input_state = tuple.input_state; if (comp_state.NumWords() > 0) { int32 min_num_phones, max_num_phones; int32 word_id = comp_state.PendingWord(); if (comp_state.WordFreshness() == kFresh || comp_state.PhoneFreshness() == kAllFresh) { // Just saw word, or shifted phones, // so 1st opportunity to process phone-sequences of all possible sizes, // with this word. NumPhonesMap::const_iterator iter = lexicon_info_.num_phones_map_.find(word_id); if (iter == lexicon_info_.num_phones_map_.end()) { KALDI_ERR << "Word " << word_id << " is not present in the lexicon."; } min_num_phones = iter->second.first; max_num_phones = std::min(iter->second.second, comp_state.NumPhones()); } else if (comp_state.PhoneFreshness() == kFresh) { // just the latest phone is new -> just try to process the // phone-sequence of all the phones we have. min_num_phones = comp_state.NumPhones(); max_num_phones = min_num_phones; } else { return; // Nothing to do, since neither the word nor the phones are fresh. } for (int32 num_phones = min_num_phones; num_phones <= max_num_phones; num_phones++) { Tuple next_tuple; next_tuple.input_state = input_state; // We're not taking a transition in the // input FST so this stays the same. CompactLatticeArc arc; if (comp_state.TakeTransition(lexicon_info_.lexicon_map_, word_id, num_phones, &next_tuple.comp_state, &arc)) { ProcessTransition(output_state, next_tuple, &arc); } } } } void LatticeLexiconWordAligner::PossiblyAdvanceArc( const Tuple &tuple, StateId output_state) { if (tuple.comp_state.ViableIfAdvanced(lexicon_info_.viability_map_)) { for(fst::ArcIterator<CompactLattice> aiter(lat_in_, tuple.input_state); !aiter.Done(); aiter.Next()) { const CompactLatticeArc &arc_in = aiter.Value(); Tuple next_tuple(arc_in.nextstate, tuple.comp_state); LatticeWeight arc_weight; next_tuple.comp_state.Advance(arc_in, tmodel_, &arc_weight); // Note: GetStateForTuple will add the tuple to the queue, // if necessary. StateId next_output_state = GetStateForTuple(next_tuple); CompactLatticeArc arc_out(0, 0, CompactLatticeWeight(arc_weight, std::vector<int32>()), next_output_state); lat_out_->AddArc(output_state, arc_out); } } } bool LatticeLexiconWordAligner::ProcessFinal() { bool saw_final = false; // Find final-states... for (size_t i = 0; i < final_queue_.size(); i++) { const Tuple &tuple = final_queue_[i].first; StateId output_state = final_queue_[i].second; KALDI_ASSERT(lat_in_.Final(tuple.input_state) == CompactLatticeWeight::One()); LatticeWeight final_weight = tuple.comp_state.FinalWeight(); if (final_weight != LatticeWeight::Zero()) { // note: final_weight is only nonzero if there are no // pending transition-ids, so there is no string component. std::vector<int32> empty_vec; lat_out_->SetFinal(output_state, CompactLatticeWeight(final_weight, empty_vec)); saw_final = true; } } return saw_final; } bool LatticeLexiconWordAligner::HasNonEpsArcsOut(StateId output_state) { for (fst::ArcIterator<CompactLattice> aiter(*lat_out_, output_state); !aiter.Done(); aiter.Next()) { const CompactLatticeArc &arc = aiter.Value(); if (arc.ilabel != 0 || arc.olabel != 0 || !arc.weight.String().empty()) return true; } return false; } void LatticeLexiconWordAligner::ProcessFinalForceOut() { KALDI_ASSERT(queue_.empty()); std::vector<std::pair<Tuple, StateId> > new_final_queue_; new_final_queue_.reserve(final_queue_.size()); for (size_t i = 0; i < final_queue_.size();i++) { // note: all the states will // be final in the orig. lattice const Tuple &tuple = final_queue_[i].first; StateId output_state = final_queue_[i].second; if (!HasNonEpsArcsOut(output_state)) { // This if-statement // avoids forcing things out too early, when they had words // that could naturally have been put out. [without it, // we'd have multiple alternate paths at the end.] CompactLatticeArc arc; Tuple next_tuple; next_tuple.input_state = tuple.input_state; tuple.comp_state.TakeForcedTransition(partial_word_label_, &next_tuple.comp_state, &arc); // Note: the following call may add to queue_, but we'll clear it, // we don't want to process these states. StateId new_state = GetStateForTuple(next_tuple); arc.nextstate = new_state; lat_out_->AddArc(output_state, arc); new_final_queue_.push_back(std::make_pair(next_tuple, new_state)); } } queue_.clear(); std::swap(final_queue_, new_final_queue_); } void LatticeLexiconWordAligner::ComputationState::Advance( const CompactLatticeArc &arc, const TransitionModel &tmodel, LatticeWeight *weight) { const std::vector<int32> &tids = arc.weight.String(); int32 phone; if (tids.empty()) phone = 0; else { phone = tmodel.TransitionIdToPhone(tids.front()); KALDI_ASSERT(phone == tmodel.TransitionIdToPhone(tids.back()) && "Error: lattice is not phone-aligned."); } if (arc.ilabel != 0) { // note: arc.ilabel==arc.olabel (acceptor) words_.push_back(arc.ilabel); // Note: the word freshness only applies to the word in position 0, // so only if the word-sequence is now of size 1, is it fresh. if (words_.size() == 1) word_fresh_ = kFresh; else word_fresh_ = kNotFresh; } else { // No word added -> word not fresh. word_fresh_ = kNotFresh; } if (phone != 0) { phones_.push_back(phone); transition_ids_.push_back(tids); phone_fresh_ = kFresh; } else { phone_fresh_ = kNotFresh; } *weight = Times(weight_, arc.weight.Weight()); // will go on arc in output lattice weight_ = LatticeWeight::One(); } bool LatticeLexiconWordAligner::ComputationState::ViableIfAdvanced( const ViabilityMap &viability_map) const { /* This will ideally to return true if and only if we can ever take any kind of transition out of this state after "advancing" it by adding words and/or phones. It's OK to return true in some cases where the condition is false, though, if it's a pain to check, because the result will just be doing extra work for nothing (those states won't be co-accessible in the output). */ if (phones_.empty()) return true; if (words_.empty()) return true; else { // neither phones_ or words_ is empty. Return true if a longer sequence // than this phone sequence can have either zero (<eps>/epsilon) or the // first element of words_, as an entry in the lexicon with that phone // sequence. ViabilityMap::const_iterator iter = viability_map.find(phones_); if (iter == viability_map.end()) return false; else { const std::vector<int32> &this_set = iter->second; // sorted vector. // Return true if either 0 or words_[0] is in the set. If 0 is // in the set, it will be the 1st element of the vector, because it's // the lowest element. return (this_set.front() == 0 || std::binary_search(this_set.begin(), this_set.end(), words_[0])); } } } void LatticeLexiconWordAligner::ComputationState::TakeForcedTransition( int32 partial_word_label, ComputationState *next_state, CompactLatticeArc *arc_out) const { KALDI_ASSERT(!IsEmpty()); next_state->phones_.clear(); next_state->words_.clear(); next_state->transition_ids_.clear(); // neither of the following variables should matter, actually, // they will never be inspected. So just set them to kFresh for consistency, // so they end up at the same place in the tuple-map_. next_state->word_fresh_ = kFresh; next_state->phone_fresh_ = kFresh; next_state->weight_ = LatticeWeight::One(); int32 word_id; if (words_.size() >= 1) { word_id = words_[0]; if (words_.size() > 1) KALDI_WARN << "Word-aligning lattice: discarding extra word at end of lattice" << "(forced-out)."; } else { word_id = partial_word_label; } KALDI_ASSERT(word_id != 0); // any zeros would have been replaced with // 'temporary epsilon' = 2. std::vector<int32> appended_transition_ids; AppendVectors(transition_ids_.begin(), transition_ids_.end(), &appended_transition_ids); arc_out->ilabel = word_id; arc_out->olabel = word_id; arc_out->weight = CompactLatticeWeight(weight_, appended_transition_ids); // arc_out->nextstate will be set by the calling code. } bool LatticeLexiconWordAligner::ComputationState::TakeTransition( const LexiconMap &lexicon_map, int32 word_id, int32 num_phones, ComputationState *next_state, CompactLatticeArc *arc_out) const { KALDI_ASSERT(word_id == 0 || (!words_.empty() && word_id == words_[0])); KALDI_ASSERT(num_phones <= static_cast<int32>(phones_.size())); std::vector<int32> lexicon_key; lexicon_key.reserve(1 + num_phones); lexicon_key.push_back(word_id); // put 1st word in lexicon_key. lexicon_key.insert(lexicon_key.end(), phones_.begin(), phones_.begin() + num_phones); LexiconMap::const_iterator iter = lexicon_map.find(lexicon_key); if (iter == lexicon_map.end()) { // no such entry return false; } else { // Entry exists. We'll create an arc. next_state->phones_.assign(phones_.begin() + num_phones, phones_.end()); next_state->words_.assign(words_.begin() + (word_id == 0 ? 0 : 1), words_.end()); next_state->transition_ids_.assign(transition_ids_.begin() + num_phones, transition_ids_.end()); next_state->word_fresh_ = (word_id != 0 && !next_state->words_.empty()) ? kFresh : kNotFresh; next_state->phone_fresh_ = (next_state->phones_.empty() || num_phones == 0) ? kNotFresh : kAllFresh; // this next thing is a bit hard to explain. If we just consumed a word with // no phones, we treat the phones as fresh. The idea is that if we need to // both consume a word with no phones and a phone with no words (e.g. // an empty word and then silence), we need to have the phones marked // as fresh in order for this to be possible. if (num_phones == 0 && word_id != 0 && !next_state->phones_.empty()) next_state->phone_fresh_ = kAllFresh; next_state->weight_ = LatticeWeight::One(); if (GetVerboseLevel() >= 5) { std::ostringstream ostr; for (size_t i = 0; i < num_phones; i++) ostr << phones_[i] << " "; KALDI_VLOG(5) << "Taking arc with word = " << word_id << " and phones = " << ostr.str() << ", output-word = " << iter->second << ", dest-state has num-words = " << next_state->words_.size() << " and num-phones = " << next_state->phones_.size(); } // Set arc_out: Label word_id = iter->second; // word_id will typically be // the same as words_[0], i.e. the // word we consumed. KALDI_ASSERT(word_id != 0); // we replaced zeros with 'temporary epsilon' = -2. std::vector<int32> appended_transition_ids; AppendVectors(transition_ids_.begin(), transition_ids_.begin() + num_phones, &appended_transition_ids); arc_out->ilabel = word_id; arc_out->olabel = word_id; arc_out->weight = CompactLatticeWeight(weight_, appended_transition_ids); // arc_out->nextstate will be set in the calling code. return true; } } // Returns true if this vector of transition-ids could be a valid // word. Note: for testing, we assume that the lexicon always // has the same input-word and output-word. The other case is complex // to test. static bool IsPlausibleWord(const WordAlignLatticeLexiconInfo &lexicon_info, const TransitionModel &tmodel, int32 word_id, const std::vector<int32> &transition_ids) { std::vector<std::vector<int32> > split_alignment; // Split into phones. if (!SplitToPhones(tmodel, transition_ids, &split_alignment)) { KALDI_WARN << "Could not split word into phones correctly (forced-out?)"; } std::vector<int32> phones(split_alignment.size()); for (size_t i = 0; i < split_alignment.size(); i++) { KALDI_ASSERT(!split_alignment[i].empty()); phones[i] = tmodel.TransitionIdToPhone(split_alignment[i][0]); } std::vector<int32> lexicon_entry; lexicon_entry.push_back(word_id); lexicon_entry.insert(lexicon_entry.end(), phones.begin(), phones.end()); if (!lexicon_info.IsValidEntry(lexicon_entry)) { std::ostringstream ostr; for (size_t i = 0; i < lexicon_entry.size(); i++) ostr << lexicon_entry[i] << ' '; KALDI_WARN << "Invalid arc in aligned lattice (code error?) lexicon-entry is " << ostr.str(); return false; } else { return true; } } void WordAlignLatticeLexiconInfo::UpdateViabilityMap( const std::vector<int32> &lexicon_entry) { int32 word = lexicon_entry[0]; // note: word may be zero. int32 num_phones = static_cast<int32>(lexicon_entry.size()) - 2; std::vector<int32> phones; if (num_phones > 0) phones.reserve(num_phones - 1); // for each nonempty sequence of phones that is a strict prefix of the phones // in the lexicon entry (i.e. lexicon_entry [2 ... ]), add the word to the set // in viability_map_[phones]. for (int32 n = 0; n < num_phones - 1; n++) { phones.push_back(lexicon_entry[n + 2]); // first phone is at position 2. // n+1 is the length of the sequence of phones viability_map_[phones].push_back(word); } } void WordAlignLatticeLexiconInfo::FinalizeViabilityMap() { for (ViabilityMap::iterator iter = viability_map_.begin(); iter != viability_map_.end(); ++iter) { std::vector<int32> &words = iter->second; SortAndUniq(&words); KALDI_ASSERT(words[0] >= 0 && "Error: negative labels in lexicon."); } } /// Update the map from a vector (orig-word-symbol phone1 phone2 ... ) to the /// new word-symbol. The new word-symbol must always be nonzero; we'll replace /// it with kTemporaryEpsilon = -2, if it was zero. void WordAlignLatticeLexiconInfo::UpdateLexiconMap( const std::vector<int32> &lexicon_entry) { KALDI_ASSERT(lexicon_entry.size() >= 2); std::vector<int32> key; key.reserve(lexicon_entry.size() - 1); // add the original word: key.push_back(lexicon_entry[0]); // add the phones: key.insert(key.end(), lexicon_entry.begin() + 2, lexicon_entry.end()); int32 new_word = lexicon_entry[1]; // This will typically be the same as // the original word at lexicon_entry[0] but is allowed to differ. if (new_word == 0) new_word = kTemporaryEpsilon; // replace 0's with -2; // we'll revert the change at the end. if (lexicon_map_.count(key) != 0) { if (lexicon_map_[key] == new_word) KALDI_WARN << "Duplicate entry in lexicon map for word " << lexicon_entry[0]; else KALDI_ERR << "Duplicate entry in lexicon map for word " << lexicon_entry[0] << " with inconsistent to-word."; } lexicon_map_[key] = new_word; if (lexicon_entry[0] != lexicon_entry[1]) { // Add reverse lexicon entry, this time with no 0 -> -2 mapping. key[0] = lexicon_entry[1]; // Note: we ignore the situation where there are conflicting // entries in reverse_lexicon_map_, as we never actually inspect // the contents so it won't matter. reverse_lexicon_map_[key] = lexicon_entry[0]; } } void WordAlignLatticeLexiconInfo::UpdateNumPhonesMap( const std::vector<int32> &lexicon_entry) { int32 num_phones = static_cast<int32>(lexicon_entry.size()) - 2; int32 word = lexicon_entry[0]; if (num_phones_map_.count(word) == 0) num_phones_map_[word] = std::make_pair(num_phones, num_phones); else { std::pair<int32, int32> &pr = num_phones_map_[word]; pr.first = std::min(pr.first, num_phones); // update min-num-phones pr.second = std::max(pr.second, num_phones); // update max-num-phones if (pr.first == 0 && word == 0) KALDI_ERR << "Zero word with empty pronunciation is not allowed."; } } /// Entry contains new-word-id phone1 phone2 ... /// equivalent to all but the 1st entry on a line of the input file. bool WordAlignLatticeLexiconInfo::IsValidEntry(const std::vector<int32> &entry) const { KALDI_ASSERT(!entry.empty()); LexiconMap::const_iterator iter = lexicon_map_.find(entry); if (iter != lexicon_map_.end()) { int32 tgt_word = (iter->second == kTemporaryEpsilon ? 0 : iter->second); if (tgt_word == entry[0]) return true; // symmetric entry. // this means that that there would be an output-word with this // value, and this sequence of phones. } // For entries that were not symmetric: return (reverse_lexicon_map_.count(entry) != 0); } int32 WordAlignLatticeLexiconInfo::EquivalenceClassOf(int32 word) const { unordered_map<int32, int32>::const_iterator iter = equivalence_map_.find(word); if (iter == equivalence_map_.end()) return word; // not in map. else return iter->second; } void WordAlignLatticeLexiconInfo::UpdateEquivalenceMap( const std::vector<std::vector<int32> > &lexicon) { std::vector<std::pair<int32, int32> > equiv_pairs; // pairs of // (lower,higher) words that are equivalent. for (size_t i = 0; i < lexicon.size(); i++) { KALDI_ASSERT(lexicon[i].size() >= 2); int32 w1 = lexicon[i][0], w2 = lexicon[i][1]; if (w1 == w2) continue; // They are the same; this provides no information // about equivalence, since any word is equivalent // to itself. if (w1 > w2) std::swap(w1, w2); // make sure w1 < w2. equiv_pairs.push_back(std::make_pair(w1, w2)); } SortAndUniq(&equiv_pairs); equivalence_map_.clear(); for (size_t i = 0; i < equiv_pairs.size(); i++) { int32 w1 = equiv_pairs[i].first, w2 = equiv_pairs[i].second, w1dash = EquivalenceClassOf(w1); equivalence_map_[w2] = w1dash; } } WordAlignLatticeLexiconInfo::WordAlignLatticeLexiconInfo( const std::vector<std::vector<int32> > &lexicon) { for (size_t i = 0; i < lexicon.size(); i++) { const std::vector<int32> &lexicon_entry = lexicon[i]; KALDI_ASSERT(lexicon_entry.size() >= 2); UpdateViabilityMap(lexicon_entry); UpdateLexiconMap(lexicon_entry); UpdateNumPhonesMap(lexicon_entry); } FinalizeViabilityMap(); UpdateEquivalenceMap(lexicon); } /// Testing code; map word symbols in the lattice "lat" using the equivalence-classes /// obtained from the lexicon, using the function EquivalenceClassOf in the lexicon_info /// object. static void MapSymbols(const WordAlignLatticeLexiconInfo &lexicon_info, CompactLattice *lat) { typedef CompactLattice::StateId StateId; for (StateId s = 0; s < lat->NumStates(); s++) { for (fst::MutableArcIterator<CompactLattice> aiter(lat, s); !aiter.Done(); aiter.Next()) { CompactLatticeArc arc (aiter.Value()); KALDI_ASSERT(arc.ilabel == arc.olabel); arc.ilabel = lexicon_info.EquivalenceClassOf(arc.ilabel); arc.olabel = arc.ilabel; aiter.SetValue(arc); } } } static bool TestWordAlignedLattice(const WordAlignLatticeLexiconInfo &lexicon_info, const TransitionModel &tmodel, CompactLattice clat, CompactLattice aligned_clat, bool allow_duplicate_paths) { int32 max_err = 5, num_err = 0; { // We test whether the forward-backward likelihoods differ; this is intended // to detect when we have duplicate paths in the aligned lattice, for some path // in the input lattice (e.g. due to epsilon-sequencing problems). Posterior post; Lattice lat, aligned_lat; ConvertLattice(clat, &lat); ConvertLattice(aligned_clat, &aligned_lat); TopSort(&lat); TopSort(&aligned_lat); BaseFloat like_before = LatticeForwardBackward(lat, &post), like_after = LatticeForwardBackward(aligned_lat, &post); if (fabs(like_before - like_after) > 1.0e-04 * (fabs(like_before) + fabs(like_after))) { KALDI_WARN << "Forward-backward likelihoods differ in word-aligned lattice " << "testing, " << like_before << " != " << like_after; if (!allow_duplicate_paths) num_err++; } } // Do a check on the arcs of the aligned lattice, that each arc corresponds // to an entry in the lexicon. for (CompactLattice::StateId s = 0; s < aligned_clat.NumStates(); s++) { for (fst::ArcIterator<CompactLattice> aiter(aligned_clat, s); !aiter.Done(); aiter.Next()) { const CompactLatticeArc &arc (aiter.Value()); KALDI_ASSERT(arc.ilabel == arc.olabel); int32 word_id = arc.ilabel; const std::vector<int32> &tids = arc.weight.String(); if (word_id == 0 && tids.empty()) continue; // We allow epsilon arcs. if (num_err < max_err) if (!IsPlausibleWord(lexicon_info, tmodel, word_id, tids)) num_err++; // Note: IsPlausibleWord will warn if there is an error. } if (!aligned_clat.Final(s).String().empty()) { KALDI_WARN << "Aligned lattice has nonempty string on its final-prob."; return false; } } // Next we'll do an equivalence test. // First map symbols into equivalence classes, so that we don't wrongly fail // due to the capability of the framework to map words to other words. // (e.g. mapping <eps> on silence arcs to SIL). MapSymbols(lexicon_info, &clat); MapSymbols(lexicon_info, &aligned_clat); /// Check equivalence. int32 num_paths = 5, seed = Rand(), max_path_length = -1; BaseFloat delta = 0.2; // some lattices have large costs -> use large delta. FLAGS_v = GetVerboseLevel(); // set the OpenFst verbose level to the Kaldi // verbose level. if (!RandEquivalent(clat, aligned_clat, num_paths, delta, seed, max_path_length)) { KALDI_WARN << "Equivalence test failed during lattice alignment."; return false; } FLAGS_v = 0; return (num_err == 0); } // This is the wrapper function for users to call. bool WordAlignLatticeLexicon(const CompactLattice &lat, const TransitionModel &tmodel, const WordAlignLatticeLexiconInfo &lexicon_info, const WordAlignLatticeLexiconOpts &opts, CompactLattice *lat_out) { PhoneAlignLatticeOptions phone_align_opts; phone_align_opts.reorder = opts.reorder; phone_align_opts.replace_output_symbols = false; phone_align_opts.remove_epsilon = false; // Input Lattice should be deterministic and w/o epsilons. bool test = true; uint64 props = lat.Properties(fst::kIDeterministic|fst::kIEpsilons, test); if (props != fst::kIDeterministic) { KALDI_WARN << "[Lattice has input epsilons and/or is not input-deterministic " << "(in Mohri sense)]-- i.e. lattice is not deterministic. " << "Word-alignment may be slow and-or blow up in memory."; } CompactLattice phone_aligned_lat; bool ans = PhoneAlignLattice(lat, tmodel, phone_align_opts, &phone_aligned_lat); // 'phone_aligned_lat' is no longer deterministic and contains epsilons. int32 max_states; if (opts.max_expand <= 0) { max_states = -1; } else { // The 1000 is a fixed offset to give it more wiggle room for very // small inputs. max_states = kNumStatesOffset + opts.max_expand * phone_aligned_lat.NumStates(); } // If ans == false, we hope this is due to a forced-out lattice, and we try to // continue. LatticeLexiconWordAligner aligner(phone_aligned_lat, tmodel, lexicon_info, max_states, opts.partial_word_label, lat_out); // We'll let the calling code warn if this is false; it will know the utterance-id. ans = aligner.AlignLattice() && ans; if (ans && opts.test) { // We only test if it succeeded. if (!TestWordAlignedLattice(lexicon_info, tmodel, lat, *lat_out, opts.allow_duplicate_paths)) { KALDI_WARN << "Lattice failed test (activated because --test=true). " << "Probable code error, please contact Kaldi maintainers."; ans = false; } } return ans; } bool ReadLexiconForWordAlign (std::istream &is, std::vector<std::vector<int32> > *lexicon) { lexicon->clear(); std::string line; while (std::getline(is, line)) { std::vector<int32> this_entry; if (!SplitStringToIntegers(line, " \t\r", false, &this_entry) || this_entry.size() < 2) { KALDI_WARN << "Lexicon line '" << line << "' is invalid"; return false; } lexicon->push_back(this_entry); } return (!lexicon->empty()); } } // namespace kaldi |