// kwsbin/lattice-to-kws-index.cc // Copyright 2012 Johns Hopkins University (Author: Guoguo Chen) // Lucas Ondel // See ../../COPYING for clarification regarding multiple authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, // MERCHANTABLITY OR NON-INFRINGEMENT. // See the Apache 2 License for the specific language governing permissions and // limitations under the License. #include "base/kaldi-common.h" #include "util/common-utils.h" #include "fstext/fstext-utils.h" #include "lat/kaldi-lattice.h" #include "lat/lattice-functions.h" #include "kws/kaldi-kws.h" #include "kws/kws-functions.h" #include "fstext/epsilon-property.h" int main(int argc, char *argv[]) { try { using namespace kaldi; using fst::VectorFst; typedef kaldi::int32 int32; typedef kaldi::uint64 uint64; const char *usage = "Create an inverted index of the given lattices. The output index is \n" "in the T*T*T semiring. For details for the semiring, please refer to\n" "Dogan Can and Murat Saraclar's paper named " "\"Lattice Indexing for Spoken Term Detection\"\n" "\n" "Usage: lattice-to-kws-index [options] " " \n" "e.g.: \n" " lattice-to-kws-index ark:utter.symtab ark:1.lats ark:global.idx\n"; ParseOptions po(usage); int32 frame_subsampling_factor = 1; int32 max_silence_frames = 50; bool strict = true; bool allow_partial = true; BaseFloat max_states_scale = 4; po.Register("frame-subsampling-factor", &frame_subsampling_factor, "Frame subsampling factor. (Default value 1)"); po.Register("max-silence-frames", &max_silence_frames, "If --frame-subsampling-factor is used, --max-silence-frames " "is relative to the the input, not the output frame rate " "(we divide by frame-subsampling-factor and round to " "the closest integer, to get the number of symbols in the " "lattice)."); po.Register("strict", &strict, "Setting --strict=false will cause " "successful termination even if we processed no lattices."); po.Register("max-states-scale", &max_states_scale, "Number of states in the" " original lattice times this scale is the number of states " "allowed when optimizing the index. Negative number means no " "limit on the number of states."); po.Register("allow-partial", &allow_partial, "Allow partial output if fails" " to determinize, otherwise skip determinization if it fails."); po.Read(argc, argv); if (po.NumArgs() != 3) { po.PrintUsage(); exit(1); } max_silence_frames = 0.5 + max_silence_frames / static_cast(frame_subsampling_factor); std::string usymtab_rspecifier = po.GetOptArg(1), lats_rspecifier = po.GetArg(2), index_wspecifier = po.GetArg(3); // We use RandomAccessInt32Reader to read the utterance symtab table. RandomAccessInt32Reader usymtab_reader(usymtab_rspecifier); // We read the lattice in as CompactLattice; We need the CompactLattice // structure for the rest of the work SequentialCompactLatticeReader clat_reader(lats_rspecifier); TableWriter< fst::VectorFstTplHolder > index_writer(index_wspecifier); int32 n_done = 0; int32 n_fail = 0; int32 max_states = -1; for (; !clat_reader.Done(); clat_reader.Next()) { std::string key = clat_reader.Key(); CompactLattice clat = clat_reader.Value(); clat_reader.FreeCurrent(); KALDI_LOG << "Processing lattice " << key; if (max_states_scale > 0) { max_states = static_cast( max_states_scale * static_cast(clat.NumStates())); } // Check if we have the corresponding utterance id. if (!usymtab_reader.HasKey(key)) { KALDI_WARN << "Cannot find utterance id for " << key; n_fail++; continue; } // Topologically sort the lattice, if not already sorted. uint64 props = clat.Properties(fst::kFstProperties, false); if (!(props & fst::kTopSorted)) { if (fst::TopSort(&clat) == false) { KALDI_WARN << "Cycles detected in lattice " << key; n_fail++; continue; } } // Get the alignments std::vector state_times; CompactLatticeStateTimes(clat, &state_times); // Cluster the arcs in the CompactLattice, write the cluster_id on the // output label side. // ClusterLattice() corresponds to the second part of the preprocessing in // Dogan and Murat's paper -- clustering. Note that we do the first part // of preprocessing (the weight pushing step) later when generating the // factor transducer. KALDI_VLOG(1) << "Arc clustering..."; bool success = false; success = kaldi::ClusterLattice(&clat, state_times); if (!success) { KALDI_WARN << "State id's and alignments do not match for lattice " << key; n_fail++; continue; } // The next part is something new, not in the Dogan and Can paper. It is // necessary because we have epsilon arcs, due to silences, in our // lattices. We modify the factor transducer, while maintaining // equivalence, to ensure that states don't have both epsilon *and* // non-epsilon arcs entering them. (and the same, with "entering" // replaced with "leaving"). Later we will find out which states have // non-epsilon arcs leaving/entering them and use it to be more selective // in adding arcs to connect them with the initial/final states. The goal // here is to disallow silences at the beginning or ending of a keyword // occurrence. if (true) { EnsureEpsilonProperty(&clat); fst::TopSort(&clat); // We have to recompute the state times because they will have changed. CompactLatticeStateTimes(clat, &state_times); } // Generate factor transducer // CreateFactorTransducer() corresponds to the "Factor Generation" part of // Dogan and Murat's paper. But we also move the weight pushing step to // this function as we have to compute the alphas and betas anyway. KALDI_VLOG(1) << "Generating factor transducer..."; KwsProductFst factor_transducer; int32 utterance_id = usymtab_reader.Value(key); success = kaldi::CreateFactorTransducer(clat, state_times, utterance_id, &factor_transducer); if (!success) { KALDI_WARN << "Cannot generate factor transducer for lattice " << key; n_fail++; } MaybeDoSanityCheck(factor_transducer); // Remove long silence arc // We add the filtering step in our implementation. This is because gap // between two successive words in a query term should be less than 0.5s KALDI_VLOG(1) << "Removing long silence..."; RemoveLongSilences(max_silence_frames, state_times, &factor_transducer); MaybeDoSanityCheck(factor_transducer); // Do factor merging, and return a transducer in T*T*T semiring. This step // corresponds to the "Factor Merging" part in Dogan and Murat's paper. KALDI_VLOG(1) << "Merging factors..."; KwsLexicographicFst index_transducer; DoFactorMerging(&factor_transducer, &index_transducer); MaybeDoSanityCheck(index_transducer); // Do factor disambiguation. It corresponds to the "Factor Disambiguation" // step in Dogan and Murat's paper. KALDI_VLOG(1) << "Doing factor disambiguation..."; DoFactorDisambiguation(&index_transducer); MaybeDoSanityCheck(index_transducer); // Optimize the above factor transducer. It corresponds to the // "Optimization" step in the paper. KALDI_VLOG(1) << "Optimizing factor transducer..."; OptimizeFactorTransducer(&index_transducer, max_states, allow_partial); MaybeDoSanityCheck(index_transducer); // Write result index_writer.Write(key, index_transducer); n_done++; } KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail; if (strict == true) return (n_done != 0 ? 0 : 1); else return 0; } catch(const std::exception &e) { std::cerr << e.what(); return -1; } }