Yannick Estève / ONTRAC-Kaldi

Blame view

src/bin/prons-to-wordali.cc 5.28 KB
  // bin/prons-to-wordali.cc
  
  // Copyright 2009-2011  Microsoft Corporation
  
  // See ../../COPYING for clarification regarding multiple authors
  //
  // Licensed under the Apache License, Version 2.0 (the "License");
  // you may not use this file except in compliance with the License.
  // You may obtain a copy of the License at
  //
  //  http://www.apache.org/licenses/LICENSE-2.0
  //
  // THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  // WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  // MERCHANTABLITY OR NON-INFRINGEMENT.
  // See the Apache 2 License for the specific language governing permissions and
  // limitations under the License.
  
  
  #include "base/kaldi-common.h"
  #include "hmm/transition-model.h"
  #include "hmm/hmm-utils.h"
  #include "util/common-utils.h"
  #include "fst/fstlib.h"
  #include "fstext/fstext-utils.h"
  
  
  int main(int argc, char *argv[]) {
    using namespace kaldi;
    using fst::VectorFst;
    using fst::StdArc;
    typedef kaldi::int32 int32;
    try {
      const char *usage =
          "Caution: this program relates to older scripts and is deprecated,
  "
          "for modern scripts see egs/wsj/s5/steps/{get_ctm,get_train_ctm}.sh
  "
          "Given per-utterance pronunciation information as output by 
  "
          "words-to-prons, and per-utterance phone alignment information
  "
          "as output by ali-to-phones --write-lengths, output word alignment
  "
          "information that can be turned into the ctm format.
  "
          "Outputs is pairs of (word, #frames), or if --per-frame is given,
  "
          "just the word for each frame.
  "
          "Note: zero word-id usually means optional silence.
  "
          "Format is standard format for archives of vector<pair<int32, int32> >
  "
          "i.e. :
  "
          "utt-id  600 22 ; 1028 32 ; 0 41
  "
          "where 600, 1028 and 0 are the word-ids, and 22, 32 and 41 are the
  "
          "lengths.
  "
          "
  "
          "Usage:  prons-to-wordali [options] <prons-rspecifier>"
          " <phone-lengths-rspecifier> <wordali-wspecifier>
  "
          "e.g.: 
  "
          " ali-to-phones 1.mdl ark:1.ali ark:- | \\
  "
          "  phones-to-prons L_align.fst 46 47 ark:- 'ark:sym2int.pl -f 2- words.txt text|' \\
  "
          "  ark:- | prons-to-wordali ark:- \\
  "
          "    \"ark:ali-to-phones --write-lengths 1.mdl ark:1.ali ark:-|\" ark:1.wali
  ";
      
      ParseOptions po(usage);
      bool per_frame = false;
      po.Register("per-frame", &per_frame, "If true, write out the frame-level word alignment (else word sequence)");
      po.Read(argc, argv);
  
      if (po.NumArgs() != 3) {
        po.PrintUsage();
        exit(1);
      }
  
      std::string prons_rspecifier = po.GetArg(1),
          phone_lengths_rspecifier = po.GetArg(2),
          wordali_wspecifier = po.GetArg(3);
          
                  
      SequentialInt32VectorVectorReader prons_reader(prons_rspecifier);
      RandomAccessInt32PairVectorReader phones_reader(phone_lengths_rspecifier);
  
      std::string empty;
      Int32PairVectorWriter pair_writer(per_frame ? empty : wordali_wspecifier);
      Int32VectorWriter frame_writer(per_frame ? wordali_wspecifier : empty);
  
      int32 n_done = 0, n_err = 0;
      
      for (; !prons_reader.Done(); prons_reader.Next()) {
        std::string key = prons_reader.Key();
        const std::vector<std::vector<int32> > &prons = prons_reader.Value();
        if (!phones_reader.HasKey(key)) {
          KALDI_WARN << "Not processing utterance " << key << " because no phone "
                     << "alignment found.";
          n_err++;
          continue;
        }
        // first member of each pair is phone; second is length in
        // frames.
        const std::vector<std::pair<int32, int32> > &phones =
            phones_reader.Value(key);
  
        std::vector<std::pair<int32, int32> > word_alignment;
  
        size_t p = 0; // index into "phones".
        for (size_t i = 0; i < prons.size(); i++) {
          if (!(prons[i].size() >= 1)) {
            KALDI_WARN << "Invalid, empty pronunciation.";
            n_err++;
            continue;
          }
          int32 word = prons[i][0], word_len = 0;
          for (size_t j = 1; j < prons[i].size(); j++, p++) {
            if (!(static_cast<size_t>(p) < phones.size() &&
                  prons[i][j] == phones[p].first) ) {
              KALDI_WARN << "For key " << key << ", mismatch between prons and phones.";
              n_err++;
              continue;
            }
            word_len += phones[p].second;
          }
          word_alignment.push_back(std::make_pair(word, word_len));
        }
        if (static_cast<size_t>(p) != phones.size()) {
          KALDI_WARN << "For key " << key << ", mismatch between prons and phones (wrong #phones)";
          n_err++;
          continue;
        }
  
        if (!per_frame) {
          pair_writer.Write(key, word_alignment);
        } else {
          std::vector<int32> word_per_frame;
          for (size_t i = 0; i < word_alignment.size(); i++) {
            int32 word = word_alignment[i].first,
                len = word_alignment[i].second;
            for (int32 j = 0; j < len; j++)
              word_per_frame.push_back(word);
          }
          frame_writer.Write(key, word_per_frame);
        }
        n_done++;
      }
      KALDI_LOG << "Done " << n_done << " utterances; " << n_err << " had errors.";
    } catch(const std::exception &e) {
      std::cerr << e.what();
      return -1;
    }
  }