Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/utils/add_lex_disambig.pl 6.64 KB
  #!/usr/bin/env perl
  #  Copyright 2010-2011  Microsoft Corporation
  #            2013-2016  Johns Hopkins University (author: Daniel Povey)
  #                 2015  Hainan Xu
  #                 2015  Guoguo Chen
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  
  # Adds disambiguation symbols to a lexicon.
  # Outputs still in the normal lexicon format.
  # Disambig syms are numbered #1, #2, #3, etc. (#0
  # reserved for symbol in grammar).
  # Outputs the number of disambig syms to the standard output.
  # With the --pron-probs option, expects the second field
  # of each lexicon line to be a pron-prob.
  # With the --sil-probs option, expects three additional
  # fields after the pron-prob, representing various components
  # of the silence probability model.
  
  $pron_probs = 0;
  $sil_probs = 0;
  $first_allowed_disambig = 1;
  
  for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
    if ($ARGV[0] eq "--pron-probs") {
      $pron_probs = 1;
      shift @ARGV;
    }
    if ($ARGV[0] eq "--sil-probs") {
      $sil_probs = 1;
      shift @ARGV;
    }
    if ($ARGV[0] eq "--first-allowed-disambig") {
      $first_allowed_disambig = 0 + $ARGV[1];
      if ($first_allowed_disambig < 1) {
        die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig
  ";
      }
      shift @ARGV;
      shift @ARGV;
    }
  }
  
  if (@ARGV != 2) {
    die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>
  " .
      "This script adds disambiguation symbols to a lexicon in order to
  " .
      "make decoding graphs determinizable; it adds pseudo-phone
  " .
      "disambiguation symbols #1, #2 and so on at the ends of phones
  " .
      "to ensure that all pronunciations are different, and that none
  " .
      "is a prefix of another.
  " .
      "It prints to the standard output the number of the largest-numbered" .
      "disambiguation symbol that was used.
  " .
      "
  " .
      "Options:   --pron-probs       Expect pronunciation probabilities in the 2nd field
  " .
      "           --sil-probs        [should be with --pron-probs option]
  " .
      "                              Expect 3 extra fields after the pron-probs, for aspects of
  " .
      "                              the silence probability model
  " .
      "           --first-allowed-disambig <n>  The number of the first disambiguation symbol
  " .
      "                              that this script is allowed to add.  By default this is
  " .
      "                              #1, but you can set this to a larger value using this option.
  " .
      "e.g.:
  " .
      " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt
  " .
      " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt
  " .
      " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt
  ";
  }
  
  
  $lexfn = shift @ARGV;
  $lexoutfn = shift @ARGV;
  
  open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
  
  # (1)  Read in the lexicon.
  @L = ( );
  while(<L>) {
      @A = split(" ", $_);
      push @L, join(" ", @A);
  }
  
  # (2) Work out the count of each phone-sequence in the
  # lexicon.
  
  foreach $l (@L) {
      @A = split(" ", $l);
      shift @A; # Remove word.
      if ($pron_probs) {
        $p = shift @A;
        if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
      }
      if ($sil_probs) {
        $silp = shift @A;
        if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
        $correction = shift @A;
        if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
        $correction = shift @A;
        if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
      }
      if (!(@A)) {
        die "Bad lexicon line $1, no phone in phone list";
      }
      $count{join(" ",@A)}++;
  }
  
  # (3) For each left sub-sequence of each phone-sequence, note down
  # that it exists (for identifying prefixes of longer strings).
  
  foreach $l (@L) {
      @A = split(" ", $l);
      shift @A; # Remove word.
      if ($pron_probs) { shift @A; } # remove pron-prob.
      if ($sil_probs) {
        shift @A; # Remove silprob
        shift @A; # Remove silprob
        shift @A; # Remove silprob, there three numbers for sil_probs
      }
      while(@A > 0) {
          pop @A;  # Remove last phone
          $issubseq{join(" ",@A)} = 1;
      }
  }
  
  # (4) For each entry in the lexicon:
  #  if the phone sequence is unique and is not a
  #  prefix of another word, no diambig symbol.
  #  Else output #1, or #2, #3, ... if the same phone-seq
  #  has already been assigned a disambig symbol.
  
  
  open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.
  ";
  
  # max_disambig will always be the highest-numbered disambiguation symbol that
  # has been used so far.
  $max_disambig = $first_allowed_disambig - 1;
  
  foreach $l (@L) {
    @A = split(" ", $l);
    $word = shift @A;
    if ($pron_probs) {
      $pron_prob = shift @A;
    }
    if ($sil_probs) {
      $sil_word_prob = shift @A;
      $word_sil_correction = shift @A;
      $prev_nonsil_correction = shift @A
    }
    $phnseq = join(" ", @A);
    if (!defined $issubseq{$phnseq}
        && $count{$phnseq} == 1) {
      ;                           # Do nothing.
    } else {
      if ($phnseq eq "") {        # need disambig symbols for the empty string
        # that are not use anywhere else.
        $max_disambig++;
        $reserved_for_the_empty_string{$max_disambig} = 1;
        $phnseq = "#$max_disambig";
      } else {
        $cur_disambig = $last_used_disambig_symbol_of{$phnseq};
        if (!defined $cur_disambig) {
          $cur_disambig = $first_allowed_disambig;
        } else {
          $cur_disambig++;           # Get a number that has not been used yet for
                                     # this phone sequence.
        }
        while (defined $reserved_for_the_empty_string{$cur_disambig}) {
          $cur_disambig++;
        }
        if ($cur_disambig > $max_disambig) {
          $max_disambig = $cur_disambig;
        }
        $last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
        $phnseq = $phnseq . " #" . $cur_disambig;
      }
    }
    if ($pron_probs) {
      if ($sil_probs) {
        print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq
  ";
      } else {
        print O "$word\t$pron_prob\t$phnseq
  ";
      }
    } else {
      print O "$word\t$phnseq
  ";
    }
  }
  
  print $max_disambig . "
  ";