Yannick Estève / ONTRAC-Kaldi

Blame view

egs/aidatatang_200zh/s5/local/prepare_dict.sh 10.7 KB
  #!/bin/bash
  #Copyright 2016 LeSpeech (Author: Xingyu Na)
  
  # prepare dictionary for aidatatang
  # it is done for English and Chinese separately,
  # For English, we use CMU dictionary, and Sequitur G2P
  # for OOVs, while all englist phone set will concert to Chinese
  # phone set at the end. For Chinese, we use an online dictionary,
  # for OOV, we just produce pronunciation using Charactrt Mapping.
  
  . ./path.sh
  
  [ $# != 0 ] && echo "Usage: $0" && exit 1;
  
  train_dir=data/local/train
  dev_dir=data/local/dev
  test_dir=data/local/test
  dict_dir=data/local/dict
  mkdir -p $dict_dir
  mkdir -p $dict_dir/lexicon-{en,ch}
  
  # extract full vocabulary
  cat $train_dir/text $dev_dir/text $test_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\
    perl -ape 's/ /
  /g;' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\
    grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/words.txt || exit 1;
  
  # split into English and Chinese
  cat $dict_dir/words.txt | grep '[a-zA-Z]' > $dict_dir/lexicon-en/words-en.txt || exit 1;
  cat $dict_dir/words.txt | grep -v '[a-zA-Z]' > $dict_dir/lexicon-ch/words-ch.txt || exit 1;
  
  
  ##### produce pronunciations for english
  if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then
    echo "--- Downloading CMU dictionary ..."
    svn co -r 13068 https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
      $dict_dir/cmudict || exit 1;
  fi
  
  # format cmudict
  echo "--- Striping stress and pronunciation variant markers from cmudict ..."
  perl $dict_dir/cmudict/scripts/make_baseform.pl \
    $dict_dir/cmudict/cmudict.0.7a /dev/stdout |\
    sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' > $dict_dir/cmudict/cmudict-plain.txt || exit 1;
  
  # extract in-vocab lexicon and oov words
  echo "--- Searching for English OOV words ..."
  awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
    $dict_dir/cmudict/cmudict-plain.txt $dict_dir/lexicon-en/words-en.txt |\
    egrep -v '<.?s>' > $dict_dir/lexicon-en/words-en-oov.txt || exit 1;
  
  awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
    $dict_dir/lexicon-en/words-en.txt $dict_dir/cmudict/cmudict-plain.txt |\
    egrep -v '<.?s>' > $dict_dir/lexicon-en/lexicon-en-iv.txt || exit 1;
  
  wc -l $dict_dir/lexicon-en/words-en-oov.txt
  wc -l $dict_dir/lexicon-en/lexicon-en-iv.txt
  
  # setup g2p and generate oov lexicon
  if [ ! -f conf/g2p_model ]; then
    echo "--- Downloading a pre-trained Sequitur G2P model ..."
    wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
    if [ ! -f conf/g2p_model ]; then
      echo "Failed to download the g2p model!"
      exit 1
    fi
  fi
  
  echo "--- Preparing pronunciations for OOV words ..."
  g2p=`which g2p.py`
  if [ ! -x $g2p ]; then
    echo "g2p.py is not found. Checkout tools/extras/install_sequitur.sh."
    exit 1
  fi
  g2p.py --model=conf/g2p_model --apply $dict_dir/lexicon-en/words-en-oov.txt \
    > $dict_dir/lexicon-en/lexicon-en-oov.txt || exit 1;
  
  # merge in-vocab and oov lexicon
  cat $dict_dir/lexicon-en/lexicon-en-oov.txt $dict_dir/lexicon-en/lexicon-en-iv.txt |\
    sort > $dict_dir/lexicon-en/lexicon-en-phn.txt || exit 1;
  
  # convert cmu phoneme to pinyin phonenme
  mkdir -p $dict_dir/map
  cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/map/cmu || exit 1;
  cat conf/pinyin2cmu | awk -v cmu=$dict_dir/map/cmu \
    'BEGIN{while((getline<cmu)) dict[$1] = 1;}
     {for (i = 2; i <=NF; i++) if (dict[$i]) print $i;}' | sort -u > $dict_dir/map/cmu-used || exit 1;
  cat $dict_dir/map/cmu | awk -v cmu=$dict_dir/map/cmu-used \
    'BEGIN{while((getline<cmu)) dict[$1] = 1;}
     {if (!dict[$1]) print $1;}' > $dict_dir/map/cmu-not-used || exit 1;
  
  awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
    $dict_dir/map/cmu-not-used conf/cmu2pinyin |\
    egrep -v '<.?s>' > $dict_dir/map/cmu-py || exit 1;
  
  cat $dict_dir/map/cmu-py | \
    perl -e '
    open(MAPS, $ARGV[0]) or die("could not open map file");
    my %py2ph;
    foreach $line (<MAPS>) {
      @A = split(" ", $line);
      $py = shift(@A);
      $py2ph{$py} = [@A];
    }
    my @entry;
    while (<STDIN>) {
      @A = split(" ", $_);
      @entry = ();
      $W = shift(@A);
      push(@entry, $W);
      for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); }
      print "@entry";
      print "
  ";
    }
  ' conf/pinyin2cmu > $dict_dir/map/cmu-cmu || exit 1;
  
  cat $dict_dir/lexicon-en/lexicon-en-phn.txt | \
    perl -e '
    open(MAPS, $ARGV[0]) or die("could not open map file");
    my %py2ph;
    foreach $line (<MAPS>) {
      @A = split(" ", $line);
      $py = shift(@A);
      $py2ph{$py} = [@A];
    }
    my @entry;
    while (<STDIN>) {
      @A = split(" ", $_);
      @entry = ();
      $W = shift(@A);
      push(@entry, $W);
      for($i = 0; $i < @A; $i++) {
        if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); }
        else {push(@entry, $A[$i])};
      }
      print "@entry";
      print "
  ";
    }
  ' $dict_dir/map/cmu-cmu > $dict_dir/lexicon-en/lexicon-en.txt || exit 1;
  
  
  ##### produce pronunciations for chinese
  if [ ! -f $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt ]; then
    echo "------------- Downloading cedit dictionary ---------------"
    mkdir -p $dict_dir/cedict
    wget -P $dict_dir/cedict http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
    gunzip $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz
  fi
  
  cat $dict_dir/cedict/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\
   perl -e '
    while (<STDIN>) {
      @A = split(" ", $_);
      print $A[1];
      for($n = 2; $n < @A; $n++) {
        $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:;
        $tmp = uc($A[$n]);
        print " $tmp";
      }
      print "
  ";
    }
   ' | sort -k1 > $dict_dir/cedict/ch-dict.txt || exit 1;
  
  echo "--- Searching for Chinese OOV words ..."
  awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
    $dict_dir/cedict/ch-dict.txt $dict_dir/lexicon-ch/words-ch.txt |\
    egrep -v '<.?s>' > $dict_dir/lexicon-ch/words-ch-oov.txt || exit 1;
  
  awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
    $dict_dir/lexicon-ch/words-ch.txt $dict_dir/cedict/ch-dict.txt |\
    egrep -v '<.?s>' > $dict_dir/lexicon-ch/lexicon-ch-iv.txt || exit 1;
  
  wc -l $dict_dir/lexicon-ch/words-ch-oov.txt
  wc -l $dict_dir/lexicon-ch/lexicon-ch-iv.txt
  
  
  # validate Chinese dictionary and compose a char-based
  # dictionary in order to get OOV pronunciations
  cat $dict_dir/cedict/ch-dict.txt |\
    perl -e '
    use utf8;
    binmode(STDIN,":encoding(utf8)");
    binmode(STDOUT,":encoding(utf8)");
    while (<STDIN>) {
      @A = split(" ", $_);
      $word_len = length($A[0]);
      $proun_len = @A - 1 ;
      if ($word_len == $proun_len) {print $_;}
    }
    ' > $dict_dir/cedict/ch-dict-1.txt || exit 1;
  
  # extract chars
  cat $dict_dir/cedict/ch-dict-1.txt | awk '{print $1}' |\
    perl -e '
    use utf8;
    binmode(STDIN,":encoding(utf8)");
    binmode(STDOUT,":encoding(utf8)");  
    while (<STDIN>) {
      @A = split(" ", $_);
      @chars = split("", $A[0]);
      foreach (@chars) {
        print "$_
  ";
      }
    }
    ' | grep -v '^$' > $dict_dir/lexicon-ch/ch-char.txt || exit 1;
  
  # extract individual pinyins
  cat $dict_dir/cedict/ch-dict-1.txt |\
    awk '{for(i=2; i<=NF; i++) print $i}' |\
    perl -ape 's/ /
  /g;' > $dict_dir/lexicon-ch/ch-char-pinyin.txt || exit 1;
  
  # first make sure number of characters and pinyins
  # are equal, so that a char-based dictionary can
  # be composed.
  nchars=`wc -l < $dict_dir/lexicon-ch/ch-char.txt`
  npinyin=`wc -l < $dict_dir/lexicon-ch/ch-char-pinyin.txt`
  if [ $nchars -ne $npinyin ]; then
    echo "Found $nchars chars and $npinyin pinyin. Please check!"
    exit 1
  fi
  
  paste $dict_dir/lexicon-ch/ch-char.txt $dict_dir/lexicon-ch/ch-char-pinyin.txt |\
    sort -u > $dict_dir/lexicon-ch/ch-char-dict.txt || exit 1;
  
  # create a multiple pronunciation dictionary
  cat $dict_dir/lexicon-ch/ch-char-dict.txt |\
    perl -e '
    my $prev = "";
    my $out_line = "";
    while (<STDIN>) {
      @A = split(" ", $_);
      $cur = $A[0];
      $cur_py = $A[1];
      #print length($prev);
      if (length($prev) == 0) { $out_line = $_; chomp($out_line);}
      if (length($prev)>0 && $cur ne $prev) { print $out_line; print "
  "; $out_line = $_; chomp($out_line);}
      if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";}
      $prev = $cur;
    }
    print $out_line;
    ' >  $dict_dir/lexicon-ch/ch-char-dict-mp.txt || exit 1;
  
  # get lexicon for Chinese OOV words
  local/create_oov_char_lexicon.pl $dict_dir/lexicon-ch/ch-char-dict-mp.txt \
    $dict_dir/lexicon-ch/words-ch-oov.txt > $dict_dir/lexicon-ch/lexicon-ch-oov.txt || exit 1;
  
  # seperate multiple prons for Chinese OOV lexicon
  cat $dict_dir/lexicon-ch/lexicon-ch-oov.txt |\
    perl -e '
    my @entry;
    my @entry1;
    while (<STDIN>) {
      @A = split(" ", $_);
      @entry = ();
      push(@entry, $A[0]);
      for($i = 1; $i < @A; $i++ ) {
        @py = split("/", $A[$i]);
        @entry1 = @entry;
        @entry = ();
        for ($j = 0; $j < @entry1; $j++) {
          for ($k = 0; $k < @py; $k++) {
            $tmp = $entry1[$j]." ".$py[$k];
            push(@entry, $tmp);
          }
        }
      }
      for ($i = 0; $i < @entry; $i++) {
        print $entry[$i];
        print "
  ";
      }
    }
    ' > $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt || exit 1;
  
  # compose IV and OOV lexicons for Chinese
  cat $dict_dir/lexicon-ch/lexicon-ch-oov-mp.txt $dict_dir/lexicon-ch/lexicon-ch-iv.txt |\
    awk '{if (NF > 1 && $2 ~ /[A-Za-z0-9]+/) print $0;}' > $dict_dir/lexicon-ch/lexicon-ch.txt || exit 1;
  
  # convert Chinese pinyin to CMU format
  cat $dict_dir/lexicon-ch/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\
    utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch/lexicon-ch-cmu.txt || exit 1;
  
  # combine English and Chinese lexicons
  cat $dict_dir/lexicon-en/lexicon-en.txt $dict_dir/lexicon-ch/lexicon-ch-cmu.txt |\
    sort -u > $dict_dir/lexicon1.txt || exit 1;
  
  cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}'| \
    sort -u |\
    perl -e '
    my %ph_cl;
    while (<STDIN>) {
      $phone = $_;
      chomp($phone);
      chomp($_);
      $phone =~ s:([A-Z]+)[0-9]:$1:;
      if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_)  }
      else { $ph_cl{$phone} = [$_]; }
    }
    foreach $key ( keys %ph_cl ) {
       print "@{ $ph_cl{$key} }
  "
    }
    ' | sort -k1 > $dict_dir/nonsilence_phones.txt  || exit 1;
  
  ( echo SIL; echo SPN; echo NSN; echo LAU ) > $dict_dir/silence_phones.txt
  
  echo SIL > $dict_dir/optional_silence.txt
  
  # No "extra questions" in the input to this setup, as we don't
  # have stress or tone
  
  cat $dict_dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "
  ";}' > $dict_dir/extra_questions.txt || exit 1;
  cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
    $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l
  ";}' \
   >> $dict_dir/extra_questions.txt || exit 1;
  
  # Add to the lexicon the silences, noises etc.
  (echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
   echo '<UNK> SPN' ) | \
   cat - $dict_dir/lexicon1.txt  > $dict_dir/lexicon.txt || exit 1;
  
  echo "$0: aidatatang_200zh dict preparation succeeded"
  exit 0;