Yannick Estève / ONTRAC-Kaldi

Blame view

egs/wsj/s5/local/wsj_extend_char_dict.sh 3.16 KB
  #!/bin/bash
  # Copyright 2017   Hossein Hadian
  
  # This script extends the word list by including OOVs from the training
  # transcripts.
  # Since no phonemes are involved, we need no G2P models/rules.
  # In other words, this script is like wsj_extend_dict.sh except
  # it deals with characters (i.e. graphemes) instead of phonemes
  # so it's much simpler. Parts of this script are taken from
  # EESEN (https://github.com/srvk/eesen)
  
  if [ $# -ne 3 ]; then
    echo "usage: $0 <wsj-corpus-dir> <dict-src-dir> <dict-larger-dir>"
    echo "e.g.: $0 WSJ/13-32.1/ data/local/lang_char data/local/lang_char_larger"
    exit 1;
  fi
  
  if [ "`basename $1`" != 13-32.1 ]; then
    echo "Expecting the first argument to this script to end in 13-32.1"
    exit 1
  fi
  
  corpusdir=$1
  srcdir=$2
  dir=$3
  
  mincount=2 # Minimum count of an OOV we include into the lexicon.
  
  mkdir -p $dir
  cp $srcdir/lexicon.txt $dir/lexicon.ori.txt
  cp $srcdir/nonsilence_phones.txt $dir
  cp $srcdir/silence_phones.txt $dir
  cp $srcdir/optional_silence.txt $dir
  
  # the original wordlist
  cat $dir/lexicon.ori.txt | awk '{print $1}' | sort | uniq > $dir/wordlist.ori
  
  # Get the training transcripts
  echo "Getting the training transcripts, may take some time ..."
  
  touch $dir/cleaned.gz
  if [ `du -m $dir/cleaned.gz | cut -f 1` -eq 73 ]; then
    echo "Not getting cleaned data in $dir/cleaned.gz again [already exists]";
  else
   gunzip -c $corpusdir/wsj1/doc/lng_modl/lm_train/np_data/{87,88,89}/*.z \
    | awk '/^</{next}{print toupper($0)}' | perl -e '
     open(F, "<$ARGV[0]")||die;
     while(<F>){ chop; $isword{$_} = 1; }
     while(<STDIN>) {
      @A = split(" ", $_);
      for ($n = 0; $n < @A; $n++) {
        $a = $A[$n];
        if (! $isword{$a} && $a =~ s/^([^\.]+)\.$/$1/) { # nonwords that end in "."
           # and have no other "." in them: treat as period.
           print "$a";
           if ($n+1 < @A) { print "
  "; }
        } else { print "$a "; }
      }
      print "
  ";
    }
   ' $dir/wordlist.ori | gzip -c > $dir/cleaned.gz
  fi
  
  # Get unigram counts and the counts of the oov words
  echo "Getting unigram counts"
  gunzip -c $dir/cleaned.gz | tr -s ' ' '
  ' | \
    awk '{count[$1]++} END{for (w in count) { print count[w], w; }}' | \
    sort -nr > $dir/unigrams
  
  cat $dir/unigrams | awk -v dict=$dir/wordlist.ori \
    'BEGIN{while(getline<dict) seen[$1]=1;} {if(!seen[$2]){print;}}' \
     > $dir/oov.counts
  
  echo "Most frequent unseen unigrams are: "
  head $dir/oov.counts
  
  # Select the OOVs whose counts > $mincount. Include these OOVs into the lexicon.
  cat $dir/oov.counts | awk -v thresh=$mincount '{if ($1 >= thresh) { print $2; }}' > $dir/oovlist
  cat $dir/oovlist | perl -e 'while(<>){ chop; $str="$_"; foreach $p (split("", $_)) {$str="$str $p"}; print "$str
  ";}' > $dir/lexicon.oov.txt
  
  # filter out oov words that have characters not in non-silence characters
  cat $dir/lexicon.oov.txt | awk -v dict=$dir/nonsilence_phones.txt \
   'BEGIN{while(getline<dict) seen[$1]=1;} {for(i=2;i<=NF;i++) {if(!seen[$i]){break;}}; if (i==(NF+1)){print;}}' > $dir/lexicon.oov.filt.txt
  
  # THe final expanded lexicon
  cat $dir/lexicon.ori.txt $dir/lexicon.oov.filt.txt > $dir/lexicon.txt
  
  echo "Number of OOVs we handled is `cat $dir/lexicon.oov.filt.txt | wc -l`"
  echo "Created the larger lexicon $dir/lexicon.txt"