Blame view
egs/wsj/s5/steps/dict/train_g2p.sh
3.03 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
#!/bin/bash # Copyright 2014 Johns Hopkins University (Author: Yenda Trmal) # Copyright 2016 Xiaohui Zhang # Apache 2.0 # Begin configuration section. iters=5 stage=0 encoding='utf-8' only_words=true cmd=run.pl # a list of silence phones, like data/local/dict/silence_phones.txt silence_phones= # End configuration section. echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . utils/parse_options.sh || exit 1; set -u set -e if [ $# != 2 ]; then echo "Usage: $0 [options] <lexicon-in> <work-dir>" echo " where <lexicon-in> is the training lexicon (one pronunciation per " echo " word per line, with lines like 'hello h uh l ow') and" echo " <work-dir> is directory where the models will be stored" echo "e.g.: train_g2p.sh --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/" echo "" echo "main options (for others, see top of script file)" echo " --iters <int> # How many iterations. Relates to N-ngram order" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." echo " --silence-phones <silphones-list> # e.g. data/local/dict/silence_phones.txt." echo " # A list of silence phones, one or more per line" echo " # Relates to --only-words option" echo " --only-words (true|false) (default: true) # If true, exclude silence words, i.e." echo " # words with 1 phone which is a silence." exit 1; fi lexicon=$1 wdir=$2 mkdir -p $wdir/log [ ! -f $lexicon ] && echo "$0: Training lexicon does not exist." && exit 1 # Optionally remove words that are mapped to a single silence phone from the lexicon. if $only_words && [ ! -z "$silence_phones" ]; then awk -v s=$silence_phones \ 'BEGIN{while((getline<s)>0) {for(i=1;i<=NF;i++) sil[$i]=1;}} {if (!(NF == 2 && $2 in sil)) print;}' $lexicon > $wdir/lexicon_onlywords.txt lexicon=$wdir/lexicon_onlywords.txt fi if ! g2p=`which g2p.py` ; then echo "Sequitur was not found !" echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh" exit 1 fi echo "Training the G2P model (iter 0)" if [ $stage -le 0 ]; then $cmd $wdir/log/g2p.0.log \ g2p.py -S --encoding $encoding --train $lexicon --devel 5% --write-model $wdir/g2p.model.0 fi for i in `seq 0 $(($iters-2))`; do echo "Training the G2P model (iter $[$i + 1] )" if [ $stage -le $i ]; then $cmd $wdir/log/g2p.$(($i + 1)).log \ g2p.py -S --encoding $encoding --model $wdir/g2p.model.$i --ramp-up --train $lexicon --devel 5% --write-model $wdir/g2p.model.$(($i+1)) fi done ! (set -e; cd $wdir; ln -sf g2p.model.$[$iters-1] g2p.model.final ) && echo "Problem finalizing training... " && exit 1 if [ $stage -le $(($i + 2)) ]; then echo "Running test..." $cmd $wdir/log/test.log \ g2p.py --encoding $encoding --model $wdir/g2p.model.final --test $lexicon fi |