Blame view
egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh
3.27 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
#!/bin/bash # Copyright 2017 Intellisist, Inc. (Author: Navneeth K) # 2017 Xiaohui Zhang # 2018 Ruizhe Huang # Apache License 2.0 # This script trains a g2p model using Phonetisaurus. stage=0 encoding='utf-8' only_words=true silence_phones= echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . utils/parse_options.sh || exit 1; set -u set -e if [ $# != 2 ]; then echo "Usage: $0 [options] <lexicon-in> <work-dir>" echo " where <lexicon-in> is the training lexicon (one pronunciation per " echo " word per line, with lines like 'hello h uh l ow') and" echo " <work-dir> is directory where the models will be stored" echo "e.g.: $0 --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/" echo "" echo "main options (for others, see top of script file)" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." echo " --silence-phones <silphones-list> # e.g. data/local/dict/silence_phones.txt." echo " # A list of silence phones, one or more per line" echo " # Relates to --only-words option" echo " --only-words (true|false) (default: true) # If true, exclude silence words, i.e." echo " # words with one or multiple phones which are all silence." exit 1; fi lexicon=$1 wdir=$2 [ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit isuconv=`which uconv` if [ -z $isuconv ]; then echo "uconv was not found. You must install the icu4c package." exit 1; fi if ! phonetisaurus=`which phonetisaurus-apply` ; then echo "Phonetisarus was not found !" echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh" exit 1 fi mkdir -p $wdir # For input lexicon, remove pronunciations containing non-utf-8-encodable characters, # and optionally remove words that are mapped to a single silence phone from the lexicon. if [ $stage -le 0 ]; then if $only_words && [ ! -z "$silence_phones" ]; then awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \ $silence_phones $lexicon | \ awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s ",$NF);}' | \ uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt else awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s ",$NF);}' $lexicon | \ uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt fi fi if [ $stage -le 1 ]; then # Align lexicon stage. Lexicon is assumed to have first column tab separated phonetisaurus-align --input=$wdir/lexicon_tab_separated.txt --ofile=${wdir}/aligned_lexicon.corpus || exit 1; fi if [ $stage -le 2 ]; then # Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality. ./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa fi if [ $stage -le 3 ]; then # Convert the arpa file to FST. phonetisaurus-arpa2wfst --lm=${wdir}/aligned_lexicon.arpa --ofile=${wdir}/model.fst fi |