Blame view
egs/wsj/s5/steps/dict/apply_g2p.sh
3.03 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
#!/bin/bash # Copyright 2014 Johns Hopkins University (Author: Yenda Trmal) # Copyright 2016 Xiaohui Zhang # Apache 2.0 # Begin configuration section. stage=0 encoding='utf-8' var_counts=3 #Generate upto N variants var_mass=0.9 #Generate so many variants to produce 90 % of the prob mass cmd=run.pl nj=10 #Split the task into several parallel, to speedup things model= # End configuration section. echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . parse_options.sh || exit 1; set -u set -e if [ $# != 3 ]; then echo "Usage: $0 [options] <word-list> <g2p-model-dir> <output-dir>" echo "... where <word-list> is a list of words whose pronunciation is to be generated" echo " <g2p-model-dir> is a directory used as a target during training of G2P" echo " <output-dir> is the directory where the output lexicon should be stored" echo "e.g.: $0 oov_words exp/g2p exp/g2p/oov_lex" echo "" echo "main options (for others, see top of script file)" echo " --nj <int> # How many tasks should be spawn (to speedup things)" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." exit 1; fi wordlist=$1 modeldir=$2 output=$3 mkdir -p $output/log model=$modeldir/g2p.model.final [ ! -f ${model:-} ] && echo "File $model not found in the directory $modeldir." && exit 1 #[ ! -x $wordlist ] && echo "File $wordlist not found!" && exit 1 cp $wordlist $output/wordlist.txt if ! g2p=`which g2p.py` ; then echo "The Sequitur was not found !" echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh" exit 1 fi echo "Applying the G2P model to wordlist $wordlist" if [ $stage -le 0 ]; then $cmd JOBS=1:$nj $output/log/apply.JOBS.log \ split -n l/JOBS/$nj $output/wordlist.txt \| \ g2p.py -V $var_mass --variants-number $var_counts --encoding $encoding \ --model $modeldir/g2p.model.final --apply - \ \> $output/output.JOBS fi cat $output/output.* > $output/output # Remap the words from output file back to the original casing # Conversion of some of thems might have failed, so we have to be careful # and use the transform_map file we generated beforehand # Also, because the sequitur output is not readily usable as lexicon (it adds # one more column with ordering of the pron. variants) convert it into the proper lexicon form output_lex=$output/lexicon.lex # Just convert it to a proper lexicon format cut -f 1,3,4 $output/output > $output_lex # Some words might have been removed or skipped during the process, # let's check it and warn the user if so... nlex=`cut -f 1 $output_lex | sort -u | wc -l` nwlist=`cut -f 1 $output/wordlist.txt | sort -u | wc -l` if [ $nlex -ne $nwlist ] ; then echo "WARNING: Unable to generate pronunciation for all words. "; echo "WARINNG: Wordlist: $nwlist words" echo "WARNING: Lexicon : $nlex words" echo "WARNING:Diff example: " diff <(cut -f 1 $output_lex | sort -u ) \ <(cut -f 1 $output/wordlist.txt | sort -u ) || true fi exit 0 |