Blame view
egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh
4.27 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
#!/bin/bash # Copyright 2014 Johns Hopkins University (Author: Yenda Trmal) # Copyright 2016 Xiaohui Zhang # 2018 Ruizhe Huang # Apache 2.0 # This script applies a trained Phonetisarus G2P model to # synthesize pronunciations for missing words (i.e., words in # transcripts but not the lexicon), and output the expanded lexicon. # The user could specify either nbest or pmass option # to determine the number of output pronunciation variants, # or use them together to get the intersection of two options. # Begin configuration section. stage=0 nbest= # Generate up to N, like N=3, pronunciation variants for each word # (The maximum size of the nbest list, not considering pruning and taking the prob-mass yet). thresh=5 # Pruning threshold for the n-best list, in (0, 99], which is a -log-probability value. # A large threshold makes the nbest list shorter, and less likely to hit the max size. # This value corresponds to the weight_threshold in shortest-path.h of openfst. pmass= # Select the top variants from the pruned nbest list, # summing up to this total prob-mass for a word. # On the "boundary", it's greedy by design, e.g. if pmass = 0.8, # and we have prob(pron_1) = 0.5, and prob(pron_2) = 0.4, then we get both. # End configuration section. echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh; # source the path. . utils/parse_options.sh || exit 1; set -u set -e if [ $# != 3 ]; then echo "Usage: $0 [options] <word-list> <g2p-model-dir> <output-dir>" echo "... where <word-list> is a list of words whose pronunciation is to be generated." echo " <g2p-model-dir> is a directory used as a target during training of G2P" echo " <output-dir> is the directory where the output lexicon should be stored." echo " The format of the output lexicon output-dir/lexicon.lex is" echo " <word>\t<prob>\t<pronunciation> per line." echo "e.g.: $0 --nbest 1 exp/g2p/oov_words.txt exp/g2p exp/g2p/oov_lex" echo "" echo "main options (for others, see top of script file)" echo " --nbest <int> # Generate upto N pronunciation variants for each word." echo " --pmass <float> # Select the top variants from the pruned nbest list," echo " # summing up to this total prob-mass, within [0, 1], for a word." echo " --thresh <int> # Pruning threshold for n-best." exit 1; fi wordlist=$1 modeldir=$2 outdir=$3 model=$modeldir/model.fst output_lex=$outdir/lexicon.lex mkdir -p $outdir [ ! -f ${model:-} ] && echo "$0: File $model not found in the directory $modeldir." && exit 1 [ ! -f $wordlist ] && echo "$0: File $wordlist not found!" && exit 1 [ -z $pmass ] && [ -z $nbest ] && echo "$0: nbest or/and pmass should be specified." && exit 1; if ! phonetisaurus=`which phonetisaurus-apply` ; then echo "Phonetisarus was not found !" echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh" exit 1 fi cp $wordlist $outdir/wordlist.txt # three options: 1) nbest, 2) pmass, 3) nbest+pmass, nbest=${nbest:-20} # if nbest is not specified, set it to 20, due to Phonetisaurus mechanism pmass=${pmass:-1.0} # if pmass is not specified, set it to 1.0, due to Phonetisaurus mechanism [[ ! $nbest =~ ^[1-9][0-9]*$ ]] && echo "$0: nbest should be a positive integer." && exit 1; echo "Applying the G2P model to wordlist $wordlist" phonetisaurus-apply --pmass $pmass --nbest $nbest --thresh $thresh \ --word_list $wordlist --model $model \ --accumulate --verbose --prob \ 1>$output_lex echo "Completed. Synthesized lexicon for new words is in $output_lex" # Some words might have been removed or skipped during the process, # let's check it and warn the user if so... nlex=`cut -f 1 $output_lex | sort -u | wc -l` nwlist=`cut -f 1 $wordlist | sort -u | wc -l` if [ $nlex -ne $nwlist ] ; then failed_wordlist=$outdir/lexicon.failed echo "WARNING: Unable to generate pronunciation for all words. "; echo "WARINNG: Wordlist: $nwlist words" echo "WARNING: Lexicon : $nlex words" comm -13 <(cut -f 1 $output_lex | sort -u ) \ <(cut -f 1 $wordlist | sort -u ) \ >$failed_wordlist && echo "WARNING: The list of failed words is in $failed_wordlist" fi exit 0 |