Blame view

egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh 4.27 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
  #!/bin/bash
  # Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
  # Copyright 2016  Xiaohui Zhang
  #           2018  Ruizhe Huang
  # Apache 2.0
  
  # This script applies a trained Phonetisarus G2P model to
  # synthesize pronunciations for missing words (i.e., words in
  # transcripts but not the lexicon), and output the expanded lexicon.
  # The user could specify either nbest or pmass option 
  # to determine the number of output pronunciation variants, 
  # or use them together to get the intersection of two options.
  
  # Begin configuration section.  
  stage=0
  nbest=      # Generate up to N, like N=3, pronunciation variants for each word
              # (The maximum size of the nbest list, not considering pruning and taking the prob-mass yet). 
  thresh=5    # Pruning threshold for the n-best list, in (0, 99], which is a -log-probability value.
              # A large threshold makes the nbest list shorter, and less likely to hit the max size.
              # This value corresponds to the weight_threshold in shortest-path.h of openfst.
  pmass=      # Select the top variants from the pruned nbest list,
              # summing up to this total prob-mass for a word.
              # On the "boundary", it's greedy by design, e.g. if pmass = 0.8,
              # and we have prob(pron_1) = 0.5, and prob(pron_2) = 0.4, then we get both.
  # End configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f ./path.sh ] && . ./path.sh; # source the path.
  . utils/parse_options.sh || exit 1;
  
  set -u
  set -e
  
  if [ $# != 3 ]; then
    echo "Usage: $0 [options] <word-list> <g2p-model-dir> <output-dir>"
    echo "... where <word-list> is a list of words whose pronunciation is to be generated."
    echo "          <g2p-model-dir> is a directory used as a target during training of G2P"
    echo "          <output-dir> is the directory where the output lexicon should be stored."
    echo "                       The format of the output lexicon output-dir/lexicon.lex is" 
    echo "                       <word>\t<prob>\t<pronunciation> per line."
    echo "e.g.: $0 --nbest 1 exp/g2p/oov_words.txt exp/g2p exp/g2p/oov_lex"
    echo ""
    echo "main options (for others, see top of script file)"
    echo "  --nbest <int>    # Generate upto N pronunciation variants for each word." 
    echo "  --pmass <float>  # Select the top variants from the pruned nbest list," 
    echo "                   # summing up to this total prob-mass, within [0, 1], for a word." 
    echo "  --thresh <int>   # Pruning threshold for n-best."
    exit 1;
  fi
  
  wordlist=$1
  modeldir=$2
  outdir=$3
  
  model=$modeldir/model.fst
  output_lex=$outdir/lexicon.lex
  mkdir -p $outdir
  
  [ ! -f ${model:-} ] && echo "$0: File $model not found in the directory $modeldir." && exit 1
  [ ! -f $wordlist ] && echo "$0: File $wordlist not found!" && exit 1
  [ -z $pmass ] && [ -z $nbest ] && echo "$0: nbest or/and pmass should be specified." && exit 1;
  if ! phonetisaurus=`which phonetisaurus-apply` ; then
    echo "Phonetisarus was not found !"
    echo "Go to $KALDI_ROOT/tools and execute extras/install_phonetisaurus.sh"
    exit 1
  fi
  
  cp $wordlist $outdir/wordlist.txt
  
  # three options: 1) nbest, 2) pmass, 3) nbest+pmass,
  nbest=${nbest:-20}   # if nbest is not specified, set it to 20, due to Phonetisaurus mechanism
  pmass=${pmass:-1.0}  # if pmass is not specified, set it to 1.0, due to Phonetisaurus mechanism
  
  [[ ! $nbest =~ ^[1-9][0-9]*$ ]] && echo "$0: nbest should be a positive integer." && exit 1;
  
  echo "Applying the G2P model to wordlist $wordlist"
  phonetisaurus-apply --pmass $pmass --nbest $nbest --thresh $thresh \
    --word_list $wordlist --model $model \
    --accumulate --verbose --prob \
    1>$output_lex
  
  echo "Completed. Synthesized lexicon for new words is in $output_lex"
  
  # Some words might have been removed or skipped during the process,
  # let's check it and warn the user if so...
  nlex=`cut -f 1 $output_lex | sort -u | wc -l`
  nwlist=`cut -f 1 $wordlist | sort -u | wc -l`
  if [ $nlex -ne $nwlist ] ; then
    failed_wordlist=$outdir/lexicon.failed
    echo "WARNING: Unable to generate pronunciation for all words. ";
    echo "WARINNG:   Wordlist: $nwlist words"
    echo "WARNING:   Lexicon : $nlex words"
    comm -13 <(cut -f 1 $output_lex | sort -u ) \
             <(cut -f 1 $wordlist | sort -u ) \
             >$failed_wordlist && echo "WARNING: The list of failed words is in $failed_wordlist"
  fi
  exit 0