Blame view

egs/wsj/s5/steps/dict/train_g2p.sh 3.03 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  #!/bin/bash
  # Copyright 2014  Johns Hopkins University (Author: Yenda Trmal)
  # Copyright 2016  Xiaohui Zhang
  # Apache 2.0
  
  # Begin configuration section.
  iters=5
  stage=0
  encoding='utf-8'
  only_words=true
  cmd=run.pl
  # a list of silence phones, like data/local/dict/silence_phones.txt
  silence_phones=
  # End configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f ./path.sh ] && . ./path.sh; # source the path.
  . utils/parse_options.sh || exit 1;
  
  set -u
  set -e
  
  if [ $# != 2 ]; then
     echo "Usage: $0 [options] <lexicon-in> <work-dir>"
     echo "    where <lexicon-in> is the training lexicon (one pronunciation per "
     echo "    word per line, with lines like 'hello h uh l ow') and"
     echo "    <work-dir> is directory where the models will be stored"
     echo "e.g.: train_g2p.sh --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
     echo ""
     echo "main options (for others, see top of script file)"
     echo "  --iters <int>                                    # How many iterations. Relates to N-ngram order"
     echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
     echo "  --silence-phones <silphones-list>                # e.g. data/local/dict/silence_phones.txt."
     echo "                                                   # A list of silence phones, one or more per line"
     echo "                                                   # Relates to  --only-words option"
     echo "  --only-words (true|false)    (default: true)     # If true, exclude silence words, i.e."
     echo "                                                   # words with 1 phone which is a silence."
     exit 1;
  fi
  
  lexicon=$1
  wdir=$2
  
  
  mkdir -p $wdir/log
  
  [ ! -f $lexicon ] && echo "$0: Training lexicon does not exist." && exit 1
  
  # Optionally remove words that are mapped to a single silence phone from the lexicon.
  if $only_words && [ ! -z "$silence_phones" ]; then
    awk -v s=$silence_phones \
      'BEGIN{while((getline<s)>0) {for(i=1;i<=NF;i++) sil[$i]=1;}}
      {if (!(NF == 2 && $2 in sil)) print;}' $lexicon > $wdir/lexicon_onlywords.txt
    lexicon=$wdir/lexicon_onlywords.txt
  fi
  
  if ! g2p=`which g2p.py` ; then
    echo "Sequitur was not found !"
    echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
    exit 1
  fi
  
  echo "Training the G2P model (iter 0)"
  
  if [ $stage -le 0 ]; then
    $cmd $wdir/log/g2p.0.log \
      g2p.py -S --encoding $encoding --train $lexicon --devel 5% --write-model $wdir/g2p.model.0
  fi
  
  for i in `seq 0 $(($iters-2))`; do
  
    echo "Training the G2P model (iter $[$i + 1] )"
  
    if [ $stage -le $i ]; then
      $cmd $wdir/log/g2p.$(($i + 1)).log \
        g2p.py -S --encoding $encoding --model $wdir/g2p.model.$i --ramp-up --train $lexicon --devel 5% --write-model $wdir/g2p.model.$(($i+1))
    fi
  
  done
  
  ! (set -e; cd $wdir; ln -sf g2p.model.$[$iters-1] g2p.model.final ) && echo "Problem finalizing training... " && exit 1
  
  if [ $stage -le $(($i + 2)) ]; then
    echo "Running test..."
    $cmd $wdir/log/test.log \
      g2p.py --encoding $encoding --model $wdir/g2p.model.final --test $lexicon
  fi