Blame view

egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh 5.76 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
  #! /bin/bash
  #
  # This script demonstrates a lexicon learning recipe, which aims to imrove
  # the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
  # the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_greedy.sh
  # for explanation of the options. 
  #
  # Copyright 2018  Xiaohui Zhang
  # Apache 2.0
  
  . ./cmd.sh
  . ./path.sh
  
  oov_symbol="<unk>"
  # The user may have an phonetisaurus-trained English g2p model ready.
  g2p_mdl_dir=
  # The dir which contains the reference lexicon (most probably hand-derived)
  # we want to expand/improve, and nonsilence_phones.txt,.etc which we need  
  # for building new dict dirs.
  ref_dict=data/local/dict
  # acoustic training data we use to get alternative
  # pronunciations and collet acoustic evidence.
  data=data/train
  # the cut-off parameter used to select pronunciation candidates from phone
  # decoding. We remove pronunciations with probabilities less than this value
  # after normalizing the probs s.t. the max-prob is 1.0 for each word."
  min_prob=0.1
  # Refer to steps/dict/select_prons_greedy.sh for the detailed meaning of
  # alpha, beta and delta. Basically, the three dimensions of alpha
  # and beta correspond to three pronunciation sources: phonetic-
  # decoding, G2P and the reference lexicon, and the larger a value is,
  # the more aggressive we'll prune pronunciations from that sooure.
  # The valid range of each dim. is [0, 1] (for alpha, and 0 means 
  # we never pruned pron from that source.) [0, 100] (for beta). 
  alpha="0.04,0.02,0"
  beta="30,5,0"
  # Floor value of the pronunciation posterior statistics.
  delta=0.00000001
  # This parameter determines how many pronunciations we keep for each word
  # after the first pass pruning. See steps/dict/internal/prune_pron_candidates.py
  # for details.
  vcr=16 
  
  # Intermediate outputs of the lexicon learning stage will be put into dir
  dir=exp/tri3_lex_greedy_work
  nj=35
  decode_nj=30
  stage=0
  lexlearn_stage=0
  affix="learned_greedy"
  
  . utils/parse_options.sh # accept options
  
  # The reference vocab is the list of words which we already have hand-derived pronunciations.
  ref_vocab=data/local/vocab.txt
  cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1; 
  
  # Get a G2P generated lexicon for oov words (w.r.t the reference lexicon)
  # in acoustic training data.
  if [ $stage -le 0 ]; then
    if [ -z $g2p_mdl_dir ]; then
      g2p_mdl_dir=exp/g2p_phonetisaurus
      steps/dict/train_g2p_phonetisaurus.sh $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1;
    fi
    awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s
  ",w;}' \
      $data/text | sort -u > $data/train_vocab.txt || exit 1;
    awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \
      $data/train_vocab.txt | sort > $data/oov_train.txt || exit 1;
    steps/dict/apply_g2p_phonetisaurus.sh --nbest 5 $data/train_vocab.txt $g2p_mdl_dir \
      exp/g2p_phonetisaurus/lex_train || exit 1;
  fi
  
  # Learn a lexicon based on the acoustic training data and the reference lexicon.
  if [ $stage -le 1 ]; then
    steps/dict/learn_lexicon_greedy.sh --lexiconp-g2p "exp/g2p_phonetisaurus/lex_train/lexicon.lex" \
      --alpha $alpha --beta $beta --delta $delta \
      --min-prob $min_prob --cmd "$train_cmd" \
      --variant-counts-ratio $vcr \
      --stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl false \
      $ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_${affix}_nosp \
      $dir || exit 1;
  fi
  
  # Add pronounciation probs to the learned lexicon.
  if [ $stage -le 2 ]; then
    utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
      data/local/dict_${affix}_nosp $oov_symbol data/local/lang_${affix}_nosp data/lang_${affix}_nosp || exit 1;
    
    steps/align_si.sh --nj $nj --cmd "$train_cmd" \
      $data data/lang_${affix}_nosp exp/tri2 exp/tri2_ali_${affix}_nosp || exit 1;
    
    steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_${affix}_nosp exp/tri2_ali_${affix}_nosp || exit 1;
    
    utils/dict_dir_add_pronprobs.sh --max-normalize true \
      data/local/dict_${affix}_nosp exp/tri2_ali_${affix}_nosp/pron_counts_nowb.txt \
      exp/tri2_ali_${affix}_nosp/sil_counts_nowb.txt \
      exp/tri2_ali_${affix}_nosp/pron_bigram_counts_nowb.txt data/local/dict_${affix} || exit 1;
    
    utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
      data/local/dict_${affix} $oov_symbol data/local/lang_${affix} data/lang_${affix} || exit 1;
  fi
  
  # Re-decode
  if [ $stage -le 3 ]; then
    ! cmp data/lang_nosp/words.txt data/lang_${affix}/words.txt &&\
      echo "$0: The vocab of the affix lexicon and the reference vocab may be incompatible."
    cp data/lang_nosp/G.fst data/lang_${affix}/
    utils/mkgraph.sh data/lang_${affix} exp/tri3 exp/tri3/graph_${affix} || exit 1;
    
    for dset in dev test; do
    (  steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
        exp/tri3/graph_${affix} data/${dset} exp/tri3/decode_${affix}_${dset} || exit 1;
    ) &
    done
  fi
  
  # RESULTS:
  # Baseline:
  # %WER 18.7 | 507 17783 | 83.9 11.4 4.7 2.6 18.7 92.3 | -0.006 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys
  # %WER 17.6 | 1155 27500 | 84.7 11.6 3.7 2.4 17.6 87.2 | 0.013 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys
  
  # Re-decoding with the learned lexicon:
  # %WER 18.5 | 507 17783 | 84.3 11.2 4.5 2.8 18.5 92.3 | -0.007 | exp/tri3/decode_learned_greedy_dev/score_16_0.0/ctm.filt.filt.sys
  # %WER 17.5 | 1155 27500 | 84.9 11.5 3.6 2.4 17.5 87.5 | 0.035 | exp/tri3/decode_learned_greedy_test/score_14_0.0/ctm.filt.filt.sys
  
  # To see the effect to neural-net results, one should re-train NN with the learned lexicon.
  # Experiments have shown that, with the new lang dir, one should just re-run NN training
  # starting from the supervision generation (steps/align_fmllr_lats.sh) stage, and should
  # expect improved overall WERs and word recognition performance on words whose pronunciations
  # were changed.
  
  exit
  wait