Blame view

egs/iam/v1/local/prepare_dict.sh 1.34 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
  #!/usr/bin/env bash
  
  # Copyright      2017  Hossein Hadian
  #                2017  Chun Chieh Chang
  #                2017  Ashish Arora
  
  # This script prepares the dictionary.
  
  set -e
  dir=data/local/dict
  vocab_size=50000
  . ./utils/parse_options.sh
  
  mkdir -p $dir
  
  # First get the set of all letters that occur in data/train/text
  cat data/train/text | \
    perl -ne '@A = split; shift @A; for(@A) {print join("
  ", split(//)), "
  ";}' | \
    sort -u > $dir/nonsilence_phones.txt
  
  # Now use the pocolm's wordlist which is the most N frequent words in
  # in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising
  # letters as their transcription. Only include words that use the above letters.
  # (Letter # is replaced with <HASH>)
  
  export letters=$(cat $dir/nonsilence_phones.txt | tr -d "
  ")
  
  head -n $vocab_size data/local/local_lm/data/word_count | awk '{print $2}' | \
    perl -e '$letters=$ENV{letters};
  while(<>){
      chop;
      $w = $_;
      if($w =~ m/^[$letters]+$/){
        $trans = join(" ", split(//, $w));
        $trans =~ s/#/<HASH>/g;
        print "$w $trans
  ";
      }
  }' | sort -u > $dir/lexicon.txt
  
  
  perl -i -pe "s/#/<HASH>/" $dir/nonsilence_phones.txt
  
  echo '<sil> SIL' >> $dir/lexicon.txt
  echo '<unk> SIL' >> $dir/lexicon.txt
  
  echo SIL > $dir/silence_phones.txt
  
  echo SIL >$dir/optional_silence.txt
  
  echo -n "" >$dir/extra_questions.txt