Blame view

egs/tedlium/s5/local/prepare_dict.sh 1.38 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  #!/bin/bash
  #
  # Copyright  2014 Nickolay V. Shmyrev
  #            2014 Brno University of Technology (Author: Karel Vesely)
  #            2016 Daniel Galvez
  # Apache 2.0
  #
  
  dir=data/local/dict_nosp
  mkdir -p $dir
  
  srcdict=db/cantab-TEDLIUM/cantab-TEDLIUM.dct
  
  [ ! -r $srcdict ] && echo "Missing $srcdict" && exit 1
  
  # Join dicts and fix some troubles
  cat $srcdict | grep -v -w "<s>" | grep -v -w "</s>" | grep -v -w "<unk>" | \
    LANG= LC_ALL= sort | sed 's:([0-9])::g' > $dir/lexicon_words.txt
  
  cat $dir/lexicon_words.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
    grep -v SIL | sort > $dir/nonsilence_phones.txt
  
  ( echo SIL; echo BRH; echo CGH; echo NSN ; echo SMK; echo UM; echo UHH ) > $dir/silence_phones.txt
  
  echo SIL > $dir/optional_silence.txt
  
  # No "extra questions" in the input to this setup, as we don't
  # have stress or tone.
  echo -n >$dir/extra_questions.txt
  
  # Add to the lexicon the silences, noises etc.
  # Typically, you would use "<UNK> NSN" here, but the Cantab Research language models
  # use <unk> instead of <UNK> to represent out of vocabulary words.
  (echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH';
   echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH'
   echo '<unk> NSN' ) | \
   cat - $dir/lexicon_words.txt | sort | uniq > $dir/lexicon.txt
  
  # Check that the dict dir is okay!
  utils/validate_dict_dir.pl $dir || exit 1