Blame view

egs/wsj/s5/local/wsj_prepare_char_dict.sh 1.18 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
  #!/bin/bash
  
  # Copyright 2017  Hossein Hadian
  
  phone_dir=data/local/dict_nosp
  dir=data/local/dict_char
  mkdir -p $dir
  
  [ -f path.sh ] && . ./path.sh
  
  # Simply transcribe each word with its comprising characters:
  
  # We keep only one pronunciation for each word. Other alternative pronunciations are discarded.
  cat $phone_dir/lexicon1_raw_nosil.txt | \
    perl -e 'while(<>){@A = split; if(! $seen{$A[0]}) {$seen{$A[0]} = 1; print $_;}}' \
         > $phone_dir/lexicon2_raw_nosil.txt || exit 1;
  
  
  cat $phone_dir/lexicon2_raw_nosil.txt | python -c 'import sys
  for l in sys.stdin:
    w = l.strip().split(" ")[0]
    r = w
    for c in w:
      if c not in "!~@#$%^&*()+=/\",;:?_{}-":
        r += " " + c
    print r
  ' > $dir/lexicon2_raw_nosil.txt || exit 1;
  
  (echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt
  echo SIL > $dir/optional_silence.txt
  
  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; \
   echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \
   cat - $dir/lexicon2_raw_nosil.txt | sort | uniq > $dir/lexicon.txt || exit 1;
  
  #  Get the set of non-silence phones
  cut -d' ' -f2- $dir/lexicon2_raw_nosil.txt | tr ' ' '
  ' | \
    sort -u > $dir/nonsilence_phones.txt
  
  echo "Character-based dictionary preparation succeeded."