Blame view

egs/zeroth_korean/s5/local/prepare_dict.sh 2.45 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
  #!/bin/bash
  
  # Copyright 2014 Vassil Panayotov
  # Apache 2.0
  
  # Modified by Lucas Jo 2017 (Altas Guide)
  # Prepare dictionary
  
  if [ $# -ne 2 ]; then
    echo "Usage: $0 <lm-dir> <dst-dir>"
    echo "e.g.: /data/local/lm data/local/dict_nosp"
    exit 1
  fi
  lm_dir=$1
  dst_dir=$2
  
  mkdir -p $dst_dir || exit 1;
  
  # this file is  a copy of the lexicon we obtained from download_lm.sh process
  lexicon_raw_nosil=$dst_dir/lexicon_raw_nosil.txt
  
  if [[ ! -s "$lexicon_raw_nosil" ]]; then
    cp $lm_dir/zeroth_lexicon $lexicon_raw_nosil || exit 1
  fi
  
  silence_phones=$dst_dir/silence_phones.txt
  optional_silence=$dst_dir/optional_silence.txt
  nonsil_phones=$dst_dir/nonsilence_phones.txt
  extra_questions=$dst_dir/extra_questions.txt
  
  echo "Preparing phone lists and clustering questions"
  (echo SIL; echo SPN;) > $silence_phones
  #( echo SIL; echo BRH; echo CGH; echo NSN ; echo SMK; echo UM; echo UHH ) > $silence_phones
  echo SIL > $optional_silence
  # nonsilence phones; on each line is a list of phones that correspond
  # really to the same base phone.
  awk '{for (i=2; i<=NF; ++i) { print $i; gsub(/[0-9]/, "", $i); print $i}}' $lexicon_raw_nosil |\
    sort -u |\
    perl -e 'while(<>){
      chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
      $phones_of{$1} .= "$_ "; }
      foreach $list (values %phones_of) {print $list . "
  "; } ' \
      > $nonsil_phones || exit 1;
  # A few extra questions that will be added to those obtained by
  # automatically clustering
  # the "real" phones.  These ask about stress; there's also one for
  # silence.
  cat $silence_phones| awk '{printf("%s ", $1);} END{printf "
  ";}' > $extra_questions || exit 1;
  cat $nonsil_phones | perl -e 'while(<>){ foreach $p (split(" ", $_)){
  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l
  ";}' \
    >> $extra_questions || exit 1;
  
  echo "$(wc -l <$silence_phones) silence phones saved to: $silence_phones"
  echo "$(wc -l <$optional_silence) optional silence saved to: $optional_silence"
  echo "$(wc -l <$nonsil_phones) non-silence phones saved to: $nonsil_phones"
  echo "$(wc -l <$extra_questions) extra triphone clustering-related questions saved to: $extra_questions"
  
  #(echo '!SIL SIL'; echo '[BREATH] BRH'; echo '[NOISE] NSN'; echo '[COUGH] CGH';
  # echo '[SMACK] SMK'; echo '[UM] UM'; echo '[UH] UHH'
  #  echo '<UNK> NSN' ) | \
  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<UNK> SPN'; ) |\
    cat - $lexicon_raw_nosil | sort | uniq >$dst_dir/lexicon.txt
  echo "Lexicon text file saved as: $dst_dir/lexicon.txt"
  exit 0