Blame view

egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh 4.8 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
  #!/usr/bin/env bash
  # Copyright 2014  Gaurav Kumar.   Apache 2.0
  
  . ./path.sh
  
  #First get the list of unique words from our text file
  if [ $# -lt 1 ]; then
    echo 'Usage fsp_prepare_dict.sh lexicon'
    exit 1;
  fi
  
  stage=0
  
  dir=`pwd`/data/local/dict
  datadir=`pwd`/data/local/data/train_all
  mkdir -p $dir
  local=`pwd`/local
  utils=`pwd`/utils
  tmpdir=`pwd`/data/local/tmp
  lexicon=$1
  
  #Get all unique words, remove punctuation.
  if [ $stage -le 0 ]; then
    cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "
  " | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords
    if [ ! -f "${tmpdir}/es_wordlist.json" ]; then
      echo "Could not find the large collection of Spanish words es_wordlist.json"
      echo "Trying to download it via wget"
  
      if ! which wget >&/dev/null; then
        echo "This script requires you to first install wget"
        exit 1;
      fi
  
      cwd=`pwd`
      cd $tmpdir
      wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz
  
      if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then
        echo "Download of the large Spanish word list failed"
        exit 1;
      fi
  
      tar -xovzf es_wordlist.json.tgz || exit 1;
      cd $cwd
    fi
  
    # Merge with gigaword corpus
    $local/merge_lexicons.py ${tmpdir} ${lexicon}
    mv $tmpdir/uniquewords $tmpdir/uniquewords.small
    mv $tmpdir/uniquewords64k $tmpdir/uniquewords
  fi
  
  #Then get the list of phones form basic_rules in the lexicon folder
  if [ $stage -le 1 ]; then
    if [ ! -d "$lexicon/callhome_spanish_lexicon_970908" ]; then
      echo "Could not find folder callhome_spanish_lexicon_970908 in the lexicon folder"
      exit 1;
    fi
  
    # This is a preliminary attempt to get the unique phones from the LDC lexicon
    # This will be extended based on our lexicon later
    perl $local/find_unique_phones.pl $lexicon/callhome_spanish_lexicon_970908 $tmpdir
  
  fi
  
  #Get pronunciation for each word using the spron.pl file in the lexicon folder
  if [ $stage -le 2 ]; then
    #cd $lexicon/callhome_spanish_lexicon_970908
    # Replace all words for which no pronunciation was generated with an orthographic
    # representation
    cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \
      | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \
      | awk -F '[/][/]' '{print $1}' \
      > $tmpdir/lexicon_raw
  fi
  
  #Break the pronunciation down according to the format required by Kaldi
  if [ $stage -le 3 ]; then
    # Creates a KALDI compatible lexicon, and extends the phone list
    perl $local/isolate_phones.pl $tmpdir
    cat $tmpdir/phones_extended | sort | awk '{if ($1 != "") {print;}}' > $tmpdir/phones_extended.1
    mv $tmpdir/phones $tmpdir/phones.small
    mv $tmpdir/phones_extended.1 $tmpdir/phones
    sort $tmpdir/phones -o $tmpdir/phones
    paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | sed -r 's:(\S+)\s#.*:\1 oov:g' > $tmpdir/lexicon.1
    #paste -d ' ' $tmpdir/uniquewords $tmpdir/lexicon_one_column | grep -v '#' > $tmpdir/lexicon.1
  fi
  
  if [ $stage -le 4 ]; then
    # silence phones, one per line.
    for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt
    echo sil > $dir/optional_silence.txt
  
    # An extra question will be added by including the silence phones in one class.
    cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "
  ";}' > \
    $dir/extra_questions.txt || exit 1;
  
    # Remove [] chars from phones
    cat $tmpdir/phones | awk '{if ($1 != "_" && $1 != "[" && $1 != "]") {print;}}' > $tmpdir/phones.1
    rm $tmpdir/phones
    mv $tmpdir/phones.1 $tmpdir/phones
    cp $tmpdir/phones $dir/nonsilence_phones.txt
  
    if [ -f $tmpdir/lexicon.2 ]; then rm $tmpdir/lexicon.2; fi
    cp "$tmpdir/lexicon.1" "$tmpdir/lexicon.2"
  
    # Add prons for laughter, noise, oov
    w=$(grep -v sil $dir/silence_phones.txt | tr '
  ' '|')
    perl -i -ne "print unless /\[(${w%?})\]/"  $tmpdir/lexicon.2
  
    for w in `grep -v sil $dir/silence_phones.txt`; do
      echo "[$w] $w"
    done | cat - $tmpdir/lexicon.2  > $tmpdir/lexicon.3 || exit 1;
  
    cat $tmpdir/lexicon.3  \
     <( echo "mm m"
        echo "<unk> oov" ) > $tmpdir/lexicon.4
  
    # From the lexicon remove _ from the phonetic representation
    cat $tmpdir/lexicon.4 | sed 's:\s_::g' > $tmpdir/lexicon.5
  
    cp "$tmpdir/lexicon.5" $dir/lexicon.txt
  
    cat $datadir/text  | \
    awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \
    sort -nr > $tmpdir/word_counts
  
    awk '{print $1}' $dir/lexicon.txt | \
    perl -e '($word_counts)=@ARGV;
     open(W, "<$word_counts")||die "opening word-counts $word_counts";
     while(<STDIN>) { chop; $seen{$_}=1; }
     while(<W>) {
       ($c,$w) = split;
       if (!defined $seen{$w}) { print; }
     } ' $tmpdir/word_counts > $tmpdir/oov_counts.txt
    echo "*Highest-count OOVs are:"
    head -n 20 $tmpdir/oov_counts.txt
  fi
  
  $utils/validate_dict_dir.pl $dir
  exit 0;