Blame view
egs/gale_arabic/s5/local/gale_prep_dict.sh
821 Bytes
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
#!/bin/bash # Copyright 2014 QCRI (author: Ahmed Ali) # Apache 2.0 # run this from ../ dir=data/local/dict mkdir -p $dir # (1) Get QCRI dictionary wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2 || exit 1; bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d' > $dir/lexicon.txt rm -fr ar-ar_lexicon_2014-03-17.txt.bz2 #(2) Dictionary preparation: # silence phones, one per line. echo SIL > $dir/silence_phones.txt echo SIL > $dir/optional_silence.txt # nonsilence phones; on each line is a list of phones that correspond # really to the same base phone. cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' ' ' |\ sort -u > $dir/nonsilence_phones.txt || exit 1; perl -i -pe 'print "<UNK> SIL " if $.==1' $dir/lexicon.txt echo Dictionary preparation succeeded exit 0 |