Blame view
egs/spanish_dimex100/s5/local/lang_prep.sh
1.59 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
#!/bin/bash ## Only run this file from the example root directory ## $ ./local/data_prep.sh CORPUS_DIR="$1" mkdir -p "data/local/dict" source ./path.sh ############################# # data/local/dict/lexicon.txt ############################# export LC_ALL=C echo -e '!SIL sil <UNK> spn' > data/local/dict/lexicon.txt cat "$CORPUS_DIR/diccionarios/T22.full.dic" \ | tr '[:upper:]' '[:lower:]' \ | sed -e 's/([0123456789]*)//g' \ -e 's/\([^ ]\)n\~/\1n/g' \ -e 's/a_7/a/g' -e 's/e_7/e/g' -e 's/i_7/i/g' -e 's/o_7/o/g' -e 's/u_7/u/g' \ -e 's/a-7/a/g' -e 's/e-7/e/g' -e 's/i-7/i/g' -e 's/o-7/o/g' -e 's/u-7/u/g' \ -e 's/a_/a/g' -e 's/e_/e/g' -e 's/i_/i/g' -e 's/o_/o/g' -e 's/u_/u/g' \ | sed -e 's/_7n.*$//' \ -e 's/atl_7tica/atletica/' \ -e 's/biol_7gicas/biologicas/' \ -e 's/elec_7ctrico/electrico/' \ -e 's/gr_7afico/grafico/' \ -e 's/s_7lo/solo/' \ | sed -e 's/n~/ni/g' -e 's/r(/rh/g' \ | sed -e 's/\t/ /g' -e '/^$/d' \ | sort | uniq \ >> data/local/dict/lexicon.txt ####################################### # data/local/dict/silence_phones.txt # data/local/dict/optional_silence.txt # data/local/dict/nonsilence_phones.txt # data/local/dict/extra_questions.txt ####################################### echo -e 'sil spn' > data/local/dict/silence_phones.txt echo -e 'sil' > data/local/dict/optional_silence.txt cat data/local/dict/lexicon.txt \ | grep -v '<UNK>' \ | grep -v '!SIL' \ | cut -d' ' -f1 --complement \ | sed 's/ / /g' \ | sort -u \ > data/local/dict/nonsilence_phones.txt |