lang_prep.sh
1.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/bash
## Only run this file from the example root directory
## $ ./local/data_prep.sh
CORPUS_DIR="$1"
mkdir -p "data/local/dict"
source ./path.sh
#############################
# data/local/dict/lexicon.txt
#############################
export LC_ALL=C
echo -e '!SIL sil\n<UNK> spn' > data/local/dict/lexicon.txt
cat "$CORPUS_DIR/diccionarios/T22.full.dic" \
| tr '[:upper:]' '[:lower:]' \
| sed -e 's/([0123456789]*)//g' \
-e 's/\([^ ]\)n\~/\1n/g' \
-e 's/a_7/a/g' -e 's/e_7/e/g' -e 's/i_7/i/g' -e 's/o_7/o/g' -e 's/u_7/u/g' \
-e 's/a-7/a/g' -e 's/e-7/e/g' -e 's/i-7/i/g' -e 's/o-7/o/g' -e 's/u-7/u/g' \
-e 's/a_/a/g' -e 's/e_/e/g' -e 's/i_/i/g' -e 's/o_/o/g' -e 's/u_/u/g' \
| sed -e 's/_7n.*$//' \
-e 's/atl_7tica/atletica/' \
-e 's/biol_7gicas/biologicas/' \
-e 's/elec_7ctrico/electrico/' \
-e 's/gr_7afico/grafico/' \
-e 's/s_7lo/solo/' \
| sed -e 's/n~/ni/g' -e 's/r(/rh/g' \
| sed -e 's/\t/ /g' -e '/^$/d' \
| sort | uniq \
>> data/local/dict/lexicon.txt
#######################################
# data/local/dict/silence_phones.txt
# data/local/dict/optional_silence.txt
# data/local/dict/nonsilence_phones.txt
# data/local/dict/extra_questions.txt
#######################################
echo -e 'sil\nspn' > data/local/dict/silence_phones.txt
echo -e 'sil' > data/local/dict/optional_silence.txt
cat data/local/dict/lexicon.txt \
| grep -v '<UNK>' \
| grep -v '!SIL' \
| cut -d' ' -f1 --complement \
| sed 's/ /\n/g' \
| sort -u \
> data/local/dict/nonsilence_phones.txt