02_lexicon_LIA.sh
2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/sh
#. 00_init_paths.sh
echo "==> $0"
EXPE_DIR=$1
. ../LIA_kaldiUtils/path.sh
# ================================== #
#LM_SOURCE=../LM_DATABASE
#LM_DIR=$EXPE_DIR/LANGUAGE_MODEL
#SPEERAL_LEXICON=$LM_SOURCE/LEXIQUE_V3.fmt
#TEXT=$LM_DATA/text
#CLEAN_TEXT=$LM_DIR/text.no_oov
# ================================== #
LM_DATA=$EXPE_DIR/ling_Data/
DICT=$LM_DATA/dict/
mkdir -p $LM_DATA
mkdir -p $DICT
mkdir -p $EXPE_DIR/TEMP
NON_SILENCE_PHONEMES=$DICT/nonsilence_phones.txt
SILENCE_PHONEMES=$DICT/silence_phones.txt
OPTIONAL_SILENCES=$DICT/optional_silence.txt
EXTRA_QUESTIONS=$DICT/extra_questions.txt
OOV=$LM_DATA/oov.txt
LEXICON=../LM_DATABASE/LEXIQUE_V3.lst
LEXICON_PHON=../LM_DATABASE/LEXIQUE_V3.fmt
ls $LEXICON_PHON
#====================================#
cp $LEXICON_PHON $DICT
awk 'BEGIN{getline}($0 !~ /^#/) { print}' $DICT/LEXIQUE_V3.fmt | sort | awk '($0 !~ /^[:space:]*$/) {print}' > $DICT/lexicon1.txt || exit 1;
#cat $LEXICON | sed 's/(.)//' > $DICT/lexicon.txt
wc -l $DICT/lexicon1.txt
cat $DICT/lexicon1.txt |\
awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' |\
grep -v SIL > $NON_SILENCE_PHONEMES || exit 1;
#cat $KALDI_LEXICON | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | grep -v SIL > $NON_SILENCE_PHONEMES || exit 1;
#head $NON_SILENCE_PHONEMES
(echo "SIL"; echo "SPN"; echo "NSN"; echo "LAU" ) > $SILENCE_PHONEMES
echo "SIL" > $OPTIONAL_SILENCES
echo "<UNK>" > $OOV
# No "extra questions" in the input to this setup, as we don't have stress or tone.
touch $EXTRA_QUESTIONS
# Add to the lexicon the silences, noises etc.
(
#echo '!SIL SIL'; echo '[VOCALIZED-NOISE] SPN'; echo '[NOISE] NSN'; echo '[LAUGHTER] LAU';
echo '<UNK> SPN' ) |\
cat - $DICT/lexicon1.txt > $DICT/lexicon2.txt || exit 1;
cp $DICT/lexicon2.txt $DICT/lexicon.txt
#utils/prepare_lang.sh $LM_DATA "<UNK>" $EXPE_DIR/TEMP $LM_DATA
#prepare_lang.sh $LM_DATA "<UNK>" $EXPE_DIR/TEMP $LM_DATA
echo "====> prepare_lang.sh $DICT <UNK> $EXPE_DIR/TEMP $LM_DATA"
prepare_lang.sh $DICT "<UNK>" $EXPE_DIR/TEMP $LM_DATA