prepare_dict.sh
4.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/bin/bash
# Copyright 2017 Intellisist, Inc. (Author: Navneeth K)
# 2017 Xiaohui Zhang
# Apache License 2.0
# This script first prepares switchboard lexicon and CMUDict + tedlium combined lexicon (refered as cmudict later on for simplicity).
# Then it maps phones in switchboard lexicon to cmudict and merge these two lexicons to produce the final lexicon data/local/dict_combined.
# After phone mapping, all alternative pronunciations from switchboard lexicon are included.
replace_swbd_symbols=( "ax" "el" "en" )
replace_cmudict_symbols=( "ah" "ah l" "ah n" )
. ./cmd.sh
. ./path.sh
#check existing directories
if [ $# -lt 1 ] || [ $# -gt 2 ]; then
echo "Usage: prepare_dict.sh /path/to/SWBD [/path/to/TEDLIUM_r2]"
exit 1;
fi
SWBD_DIR=$1
TEDLIUM_DIR=$2
# This function filters lines that are common in both files
function filter_common {
awk 'NR==FNR{arr[$0]++;next} arr[$0] {print}' $1 $2
}
# This function filters lines in file2 that are not in file1
function filter_different {
awk 'NR==FNR{arr[$0]++;next} !arr[$0] {print}' $1 $2
}
num_syms=0
substitute_arg=""
for i in "${replace_swbd_symbols[@]}"; do
replace_symbol=${replace_cmudict_symbols[${num_syms}]}
if [ $num_syms -eq 0 ]; then
# ax appears twice together in "personably p er s ax ax n b l iy"
substitute_arg=" sed 's: ${i} : ${replace_symbol} :g' | sed 's: ${i} : ${replace_symbol} :g' | sed 's:${i}$:${replace_symbol}:g'"
else
substitute_arg=$substitute_arg" | sed 's: ${i} : ${replace_symbol} :g' | sed 's:${i}$:${replace_symbol}:g'"
fi
num_syms=$((num_syms+1))
done
# Prepare switchboard lexicon
local/swbd1_data_download.sh $SWBD_DIR
local/swbd1_prepare_dict.sh
# Prepare cmudict + tedlium lexicon
local/cmu_tedlium_prepare_dict.sh $TEDLIUM_DIR
dir=data/local/dict_combined
swbd_dir=data/local/dict_swbd
cmudict_dir=data/local/dict_cmu_tedlium
rm -rf $dir && mkdir -p $dir
# Find words that are unique to swbd lexicon (excluding non-scored words)
utils/filter_scp.pl --exclude ${cmudict_dir}/lexicon.txt \
${swbd_dir}/lexicon.txt | grep -v '\[*\]' | grep -v '<unk>' > ${dir}/lexicon_swbd_unique.txt || exit 1;
# Mapping phones from swbd phones to cmu phones for words above.
echo "cat ${dir}/lexicon_swbd_unique.txt | $substitute_arg" > ${dir}/substitute.sh
bash ${dir}/substitute.sh > ${dir}/lexicon_swbd_unique_cmuphones.txt || exit 1;
# Find words that exist in both swbd and cmudict lexicons (excluding non-scored words)
utils/filter_scp.pl --exclude ${dir}/lexicon_swbd_unique.txt \
${swbd_dir}/lexicon.txt | grep -v '\[*\]' | grep -v '<unk>' > ${dir}/lexicon_swbd1.txt || exit 1;
# Find words that have same pronounciation in both dictionaries - common lines
filter_common ${cmudict_dir}/lexicon.txt \
${dir}/lexicon_swbd1.txt > ${dir}/lexicon_re_match_pron.txt || exit 1;
# Find words in swbd lexicon that have different pronounciation from cmudict - different lines
filter_different ${dir}/lexicon_re_match_pron.txt \
${dir}/lexicon_swbd1.txt > ${dir}/lexicon_swbd2.txt || exit 1;
# Mapping phones from swbd phones to cmu phones for words above.
echo "cat ${dir}/lexicon_swbd2.txt | $substitute_arg" > ${dir}/substitute.sh
bash ${dir}/substitute.sh > ${dir}/lexicon_swbd3.txt || exit 1;
# lexicon_re_swbd4.txt contains lines that match after phone mapping
filter_common ${cmudict_dir}/lexicon.txt \
${dir}/lexicon_swbd3.txt > ${dir}/lexicon_re_swbd4.txt || exit 1;
# lexicon_swbd4.txt contains lines that do not match after phone mapping (alternative pronunciations).
filter_different ${cmudict_dir}/lexicon.txt \
${dir}/lexicon_swbd3.txt > ${dir}/lexicon_swbd4.txt || exit 1;
# Extract lines from cmudict that has the above words
utils/filter_scp.pl ${dir}/lexicon_swbd4.txt ${cmudict_dir}/lexicon.txt > ${dir}/lexicon_cmudict4.txt || exit 1;
# Writing to lexicon.txt
cat ${dir}/lexicon_swbd4.txt ${dir}/lexicon_swbd_unique_cmuphones.txt ${cmudict_dir}/lexicon.txt | sort -u > ${dir}/lexicon.txt
# Separate the lexicon word and phoneme expansion by TAB
cat ${dir}/lexicon.txt | awk '{printf("%s\t",$1); for(i=2;i<NF;i++) {printf("%s ",$i);} printf("%s\n",$NF)}' > ${dir}/lexicon_tab_separated.txt
mv ${dir}/lexicon_tab_separated.txt ${dir}/lexicon.txt
# copy silence, nonsilence and optional silence phones from swbd dict
cp ${cmudict_dir}/{nonsilence_phones.txt,silence_phones.txt,optional_silence.txt,extra_questions.txt} ${dir}
# validate the dict directory
utils/validate_dict_dir.pl $dir