gp_data_prep.sh
4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/bin/bash -u
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
set -o errexit
function error_exit () {
echo -e "$@" >&2; exit 1;
}
function read_dirname () {
local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
[ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
echo $retval
}
PROG=`basename $0`;
usage="Usage: $PROG <arguments>\n
Prepare train, dev, eval file lists for a language.\n\n
Required arguments:\n
--config-dir=DIR\tDirecory containing the necessary config files\n
--corpus-dir=DIR\tDirectory for the GlobalPhone corpus\n
--lm-dir=DIR\t\tDirectory containing language models\n
--work-dir=DIR\t\tWorking directory\n
";
if [ $# -lt 4 ]; then
error_exit $usage;
fi
while [ $# -gt 0 ];
do
case "$1" in
--help) echo -e $usage; exit 0 ;;
--config-dir=*)
CONFDIR=`read_dirname $1`; shift ;;
--corpus-dir=*)
GPDIR=`read_dirname $1`; shift ;;
--lm-dir=*)
LMDIR=`read_dirname $1`; shift ;;
--work-dir=*)
WDIR=`read_dirname $1`; shift ;;
*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
esac
done
# (1) check if the config files are in place:
cd $CONFDIR
[ -f dev_spk.list ] || error_exit "$PROG: Dev-set speaker list not found.";
[ -f eval_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
[ -f lang_codes.txt ] || error_exit "$PROG: Mapping for language name to 2-letter code not found.";
cd $WDIR
[ -f path.sh ] && . ./path.sh # Sets the PATH to contain necessary executables
# (2) get the various file lists (for audio, transcription, etc.) for the
# specified language.
for LCODE in GE PO SP SW; do
mkdir -p data/$LCODE
gp_prep_flists.sh --corpus-dir=$GPDIR --dev-spk=$CONFDIR/dev_spk.list \
--eval-spk=$CONFDIR/eval_spk.list --lang-map=$CONFDIR/lang_codes.txt \
--work-dir=data $LCODE 2>data/$LCODE/prep_flists.log &
# Running these in parallel since this does audio conversion (to figure out
# which files cannot be processed) and takes some time to run.
done
wait;
# (3) Normalize the dictionary and transcripts.
for LCODE in GE PO SP SW; do
full_name=`awk '/'$LCODE'/ {print $2}' $CONFDIR/lang_codes.txt`;
gp_norm_dict_${LCODE}.pl -i $GPDIR/Dictionaries/${LCODE}/${full_name}-GPDict.txt | sort -u > data/$LCODE/local/lexicon_nosil_${LCODE}.txt
(echo -e '!SIL\tSIL\n<UNK>\tSPN';) \
| cat - data/$LCODE/local/lexicon_nosil_${LCODE}.txt \
> data/$LCODE/local/lexicon_${LCODE}.txt;
# add disambig symbols to the lexicon:
ndisambig=`add_lex_disambig.pl data/$LCODE/local/lexicon_${LCODE}.txt data/$LCODE/local/lexicon_disambig_${LCODE}.txt`
ndisambig=$[$ndisambig+1]; # add one disambig symbol for silence
echo $ndisambig > data/$LCODE/local/lex_ndisambig
# Get the list of phones and map them to integers (adding silence and spoken
# nosie to the list).
cut -f2 data/$LCODE/local/lexicon_nosil_${LCODE}.txt | sed -e "s?_.*??g" \
| tr ' ' '\n' | sort -u \
| awk 'BEGIN{ print "<eps> 0"; print "SIL 1"; print "SPN 2"; N=3; }
{ printf("%s %d\n", $1, N++); }' > data/$LCODE/local/phones.txt
# If using word-boundary markers on phones, use this in the awk command above
# { printf("%s_WB %d\n", $1, N++); }
# If using position markers on phones, use these in the awk command above
# { printf("%s_B %d\n", $1, N++); }
# { printf("%s_E %d\n", $1, N++); }
# { printf("%s_S %d\n", $1, N++); }
# Get the list of words:
cut -f1 data/$LCODE/local/lexicon_${LCODE}.txt | sort -u \
| awk 'BEGIN{print "<eps> 0";} {printf("%s %d\n", $1, NR);}
END{printf("#0 %d\n", NR+1);}' > data/$LCODE/local/words.txt
for x in train dev eval; do
gp_norm_trans_${LCODE}.pl -i data/$LCODE/local/${x}_${LCODE}.trans \
> data/$LCODE/local/${x}_${LCODE}.trans2;
done
done
# (4) Normalize the LMs - this is very Edinburgh-specific since we have some
# LMs that came with the GlobalPhone corpus.
gp_prep_lms_edin.sh --lm-dir=$LMDIR --work-dir=$WDIR
echo "Finished data preparation."