gp_data_prep.sh
3.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/bash -u
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
set -o errexit
function error_exit () {
echo -e "$@" >&2; exit 1;
}
function read_dirname () {
local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
[ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
echo $retval
}
PROG=`basename $0`;
usage="Usage: $PROG <arguments>\n
Prepare train, dev, eval file lists for a language.\n
e.g.: $PROG --config-dir=conf --corpus-dir=corpus --languages=\"GE PO SP\"\n\n
Required arguments:\n
--config-dir=DIR\tDirecory containing the necessary config files\n
--corpus-dir=DIR\tDirectory for the GlobalPhone corpus\n
--languages=STR\tSpace separated list of two letter language codes\n
";
if [ $# -lt 3 ]; then
error_exit $usage;
fi
while [ $# -gt 0 ];
do
case "$1" in
--help) echo -e $usage; exit 0 ;;
--config-dir=*)
CONFDIR=`read_dirname $1`; shift ;;
--corpus-dir=*)
GPDIR=`read_dirname $1`; shift ;;
--languages=*)
LANGUAGES=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
esac
done
# (1) check if the config files are in place:
pushd $CONFDIR > /dev/null
[ -f dev_spk.list ] || error_exit "$PROG: Dev-set speaker list not found.";
[ -f eval_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
[ -f lang_codes.txt ] || error_exit "$PROG: Mapping for language name to 2-letter code not found.";
popd > /dev/null
[ -f path.sh ] && . ./path.sh # Sets the PATH to contain necessary executables
# (2) get the various file lists (for audio, transcription, etc.) for the
# specified language.
printf "Preparing file lists ... "
for L in $LANGUAGES; do
mkdir -p data/$L/local/data
local/gp_prep_flists.sh --corpus-dir=$GPDIR --dev-spk=$CONFDIR/dev_spk.list \
--eval-spk=$CONFDIR/eval_spk.list --lang-map=$CONFDIR/lang_codes.txt \
--work-dir=data $L >& data/$L/prep_flists.log &
# Running these in parallel since this does audio conversion (to figure out
# which files cannot be processed) and takes some time to run.
done
wait;
echo "Done"
# (3) Normalize the transcripts.
for L in $LANGUAGES; do
printf "Language - ${L}: normalizing transcripts ... "
for x in train dev eval; do
local/gp_norm_trans_${L}.pl -i data/$L/local/data/${x}_${L}.trans1 \
> data/$L/local/data/${x}_${L}.txt;
done
echo "Done"
done
# (4) Create a directories to contain files needed in training and testing:
for L in $LANGUAGES; do
printf "Language - ${L}: formatting train/test data ... "
for x in train dev eval; do
mkdir -p data/$L/$x
cp data/$L/local/data/${x}_${L}_wav.scp data/$L/$x/wav.scp
cp data/$L/local/data/${x}_${L}.txt data/$L/$x/text
cp data/$L/local/data/${x}_${L}.spk2utt data/$L/$x/spk2utt
cp data/$L/local/data/${x}_${L}.utt2spk data/$L/$x/utt2spk
done
echo "Done"
done
echo "Finished data preparation."