gp_prep_flists.sh
4.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/bin/bash -u
# Copyright 2012 Arnab Ghoshal
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
set -o errexit
set -o pipefail
function read_dirname () {
local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
[ -d "$dir_name" ] || { echo "Argument '$dir_name' not a directory" >&2; \
exit 1; }
local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
echo $retval
}
PROG=`basename $0`;
usage="Usage: $PROG <arguments> <2-letter language code>\n
Prepare train, dev, eval file lists for a language.\n\n
Required arguments:\n
--corpus-dir=DIR\tDirectory for the GlobalPhone corpus\n
--dev-spk=FILE\t\tDev set speaker list\n
--eval-spk=FILE\tEval set speaker list\n
--lang-map=FILE\tMapping from 2-letter language code to full name\n
--work-dir=DIR\t\tPlace to write the files (in a subdirectory with the 2-letter language code)\n
";
if [ $# -lt 6 ]; then
echo -e $usage; exit 1;
fi
while [ $# -gt 0 ];
do
case "$1" in
--help) echo -e $usage; exit 0 ;;
--corpus-dir=*)
GPDIR=`read_dirname $1`; shift ;;
--work-dir=*)
WDIR=`read_dirname $1`; shift ;;
--dev-spk=*)
DEVSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
--eval-spk=*)
EVALSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
--lang-map=*)
LANGMAP=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
??) LCODE=$1; shift ;;
*) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
esac
done
tmpdir=$(mktemp -d);
trap 'rm -rf "$tmpdir"' EXIT
grep "^$LCODE" $DEVSPK | cut -f2- | tr ' ' '\n' \
| sed -e "s?^?$LCODE?" -e 's?$?_?' > $tmpdir/dev_spk
grep "^$LCODE" $EVALSPK | cut -f2- | tr ' ' '\n' \
| sed -e "s?^?$LCODE?" -e 's?$?_?' > $tmpdir/eval_spk
# Currently the Dev/Eval info is missing for some languages and is marked
# by either TBA or XXX in the speaker list. We are currently not processing
# such languages.
egrep 'XXX|TBA' $tmpdir/dev_spk \
&& { echo "Dev speaker list not defined. File contents:"; \
cat $tmpdir/dev_spk; exit 1; }
egrep 'XXX|TBA' $tmpdir/eval_spk \
&& { echo "Eval speaker list not defined. File contents:"; \
cat $tmpdir/eval_spk; exit 1; }
# We are going to use the 2-letter codes throughout, but the top-level
# directories of the GlobalPhone corpus use the full names of languages.
full_name=`awk '/'$LCODE'/ {print $2}' $LANGMAP`;
ls "$GPDIR/$full_name/adc" | sed -e "s?^?$LCODE?" -e 's?$?_?' \
> $tmpdir/all_spk
grep -v -f $tmpdir/dev_spk -f $tmpdir/eval_spk $tmpdir/all_spk \
> $tmpdir/train_spk
find $GPDIR/$full_name/rmn -name '*.rmn' > $tmpdir/trans.list
ODIR=$WDIR/$LCODE/local # Directory to write file lists & transcripts
mkdir -p $ODIR $WDIR/$LCODE/wav # Directory for WAV files
for x in dev eval train; do
find $GPDIR/$full_name/adc -name "${LCODE}*\.adc\.shn" \
| grep -f $tmpdir/${x}_spk > $ODIR/${x}_${LCODE}.flist
# The audio conversion is done here since some files cannot be converted,
# and those need to be removed from the file lists.
gp_convert_audio.sh --input-list=$ODIR/${x}_${LCODE}.flist \
--output-dir=$WDIR/$LCODE/wav \
--output-list=$ODIR/${x}_${LCODE}_wav.flist
# Get the utterance IDs for the audio files successfully converted to WAV
sed -e "s?.*/??" -e 's?.wav$??' $ODIR/${x}_${LCODE}_wav.flist \
> $tmpdir/${x}_basenames_wav
paste $tmpdir/${x}_basenames_wav $ODIR/${x}_${LCODE}_wav.flist | sort -k1,1 \
> $tmpdir/${x}_${LCODE}_wav.scp
cut -f1 $tmpdir/${x}_${LCODE}_wav.scp > $tmpdir/${x}_basenames_wav2
# Now, get the transcripts: each line of the output contains an utterance
# ID followed by the transcript.
sed -e 's?_$??' $tmpdir/${x}_spk | grep -f - $tmpdir/trans.list \
| gp_extract_transcripts.pl | sort -k1,1 > $tmpdir/${x}_${LCODE}.trans
# Intersect the set of utterances with transcripts with the set of those
# with valid audio.
cut -f1 $tmpdir/${x}_${LCODE}.trans \
| join $tmpdir/${x}_basenames_wav2 - > $tmpdir/${x}_basenames
# Get the common set of WAV files and transcripts.
join $tmpdir/${x}_basenames $tmpdir/${x}_${LCODE}_wav.scp \
> $ODIR/${x}_${LCODE}_wav.scp
join $tmpdir/${x}_basenames $tmpdir/${x}_${LCODE}.trans \
> $ODIR/${x}_${LCODE}.trans
sed -e 's?_.*$??' $tmpdir/${x}_basenames \
| paste -d' ' $tmpdir/${x}_basenames - \
> $ODIR/${x}_${LCODE}.utt2spk
utt2spk_to_spk2utt.pl $ODIR/${x}_${LCODE}.utt2spk \
> $ODIR/${x}_${LCODE}.spk2utt || exit 1;
done