aspire_data_prep.sh
5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/bin/bash
# Copyright 2015 Johns Hopkins University (Author: Vijayaditya Peddinti)
# Apache 2.0.
set -e
stage=0
# Location of aspire data.
aspire_data=/export/corpora/LDC/LDC2017S21/IARPA-ASpIRE-Dev-Sets-v2.0/data # for JHU
mean_rms=0.0417 # determined from the mean rms value of data/train_rvb/mean_rms
. ./path.sh # Needed for KALDI_ROOT
. utils/parse_options.sh
dev_transcript=$aspire_data/dev_and_dev_test_STM_files
dev_audio=$aspire_data/dev_and_dev_test_audio/ASpIRE_single_dev
test_audio=$aspire_data/dev_and_dev_test_audio/ASpIRE_single_dev_test
if [ ! -f $aspire_data/my_english.glm ]; then
echo "Expected to find the glm file, provided in ASpIRE challenge."
echo "Please provide the glm file in $aspire_data." && exit 1;
fi
# (1) Get transcripts in one file, and clean them up ..
tmpdir=`pwd`/data/local/data
mkdir -p $tmpdir
if [ $stage -le 0 ]; then
find $dev_transcript/ -name 'dev.stm' > $tmpdir/transcripts.flist
find $dev_audio/ -name '*.wav' > $tmpdir/wav.flist
find $test_audio/ -name '*.wav' > $tmpdir/wav_test.flist
n=$(awk '{print $1}' $(cat $tmpdir/transcripts.flist) | uniq | wc -l)
if [ $n -ne 30 ]; then
echo "Expected to find 30 transcript files in the aspire_single_dev_transcript directory, found $n"
exit 1;
fi
n=`cat $tmpdir/wav.flist | wc -l`
if [ $n -ne 30 ]; then
echo "Expected to find 30 .wav files in the aspire_single_dev directory, found $n"
exit 1;
fi
n=`cat $tmpdir/wav_test.flist | wc -l`
if [ $n -ne 60 ]; then
echo "Expected to find 60 .wav files in the aspire_single_dev_test data, found $n"
exit 1;
fi
fi
# create the dev_aspire files
dev=data/dev_aspire
if [ $stage -le 1 ]; then
mkdir -p $dev
# transcription file format
# single_074f59de 1 single_074f59de 497.775 506.595 um everybody can't get their needs met in in in in a in a negotiations or to to their satisfaction but at least you're attemptin
echo -n > $tmpdir/text.1 || exit 1;
python -c "
import sys
trans_file = open('$tmpdir/text.1', 'w')
utt2spk_file = open('$dev/utt2spk', 'w')
segments_file = open('$dev/segments', 'w')
stm_file = open('$dev/stm', 'w')
utt2spk = []
for file_name in open('$tmpdir/transcripts.flist', 'r').readlines():
lines = open(file_name.strip()).readlines()
for line in lines:
parts = line.split()
file_id = parts[0]
utt_id = '{0}-{1}-{2:06}-{3:06}'.format(parts[0], parts[1], int(float(parts[3]) * 1000), int(float(parts[4]) * 1000))
spk_id = '{0}-{1}'.format(parts[0], parts[1])
stm_file.write('{0} A {0} {1}\n'.format(spk_id, ' '.join(parts[3:])))
trans_file.write('{0} {1}\n'.format(utt_id, ' '.join(parts[5:])))
utt2spk.append(('{0} {1}\n'.format(utt_id, spk_id)))
segments_file.write('{0} {1}-1 {2} {3}\n'.format(utt_id, file_id, parts[3], parts[4]))
stm_file.close()
trans_file.close()
utt2spk.sort()
utt2spk_file.write(''.join(utt2spk))
utt2spk_file.close()
segments_file.close()
" || exit 1;
fi
if [ $stage -le 2 ]; then
sort $tmpdir/text.1 | grep -v '((' | \
awk '{if (NF > 1){ print; }}' | \
sed 's:\[laugh\]:[laughter]:g' | \
sed 's:\[sigh\]:[noise]:g' | \
sed 's:\[cough\]:[noise]:g' | \
sed 's:\[sigh\]:[noise]:g' | \
sed 's:\[mn\]:[noise]:g' | \
sed 's:\[breath\]:[noise]:g' | \
sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2
cp $tmpdir/text.2 $dev/text
utils/utt2spk_to_spk2utt.pl <$dev/utt2spk > $dev/spk2utt
fi
if [ $stage -le 3 ]; then
for f in `cat $tmpdir/wav.flist`; do
# convert to absolute path
utils/make_absolute.sh $f
done > $tmpdir/wav_abs.flist
cat $tmpdir/wav_abs.flist | python -c "
import sys, os, subprocess, re
for line in sys.stdin.readlines():
if len(line.strip()) == 0:
continue
proc = subprocess.Popen('sox {0} -n stat'.format(line.strip()).split(), stdout = subprocess.PIPE, stderr = subprocess.PIPE)
out, err = proc.communicate()
out_rms = $mean_rms/float(re.split('RMS\s+amplitude:', err)[1].split()[0])
line = line.strip()
file_id=os.path.splitext(os.path.split(line)[1])[0]+'-1'
print '{0} sox --vol {1} {2} -r 8000 -t wav - |'.format(file_id, out_rms, line)
"| sort -k1,1 -u > $dev/wav.scp || exit 1;
cat $dev/wav.scp |awk '{printf("%s %s A\n", $1, $1)}' > $dev/reco2file_and_channel
cp $aspire_data/my_english.glm $dev/glm
fi
# prepare test data
if [ $stage -le 4 ]; then
for dataset in test ; do
test=data/${dataset}_aspire
mkdir -p $test
for f in `cat $tmpdir/wav_${dataset}.flist`; do
# convert to absolute path
utils/make_absolute.sh $f
done > $tmpdir/wav_${dataset}_abs.flist
cat $tmpdir/wav_${dataset}_abs.flist | \
python -c "
import sys, os, subprocess, re
lines = sys.stdin.readlines()
for line in lines:
if len(line.strip()) == 0:
continue
proc = subprocess.Popen('sox {0} -n stat'.format(line.strip()).split(), stdout = subprocess.PIPE, stderr = subprocess.PIPE)
out, err = proc.communicate()
out_rms = $mean_rms/float(re.split('RMS\s+amplitude:', err)[1].split()[0])
line = line.strip()
file_id=os.path.splitext(os.path.split(line)[1])[0]+'-1'
print '{0} sox --vol {1} {2} -r 8000 -t wav - |'.format(file_id, out_rms, line)
" | sort -k1,1 -u > $test/wav.scp || exit 1;
cat $test/wav.scp |awk '{printf("%s %s\n", $1, $1)}' > $test/utt2spk
cat $test/wav.scp |awk '{printf("%s %s\n", $1, $1)}' > $test/spk2utt
cat $test/wav.scp |awk '{printf("%s %s A\n", $1, $1)}' > $test/reco2file_and_channel
cp $aspire_data/my_english.glm $test/glm
done
fi
echo "Aspire dev/test/eval data preparation succeeded"