fix_data_dir.sh
5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/bin/bash
# This script makes sure that only the segments present in
# all of "feats.scp", "wav.scp" [if present], segments[if prsent]
# text, and utt2spk are present in any of them.
# It puts the original contents of data-dir into
# data-dir/.backup
if [ $# != 1 ]; then
echo "Usage: fix_data_dir.sh data-dir"
exit 1
fi
data=$1
mkdir -p $data/.backup
[ ! -d $data ] && echo "$0: no such directory $data" && exit 1;
[ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;
tmpdir=$(mktemp -d);
trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
export LC_ALL=C
function check_sorted {
file=$1
sort -k1,1 -u <$file >$file.tmp
if ! cmp -s $file $file.tmp; then
echo "$0: file $1 is not in sorted order or not unique, sorting it"
mv $file.tmp $file
else
rm $file.tmp
fi
}
for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp reco2file_and_channel spk2gender; do
if [ -f $data/$x ]; then
cp $data/$x $data/.backup/$x
check_sorted $data/$x
fi
done
function filter_file {
filter=$1
file_to_filter=$2
cp $file_to_filter ${file_to_filter}.tmp
utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
if ! cmp ${file_to_filter}.tmp $file_to_filter >&/dev/null; then
length1=`cat ${file_to_filter}.tmp | wc -l`
length2=`cat ${file_to_filter} | wc -l`
if [ $length1 -ne $length2 ]; then
echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
fi
fi
}
function filter_recordings {
# We call this once before the stage when we filter on utterance-id, and once
# after.
if [ -f $data/segments ]; then
# We have a segments file -> we need to filter this and the file wav.scp, and
# reco2file_and_utt, if it exists, to make sure they have the same list of
# recording-ids.
if [ ! -f $data/wav.scp ]; then
echo "$0: $data/segments exists but not $data/wav.scp"
exit 1;
fi
awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
n1=`cat $tmpdir/recordings | wc -l`
[ ! -s $tmpdir/recordings ] && \
echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
mv $tmpdir/recordings.tmp $tmpdir/recordings
cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
filter_file $tmpdir/recordings $data/segments
cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
filter_file $tmpdir/recordings $data/wav.scp
[ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
fi
}
function filter_speakers {
# throughout this program, we regard utt2spk as primary and spk2utt as derived, so...
utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
if [ -f $data/cmvn.scp ]; then
cat $data/spk2utt | awk '{print $1}' >$tmpdir/speakers
cat $data/cmvn.scp | awk '{print $1}' >$tmpdir/speakers.cmvn
utils/filter_scp.pl $data/cmvn.scp $tmpdir/speakers > $tmpdir/speakers.tmp
mv $tmpdir/speakers.tmp $tmpdir/speakers
filter_file $tmpdir/speakers $data/cmvn.scp
filter_file $tmpdir/speakers $data/spk2utt
utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk
fi
if [ -f $data/spk2gender ]; then
# We don't handle the case when the spk2gender does not cover all speakers.
cat $data/spk2utt | awk '{print $1}' >$tmpdir/speakers
filter_file $tmpdir/speakers $data/spk2gender
fi
}
function filter_utts {
cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
# Do a check.
! cat $data/utt2spk | sort | cmp - $data/utt2spk && \
echo "utt2spk is not in sorted order (fix this yourself)" && exit 1;
! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \
echo "utt2spk is not in sorted order when sorted first on speaker-id " && \
echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
maybe_wav=
[ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
for x in feats.scp text segments $maybe_wav; do
if [ -f $data/$x ]; then
utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
mv $tmpdir/utts.tmp $tmpdir/utts
fi
done
[ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
rm $tmpdir/utts && exit 1;
nutts=`cat $tmpdir/utts | wc -l`
if [ -f $data/feats.scp ]; then
nfeats=`cat $data/feats.scp | wc -l`
else
nfeats=0
fi
ntext=`cat $data/text | wc -l`
if [ "$nutts" -ne "$nfeats" -o "$nutts" -ne "$ntext" ]; then
echo "fix_data_dir.sh: kept $nutts utterances, vs. $nfeats features and $ntext transcriptions."
else
echo "fix_data_dir.sh: kept all $nutts utterances."
fi
for x in utt2spk feats.scp text segments $maybe_wav; do
if [ -f $data/$x ]; then
mv $data/$x $data/.backup/$x
utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x
fi
done
}
filter_recordings
filter_speakers
filter_utts
filter_recordings
utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
echo "fix_data_dir.sh: old files are kept in $data/.backup"