make_sre_2008_test.sh
2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
# Copyright 2013 Daniel Povey
# Apache 2.0.
if [ $# -ne 2 ]; then
echo "Usage: $0 <path-to-LDC2011S08> <path-to-output>";
echo "e.g. $0 /export/corpora5/LDC/LDC2011S08 data";
exit 1;
fi
src=$1
d=$2
for condition in short3; do # could also add "10sec" and "summed" here.
sph_src=$src/data/test/data/$condition
if [ ! -d $sph_src ]; then
echo "$0: expecting directory $sph_src to exist"
exit 1;
fi
data=$d/sre08_test_${condition}
mkdir -p $data
for f in $sph_src/*.sph; do
base=$(basename $f | sed s:.sph$::)
for side in A B; do
if [ $side == "A" ]; then
channel=1
else
channel=2
fi
utt_id=${base}_${side} # e.g. thagc_B
echo "${utt_id} sph2pipe -f wav -p -c ${channel} $f |"
done
done | sort > $data/wav.scp
! [ -s $data/wav.scp ] && echo "$0: Error creating wav.scp (empty output)" && exit 1;
# We don't have speaker information here, so we just make the utt2spk a one-to-one
# mapping (this file is required by certain Kaldi scripts)
cat $data/wav.scp | awk '{print $1, $1}' | tee $data/spk2utt > $data/utt2spk
# Use the "trials" file to get the gender
cat $src/data/trials/*-${condition}.ndx | sed 's/.sph:/_/' | awk '{print $3, $2}' | sort | uniq > $data/spk2gender
# Note: not all of these utterances appear in the trials file, e.g. the interview
# segments have a "B" side which is never used. So the spk2gender file is smaller
# than the spk2utt file. When we do fix_data_dir.sh, the un-needed ones get removed.
utils/fix_data_dir.sh $data || exit 1;
utils/validate_data_dir.sh --no-text --no-feats $data || exit 1;
# Filter into male and female parts, since we have gender-dependent
# processing.
echo "Creating female subset of $data"
cat $data/spk2gender | grep -w f > spklist
utils/subset_data_dir.sh --spk-list spklist $data ${data}_female
echo "Creating male subset of $data"
cat $data/spk2gender | grep -w m > spklist
utils/subset_data_dir.sh --spk-list spklist $data ${data}_male
done
trials=$d/sre08_trials
mkdir -p $trials
tail -n +2 $src/data/keys/NIST_SRE08_KEYS.v0.1/trial-keys/NIST_SRE08_short2-short3.trial.key | \
sed 's:,b,:_B,:; s:,a,:_A,:; s:,: :g' > $trials/short2-short3.trials
cat $trials/short2-short3.trials | awk '{print $2, $0}' | \
utils/filter_scp.pl $d/sre08_test_short3_female/utt2spk | cut -d ' ' -f 2- \
> $trials/short2-short3-female.trials
cat $trials/short2-short3.trials | awk '{print $2, $0}' | \
utils/filter_scp.pl $d/sre08_test_short3_male/utt2spk | cut -d ' ' -f 2- \
> $trials/short2-short3-male.trials
n1=$(cat $trials/short2-short3.trials | wc -l)
n2=$(cat $trials/short2-short3-{male,female}.trials | wc -l)
if ! [ $n1 -eq $n2 ]; then
echo "Error: length mismatch (missing data?) $n1 != $n2"
exit 1
fi
exit 0