Blame view
egs/sre08/v1/local/make_sre_2008_test.sh
2.77 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
#!/bin/bash # Copyright 2013 Daniel Povey # Apache 2.0. if [ $# -ne 2 ]; then echo "Usage: $0 <path-to-LDC2011S08> <path-to-output>"; echo "e.g. $0 /export/corpora5/LDC/LDC2011S08 data"; exit 1; fi src=$1 d=$2 for condition in short3; do # could also add "10sec" and "summed" here. sph_src=$src/data/test/data/$condition if [ ! -d $sph_src ]; then echo "$0: expecting directory $sph_src to exist" exit 1; fi data=$d/sre08_test_${condition} mkdir -p $data for f in $sph_src/*.sph; do base=$(basename $f | sed s:.sph$::) for side in A B; do if [ $side == "A" ]; then channel=1 else channel=2 fi utt_id=${base}_${side} # e.g. thagc_B echo "${utt_id} sph2pipe -f wav -p -c ${channel} $f |" done done | sort > $data/wav.scp ! [ -s $data/wav.scp ] && echo "$0: Error creating wav.scp (empty output)" && exit 1; # We don't have speaker information here, so we just make the utt2spk a one-to-one # mapping (this file is required by certain Kaldi scripts) cat $data/wav.scp | awk '{print $1, $1}' | tee $data/spk2utt > $data/utt2spk # Use the "trials" file to get the gender cat $src/data/trials/*-${condition}.ndx | sed 's/.sph:/_/' | awk '{print $3, $2}' | sort | uniq > $data/spk2gender # Note: not all of these utterances appear in the trials file, e.g. the interview # segments have a "B" side which is never used. So the spk2gender file is smaller # than the spk2utt file. When we do fix_data_dir.sh, the un-needed ones get removed. utils/fix_data_dir.sh $data || exit 1; utils/validate_data_dir.sh --no-text --no-feats $data || exit 1; # Filter into male and female parts, since we have gender-dependent # processing. echo "Creating female subset of $data" cat $data/spk2gender | grep -w f > spklist utils/subset_data_dir.sh --spk-list spklist $data ${data}_female echo "Creating male subset of $data" cat $data/spk2gender | grep -w m > spklist utils/subset_data_dir.sh --spk-list spklist $data ${data}_male done trials=$d/sre08_trials mkdir -p $trials tail -n +2 $src/data/keys/NIST_SRE08_KEYS.v0.1/trial-keys/NIST_SRE08_short2-short3.trial.key | \ sed 's:,b,:_B,:; s:,a,:_A,:; s:,: :g' > $trials/short2-short3.trials cat $trials/short2-short3.trials | awk '{print $2, $0}' | \ utils/filter_scp.pl $d/sre08_test_short3_female/utt2spk | cut -d ' ' -f 2- \ > $trials/short2-short3-female.trials cat $trials/short2-short3.trials | awk '{print $2, $0}' | \ utils/filter_scp.pl $d/sre08_test_short3_male/utt2spk | cut -d ' ' -f 2- \ > $trials/short2-short3-male.trials n1=$(cat $trials/short2-short3.trials | wc -l) n2=$(cat $trials/short2-short3-{male,female}.trials | wc -l) if ! [ $n1 -eq $n2 ]; then echo "Error: length mismatch (missing data?) $n1 != $n2" exit 1 fi exit 0 |