Blame view
egs/csj/s5/local/csj_eval_data_prep.sh
2.52 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
#!/bin/bash # Copyright 2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki) # 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe) # Apache 2.0 # Acknowledgement This work was supported by JSPS KAKENHI Grant Number 26280055. # Official evaluation data set (each set contains 10 speakers) preparation # To be run from one directory above this script. # The input is directory containing the official evaluation test set and transcripts. if [ $# -ne 2 ]; then echo "Usage: "`basename $0`" <transcription-dir> <eval_num>" echo "See comments in the script for more details" exit 1 fi tdir=$1 eval_num=$2 . ./path.sh dir=data/local/$eval_num mkdir -p $dir cat $tdir/$eval_num/*/*-wav.list | sort > $dir/wav.flist n=`cat $dir/wav.flist | wc -l` sed -e 's?.*/??' -e 's?.wav??' -e 's?\-[R,L]??' $dir/wav.flist | paste - $dir/wav.flist \ > $dir/wavflist.scp awk '{ printf("%s cat %s | ", $1, $2); }' < $dir/wavflist.scp | sort > $dir/wav.scp || exit 1; # Get segments file... # segments file format is: utt-id start-time end-time, e.g.: # A01F0055_00380213_00385.951 => A01F0055 00380.213 00385.951 awk '{ spkutt_id=$1; split(spkutt_id,T,"[_ ]"); name=T[1]; stime=$2; etime=$3; printf("%s_%07.0f_%07.0f",name, int(1000*stime), int(1000*etime)); for(i=4;i<=NF;i++) printf(" %s", tolower($i)); printf " " }' $tdir/$eval_num/*/*-trans.text | sort > $dir/transcripts_${eval_num}.txt # Remove option cat $dir/transcripts_${eval_num}.txt \ | perl -ane 's:\<s\>::gi; s:\<\/s\>::gi; print;' \ | awk '{if(NF > 1) { print; } } ' > $dir/text export LC_ALL=C; sort -c $dir/text || exit 1; # check it's sorted. ## Create segment file awk '{ segment=$1; split(segment,S,"[_]"); spkid=S[1]; startf=S[2]; endf=S[3]; print segment " " spkid " " startf/1000 " " endf/1000 }' < $dir/text > $dir/segments # create an utt2spk file that assumes each conversation side is a separate speaker. awk '{segment=$1; split(segment,S,"[_]"); spkid=S[1]; print $1 " " spkid}' $dir/segments > $dir/utt2spk || exit 1; sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; dest=data/$eval_num mkdir -p $dest for x in wav.scp segments text utt2spk spk2utt; do cp $dir/$x $dest/$x done utils/fix_data_dir.sh $dest if [ $(wc -l < $dest/wav.scp) -ne 10 ]; then echo "$0: error: expected 10 lines in wav.scp, got $(wc -l < $dest/wav.scp)" exit 1; fi echo "Completed preparation evaluation set $eval_num" |