Blame view
egs/fisher_swbd/s5/local/swbd1_data_prep.sh
4.7 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#!/bin/bash # Switchboard-1 training data preparation customized for Edinburgh # Author: Arnab Ghoshal (Jan 2013) # To be run from one directory above this script. ## The input is some directory containing the switchboard-1 release 2 ## corpus (LDC97S62). Note: we don't make many assumptions about how ## you unpacked this. We are just doing a "find" command to locate ## the .sph files. . ./path.sh #check existing directories if [ $# != 1 ]; then echo "Usage: swbd1_data_prep.sh /path/to/SWBD" exit 1; fi SWBD_DIR=$1 dir=data/local/train_swbd mkdir -p $dir # Audio data directory check if [ ! -d $SWBD_DIR ]; then echo "Error: run.sh requires a directory argument" exit 1; fi sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -x $sph2pipe ] \ && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; # Option A: SWBD dictionary file check [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \ echo "SWBD dictionary file does not exist" && exit 1; # find sph audio files find $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist n=`cat $dir/sph.flist | wc -l` [ $n -ne 2435 ] && \ echo Warning: expected 2435 data data files, found $n # (1a) Transcriptions preparation # make basic transcription file (add segments info) # **NOTE: In the default Kaldi recipe, everything is made uppercase, while we # make everything lowercase here. This is because we will be using SRILM which # can optionally make everything lowercase (but not uppercase) when mapping # LM vocabs. awk '{ name=substr($1,1,6); gsub("^sw","sw0",name); side=substr($1,7,1); stime=$2; etime=$3; printf("%s-%s_%06.0f-%06.0f", name, side, int(100*stime+0.5), int(100*etime+0.5)); for(i=4;i<=NF;i++) printf(" %s", $i); printf " " }' $dir/swb_ms98_transcriptions/*/*/*-trans.text > $dir/transcripts1.txt # test if trans. file is sorted export LC_ALL=C; sort -c $dir/transcripts1.txt || exit 1; # check it's sorted. # Remove SILENCE, <B_ASIDE> and <E_ASIDE>. # Note: we have [NOISE], [VOCALIZED-NOISE], [LAUGHTER], [SILENCE]. # removing [SILENCE], and the <B_ASIDE> and <E_ASIDE> markers that mark # speech to somone; we will give phones to the other three (NSN, SPN, LAU). # There will also be a silence phone, SIL. # **NOTE: modified the pattern matches to make them case insensitive cat $dir/transcripts1.txt \ | perl -ane 's:\s\[SILENCE\](\s|$):$1:gi; s/<B_ASIDE>//gi; s/<E_ASIDE>//gi; print;' \ | awk '{if(NF > 1) { print; } } ' > $dir/transcripts2.txt # **NOTE: swbd1_map_words.pl has been modified to make the pattern matches # case insensitive local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final transcripts # format acronyms in text python local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \ -M data/local/dict_nosp/acronyms_swbd.map cp $dir/text $dir/text_bk mv $dir/text_map $dir/text # (1c) Make segment files from transcript #segments file format is: utt-id side-id start-time end-time, e.g.: #sw02001-A_000098-001156 sw02001-A 0.98 11.56 awk '{ segment=$1; split(segment,S,"[_-]"); side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4]; print segment " " audioname "-" side " " startf/100 " " endf/100 }' < $dir/text > $dir/segments sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \ > $dir/sph.scp awk -v sph2pipe=$sph2pipe '{ printf("%s-A %s -f wav -p -c 1 %s | ", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s | ", $1, sph2pipe, $2); }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1; #side A - channel 1, side B - channel 2 # this file reco2file_and_channel maps recording-id (e.g. sw02001-A) # to the file name sw02001 and the A, e.g. # sw02001-A sw02001 A # In this case it's trivial, but in other corpora the information might # be less obvious. Later it will be needed for ctm scoring. awk '{print $1}' $dir/wav.scp \ | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2 "; ' \ > $dir/reco2file_and_channel || exit 1; awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments > $dir/utt2spk \ || exit 1; sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; # We assume each conversation side is a separate speaker. This is a very # reasonable assumption for Switchboard. The actual speaker info file is at: # http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary # Copy stuff into its final locations [this has been moved from the format_data # script] mkdir -p data/train_swbd for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do cp $dir/$f data/train_swbd/$f || exit 1; done echo Switchboard-1 data preparation succeeded. |