Blame view
egs/hkust/s5/local/hkust_data_prep.sh
5.41 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
#!/bin/bash . ./path.sh || exit 1; if [ $# != 2 ]; then echo "Usage: $0 <audio-path> <text-path>" echo " $0 /export/corpora/LDC03S04 /export/corpora/LDC03T19" exit 1; fi hkust_audio_dir=$1 hkust_text_dir=$2 train_dir=data/local/train dev_dir=data/local/dev # transcripts normalization and segmentation # needs external tools python2 -c "import mmseg" 2>/dev/null || { echo "Python module mmseg is not found. To install it, run tools/extra/install_mmseg.sh"; exit 1; } mkdir -p $train_dir mkdir -p $dev_dir #data directory check if [ ! -d $hkust_audio_dir ] || [ ! -d $hkust_text_dir ]; then echo "Error: $0 requires two directory arguments" exit 1; fi #find sph audio file for train dev resp. find $hkust_audio_dir -iname "*.sph" | grep -i "audio/train" > $train_dir/sph.flist || exit 1; find $hkust_audio_dir -iname "*.sph" | grep -i "audio/dev" > $dev_dir/sph.flist || exit 1; n=`cat $train_dir/sph.flist $dev_dir/sph.flist | wc -l` [ $n -ne 897 ] && \ echo Warning: expected 897 data data files, found $n #Transcriptions preparation #collect all trans, convert encodings to utf-8, find $hkust_text_dir -iname "*.txt" | grep -i "trans/train" | xargs cat |\ iconv -f GBK -t UTF-8 | perl -e ' while (<STDIN>) { @A = split(" ", $_); if (@A <= 1) { next; } if ($A[0] eq "#") { $utt_id = $A[1]; } if (@A >= 3) { $A[2] =~ s:^([AB])\:$:$1:; printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5; for($n = 3; $n < @A; $n++) { print " $A[$n]" }; print " "; } } ' | sort -k1 > $train_dir/transcripts.txt || exit 1; find $hkust_text_dir -iname "*.txt" | grep -i "trans/dev" | xargs cat |\ iconv -f GBK -t UTF-8 | perl -e ' while (<STDIN>) { @A = split(" ", $_); if (@A <= 1) { next; } if ($A[0] eq "#") { $utt_id = $A[1]; } if (@A >= 3) { $A[2] =~ s:^([AB])\:$:$1:; printf "%s-%s-%06.0f-%06.0f", $utt_id, $A[2], 100*$A[0] + 0.5, 100*$A[1] + 0.5; for($n = 3; $n < @A; $n++) { print " $A[$n]" }; print " "; } } ' | sort -k1 > $dev_dir/transcripts.txt || exit 1; #transcripts normalization and segmentation cat $train_dir/transcripts.txt |\ sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\ sed -e 's/<\/foreign>/ /g' |\ sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\ sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\ local/hkust_normalize.pl |\ local/hkust_segment.py |\ awk '{if (NF > 1) print $0;}' > $train_dir/text || exit 1; cat $dev_dir/transcripts.txt |\ sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\ sed -e 's/<\/foreign>/ /g' |\ sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\ sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\ local/hkust_normalize.pl |\ local/hkust_segment.py |\ awk '{if (NF > 1) print $0;}' > $dev_dir/text || exit 1; # some data is corrupted. Delete them cat $train_dir/text | grep -v 20040527_210939_A901153_B901154-A-035691-035691 | egrep -v "A:|B:" > tmp mv tmp $train_dir/text || exit 1; #Make segment files from transcript #segments file format is: utt-id side-id start-time end-time, e.g.: #sw02001-A_000098-001156 sw02001-A 0.98 11.56 awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4]; print segment " " audioname "-" side " " startf/100 " " endf/100}' <$train_dir/text > $train_dir/segments awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $train_dir/sph.flist > $train_dir/sph.scp awk '{ segment=$1; split(segment,S,"-"); side=S[2]; audioname=S[1];startf=S[3];endf=S[4]; print segment " " audioname "-" side " " startf/100 " " endf/100}' <$dev_dir/text > $dev_dir/segments awk '{name = $0; gsub(".sph$","",name); gsub(".*/","",name); print(name " " $0)}' $dev_dir/sph.flist > $dev_dir/sph.scp sph2pipe=`which sph2pipe` || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -x $sph2pipe ] && echo "Could not find the sph2pipe program at $sph2pipe" && exit 1; cat $train_dir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s | ", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s | ", $1, sph2pipe, $2);}' | \ sort > $train_dir/wav.scp || exit 1; cat $dev_dir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s | ", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s | ", $1, sph2pipe, $2);}' | \ sort > $dev_dir/wav.scp || exit 1; #side A - channel 1, side B - channel 2 # this file reco2file_and_channel maps recording-id (e.g. sw02001-A) # to the file name sw02001 and the A, e.g. # sw02001-A sw02001 A # In this case it's trivial, but in other corpora the information might # be less obvious. Later it will be needed for ctm scoring. cat $train_dir/wav.scp | awk '{print $1}' | \ perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2 "; ' \ > $train_dir/reco2file_and_channel || exit 1; cat $dev_dir/wav.scp | awk '{print $1}' | \ perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; print "$1-$2 $1 $2 "; ' \ > $dev_dir/reco2file_and_channel || exit 1; cat $train_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $train_dir/utt2spk || exit 1; cat $train_dir/utt2spk | sort -k 2 | utils/utt2spk_to_spk2utt.pl > $train_dir/spk2utt || exit 1; cat $dev_dir/segments | awk '{spk=substr($1,1,33); print $1 " " spk}' > $dev_dir/utt2spk || exit 1; cat $dev_dir/utt2spk | sort -k 2 | utils/utt2spk_to_spk2utt.pl > $dev_dir/spk2utt || exit 1; echo "$0: HKUST data preparation succeeded" exit 0 |