Blame view
egs/chime5/s5b/local/prepare_data.sh
4.7 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
#!/bin/bash # # Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal) # Apache 2.0 # Begin configuration section. mictype=worn # worn, ref or others cleanup=true # End configuration section . ./utils/parse_options.sh # accept options.. you can run this run.sh with the . ./path.sh echo >&2 "$0" "$@" if [ $# -ne 3 ] ; then echo >&2 "$0" "$@" echo >&2 "$0: Error: wrong number of arguments" echo -e >&2 "Usage: $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>" echo -e >&2 "eg: $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train" exit 1 fi set -e -o pipefail adir=$1 jdir=$2 dir=$3 json_count=$(find -L $jdir -name "*.json" | wc -l) wav_count=$(find -L $adir -name "*.wav" | wc -l) if [ "$json_count" -eq 0 ]; then echo >&2 "We expect that the directory $jdir will contain json files." echo >&2 "That implies you have supplied a wrong path to the data." exit 1 fi if [ "$wav_count" -eq 0 ]; then echo >&2 "We expect that the directory $adir will contain wav files." echo >&2 "That implies you have supplied a wrong path to the data." exit 1 fi echo "$0: Converting transcription to text" mkdir -p $dir for file in $jdir/*json; do ./local/json2text.py --mictype $mictype $file done | \ sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\ sed -e 's/ - / /g' |\ sed -e 's/mm-/mm/g' > $dir/text.orig echo "$0: Creating datadir $dir for type=\"$mictype\"" if [ $mictype == "worn" ]; then # convert the filenames to wav.scp format, use the basename of the file # as a the wav.scp key, add .L and .R for left and right channel # i.e. each file will have two entries (left and right channel) find -L $adir -name "S[0-9]*_P[0-9]*.wav" | \ perl -ne '{ chomp; $path = $_; next unless $path; @F = split "/", $path; ($f = $F[@F-1]) =~ s/.wav//; @F = split "_", $f; print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 | "; print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 | "; }' | sort > $dir/wav.scp # generate the transcripts for both left and right channel # from the original transcript in the form # P09_S03-0006072-0006147 gimme the baker # create left and right channel transcript # P09_S03.L-0006072-0006147 gimme the baker # P09_S03.R-0006072-0006147 gimme the baker sed -n 's/ *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text elif [ $mictype == "ref" ]; then # fixed reference array # first get a text, which will be used to extract reference arrays perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text find -L $adir | grep "\.wav" | sort > $dir/wav.flist # following command provide the argument for grep to extract only reference arrays grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr " " " "` $dir/wav.flist > $dir/wav.flist2 paste -d" " \ <(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \ $dir/wav.flist2 | sort > $dir/wav.scp else # array mic case # convert the filenames to wav.scp format, use the basename of the file # as a the wav.scp key find -L $adir -name "*.wav" -ipath "*${mictype}*" |\ perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\ sort -u > $dir/wav.scp # convert the transcripts from # P09_S03-0006072-0006147 gimme the baker # to the per-channel transcripts # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker perl -ne '$l=$_; for($i=1; $i<=4; $i++) { ($x=$l)=~ s/-/.CH\Q$i\E-/; print $x;}' $dir/text.orig | sort > $dir/text fi $cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist # Prepare 'segments', 'utt2spk', 'spk2utt' if [ $mictype == "worn" ]; then cut -d" " -f 1 $dir/text | \ awk -F"-" '{printf("%s %s %08.2f %08.2f ", $0, $1, $2/100.0, $3/100.0)}' |\ sed -e "s/_[A-Z]*\././2" \ > $dir/segments elif [ $mictype == "ref" ]; then cut -d" " -f 1 $dir/text | \ awk -F"-" '{printf("%s %s %08.2f %08.2f ", $0, $1, $2/100.0, $3/100.0)}' |\ sed -e "s/_[A-Z]*\././2" |\ sed -e "s/ P.._/ /" > $dir/segments else cut -d" " -f 1 $dir/text | \ awk -F"-" '{printf("%s %s %08.2f %08.2f ", $0, $1, $2/100.0, $3/100.0)}' |\ sed -e "s/_[A-Z]*\././2" |\ sed -e 's/ P.._/ /' > $dir/segments fi cut -f 1 -d ' ' $dir/segments | \ perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_ ";' > $dir/utt2spk utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt # Check that data dirs are okay! utils/validate_data_dir.sh --no-feats $dir || exit 1 |