Blame view
egs/aspire/s5/local/fisher_data_prep.sh
7.02 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 |
#!/bin/bash # Copyright 2013 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0. stage=0 calldata= while test $# -gt 0 do case "$1" in --calldata) calldata=1 ;; *) break; ;; esac shift done . utils/parse_options.sh if [ $# -eq 0 ]; then echo "$0 [--calldata] <fisher-dir-1> [<fisher-dir-2> ...]" echo " e.g.: $0 /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19\\" echo " /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13" echo " (We also support a single directory that has the contents of all of them)" echo " If specified, --calldata will be used to map Kaldi speaker ID to real" echo " speaker PIN released with the Fisher corpus." exit 1; fi # Check that the arguments are all absolute pathnames. for dir in $*; do case $dir in /*) ;; *) echo "$0: all arguments must be absolute pathnames."; exit 1; esac done # First check we have the right things in there... # rm -r data/local/data/links 2>/dev/null mkdir -p data/local/data/links || exit 1; for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \ fe_03_p2_sph1 fe_03_p2_sph3 fe_03_p2_sph5 fe_03_p2_sph7 fe_03_p1_sph2 \ fe_03_p1_sph4 fe_03_p1_sph6 fe_03_p1_tran fe_03_p2_sph2 fe_03_p2_sph4 \ fe_03_p2_sph6 fe_03_p2_tran; do found_subdir=false for dir in $*; do if [ -d $dir/$subdir ]; then found_subdir=true ln -s $dir/$subdir data/local/data/links else new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/) if [ -d $dir/$new_style_subdir ]; then found_subdir=true ln -s $dir/$new_style_subdir data/local/data/links/$subdir fi fi done if ! $found_subdir; then echo "$0: could not find the subdirectory $subdir in any of $*" exit 1; fi done tmpdir=`pwd`/data/local/data links=data/local/data/links . ./path.sh # Needed for KALDI_ROOT sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe if [ ! -x $sph2pipe ]; then echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; exit 1; fi # (1) Get transcripts in one file, and clean them up .. if [ $stage -le 0 ]; then find $links/fe_03_p1_tran/data $links/fe_03_p2_tran/data -name '*.txt' > $tmpdir/transcripts.flist for dir in fe_03_p{1,2}_sph{1,2,3,4,5,6,7}; do find $links/$dir/ -name '*.sph' done > $tmpdir/sph.flist n=`cat $tmpdir/transcripts.flist | wc -l` if [ $n -ne 11699 ]; then echo "Expected to find 11699 transcript files in the Fisher data, found $n" exit 1; fi n=`cat $tmpdir/sph.flist | wc -l` if [ $n -ne 11699 ]; then echo "Expected to find 11699 .sph files in the Fisher data, found $n" exit 1; fi fi if [ $stage -le 1 ]; then mkdir -p data/train_all ## fe_03_00004.sph ## Transcpribed at the LDC # #7.38 8.78 A: an- so the topic is echo -n > $tmpdir/text.1 || exit 1; perl -e ' use File::Basename; ($tmpdir)=@ARGV; open(F, "<$tmpdir/transcripts.flist") || die "Opening list of transcripts"; open(R, "|sort >data/train_all/reco2file_and_channel") || die "Opening reco2file_and_channel"; open(T, ">$tmpdir/text.1") || die "Opening text output"; while (<F>) { $file = $_; m:([^/]+)\.txt: || die "Bad filename $_"; $call_id = $1; print R "$call_id-A $call_id A "; print R "$call_id-B $call_id B "; open(I, "<$file") || die "Opening file $_"; $line1 = <I>; $line1 =~ m/# (.+)\.sph/ || die "Bad first line $line1 in file $file"; $call_id eq $1 || die "Mismatch call-id $call_id vs $1 "; while (<I>) { if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.+\S|\S)\s*$/) { $start = sprintf("%06d", $1 * 100.0); $end = sprintf("%06d", $2 * 100.0); length($end) > 6 && die "Time too long $end in file $file"; $side = $3; $words = $4; $utt_id = "${call_id}-$side-$start-$end"; print T "$utt_id $words " || die "Error writing to text file"; } } } close(R); close(T) ' $tmpdir || exit 1; fi if [ $stage -le 2 ]; then sort $tmpdir/text.1 | grep -v '((' | \ awk '{if (NF > 1){ print; }}' | \ sed 's:\[laugh\]:[laughter]:g' | \ sed 's:\[sigh\]:[noise]:g' | \ sed 's:\[cough\]:[noise]:g' | \ sed 's:\[sigh\]:[noise]:g' | \ sed 's:\[mn\]:[noise]:g' | \ sed 's:\[breath\]:[noise]:g' | \ sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2 cp $tmpdir/text.2 data/train_all/text # create segments file and utt2spk file... ! cat data/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2 "; ' > data/train_all/utt2spk \ && echo "Error producing utt2spk file" && exit 1; cat data/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e "; ' > data/train_all/segments utils/utt2spk_to_spk2utt.pl <data/train_all/utt2spk > data/train_all/spk2utt fi if [ $stage -le 3 ]; then for f in `cat $tmpdir/sph.flist`; do # convert to absolute path utils/make_absolute.sh $f done > $tmpdir/sph_abs.flist cat $tmpdir/sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s | ", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s | ", $1, sph2pipe, $2);}' | \ sort -k1,1 -u > data/train_all/wav.scp || exit 1; fi if [ $stage -le 4 ]; then # get the spk2gender information. This is not a standard part of our # file formats # The files "filetable2fe_03_p2_sph1 fe_03_05852.sph ff cat $links/fe_03_p1_sph{1,2,3,4,5,6,7}/filetable.txt \ $links/fe_03_p2_sph{1,2,3,4,5,6,7}/docs/filetable2.txt | \ perl -ane 'm:^\S+ (\S+)\.sph ([fm])([fm]): || die "bad line $_;"; print "$1-A $2 ", "$1-B $3 "; ' | \ sort | uniq | utils/filter_scp.pl data/train_all/spk2utt > data/train_all/spk2gender if [ ! -s data/train_all/spk2gender ]; then echo "It looks like our first try at getting the spk2gender info did not work." echo "(possibly older distribution?) Trying something else." cat $links/fe_03_p1_tran/doc/fe_03_p1_filelist.tbl $links/fe_03_p2_tran/doc/fe_03_p2_filelist.tbl | \ perl -ane 'm:fe_03_p[12]_sph\d\t(\d+)\t([mf])([mf]): || die "Bad line $_"; print "fe_03_$1-A $2 ", "fe_03_$1-B $3 "; ' | \ sort | uniq | utils/filter_scp.pl data/train_all/spk2utt > data/train_all/spk2gender fi fi if [ ! -z "$calldata" ]; then # fix speaker IDs cat $links/fe_03_p{1,2}_tran/doc/*calldata.tbl > $tmpdir/combined-calldata.tbl local/fisher_fix_speakerid.pl $tmpdir/combined-calldata.tbl data/train_all utils/utt2spk_to_spk2utt.pl data/train_all/utt2spk.new > data/train_all/spk2utt.new # patch files for f in spk2utt utt2spk text segments spk2gender; do cp data/train_all/$f data/train_all/$f.old || exit 1; cp data/train_all/$f.new data/train_all/$f || exit 1; done rm $tmpdir/combined-calldata.tbl fi echo "Data preparation succeeded" |