Blame view
egs/fisher_swbd/s5/local/fisher_data_prep.sh
6.16 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
#!/bin/bash # Copyright 2013 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0. stage=0 . utils/parse_options.sh if [ $# -eq 0 ]; then echo "$0 <fisher-dir-1> [<fisher-dir-2> ...]" echo " e.g.: $0 /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19\\" echo " /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13" echo " (We also support a single directory that has the contents of all of them)" exit 1; fi # Check that the arguments are all absolute pathnames. for dir in $*; do case $dir in /*) ;; *) echo "$0: all arguments must be absolute pathnames."; exit 1; esac done # First check we have the right things in there... # rm -r data/local/data_fisher/links 2>/dev/null mkdir -p data/local/data_fisher/links || exit 1; for subdir in fe_03_p1_sph1 fe_03_p1_sph3 fe_03_p1_sph5 fe_03_p1_sph7 \ fe_03_p2_sph1 fe_03_p2_sph3 fe_03_p2_sph5 fe_03_p2_sph7 fe_03_p1_sph2 \ fe_03_p1_sph4 fe_03_p1_sph6 fe_03_p1_tran fe_03_p2_sph2 fe_03_p2_sph4 \ fe_03_p2_sph6 fe_03_p2_tran; do found_subdir=false for dir in $*; do if [ -d $dir/$subdir ]; then found_subdir=true ln -s $dir/$subdir data/local/data_fisher/links/$subdir else new_style_subdir=$(echo $subdir | sed s/fe_03_p1_sph/fisher_eng_tr_sp_d/) if [ -d $dir/$new_style_subdir ]; then found_subdir=true ln -s $dir/$new_style_subdir data/local/data_fisher/links/$subdir fi fi done if ! $found_subdir; then echo "$0: could not find the subdirectory $subdir in any of $*" exit 1; fi done tmpdir=`pwd`/data/local/data_fisher links=data/local/data_fisher/links . ./path.sh # Needed for KALDI_ROOT sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe if [ ! -x $sph2pipe ]; then echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; exit 1; fi # (1) Get transcripts in one file, and clean them up ... if [ $stage -le 0 ]; then find $links/fe_03_p1_tran/data $links/fe_03_p2_tran/data -iname '*.txt' > $tmpdir/transcripts.flist for dir in fe_03_p{1,2}_sph{1,2,3,4,5,6,7}; do find $links/$dir/ -iname '*.sph' done > $tmpdir/sph.flist n=`cat $tmpdir/transcripts.flist | wc -l` if [ $n -ne 11699 ]; then echo "Expected to find 11699 transcript files in the Fisher data, found $n" exit 1; fi n=`cat $tmpdir/sph.flist | wc -l` if [ $n -ne 11699 ]; then echo "Expected to find 11699 .sph files in the Fisher data, found $n" exit 1; fi fi dir=data/train_fisher if [ $stage -le 1 ]; then mkdir -p $dir ## fe_03_00004.sph ## Transcpribed at the LDC # #7.38 8.78 A: an- so the topic is echo -n > $tmpdir/text.1 || exit 1; perl -e ' use File::Basename; ($tmpdir)=@ARGV; open(F, "<$tmpdir/transcripts.flist") || die "Opening list of transcripts"; open(R, "|sort >data/train_fisher/reco2file_and_channel") || die "Opening reco2file_and_channel"; open(T, ">$tmpdir/text.1") || die "Opening text output"; while (<F>) { $file = $_; m:([^/]+)\.txt: || die "Bad filename $_"; $call_id = $1; print R "$call_id-A $call_id A "; print R "$call_id-B $call_id B "; open(I, "<$file") || die "Opening file $_"; $line1 = <I>; $line1 =~ m/# (.+)\.sph/ || die "Bad first line $line1 in file $file"; $call_id eq $1 || die "Mismatch call-id $call_id vs $1 "; while (<I>) { if (m/([0-9.]+)\s+([0-9.]+) ([AB]):\s*(\S.+\S|\S)\s*$/) { $start = sprintf("%06d", $1 * 100.0); $end = sprintf("%06d", $2 * 100.0); length($end) > 6 && die "Time too long $end in file $file"; $side = $3; $words = $4; $utt_id = "${call_id}-$side-$start-$end"; print T "$utt_id $words " || die "Error writing to text file"; } } } close(R); close(T) ' $tmpdir || exit 1; fi if [ $stage -le 2 ]; then sort $tmpdir/text.1 | grep -v '((' | \ awk '{if (NF > 1){ print; }}' | \ sed 's:\[laugh\]:[laughter]:g' | \ sed 's:\[sigh\]:[noise]:g' | \ sed 's:\[cough\]:[noise]:g' | \ sed 's:\[sigh\]:[noise]:g' | \ sed 's:\[mn\]:[noise]:g' | \ sed 's:\[breath\]:[noise]:g' | \ sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2 cp $tmpdir/text.2 $dir/text # create segments file and utt2spk file... ! cat $dir/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2 "; ' > $dir/utt2spk \ && echo "Error producing utt2spk file" && exit 1; cat $dir/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2; $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e "; ' > $dir/segments utils/utt2spk_to_spk2utt.pl <$dir/utt2spk > $dir/spk2utt fi if [ $stage -le 3 ]; then for f in `cat $tmpdir/sph.flist`; do # convert to absolute path utils/make_absolute.sh $f done > $tmpdir/sph_abs.flist cat $tmpdir/sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s | ", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s | ", $1, sph2pipe, $2);}' | \ sort -k1,1 -u > $dir/wav.scp || exit 1; fi if [ $stage -le 4 ]; then # get the spk2gender information. This is not a standard part of our # file formats # The files "filetable2fe_03_p2_sph1 fe_03_05852.sph ff cat $links/fe_03_p1_sph{1,2,3,4,5,6,7}/filetable.txt \ $links/fe_03_p2_sph{1,2,3,4,5,6,7}/docs/filetable2.txt | \ perl -ane 'm:^\S+ (\S+)\.sph ([fm])([fm]): || die "bad line $_;"; print "$1-A $2 ", "$1-B $3 "; ' | \ sort | uniq | utils/filter_scp.pl $dir/spk2utt > $dir/spk2gender if [ ! -s $dir/spk2gender ]; then echo "It looks like our first try at getting the spk2gender info did not work." echo "(possibly older distribution?) Trying something else." cat $links/fe_03_p1_tran/doc/fe_03_p1_filelist.tbl $links/fe_03_p2_tran/doc/fe_03_p2_filelist.tbl | \ perl -ane 'm:fe_03_p[12]_sph\d\t(\d+)\t([mf])([mf]): || die "Bad line $_"; print "fe_03_$1-A $2 ", "fe_03_$1-B $3 "; ' | \ sort | uniq | utils/filter_scp.pl $dir/spk2utt > $dir/spk2gender fi fi echo "Fisher data preparation succeeded" |