Blame view
egs/wsj/s5/steps/resegment_text.sh
4.5 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
#!/bin/bash # Copyright Johns Hopkins University (Author: Daniel Povey) 2013. Apache 2.0. # This script takes two data directories that represent different # segmentations of the same data (both must have "segments" files and # the recording-ids must match), and it converts the text in one directory # to correspond to the segmentation in the other. Its output is the # "text" file in the second directory. To get the alignments, it # must be provided an "alignment" directory where the training data # from the first directory has been aligned. # begin configuration section. stage=0 cmd=run.pl #end configuration section. [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -ne 5 ]; then echo "Usage: $0 [options] <in-data-dir> <lang> <ali-dir|model-dir> <out-data-dir> <temp/log-dir>" echo " Options:" echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." echo " --stage (0|1|2) # start scoring script from part-way through." echo "e.g.:" echo "$0 data/train data/lang exp/tri3b_ali_all data/train_reseg exp/tri3b_resegment" exit 1; fi data=$1 lang=$2 alidir=$3 data_out=$4 dir=$5 mkdir -p $dir/log || exit 1; for f in $data/feats.scp $lang/phones.txt $alidir/ali.1.gz $alidir/num_jobs \ $alidir/final.mdl $data_out/reco2file_and_channel $data_out/segments; do if [ ! -f $f ]; then echo "$0: no such file $f" exit 1; fi done if [ $stage -le 0 ]; then echo "$0: calling get_train_ctm.sh to produce ctms of the alignments." # Caution: this will produce logs in $alidir/log/get_ctm.log steps/get_train_ctm.sh --cmd "$cmd" $data $lang $alidir || exit 1; fi if [ $stage -le 1 ]; then if [ ! -s $alidir/ctm ]; then echo "$0: file $data/ctm does not exist or is empty." exit 1; fi echo "$0: converting ctm to a format where we have the recording-id ..." echo "$0: ... in place of the side and channel, e.g. sw02008-B instead of sw02008 B" cat $alidir/ctm | awk -v r=$data_out/reco2file_and_channel \ 'BEGIN{while((getline < r) > 0) { if(NF!=3) {exit(1);} map[ $2 "&" $3 ] = $1;}} {if (NF!=5) {print "bad line " $0; exit(2);} reco = map[$1 "&" $2]; if (length(reco) == 0) { print "Bad key " $1 "&" $2; exit(3); } print reco, $3, $4, $5; } ' > $dir/ctm_per_reco fi if [ $stage -le 2 ]; then cat $data_out/segments | perl -e ' @ARGV == 1 || die; $ctm_per_reco = shift @ARGV; $chunk_size = 3; open(C, "<$ctm_per_reco") || die "opening ctm file $ctm_per_reco"; # we build up an associative array indexed by a pair of ids: $reco,$n # where $n is a 5-second chunk of time. sub to_chunk { my $t = shift @_; return int($t / $chunk_size); } while (<C>) { @A = split; @A == 4 || die "Bad line $_ in $ctm_per_reco"; ($reco, $start, $length, $word) = @A; $chunk = to_chunk($start); if (! defined $reco2list{$reco,$chunk} ){ $reco2list{$reco,$chunk} = [ ]; } # new anonymous array $arrayref = $reco2list{$reco,$chunk}; push @$arrayref, [ $start, $length, $word ]; # another level of anonymous array.. } $num_utts = 0; $num_empty = 0; while(<STDIN>) { @A = split; @A == 4 || die "Bad line $_ in stdin"; ($utt, $reco, $start, $end) = @A; @text = (); for ($chunk = to_chunk($start); $chunk <= to_chunk($end); $chunk++) { $arrayref = $reco2list{$reco,$chunk}; if (defined $arrayref) { foreach $entry ( @$arrayref ) { # note, $entry is itself an arrayref # to an array containing $start $end $word. $word_start = $$entry[0]; if ($word_start >= $start && $word_start <= $end) { $word_end = $$entry[1] + $word_start; if ($word_end >= $start && $word_end <= $end) { $word = $$entry[2]; defined $word || die; push @text, $word; } } } } } $num_utts++; if (@text > 0) { $t = join(" ", @text); print "$utt $t ";; } else { $num_empty++; } } print STDERR "Processed $num_utts utterances, of which $num_empty had no text. "; ' \ $dir/ctm_per_reco | sort > $data_out/text || exit 1; nw_old=`cat $data/text | wc | awk '{print $2 - $1}'` nw_new=`cat $data_out/text | wc | awk '{print $2 - $1}'` echo "Number of words of training text changed from $nw_old to $nw_new"; if [ ! -s $data_out/text ]; then echo "$0: produced empty output. Something went wrong." exit 1; fi fi |