Blame view

egs/wsj/s5/steps/resegment_text.sh 4.5 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  #!/bin/bash
  
  # Copyright Johns Hopkins University (Author: Daniel Povey) 2013.  Apache 2.0.
  
  # This script takes two data directories that represent different
  # segmentations of the same data (both must have "segments" files and
  # the recording-ids must match), and it converts the text in one directory
  # to correspond to the segmentation in the other.  Its output is the
  # "text" file in the second directory.  To get the alignments, it
  # must be provided an "alignment" directory where the training data
  # from the first directory has been aligned.
  
  # begin configuration section.
  stage=0
  cmd=run.pl
  
  #end configuration section.
  
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 5 ]; then
    echo "Usage: $0 [options] <in-data-dir> <lang> <ali-dir|model-dir> <out-data-dir> <temp/log-dir>"
    echo " Options:"
    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
    echo "    --stage (0|1|2)                 # start scoring script from part-way through."
    echo "e.g.:"
    echo "$0 data/train data/lang exp/tri3b_ali_all data/train_reseg exp/tri3b_resegment"
    exit 1;
  fi
  
  data=$1
  lang=$2
  alidir=$3
  data_out=$4
  dir=$5
  
  
  mkdir -p $dir/log || exit 1;
  
  for f in $data/feats.scp $lang/phones.txt $alidir/ali.1.gz $alidir/num_jobs \
     $alidir/final.mdl $data_out/reco2file_and_channel $data_out/segments; do
    if [ ! -f $f ]; then 
      echo "$0: no such file $f"
      exit 1;
    fi
  done
  
  
  if [ $stage -le 0 ]; then
    echo "$0: calling get_train_ctm.sh to produce ctms of the alignments."
    # Caution: this will produce logs in $alidir/log/get_ctm.log
    steps/get_train_ctm.sh --cmd "$cmd" $data $lang $alidir || exit 1;  
  fi
  
  
  if [ $stage -le 1 ]; then
    if [ ! -s $alidir/ctm ]; then
      echo "$0: file $data/ctm does not exist or is empty."
      exit 1;
    fi
    echo "$0: converting ctm to a format where we have the recording-id ..."
    echo "$0: ... in place of the side and channel, e.g. sw02008-B instead of sw02008 B"
  
    cat $alidir/ctm | awk -v r=$data_out/reco2file_and_channel  \
     'BEGIN{while((getline < r) > 0) { if(NF!=3) {exit(1);} map[ $2 "&" $3 ] = $1;}}
      {if (NF!=5) {print "bad line " $0; exit(2);} reco = map[$1 "&" $2];
       if (length(reco) == 0) { print "Bad key " $1 "&" $2; exit(3); } 
       print reco, $3, $4, $5; } ' > $dir/ctm_per_reco
  fi
  
  if [ $stage -le 2 ]; then
    cat $data_out/segments | perl -e '
       @ARGV == 1 || die;
       $ctm_per_reco = shift @ARGV;
       $chunk_size = 3;
       open(C, "<$ctm_per_reco") || die "opening ctm file $ctm_per_reco";
       # we build up an associative array indexed by a pair of ids: $reco,$n
       # where $n is a 5-second chunk of time.
       sub to_chunk { my $t = shift @_; return int($t / $chunk_size); }
       while (<C>) {
         @A = split;  @A == 4 || die "Bad line $_ in $ctm_per_reco";
         ($reco, $start, $length, $word) = @A;
         $chunk = to_chunk($start);
         if (! defined $reco2list{$reco,$chunk} ){ $reco2list{$reco,$chunk} = [ ]; } # new anonymous array
         $arrayref = $reco2list{$reco,$chunk};
         push @$arrayref, [ $start, $length, $word ]; # another level of anonymous array..
       }
       $num_utts = 0; $num_empty = 0;
       while(<STDIN>) {
         @A = split;  @A == 4 || die "Bad line $_ in stdin";
         ($utt, $reco, $start, $end) = @A;
         @text = ();
         for ($chunk = to_chunk($start); $chunk <= to_chunk($end); $chunk++) {
           $arrayref = $reco2list{$reco,$chunk};
           if (defined $arrayref) {
             foreach $entry ( @$arrayref ) { # note, $entry is itself an arrayref
                                             # to an array containing $start $end $word.
               $word_start = $$entry[0];
               if ($word_start >= $start && $word_start <= $end) {
                 $word_end = $$entry[1] + $word_start;
                 if ($word_end >= $start && $word_end <= $end) {
                   $word = $$entry[2]; defined $word || die;
                   push @text, $word;
                 }
               }
             }
           }
         }
         $num_utts++;
         if (@text > 0) { $t = join(" ", @text); print "$utt $t
  ";; }
         else { $num_empty++; }
       }
       print STDERR "Processed $num_utts utterances, of which $num_empty had no text.
  "; ' \
         $dir/ctm_per_reco | sort > $data_out/text || exit 1;
  
    nw_old=`cat $data/text | wc | awk '{print $2 - $1}'`
    nw_new=`cat $data_out/text | wc | awk '{print $2 - $1}'`
    echo "Number of words of training text changed from $nw_old to $nw_new";
  
    if [ ! -s $data_out/text ]; then
      echo "$0: produced empty output.  Something went wrong."
      exit 1;
    fi
  fi