Blame view

egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh 8.17 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
  #! /bin/bash
  
  # Copyright 2016  Vimal Manohar
  #           2016  Johns Hopkins University (author: Daniel Povey)
  # Apache 2.0
  
  set -e
  set -o pipefail
  
  cleanup=true
  stage=0
  cmd=run.pl
  special_symbol="***"    # Special symbol to be aligned with the inserted or
                          # deleted words. Your sentences should not contain this
                          # symbol.
  print_silence=true      # True if we want the silences in the ctm.  We do.
  frame_shift=0.01
  
  . ./path.sh
  . utils/parse_options.sh
  
  if [ $# -ne 4 ]; then
    echo "This script computes oracle paths for lattices (against a reference "
    echo "transcript) and does various kinds of processing of that, for use by "
    echo "steps/cleanup/cleanup_with_segmentation.sh."
    echo "Its main input is <latdir>/lat.*.gz."
    echo "This script outputs a human-readable word alignment of the oracle path"
    echo "through the lattice in <dir>/oracle_hyp.txt, and a time-aligned ctm version of"
    echo "the same in <dir>/ctm."
    echo "It also creates <dir>/edits.txt (the number of edits per utterance),"
    echo "<dir>/text (which is <data>/text but filtering out any utterances that"
    echo "were not decoded for some reason), and <dir>/length.txt, which is the length"
    echo "of the reference transcript, and <dir>/all_info.txt and <dir>/all_info.sorted.txt"
    echo "which contain all the info in a way that's easier to scan for humans."
    echo "Note: most of this is the same as is done in steps/cleanup/find_bad_utts.sh,"
    echo "except it runs from pre-existing lattices."
    echo ""
    echo "Usage: $0 <data> <lang> <latdir> <dir>"
    echo " e.g.: $0 data/train_si284 data/lang exp/tri4_bad_utts/lats exp/tri4_bad_utts/lattice_oracle"
    echo "Main options (for others, see top of script file)"
    echo "  --config <config-file>            # config containing options"
    echo "  --cleanup <true|false>            # set this to false to disable cleanup of "
    echo "                                    # temporary files (default: true)"
    echo "  --cmd <command-string>            # how to run jobs (default: run.pl)."
    echo "  --special-symbol <special-symbol> #  Symbol to pad with in insertions and deletions in the"
    echo "                                    # output produced in <dir>/analysis/ (default: '***'"
    echo "  --print-silence <true|false>      # Affects ctm generation; default is true (recommended)"
    echo "  --frame-shift <frame-shift>       # Frame shift in seconds; default: 0.01.  Affects ctm generation."
    exit 1
  fi
  
  data=$1
  lang=$2
  latdir=$3
  dir=$4
  
  for f in $lang/oov.int $lang/words.txt $data/text $latdir/lat.1.gz $latdir/num_jobs; do
    [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1;
  done
  
  mkdir -p $dir/log
  
  if [ -e $dir/final.mdl ]; then
    model=$dir/final.mdl
  elif [ -e $dir/../final.mdl ]; then
    model=$dir/../final.mdl
  else
    echo "$0: expected $dir/final.mdl or $dir/../final.mdl to exist"
    exit 1
  fi
  
  nj=$(cat $latdir/num_jobs)
  oov=$(cat $lang/oov.int)
  
  utils/split_data.sh $data $nj
  
  sdata=$data/split${nj}
  
  if [ $stage -le 1 ]; then
    $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \
      lattice-oracle --write-lattices="ark:|gzip -c > $dir/lat.JOB.gz" \
      "ark:gunzip -c $latdir/lat.JOB.gz |" \
      "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \
      ark,t:- \| utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/oracle_hyp.JOB.txt || exit 1;
  
    echo -n "lattice_oracle_align.sh: overall oracle %WER is: "
    grep 'Overall %WER'  $dir/log/get_oracle.*.log  | \
      perl -e 'while (<>){ if (m: (\d+) / (\d+):) { $x += $1; $y += $2}}  printf("%.2f%%
  ", $x*100.0/$y); ' | \
      tee $dir/log/oracle_overall_wer.log
  
    # the awk commands below are to ensure that partially-written files don't confuse us.
    for x in $(seq $nj); do cat $dir/oracle_hyp.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/oracle_hyp.txt
    if $cleanup; then
      rm $dir/oracle_hyp.*.txt
    fi
  fi
  
  echo $nj > $dir/num_jobs
  
  
  if [ $stage -le 2 ]; then
    # The following command gets the time-aligned ctm as $dir/ctm.JOB.txt.
  
    if [ -f $lang/phones/word_boundary.int ]; then
      $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
        set -o pipefail '&&' \
        lattice-align-words $lang/phones/word_boundary.int $model "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
        nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
        utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1;
    elif [ -f $lang/phones/align_lexicon.int ]; then
      $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
        set -o pipefail '&&' \
        lattice-align-words-lexicon $lang/phones/align_lexicon.int $model  "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
        lattice-1best ark:- ark:- \| \
        nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
        utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1;
    else
      echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
      exit 1;
    fi
    for j in $(seq $nj); do cat $dir/ctm.$j; done > $dir/ctm
    if $cleanup; then rm $dir/ctm.*; fi
    echo "$0: oracle ctm is in $dir/ctm"
  fi
  
  
  # Stages below are really just to satifsy your curiosity; the output is the same
  # as that of find_bad_utts.sh.
  
  if [ $stage -le 3 ]; then
    # in case any utterances failed to align, get filtered copy of $data/text
    utils/filter_scp.pl $dir/oracle_hyp.txt < $data/text  > $dir/text
    cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt
  
    mkdir -p $dir/analysis
  
    align-text --special-symbol="$special_symbol"  ark:$dir/text ark:$dir/oracle_hyp.txt  ark,t:- | \
      utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt
  
    echo "$0: human-readable alignments are in $dir/analysis/per_utt_details.txt"
  
    awk '{if ($2 == "#csid") print $1" "($4+$5+$6)}' $dir/analysis/per_utt_details.txt > $dir/edits.txt
  
    n1=$(wc -l < $dir/edits.txt)
    n2=$(wc -l < $dir/oracle_hyp.txt)
    n3=$(wc -l < $dir/text)
    n4=$(wc -l < $dir/length.txt)
    if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then
      echo "$0: mismatch in lengths of files:"
      wc $dir/edits.txt $dir/oracle_hyp.txt $dir/text $dir/length.txt
      exit 1;
    fi
  
    # note: the format of all_info.txt is:
    # <utterance-id>   <number of errors>  <reference-length>  <decoded-output>   <reference>
    # with the fields separated by tabs, e.g.
    # adg04_sr009_trn 1 	12	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT	 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED
  
    paste $dir/edits.txt \
        <(awk '{print $2}' $dir/length.txt) \
        <(awk '{$1="";print;}' <$dir/oracle_hyp.txt) \
        <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt
  
    sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt
  
    echo "$0: per-utterance details sorted from worst to best utts are in $dir/all_info.sorted.txt"
    echo "$0: format is: utt-id num-errs ref-length decoded-output (tab) reference"
  fi
  
  if [ $stage -le 4 ]; then
    ###
    # These stats might help people figure out what is wrong with the data
    # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt
    # b)evaluation of per-speaker performance to possibly find speakers with
    #   distinctive accents/speech disorders and similar
    # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure
    #   out if there is systematic issue with lexicon, pronunciation or phonetic confusability
  
    cat $dir/analysis/per_utt_details.txt | \
      utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt
  
    echo "$0: per-speaker details are in $dir/analysis/per_spk_details.txt"
  
    cat $dir/analysis/per_utt_details.txt | \
      utils/scoring/wer_ops_details.pl --special-symbol "$special_symbol" | \
      sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt
  
    echo "$0: per-word statistics [corr,sub,ins,del] are in $dir/analysis/ops_details.txt"
  fi
  
  if [ $stage -le 5 ]; then
    echo "$0: obtaining ctm edits"
  
    $cmd $dir/log/get_ctm_edits.log \
      align-text ark:$dir/oracle_hyp.txt ark:$dir/text ark,t:-  \| \
        steps/cleanup/internal/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \
         /dev/stdin $dir/ctm $dir/ctm_edits || exit 1
  
    echo "$0: ctm with edits information appended is in $dir/ctm_edits"
  fi