Blame view
egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
8.17 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 |
#! /bin/bash # Copyright 2016 Vimal Manohar # 2016 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 set -e set -o pipefail cleanup=true stage=0 cmd=run.pl special_symbol="***" # Special symbol to be aligned with the inserted or # deleted words. Your sentences should not contain this # symbol. print_silence=true # True if we want the silences in the ctm. We do. frame_shift=0.01 . ./path.sh . utils/parse_options.sh if [ $# -ne 4 ]; then echo "This script computes oracle paths for lattices (against a reference " echo "transcript) and does various kinds of processing of that, for use by " echo "steps/cleanup/cleanup_with_segmentation.sh." echo "Its main input is <latdir>/lat.*.gz." echo "This script outputs a human-readable word alignment of the oracle path" echo "through the lattice in <dir>/oracle_hyp.txt, and a time-aligned ctm version of" echo "the same in <dir>/ctm." echo "It also creates <dir>/edits.txt (the number of edits per utterance)," echo "<dir>/text (which is <data>/text but filtering out any utterances that" echo "were not decoded for some reason), and <dir>/length.txt, which is the length" echo "of the reference transcript, and <dir>/all_info.txt and <dir>/all_info.sorted.txt" echo "which contain all the info in a way that's easier to scan for humans." echo "Note: most of this is the same as is done in steps/cleanup/find_bad_utts.sh," echo "except it runs from pre-existing lattices." echo "" echo "Usage: $0 <data> <lang> <latdir> <dir>" echo " e.g.: $0 data/train_si284 data/lang exp/tri4_bad_utts/lats exp/tri4_bad_utts/lattice_oracle" echo "Main options (for others, see top of script file)" echo " --config <config-file> # config containing options" echo " --cleanup <true|false> # set this to false to disable cleanup of " echo " # temporary files (default: true)" echo " --cmd <command-string> # how to run jobs (default: run.pl)." echo " --special-symbol <special-symbol> # Symbol to pad with in insertions and deletions in the" echo " # output produced in <dir>/analysis/ (default: '***'" echo " --print-silence <true|false> # Affects ctm generation; default is true (recommended)" echo " --frame-shift <frame-shift> # Frame shift in seconds; default: 0.01. Affects ctm generation." exit 1 fi data=$1 lang=$2 latdir=$3 dir=$4 for f in $lang/oov.int $lang/words.txt $data/text $latdir/lat.1.gz $latdir/num_jobs; do [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1; done mkdir -p $dir/log if [ -e $dir/final.mdl ]; then model=$dir/final.mdl elif [ -e $dir/../final.mdl ]; then model=$dir/../final.mdl else echo "$0: expected $dir/final.mdl or $dir/../final.mdl to exist" exit 1 fi nj=$(cat $latdir/num_jobs) oov=$(cat $lang/oov.int) utils/split_data.sh $data $nj sdata=$data/split${nj} if [ $stage -le 1 ]; then $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \ lattice-oracle --write-lattices="ark:|gzip -c > $dir/lat.JOB.gz" \ "ark:gunzip -c $latdir/lat.JOB.gz |" \ "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|" \ ark,t:- \| utils/int2sym.pl -f 2- $lang/words.txt '>' $dir/oracle_hyp.JOB.txt || exit 1; echo -n "lattice_oracle_align.sh: overall oracle %WER is: " grep 'Overall %WER' $dir/log/get_oracle.*.log | \ perl -e 'while (<>){ if (m: (\d+) / (\d+):) { $x += $1; $y += $2}} printf("%.2f%% ", $x*100.0/$y); ' | \ tee $dir/log/oracle_overall_wer.log # the awk commands below are to ensure that partially-written files don't confuse us. for x in $(seq $nj); do cat $dir/oracle_hyp.$x.txt; done | awk '{if(NF>=1){print;}}' > $dir/oracle_hyp.txt if $cleanup; then rm $dir/oracle_hyp.*.txt fi fi echo $nj > $dir/num_jobs if [ $stage -le 2 ]; then # The following command gets the time-aligned ctm as $dir/ctm.JOB.txt. if [ -f $lang/phones/word_boundary.int ]; then $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \ set -o pipefail '&&' \ lattice-align-words $lang/phones/word_boundary.int $model "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \ nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1; elif [ -f $lang/phones/align_lexicon.int ]; then $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \ set -o pipefail '&&' \ lattice-align-words-lexicon $lang/phones/align_lexicon.int $model "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \ lattice-1best ark:- ark:- \| \ nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \ utils/int2sym.pl -f 5 $lang/words.txt '>' $dir/ctm.JOB || exit 1; else echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align." exit 1; fi for j in $(seq $nj); do cat $dir/ctm.$j; done > $dir/ctm if $cleanup; then rm $dir/ctm.*; fi echo "$0: oracle ctm is in $dir/ctm" fi # Stages below are really just to satifsy your curiosity; the output is the same # as that of find_bad_utts.sh. if [ $stage -le 3 ]; then # in case any utterances failed to align, get filtered copy of $data/text utils/filter_scp.pl $dir/oracle_hyp.txt < $data/text > $dir/text cat $dir/text | awk '{print $1, (NF-1);}' > $dir/length.txt mkdir -p $dir/analysis align-text --special-symbol="$special_symbol" ark:$dir/text ark:$dir/oracle_hyp.txt ark,t:- | \ utils/scoring/wer_per_utt_details.pl --special-symbol "***" > $dir/analysis/per_utt_details.txt echo "$0: human-readable alignments are in $dir/analysis/per_utt_details.txt" awk '{if ($2 == "#csid") print $1" "($4+$5+$6)}' $dir/analysis/per_utt_details.txt > $dir/edits.txt n1=$(wc -l < $dir/edits.txt) n2=$(wc -l < $dir/oracle_hyp.txt) n3=$(wc -l < $dir/text) n4=$(wc -l < $dir/length.txt) if [ $n1 -ne $n2 ] || [ $n2 -ne $n3 ] || [ $n3 -ne $n4 ]; then echo "$0: mismatch in lengths of files:" wc $dir/edits.txt $dir/oracle_hyp.txt $dir/text $dir/length.txt exit 1; fi # note: the format of all_info.txt is: # <utterance-id> <number of errors> <reference-length> <decoded-output> <reference> # with the fields separated by tabs, e.g. # adg04_sr009_trn 1 12 SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED AT SHOW THE GRIDLEY+S TRACK IN BRIGHT ORANGE WITH HORNE+S IN DIM RED paste $dir/edits.txt \ <(awk '{print $2}' $dir/length.txt) \ <(awk '{$1="";print;}' <$dir/oracle_hyp.txt) \ <(awk '{$1="";print;}' <$dir/text) > $dir/all_info.txt sort -nr -k2 $dir/all_info.txt > $dir/all_info.sorted.txt echo "$0: per-utterance details sorted from worst to best utts are in $dir/all_info.sorted.txt" echo "$0: format is: utt-id num-errs ref-length decoded-output (tab) reference" fi if [ $stage -le 4 ]; then ### # These stats might help people figure out what is wrong with the data # a)human-friendly and machine-parsable alignment in the file per_utt_details.txt # b)evaluation of per-speaker performance to possibly find speakers with # distinctive accents/speech disorders and similar # c)Global analysis on (Ins/Del/Sub) operation, which might be used to figure # out if there is systematic issue with lexicon, pronunciation or phonetic confusability cat $dir/analysis/per_utt_details.txt | \ utils/scoring/wer_per_spk_details.pl $data/utt2spk > $dir/analysis/per_spk_details.txt echo "$0: per-speaker details are in $dir/analysis/per_spk_details.txt" cat $dir/analysis/per_utt_details.txt | \ utils/scoring/wer_ops_details.pl --special-symbol "$special_symbol" | \ sort -i -b -k1,1 -k4,4nr -k2,2 -k3,3 > $dir/analysis/ops_details.txt echo "$0: per-word statistics [corr,sub,ins,del] are in $dir/analysis/ops_details.txt" fi if [ $stage -le 5 ]; then echo "$0: obtaining ctm edits" $cmd $dir/log/get_ctm_edits.log \ align-text ark:$dir/oracle_hyp.txt ark:$dir/text ark,t:- \| \ steps/cleanup/internal/get_ctm_edits.py --oov=$oov --symbol-table=$lang/words.txt \ /dev/stdin $dir/ctm $dir/ctm_edits || exit 1 echo "$0: ctm with edits information appended is in $dir/ctm_edits" fi |