Blame view
egs/wsj/s5/steps/compare_alignments.sh
9.29 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
#!/bin/bash # Copyright 2018 Johns Hopkins University (author: Daniel Povey) # Apache 2.0. set -e stage=0 cmd=run.pl # We use this only for get_ctm.sh, which can be a little slow. num_to_sample=1000 # We sample this many utterances for human-readable display, starting from the worst and then # starting from the middle. cleanup=true if [ -f ./path.sh ]; then . ./path.sh; fi . ./utils/parse_options.sh if [ $# -ne 5 ] && [ $# -ne 7 ]; then cat <<EOF This script compares two directories containing data alignments, and creates statistics showing how much the phone and word alignments differ, including breakdown by phones and words; and which utterances differ the most. This is intended for diagnostic purposes. Both alignment directories should be for the same data (or at least the data sets should overlap). The word alignment stats may not be correctly obtained if the data-dirs are not the same. Usage: $0 [options] <lang-directory> <data-directory> <ali-dir1> <ali-dir2> <work-dir> or: $0 [options] <lang1> <lang2> <data1> <data2> <ali-dir1> <ali-dir2> <work-dir> e.g.: $0 data/lang data/train exp/tri2_ali exp/tri3_ali exp/compare_ali_2_3 Options: --cmd (run.pl|queue.pl...) # specify how to run the sub-processes. # (passed through to get_train_ctm.sh) --cleanup <true|false> # Specify --cleanup false to prevent # cleanup of temporary files. --stage <n> # Enables you to run part of the script. EOF exit 1 fi if [ $# -eq 5 ]; then lang1=$1 lang2=$1 data1=$2 data2=$2 ali_dir1=$3 ali_dir2=$4 dir=$5 else lang1=$1 lang2=$2 data1=$3 data2=$4 ali_dir1=$5 ali_dir2=$6 dir=$7 fi for f in $lang1/phones.txt $lang2/phones.txt $data1/utt2spk $data2/utt2spk \ $ali_dir1/ali.1.gz $ali_dir2/ali.2.gz; do if [ ! -f $f ]; then echo "$0: expected file $f to exist" exit 1 fi done # This will exit if the phone symbol id's are different, due to # `set -e` above. utils/lang/check_phones_compatible.sh $lang1/phones.txt $lang2/phones.txt nj1=$(cat $ali_dir1/num_jobs) nj2=$(cat $ali_dir2/num_jobs) mkdir -p $dir/log if [ $stage -le 0 ]; then echo "$0: converting alignments to phones." for j in $(seq $nj1); do gunzip -c $ali_dir1/ali.$j.gz; done | \ ali-to-phones --per-frame=true $ali_dir1/final.mdl ark:- ark:- | gzip -c > $dir/phones1.gz for j in $(seq $nj2); do gunzip -c $ali_dir2/ali.$j.gz; done | \ ali-to-phones --per-frame=true $ali_dir2/final.mdl ark:- ark:- | gzip -c > $dir/phones2.gz fi if [ $stage -le 1 ]; then echo "$0: getting comparison stats and utterance stats." compare-int-vector --binary=false --write-confusion-matrix=$dir/conf.mat \ "ark:gunzip -c $dir/phones1.gz|" "ark:gunzip -c $dir/phones2.gz|" 2>$dir/log/compare_phones.log > $dir/utt_stats.phones tail -n 8 $dir/log/compare_phones.log fi if [ $stage -le 3 ]; then cat $dir/conf.mat | grep -v -F '[' | sed 's/]//' | awk '{n=NF; for (k=1;k<=n;k++) { conf[NR,k] = $k; row_tot[NR] += $k; col_tot[k] += $k; } } END{ for (row=1;row<=n;row++) for (col=1;col<=n;col++) { val = conf[row,col]; this_row_tot = row_tot[row]; this_col_tot = col_tot[col]; rval=conf[col,row] min_tot = (this_row_tot < this_col_tot ? this_row_tot : this_col_tot); if (val != 0) { phone1 = row-1; phone2 = col-1; if (row == col) printf("COR %d %d %.2f% ", phone1, val, (val * 100 / this_row_tot)); else { norm_prob = val * val / min_tot; # heuristic for sorting. printf("SUB %d %d %d %d %.2f%% %.2f%% ", norm_prob, phone1, phone2, val, (val * 100 / min_tot), (rval * 100 / min_tot)); }}}}' > $dir/phone_stats.all ( echo "# Format: <phone> <frame-count> <percent-correct>" grep '^COR' $dir/phone_stats.all | sort -n -k4,4 | awk '{print $2, $3, $4}' | utils/int2sym.pl -f 1 $lang1/phones.txt ) > $dir/phones_correct.txt ( echo "#Format: <phone1> <phone2> <num-frames> <prob-wrong%> <reverse-prob-wrong%>" echo "# <num-frames> is the number of frames that were labeled <phone1> in the first" echo "# set of alignments and <phone2> in the second." echo "# <prob-wrong> is <num-frames> divided by the smaller of the total num-frames of" echo "# phone1 or phone2, whichever is smaller; expressed as a percentage." echo "#<reverse-prob-wrong> is the same but for the reverse substitution, from" echo "#<phone2> to <phone1>; the comparison with <prob-wrong> the substitutions are)." grep '^SUB' $dir/phone_stats.all | sort -nr -k2,2 | awk '{print $3,$4,$5,$6,$7}' | utils/int2sym.pl -f 1-2 $lang1/phones.txt ) > $dir/phone_subs.txt fi if [ $stage -le 4 ]; then echo "$0: getting CTMs" steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data1 $lang1 $ali_dir1 $dir/ctm1 steps/get_train_ctm.sh --use-segments false --print-silence true --cmd "$cmd" --frame-shift 1.0 $data2 $lang2 $ali_dir2 $dir/ctm2 fi if [ $stage -le 5 ]; then oov=$(cat $lang1/oov.int) # Note: below, we use $lang1 for both setups; this is by design as compare-int-vector # assumes they use the same symbol table. for n in 1 2; do cat $dir/ctm${n}/ctm | utils/sym2int.pl --map-oov $oov -f 5 $lang1/words.txt | \ awk 'BEGIN{utt_id="";} { if (utt_id != $1) { if (utt_id != "") printf(" "); utt_id=$1; printf("%s ", utt_id); } t_start=int($3); t_end=t_start + int($4); word=$5; for (t=t_start; t<t_end; t++) printf("%s ", word); } END{printf(" ")}' | \ copy-int-vector ark:- ark:- | gzip -c >$dir/words${n}.gz done fi if [ $stage -le 5 ]; then compare-int-vector --binary=false --write-tot-counts=$dir/words_tot.vec --write-diff-counts=$dir/words_diff.vec \ "ark:gunzip -c $dir/words1.gz|" "ark:gunzip -c $dir/words2.gz|" 2>$dir/log/compare_words.log >$dir/utt_stats.words tail -n 8 $dir/log/compare_words.log fi if [ $stage -le 6 ]; then ( echo "# Word stats. Format:"; echo "<proportion-of-wrong-frames> <num-wrong-frames> <num-correct-frames> <word>" paste <(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_diff.vec) \ <(awk '{for (n=2;n<NF;n++) print $n;}' <$dir/words_tot.vec) | \ awk '{ if($2 > 0) print $1*$1/$2, $1/$2, $1, $2, (NR-1)}' | utils/int2sym.pl -f 5 $lang1/words.txt | \ sort -nr | awk '{print $2, $3, $4, $5;}' ) > $dir/word_stats.txt fi if [ $stage -le 7 ]; then for type in phones words; do num_utts=$(wc -l <$dir/utt_stats.$type) cat $dir/utt_stats.$type | awk -v type=$type 'BEGIN{print "Utterance-id proportion-"type"-changed num-frames num-wrong-frames"; } {print $1, $3 * 1.0 / $2, $2, $3; }' | sort -nr -k2,2 > $dir/utt_stats.$type.sorted ( echo "$0: Percentiles 100, 90, .. 0 of proportion-$type-changed distribution (over utterances) are:" cat $dir/utt_stats.$type.sorted | awk -v n=$num_utts 'BEGIN{k=int((n-1)/10);} {if (NR % k == 1) printf("%s ", $2); } END{print "";}' ) | tee $dir/utt_stats.$type.percentiles done fi if [ $stage -le 8 ]; then # Display the 1000 worst utterances, and 1000 utterances from the middle of the pack, in a readable format. num_utts=$(wc -l <$dir/utt_stats.words.sorted) half_num_utts=$[$num_utts/2]; if [ $num_to_sample -gt $half_num_utts ]; then num_to_sample=$half_num_utts fi head -n $num_to_sample $dir/utt_stats.words.sorted | awk '{print $1}' > $dir/utt_ids.worst tail -n +$half_num_utts $dir/utt_stats.words.sorted | head -n $num_to_sample | awk '{print $1}' > $dir/utt_ids.mid for suf in worst mid; do for n in 1 2; do gunzip -c $dir/phones${n}.gz | copy-int-vector ark:- ark,t:- | utils/filter_scp.pl $dir/utt_ids.$suf >$dir/temp # the next command reorders them, and duplicates the utterance-idwhich we'll later use # that to display the word sequence. awk '{print $1,$1,$1}' <$dir/utt_ids.$suf | utils/apply_map.pl -f 3 $dir/temp > $dir/phones${n}.$suf rm $dir/temp done # the stuff with 0 and <eps> below is a kind of hack so that if the phones are the same, we end up # with just the phone, but if different, we end up with p1/p2. # The apply_map.pl stuff is to put the transcript there. ( echo "# Format: <utterance-id> <word1> <word2> ... <wordN> <frame1-phone> ... <frameN-phone>" echo "# If the two alignments have the same phone, just that phone will be printed;" echo "# otherwise the two phones will be printed, as in 'phone1/phone2'. So '/' is present" echo "# whenever there is a mismatch." paste $dir/phones1.$suf $dir/phones2.$suf | perl -ane ' @A = split("\t", $_); @A1 = split(" ", $A[0]); @A2 = split(" ", $A[1]); $utt = shift @A1; shift @A2; print $utt, " "; for ($n = 0; $n < @A1 && $n < @A2; $n++) { $a1=$A1[$n]; $a2=$A2[$n]; if ($a1 eq $a2) { print "$a1 "; } else { print "$a1 0 $a2 "; }} print " " ' | utils/int2sym.pl -f 3- $lang1/phones.txt | sed 's: <eps> :/:g' | \ utils/apply_map.pl -f 2 $data1/text ) > $dir/compare_phones_${suf}.txt done fi if [ $stage -le 9 ] && $cleanup; then rm $dir/phones{1,2}.gz $dir/words{1,2}.gz $dir/ctm*/ctm $dir/*.vec $dir/conf.mat \ $dir/utt_ids.* $dir/phones{1,2}.{mid,worst} $dir/utt_stats.{phones,words} \ $dir/phone_stats.all fi # clean up exit 0 |