Blame view
egs/wsj/s5/steps/diagnostic/analyze_lats.sh
3.46 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
#!/bin/bash # # Copyright Johns Hopkins University (Author: Daniel Povey) 2016. Apache 2.0. # This script does the same type of diagnostics as analyze_alignments.sh, except # it starts from lattices (so it has to convert the lattices to alignments # first). # begin configuration section. iter=final cmd=run.pl acwt=0.1 #end configuration section. echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -ne 2 ]; then echo "Usage: $0 [options] (<lang-dir>|<graph-dir>) <decode-dir>" echo " Options:" echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." echo " --acwt <acoustic-scale> # Acoustic scale for getting best-path (default: 0.1)" echo "e.g.:" echo "$0 data/lang exp/tri4b/decode_dev" echo "This script writes some diagnostics to <decode-dir>/log/alignments.log" exit 1; fi lang=$1 dir=$2 model=$dir/../${iter}.mdl for f in $lang/words.txt $model $dir/lat.1.gz $dir/num_jobs; do [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; done num_jobs=$(cat $dir/num_jobs) || exit 1 mkdir -p $dir/log rm $dir/phone_stats.*.gz 2>/dev/null || true # this writes two archives of depth_tmp and ali_tmp of (depth per frame, alignment per frame). $cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \ lattice-depth-per-frame "ark:gunzip -c $dir/lat.JOB.gz|" "ark,t:|gzip -c > $dir/depth_tmp.JOB.gz" ark:- \| \ lattice-best-path --acoustic-scale=$acwt ark:- ark:/dev/null "ark,t:|gzip -c >$dir/ali_tmp.JOB.gz" || exit 1 $cmd JOB=1:$num_jobs $dir/log/get_lattice_stats.JOB.log \ ali-to-phones --write-lengths=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \ perl -ne 'chomp;s/^\S+\s*//;@a=split /\s;\s/, $_;$count{"begin ".$a[$0]." "}++; if(@a>1){$count{"end ".$a[-1]." "}++;}for($i=0;$i<@a;$i++){$count{"all ".$a[$i]." "}++;} END{for $k (sort keys %count){print "$count{$k} $k"}}' \| \ gzip -c '>' $dir/phone_stats.JOB.gz || exit 1 $cmd $dir/log/analyze_alignments.log \ gunzip -c "$dir/phone_stats.*.gz" \| \ steps/diagnostic/analyze_phone_length_stats.py $lang || exit 1 grep WARNING $dir/log/analyze_alignments.log echo "$0: see stats in $dir/log/analyze_alignments.log" # note: below, some things that would be interpreted by the shell have to be # escaped since it needs to be passed to $cmd. # the 'paste' command will paste together the phone-indexes and the depths # so that one line will be like utt-id1 phone1 phone2 phone3 .. utt-id1 depth1 depth2 depth3 ... # the following command computes counts of pairs (phone, lattice-depth) and outputs lines # containing 3 integers representing: # phone lattice_depth, count[phone,lattice_depth] $cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \ ali-to-phones --per-frame=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \ paste /dev/stdin '<(' gunzip -c $dir/depth_tmp.JOB.gz ')' \| \ perl -ane '$half=@F/2;for($i=1;$i<$half;$i++){$j=$i+$half;$count{$F[$i]." ".$F[$j]}++;} END{for $k (sort keys %count){print "$k $count{$k} "}}' \| \ gzip -c '>' $dir/depth_stats_tmp.JOB.gz $cmd $dir/log/analyze_lattice_depth_stats.log \ gunzip -c "$dir/depth_stats_tmp.*.gz" \| \ steps/diagnostic/analyze_lattice_depth_stats.py $lang || exit 1 grep Overall $dir/log/analyze_lattice_depth_stats.log echo "$0: see stats in $dir/log/analyze_lattice_depth_stats.log" rm $dir/phone_stats.*.gz rm $dir/depth_tmp.*.gz rm $dir/depth_stats_tmp.*.gz rm $dir/ali_tmp.*.gz exit 0 |