Blame view

egs/wsj/s5/steps/diagnostic/analyze_lats.sh 3.46 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
  #!/bin/bash
  #
  # Copyright Johns Hopkins University (Author: Daniel Povey) 2016.  Apache 2.0.
  
  # This script does the same type of diagnostics as analyze_alignments.sh, except
  # it starts from lattices (so it has to convert the lattices to alignments
  # first).
  
  # begin configuration section.
  iter=final
  cmd=run.pl
  acwt=0.1
  #end configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f ./path.sh ] && . ./path.sh
  . parse_options.sh || exit 1;
  
  if [ $# -ne 2 ]; then
    echo "Usage: $0 [options] (<lang-dir>|<graph-dir>) <decode-dir>"
    echo " Options:"
    echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
    echo "    --acwt <acoustic-scale>         # Acoustic scale for getting best-path (default: 0.1)"
    echo "e.g.:"
    echo "$0 data/lang exp/tri4b/decode_dev"
    echo "This script writes some diagnostics to <decode-dir>/log/alignments.log"
    exit 1;
  fi
  
  lang=$1
  dir=$2
  
  model=$dir/../${iter}.mdl
  
  for f in $lang/words.txt $model $dir/lat.1.gz $dir/num_jobs; do
    [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
  done
  
  num_jobs=$(cat $dir/num_jobs) || exit 1
  
  mkdir -p $dir/log
  
  rm $dir/phone_stats.*.gz 2>/dev/null || true
  
  # this writes two archives of depth_tmp and ali_tmp of (depth per frame, alignment per frame).
  $cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \
    lattice-depth-per-frame "ark:gunzip -c $dir/lat.JOB.gz|" "ark,t:|gzip -c > $dir/depth_tmp.JOB.gz" ark:- \| \
    lattice-best-path --acoustic-scale=$acwt ark:- ark:/dev/null "ark,t:|gzip -c >$dir/ali_tmp.JOB.gz" || exit 1
  
  $cmd JOB=1:$num_jobs $dir/log/get_lattice_stats.JOB.log \
    ali-to-phones --write-lengths=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \
    perl -ne 'chomp;s/^\S+\s*//;@a=split /\s;\s/, $_;$count{"begin ".$a[$0]."
  "}++;
    if(@a>1){$count{"end ".$a[-1]."
  "}++;}for($i=0;$i<@a;$i++){$count{"all ".$a[$i]."
  "}++;}
    END{for $k (sort keys %count){print "$count{$k} $k"}}' \| \
    gzip -c '>' $dir/phone_stats.JOB.gz || exit 1
  
  $cmd $dir/log/analyze_alignments.log \
    gunzip -c "$dir/phone_stats.*.gz" \| \
    steps/diagnostic/analyze_phone_length_stats.py $lang || exit 1
  
  grep WARNING $dir/log/analyze_alignments.log
  echo "$0: see stats in $dir/log/analyze_alignments.log"
  
  
  # note: below, some things that would be interpreted by the shell have to be
  # escaped since it needs to be passed to $cmd.
  # the 'paste' command will paste together the phone-indexes and the depths
  # so that one line will be like utt-id1 phone1 phone2 phone3 .. utt-id1 depth1 depth2 depth3 ...
  # the following command computes counts of pairs (phone, lattice-depth) and outputs lines
  # containing 3 integers representing:
  #   phone lattice_depth, count[phone,lattice_depth]
  $cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \
    ali-to-phones --per-frame=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \
    paste /dev/stdin '<(' gunzip -c $dir/depth_tmp.JOB.gz  ')'  \| \
    perl -ane '$half=@F/2;for($i=1;$i<$half;$i++){$j=$i+$half;$count{$F[$i]." ".$F[$j]}++;}
    END{for $k (sort keys %count){print "$k $count{$k}
  "}}' \| \
    gzip -c '>' $dir/depth_stats_tmp.JOB.gz
  
  $cmd $dir/log/analyze_lattice_depth_stats.log \
    gunzip -c "$dir/depth_stats_tmp.*.gz" \| \
    steps/diagnostic/analyze_lattice_depth_stats.py $lang || exit 1
  
  grep Overall $dir/log/analyze_lattice_depth_stats.log
  echo "$0: see stats in $dir/log/analyze_lattice_depth_stats.log"
  
  
  rm $dir/phone_stats.*.gz
  rm $dir/depth_tmp.*.gz
  rm $dir/depth_stats_tmp.*.gz
  rm $dir/ali_tmp.*.gz
  
  exit 0