Blame view
egs/wsj/s5/steps/nnet3/chain/get_phone_post.sh
10.7 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
#!/bin/bash # Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). # Apache 2.0. # This script obtains phone posteriors from a trained chain model, using either # the xent output or the forward-backward posteriors from the denominator fst. # The phone posteriors will be in matrices where the column index can be # interpreted as phone-index - 1. # You may want to mess with the compression options. Be careful: with the current # settings, you might sometimes get exact zeros as the posterior values. # CAUTION! This script isn't very suitable for dumping features from recurrent # architectures such as LSTMs, because it doesn't support setting the chunk size # and left and right context. (Those would have to be passed into nnet3-compute # or nnet3-chain-compute-post). # Begin configuration section. stage=0 nj=1 # Number of jobs to run. cmd=run.pl remove_word_position_dependency=false use_xent_output=false online_ivector_dir= use_gpu=false count_smoothing=1.0 # this should be some small number, I don't think it's critical; # it will mainly affect the probability we assign to phones that # were never seen in training. note: this is added to the raw # transition-id occupation counts, so 1.0 means, add a single # frame's count to each transition-id's counts. # End configuration section. set -e -u echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# != 5 ]; then echo "Usage: $0 <chain-tree-dir> <chain-model-dir> <lang-dir> <data-dir> <phone-post-dir>" echo " e.g.: $0 --remove-word-position-dependency true --online-ivector-dir exp/nnet3/ivectors_test_eval92_hires \\" echo " exp/chain/tree_a_sp exp/chain/tdnn1a_sp data/lang data/test_eval92_hires exp/chain/tdnn1a_sp_post_eval92" echo " ... you'll normally want to set the --nj and --cmd options as well." echo "" echo "Main options (for others, see top of script file)" echo " --cmd (run.pl|queue.pl|... <queue opts>) # how to run jobs." echo " --config <config-file> # config containing options" echo " --stage <stage> # stage to do partial re-run from." echo " --nj <N> # Number of parallel jobs to run, default:1" echo " --remove-word-position-dependency <bool> # If true, remove word-position-dependency" echo " # info when dumping posteriors (default: false)" echo " --use-xent-output <bool> # If true, use the cross-entropy output of the" echo " # neural network when dumping posteriors" echo " # (default: false, will use chain denominator FST)" echo " --online-ivector-dir <dir> # Directory where we dumped online-computed" echo " # ivectors corresponding to the data in <data>" echo " --use-gpu <bool> # Set to true to use GPUs (not recommended as the" echo " # binary is very poorly optimized for GPU use)." exit 1; fi tree_dir=$1 model_dir=$2 lang=$3 data=$4 dir=$5 for f in $tree_dir/tree $tree_dir/final.mdl $tree_dir/ali.1.gz $tree_dir/num_jobs \ $model_dir/final.mdl $model_dir/frame_subsampling_factor $model_dir/den.fst \ $data/feats.scp $lang/phones.txt; do [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1; done sdata=$data/split${nj}utt [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh --per-utt $data $nj || exit 1; use_ivector=false cmvn_opts=$(cat $model_dir/cmvn_opts) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" if [ ! -z "$online_ivector_dir" ];then steps/nnet2/check_ivectors_compatible.sh $model_dir $online_ivector_dir || exit 1; ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; ivector_feats="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $online_ivector_dir/ivector_online.scp |" ivector_opts="--online-ivector-period=$ivector_period --online-ivectors='$ivector_feats'" else ivector_opts= fi if $use_gpu; then gpu_queue_opt="--gpu 1" gpu_opt="--use-gpu=yes" if ! cuda-compiled; then echo "$0: WARNING: you are running with one thread but you have not compiled" echo " for CUDA. You may be running a setup optimized for GPUs. If you have" echo " GPUs and have nvcc installed, go to src/ and do ./configure; make" exit 1 fi else gpu_queue_opts= gpu_opt="--use-gpu=no" fi frame_subsampling_factor=$(cat $model_dir/frame_subsampling_factor) mkdir -p $dir/log cp $model_dir/frame_subsampling_factor $dir/ if [ $stage -le 0 ]; then if [ ! -f $dir/tacc ] || [ $dir/tacc -ot $tree_dir/ali.1.gz ]; then echo "$0: obtaining transition-id counts in $dir/tacc" # Obtain counts for each transition-id, from the alignments. this_nj=$(cat $tree_dir/num_jobs) $cmd JOB=1:$this_nj $dir/log/acc_taccs.JOB.log \ ali-to-post "ark:gunzip -c $tree_dir/ali.JOB.gz|" ark:- \| \ post-to-tacc $tree_dir/final.mdl ark:- $dir/tacc.JOB input_taccs=$(for n in $(seq $this_nj); do echo $dir/tacc.$n; done) $cmd $dir/log/sum_taccs.log \ vector-sum --binary=false $input_taccs $dir/tacc rm $dir/tacc.* else echo "$0: skipping creation of $dir/tacc since it already exists." fi fi if [ $stage -le 1 ] && $remove_word_position_dependency; then echo "$0: creating $dir/phone_map.int" utils/lang/get_word_position_phone_map.pl $lang $dir else # Either way, $dir/phones.txt will be a symbol table for the phones that # we are dumping (although the matrices we dump won't contain anything # for symbol 0 which is <eps>). grep -v '^#' $lang/phones.txt > $dir/phones.txt fi if [ $stage -le 1 ]; then # we want the phones in integer form as it's safer for processing by script. # $data/fake_phones.txt will just contain e.g. "0 0 1 1 ....", it's used # to force show-transitions to print the phones as integers. awk '{print $2,$2}' <$lang/phones.txt >$dir/fake_phones.txt # The format of the 'show-transitions' command below is like the following: #show-transitions tempdir/phone_map.int exp/chain/tree_a_sp/final.mdl #Transition-state 1: phone = 1 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51 # Transition-id = 1 p = 0.5 [self-loop] # Transition-id = 2 p = 0.5 [0 -> 1] #Transition-state 2: phone = 10 hmm-state = 0 forward-pdf = 0 self-loop-pdf = 51 # Transition-id = 3 p = 0.5 [self-loop] # Transition-id = 4 p = 0.5 [0 -> 1] # The following inline script processes that info about the transition model # into the file $dir/phones_and_pdfs.txt, which has a line for each transition-id # (starting from number 1), and the format of each line is # <phone-id> <pdf-id> show-transitions $dir/fake_phones.txt $tree_dir/final.mdl | \ perl -ane ' if(m/Transition-state.* phone = (\d+) pdf = (\d+)/) { $phone = $1; $forward_pdf = $2; $self_loop_pdf = $2; } if(m/Transition-state.* phone = (\d+) .* forward-pdf = (\d+) self-loop-pdf = (\d+)/) { $phone = $1; $forward_pdf = $2; $self_loop_pdf = $3; } if(m/Transition-id/) { if (m/self-loop/) { print "$phone $self_loop_pdf "; } else { print "$phone $forward_pdf " } } ' > $dir/phones_and_pdfs.txt # The following command just separates the 'tacc' file into a similar format # to $dir/phones_and_pdfs.txt, with one count per line, and a line per transition-id # starting from number 1. We skip the first two fields which are "[ 0" (the 0 is # for transition-id=0, since transition-ids are 1-based), and the last field which is "]". awk '{ for (n=3;n<NF;n++) print $n; }' <$dir/tacc >$dir/transition_counts.txt num_lines1=$(wc -l <$dir/phones_and_pdfs.txt) num_lines2=$(wc -l <$dir/transition_counts.txt) if [ $num_lines1 -ne $num_lines2 ]; then echo "$0: mismatch in num-lines between phones_and_pdfs.txt and transition_counts.txt: $num_lines1 vs $num_lines2" exit 1 fi # after 'paste', the format of the data will be # <phone-id> <pdf-id> <data-count> # we add the count smoothing at this point. paste $dir/phones_and_pdfs.txt $dir/transition_counts.txt | \ awk -v s=$count_smoothing '{print $1, $2, (s+$3);}' > $dir/combined_info.txt if $remove_word_position_dependency; then # map the phones to word-position-independent phones; you can see $dir/phones.txt # to interpret the final output. utils/apply_map.pl -f 1 $dir/phone_map.int <$dir/combined_info.txt > $dir/temp.txt mv $dir/temp.txt $dir/combined_info.txt fi awk 'BEGIN{num_phones=1;num_pdfs=1;} { phone=$1; pdf=$2; count=$3; pdf_count[pdf] += count; counts[pdf,phone] += count; if (phone>num_phones) num_phones=phone; if (pdf>=num_pdfs) num_pdfs = pdf + 1; } END{ print "[ "; for(phone=1;phone<=num_phones;phone++) { for (pdf=0;pdf<num_pdfs;pdf++) printf("%.3f ", counts[pdf,phone]/pdf_count[pdf]); print ""; } print "]"; }' <$dir/combined_info.txt >$dir/transform.mat fi if [ $stage -le 2 ]; then # note: --compression-method=3 is kTwoByteAuto: Each element is stored in two # bytes as a uint16, with the representable range of values chosen # automatically with the minimum and maximum elements of the matrix as its # edges. compress_opts="--compress=true --compression-method=3" if $use_xent_output; then # This block uses the 'output-xent' output of the nnet. model="nnet3-copy '--edits-config=echo remove-output-nodes name=output; echo rename-node old-name=output-xent new-name=output|' $model_dir/final.mdl -|" $cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \ nnet3-compute $gpu_opt $ivector_opts \ --frame-subsampling-factor=$frame_subsampling_factor --apply-exp=true \ "$model" "$feats" ark:- \| \ transform-feats $dir/transform.mat ark:- ark:- \| \ copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp else # This block is when we are using the 'chain' output (recommended as the posteriors # will be much more accurate). $cmd $gpu_queue_opts JOB=1:$nj $dir/log/get_phone_post.JOB.log \ nnet3-chain-compute-post $gpu_opt $ivector_opts --transform-mat=$dir/transform.mat \ --frame-subsampling-factor=$frame_subsampling_factor \ $model_dir/final.mdl $model_dir/den.fst "$feats" ark:- \| \ copy-feats $compress_opts ark:- ark,scp:$dir/phone_post.JOB.ark,$dir/phone_post.JOB.scp fi sleep 5 # Make a single .scp file, for convenience. for n in $(seq $nj); do cat $dir/phone_post.$n.scp; done > $dir/phone_post.scp fi |