Blame view
egs/wsj/s5/steps/combine_ali_dirs.sh
7.7 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
#!/bin/bash # Copyright 2016 Xiaohui Zhang Apache 2.0. # Copyright 2019 SmartAction (kkm) # This script combines alignment directories, such as exp/tri4a_ali, and # validates matching of the utterances and alignments after combining. # Begin configuration section. cmd=run.pl nj=4 combine_lat=true combine_ali=true tolerance=10 # End configuration section. echo "$0 $@" # Print the command line for logging. [[ -f path.sh ]] && . ./path.sh . parse_options.sh || exit 1 export LC_ALL=C if [[ $# -lt 3 ]]; then cat >&2 <<EOF Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ... e.g.: $0 --nj 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2 Options: --nj <nj> # number of jobs to split combined archives [4] --combine_ali false # merge ali.*.gz if present [true] --combine_lat false # merge lat.*.gz if present [true] --tolerance <int,%> # maximum percentage of missing alignments or lattices # w.r.t. total utterances in <data> before error is # reported [10] The script checks that certain important files are present and compatible in all source directories (phones.txt, tree); other are copied from the first source (cmvn_opts, final.mdl) without much checking. Both --combine_ali and --combine_lat are true by default, but the script proceeds with a warning if directories do not contain either alignments or alignment lattices. Check for files ali.1.gz and/or lat.1.gz in the <dest-dir> after the script completes if additional programmatic check is required. EOF exit 1; fi if [[ ! $combine_lat && ! $combine_ali ]]; then echo "$0: at least one of --combine_lat and --combine_ali must be true" exit 1 fi data=$1 dest=$2 shift 2 first_src=$1 do_ali=$combine_ali do_lat=$combine_lat # Check if alignments and/or lattices are present. Since we combine both, # whichever present, issue a warning only. Also verify that the target is # different from any source; we cannot combine in-place, and a lot of damage # could result. for src in $@; do if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \ "$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then echo "$0: error: Source $src is same as target $dest." exit 1 fi if $do_ali && [[ ! -f $src/ali.1.gz ]]; then echo "$0: warning: Alignments (ali.*.gz) are not present in $src, not" \ "combining. Consider '--combine_ali false' to suppress this warning." do_ali=false fi if $do_lat && [[ ! -f $src/lat.1.gz ]]; then echo "$0: warning: Alignment lattices (lat.*.gz) are not present in $src,"\ "not combining. Consider '--combine_lat false' to suppress this warning." do_lat=false fi done if ! $do_ali && ! $do_lat; then echo "$0: error: Cannot combine directories." exit 1 fi # Verify that required files are present in the first directory. for f in cmvn_opts final.mdl num_jobs phones.txt tree; do if [ ! -f $first_src/$f ]; then echo "$0: error: Required source file $first_src/$f is missing." exit 1 fi done # Verify that phones and trees are compatible in all directories, and than # num_jobs files are present, too. for src in $@; do if [[ $src != $first_src ]]; then if [[ ! -f $src/num_jobs ]]; then echo "$0: error: Required source file $src/num_jobs is missing." exit 1 fi if ! cmp -s $first_src/tree $src/tree; then echo "$0: error: tree $src/tree is either missing or not the" \ "same as $first_src/tree." exit 1 fi if [[ ! -f $src/phones.txt ]]; then echo "$0: error: Required source file $src/phones.txt is missing." exit 1 fi utils/lang/check_phones_compatible.sh $first_src/phones.txt \ $src/phones.txt || exit 1 fi done # All checks passed, ok to prepare directory. Copy model and other files from # the first source, as they either checked to be compatible, or we do not care # if they are. mkdir -p $dest || exit 1 rm -f $dest/{cmvn_opts,final.mdl,num_jobs,phones.txt,tree} $do_ali && rm -f $dest/ali.*.{gz,scp} $do_lat && rm -f $dest/lat.*.{gz,scp} cp $first_src/{cmvn_opts,final.mdl,phones.txt,tree} $dest/ || exit 1 cp $first_src/frame_subsampling_factor $dest/ 2>/dev/null # If present. echo $nj > $dest/num_jobs || exit 1 # Make temporary directory, delete on signal, but not on 'exit 1'. temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1 cleanup() { rm -rf "$temp_dir"; } trap cleanup HUP INT TERM echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \ "script failure, so you could examine it for troubleshooting." # This function may be called twice, once to combine alignments and the second # time to combine lattices. The two invocations are as follows: # do_combine ali alignments copy-int-vector $@ # do_combine lat lattices lattice-copy $@ # where 'ali'/'lat' is a prefix to archive name, 'alignments'/'lattices' go into # log messages and logfile names, and 'copy-int-vector'/'lattice-copy' is the # program used to copy corresponding objects. do_combine() { local ark=$1 entities=$2 copy_program=$3 shift 3 echo "$0: Gathering $entities from each source directory." # Assign all source gzipped archive names to an exported variable, one each # per source directory, so that we can copy archives in a job per source. src_id=0 for src in $@; do src_id=$((src_id + 1)) nj_src=$(cat $src/num_jobs) || exit 1 # Create and export variable src_arcs_${src_id} for the job runner. # Each numbered variable will contain the list of archives, e. g.: # src_arcs_1="exp/tri3_ali/ali.1.gz exp/tri3_ali/ali.1.gz ..." # ('printf' repeats its format as long as there are more arguments). printf -v src_arks_${src_id} "$src/$ark.%d.gz " $(seq $nj_src) export src_arks_${src_id} done # Gather archives in parallel jobs. $cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \ $copy_program \ "ark:gunzip -c \${src_arks_JOB} |" \ "ark,scp:$temp_dir/$ark.JOB.ark,$temp_dir/$ark.JOB.scp" || exit 1 # Merge (presumed already sorted) scp's into a single script. sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1 echo "$0: Splitting combined $entities into $nj archives on speaker boundary." $cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \ $copy_program \ "scp:utils/split_scp.pl --utt2spk=$data/utt2spk --one-based -j $nj JOB $temp_dir/$ark.scp |" \ "ark:| gzip -c > $dest/$ark.JOB.gz" || exit 1 # Get some interesting stats, and signal an error if error threshold exceeded. n_utt=$(wc -l <$data/utt2spk) n_ali=$(wc -l <$temp_dir/$ark.scp) n_ali_no_utt=$(join -j1 -v2 $data/utt2spk $temp_dir/$ark.scp | wc -l) n_utt_no_ali=$(join -j1 -v1 $data/utt2spk $temp_dir/$ark.scp | wc -l) n_utt_no_ali_pct=$(perl -e "print int($n_utt_no_ali/$n_utt * 100 + .5);") echo "$0: Combined $n_ali $entities for $n_utt utterances." \ "There were $n_utt_no_ali utterances (${n_utt_no_ali_pct}%) without" \ "$entities, and $n_ali_no_utt $entities not matching any utterance." if (( $n_utt_no_ali_pct >= $tolerance )); then echo "$0: error: Percentage of utterances missing $entities," \ "${n_utt_no_ali_pct}%, is at or above error tolerance ${tolerance}%." exit 1 fi return 0 } # Do the actual combining. Do not check returned exit code, as # the function always calls 'exit 1' on failure. $do_ali && do_combine ali 'alignments' copy-int-vector "$@" $do_lat && do_combine lat 'lattices' lattice-copy "$@" # Delete the temporary directory on success. cleanup what= $do_ali && what+='alignments ' $do_ali && $do_lat && what+='and ' $do_lat && what+='lattices ' echo "$0: Stored combined ${what}in $dest" # No period, interferes with # copy/paste from tty emulator. exit 0 |