Blame view

egs/wsj/s5/steps/combine_ali_dirs.sh 7.7 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
  #!/bin/bash
  # Copyright 2016  Xiaohui Zhang  Apache 2.0.
  # Copyright 2019  SmartAction (kkm)
  
  # This script combines alignment directories, such as exp/tri4a_ali, and
  # validates matching of the utterances and alignments after combining.
  
  # Begin configuration section.
  cmd=run.pl
  nj=4
  combine_lat=true
  combine_ali=true
  tolerance=10
  # End configuration section.
  echo "$0 $@"  # Print the command line for logging.
  
  [[ -f path.sh ]] && . ./path.sh
  . parse_options.sh || exit 1
  
  export LC_ALL=C
  
  if [[ $# -lt 3 ]]; then
    cat >&2 <<EOF
  Usage: $0 [options] <data> <dest-dir> <src-dir1> <src-dir2> ...
   e.g.: $0 --nj 32 data/train exp/tri3_ali_combined exp/tri3_ali_1 exp_tri3_ali_2
  Options:
   --nj <nj>              # number of jobs to split combined archives [4]
   --combine_ali false    # merge ali.*.gz if present [true]
   --combine_lat false    # merge lat.*.gz if present [true]
   --tolerance <int,%>    # maximum percentage of missing alignments or lattices
                          # w.r.t. total utterances in <data> before error is
                          # reported [10]
  
  The script checks that certain important files are present and compatible in all
  source directories (phones.txt, tree); other are copied from the first source
  (cmvn_opts, final.mdl) without much checking.
  
  Both --combine_ali and --combine_lat are true by default, but the script
  proceeds with a warning if directories do not contain either alignments or
  alignment lattices. Check for files ali.1.gz and/or lat.1.gz in the <dest-dir>
  after the script completes if additional programmatic check is required.
  EOF
    exit 1;
  fi
  
  if [[ ! $combine_lat && ! $combine_ali ]]; then
    echo "$0: at least one of --combine_lat and --combine_ali must be true"
    exit 1
  fi
  
  data=$1
  dest=$2
  shift 2
  first_src=$1
  
  do_ali=$combine_ali
  do_lat=$combine_lat
  
  # Check if alignments and/or lattices are present. Since we combine both,
  # whichever present, issue a warning only. Also verify that the target is
  # different from any source; we cannot combine in-place, and a lot of damage
  # could result.
  for src in $@; do
    if [[ "$(cd 2>/dev/null -P -- "$src" && pwd)" = \
          "$(cd 2>/dev/null -P -- "$dest" && pwd)" ]]; then
      echo "$0: error: Source $src is same as target $dest."
      exit 1
    fi
    if $do_ali && [[ ! -f $src/ali.1.gz ]]; then
      echo "$0: warning: Alignments (ali.*.gz) are not present in $src, not" \
           "combining. Consider '--combine_ali false' to suppress this warning."
      do_ali=false
    fi
    if $do_lat && [[ ! -f $src/lat.1.gz ]]; then
      echo "$0: warning: Alignment lattices (lat.*.gz) are not present in $src,"\
        "not combining. Consider '--combine_lat false' to suppress this warning."
      do_lat=false
    fi
  done
  
  if ! $do_ali && ! $do_lat; then
    echo "$0: error: Cannot combine directories."
    exit 1
  fi
  
  # Verify that required files are present in the first directory.
  for f in cmvn_opts final.mdl num_jobs phones.txt tree; do
    if [ ! -f $first_src/$f ]; then
      echo "$0: error: Required source file $first_src/$f is missing."
      exit 1
    fi
  done
  
  # Verify that phones and trees are compatible in all directories, and than
  # num_jobs files are present, too.
  for src in $@; do
    if [[ $src != $first_src ]]; then
      if [[ ! -f $src/num_jobs ]]; then
        echo "$0: error: Required source file $src/num_jobs is missing."
        exit 1
      fi
      if ! cmp -s $first_src/tree $src/tree; then
        echo "$0: error: tree $src/tree is either missing or not the" \
             "same as $first_src/tree."
        exit 1
      fi
      if [[ ! -f $src/phones.txt ]]; then
        echo "$0: error: Required source file $src/phones.txt is missing."
        exit 1
      fi
      utils/lang/check_phones_compatible.sh $first_src/phones.txt \
                                            $src/phones.txt || exit 1
    fi
  done
  
  # All checks passed, ok to prepare directory. Copy model and other files from
  # the first source, as they either checked to be compatible, or we do not care
  # if they are.
  mkdir -p $dest || exit 1
  rm -f $dest/{cmvn_opts,final.mdl,num_jobs,phones.txt,tree}
  $do_ali && rm -f $dest/ali.*.{gz,scp}
  $do_lat && rm -f $dest/lat.*.{gz,scp}
  cp $first_src/{cmvn_opts,final.mdl,phones.txt,tree} $dest/ || exit 1
  cp $first_src/frame_subsampling_factor $dest/ 2>/dev/null  # If present.
  echo $nj > $dest/num_jobs || exit 1
  
  # Make temporary directory, delete on signal, but not on 'exit 1'.
  temp_dir=$(mktemp -d $dest/temp.XXXXXX) || exit 1
  cleanup() { rm -rf "$temp_dir"; }
  trap cleanup HUP INT TERM
  echo "$0: note: Temporary directory $temp_dir will not be deleted in case of" \
       "script failure, so you could examine it for troubleshooting."
  
  
  # This function may be called twice, once to combine alignments and the second
  # time to combine lattices. The two invocations are as follows:
  #   do_combine ali alignments copy-int-vector $@
  #   do_combine lat lattices   lattice-copy $@
  # where 'ali'/'lat' is a prefix to archive name, 'alignments'/'lattices' go into
  # log messages and logfile names, and 'copy-int-vector'/'lattice-copy' is the
  # program used to copy corresponding objects.
  do_combine() {
    local ark=$1 entities=$2 copy_program=$3
    shift 3
  
    echo "$0: Gathering $entities from each source directory."
    # Assign all source gzipped archive names to an exported variable, one each
    # per source directory, so that we can copy archives in a job per source.
    src_id=0
    for src in $@; do
      src_id=$((src_id + 1))
      nj_src=$(cat $src/num_jobs) || exit 1
      # Create and export variable src_arcs_${src_id} for the job runner.
      # Each numbered variable will contain the list of archives, e. g.:
      # src_arcs_1="exp/tri3_ali/ali.1.gz exp/tri3_ali/ali.1.gz ..."
      # ('printf' repeats its format as long as there are more arguments).
      printf -v src_arks_${src_id} "$src/$ark.%d.gz " $(seq $nj_src)
      export src_arks_${src_id}
    done
  
    # Gather archives in parallel jobs.
    $cmd JOB=1:$src_id $dest/log/gather_$entities.JOB.log \
      $copy_program \
        "ark:gunzip -c \${src_arks_JOB} |" \
        "ark,scp:$temp_dir/$ark.JOB.ark,$temp_dir/$ark.JOB.scp" || exit 1
  
    # Merge (presumed already sorted) scp's into a single script.
    sort -m $temp_dir/$ark.*.scp > $temp_dir/$ark.scp || exit 1
  
    echo "$0: Splitting combined $entities into $nj archives on speaker boundary."
    $cmd JOB=1:$nj $dest/log/chop_combined_$entities.JOB.log \
      $copy_program \
        "scp:utils/split_scp.pl --utt2spk=$data/utt2spk --one-based -j $nj JOB $temp_dir/$ark.scp |" \
        "ark:| gzip -c > $dest/$ark.JOB.gz" || exit 1
  
    # Get some interesting stats, and signal an error if error threshold exceeded.
    n_utt=$(wc -l <$data/utt2spk)
    n_ali=$(wc -l <$temp_dir/$ark.scp)
    n_ali_no_utt=$(join -j1 -v2 $data/utt2spk $temp_dir/$ark.scp | wc -l)
    n_utt_no_ali=$(join -j1 -v1 $data/utt2spk $temp_dir/$ark.scp | wc -l)
    n_utt_no_ali_pct=$(perl -e "print int($n_utt_no_ali/$n_utt * 100 + .5);")
    echo "$0: Combined $n_ali $entities for $n_utt utterances." \
         "There were $n_utt_no_ali utterances (${n_utt_no_ali_pct}%) without" \
         "$entities, and $n_ali_no_utt $entities not matching any utterance."
  
    if (( $n_utt_no_ali_pct >= $tolerance )); then
      echo "$0: error: Percentage of utterances missing $entities," \
           "${n_utt_no_ali_pct}%, is at or above error tolerance ${tolerance}%."
      exit 1
    fi
  
    return 0
  }
  
  # Do the actual combining. Do not check returned exit code, as
  # the function always calls 'exit 1' on failure.
  $do_ali && do_combine ali 'alignments' copy-int-vector "$@"
  $do_lat && do_combine lat 'lattices' lattice-copy "$@"
  
  # Delete the temporary directory on success.
  cleanup
  
  what=
  $do_ali && what+='alignments '
  $do_ali && $do_lat && what+='and '
  $do_lat && what+='lattices '
  echo "$0: Stored combined ${what}in $dest"  # No period, interferes with
                                              # copy/paste from tty emulator.
  exit 0