Blame view

egs/wsj/s5/steps/subset_ali_dir.sh 1.68 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
  #!/bin/bash
  
  # Copyright 2017  Vimal Manohar
  # Apache 2.0.
  
  cmd=run.pl
  
  if [ -f ./path.sh ]; then . ./path.sh; fi
  
  . ./utils/parse_options.sh
  
  if [ $# -ne 4 ]; then
    cat <<EOF
    This script creates an alignment directory containing a subset of 
    utterances contained in <subset-data-dir> from the 
    original alignment directory containing alignments for utterances in
    <full-data-dir>.
  
    The number of split jobs in the output alignment directory is 
    equal to the number of jobs in the original alignment directory, 
    unless the subset data directory has too few speakers.
  
    Usage: $0 [options] <full-data-dir> <subset-data-dir> <ali-dir> <subset-ali-dir>
     e.g.: $0 data/train_sp data/train exp/tri3_ali_sp exp/tri3_ali
  
    Options: 
        --cmd (utils/run.pl|utils/queue.pl <queue opts>)  # how to run jobs.
  EOF
    exit 1
  fi
  
  data=$1
  subset_data=$2
  ali_dir=$3
  dir=$4
  
  nj=$(cat $ali_dir/num_jobs) || exit 1
  utils/split_data.sh $data $nj
  
  mkdir -p $dir
  cp $ali_dir/{final.mdl,*.mat,*_opts,tree} $dir/ || true
  cp -r $ali_dir/phones $dir 2>/dev/null || true
  
  $cmd JOB=1:$nj $dir/log/copy_alignments.JOB.log \
    copy-int-vector "ark:gunzip -c $ali_dir/ali.JOB.gz |" \
    ark,scp:$dir/ali_tmp.JOB.ark,$dir/ali_tmp.JOB.scp || exit 1
  
  for n in `seq $nj`; do
    cat $dir/ali_tmp.$n.scp 
  done > $dir/ali_tmp.scp
  
  num_spk=$(cat $subset_data/spk2utt | wc -l)
  if [ $num_spk -lt $nj ]; then
    nj=$num_spk
  fi
  
  utils/split_data.sh $subset_data $nj
  $cmd JOB=1:$nj $dir/log/filter_alignments.JOB.log \
    copy-int-vector \
    "scp:utils/filter_scp.pl $subset_data/split${nj}/JOB/utt2spk $dir/ali_tmp.scp |" \
    "ark:| gzip -c > $dir/ali.JOB.gz" || exit 1
  
  echo $nj > $dir/num_jobs
  
  rm $dir/ali_tmp.*.{ark,scp} $dir/ali_tmp.scp
  
  exit 0