Blame view

egs/wsj/s5/steps/segmentation/post_process_sad_to_segments.sh 1.77 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
  #!/bin/bash
  
  # Copyright 2015-17  Vimal Manohar
  # Apache 2.0.
  
  # This script post-processes the output of steps/segmentation/decode_sad.sh,
  # which is in the form of frame-level alignments, into a 'segments' file.
  # The alignments must be speech activity detection marks i.e. 1 for silence 
  # and 2 for speech.
  
  set -e -o pipefail -u
  . ./path.sh
  
  cmd=run.pl
  stage=-10
  nj=18
  
  # The values below are in seconds
  frame_shift=0.01
  segment_padding=0.2
  min_segment_dur=0
  merge_consecutive_max_dur=0
  
  . utils/parse_options.sh
  
  if [ $# -ne 3 ]; then
    echo "This script post-processes the output of steps/segmentation/decode_sad.sh, "
    echo "which is in the form of frame-level alignments, into kaldi segments. "
    echo "The alignments must be speech activity detection marks i.e. 1 for silence "
    echo "and 2 for speech."
    echo "Usage: $0 <data-dir> <vad-dir> <segmentation-dir>"
    echo " e.g.: $0 data/dev_aspire_whole exp/vad_dev_aspire"
    exit 1
  fi
  
  data_dir=$1
  vad_dir=$2    # Alignment directory containing frame-level SAD labels
  dir=$3
  
  mkdir -p $dir
  
  for f in $vad_dir/ali.1.gz $vad_dir/num_jobs; do
    if [ ! -f $f ]; then
      echo "$0: Could not find file $f" && exit 1
    fi
  done
  
  nj=`cat $vad_dir/num_jobs` || exit 1
  utils/split_data.sh $data_dir $nj
  
  utils/data/get_utt2dur.sh $data_dir
  
  if [ $stage -le 0 ]; then
    $cmd JOB=1:$nj $dir/log/segmentation.JOB.log \
      copy-int-vector "ark:gunzip -c $vad_dir/ali.JOB.gz |" ark,t:- \| \
      steps/segmentation/internal/sad_to_segments.py \
        --frame-shift=$frame_shift --segment-padding=$segment_padding \
        --min-segment-dur=$min_segment_dur --merge-consecutive-max-dur=$merge_consecutive_max_dur \
        --utt2dur=$data_dir/utt2dur - $dir/segments.JOB
  fi
  
  echo $nj > $dir/num_jobs
  
  for n in $(seq $nj); do 
    cat $dir/segments.$n
  done > $dir/segments