Blame view

egs/bn_music_speech/v1/local/make_bn.sh 1.25 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  #!/bin/bash
  # Copyright 2015   David Snyder
  # Apache 2.0.
  #
  # This script, called by ../run.sh, creates the HUB4 Broadcast News
  # data directory. The required datasets can be found at:
  #   https://catalog.ldc.upenn.edu/LDC97S44
  #   https://catalog.ldc.upenn.edu/LDC97T22
  
  set -e
  sph_dir=$1
  transcript_dir=$2
  data_dir=$3
  tmp_dir=local/bn.tmp
  
  # These parameters are used when refining the annotations.
  # A higher frames_per_second provides better resolution at the
  # frame boundaries. Set min_seg to control the minimum length of the
  # final segments. It seems that the original annotations for segments
  # below half a second are not very accurate, so we test only on segments
  # longer than this.
  frames_per_sec=100
  min_seg=0.5
  
  rm -rf local/bn.tmp
  mkdir local/bn.tmp
  
  echo "$0: preparing annotations..."
  local/make_annotations_bn.py ${transcript_dir} ${tmp_dir}
  echo "$0: Removing overlapping annotations..."
  local/refine_annotations_bn.py ${tmp_dir} ${frames_per_sec} ${min_seg}
  echo "$0: Preparing broadcast news data directories ${data_dir}/bn..."
  local/make_bn.py ${sph_dir} ${tmp_dir}
  
  mkdir -p ${data_dir}/bn
  cp ${tmp_dir}/wav.scp ${data_dir}/bn/
  cp ${tmp_dir}/utt2spk ${data_dir}/bn/
  cp ${tmp_dir}/segments ${data_dir}/bn/
  rm -rf local/bn.tmp
  utils/fix_data_dir.sh data/bn