make_bn.sh
1.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#!/bin/bash
# Copyright 2015 David Snyder
# Apache 2.0.
#
# This script, called by ../run.sh, creates the HUB4 Broadcast News
# data directory. The required datasets can be found at:
# https://catalog.ldc.upenn.edu/LDC97S44
# https://catalog.ldc.upenn.edu/LDC97T22
set -e
sph_dir=$1
transcript_dir=$2
data_dir=$3
tmp_dir=local/bn.tmp
# These parameters are used when refining the annotations.
# A higher frames_per_second provides better resolution at the
# frame boundaries. Set min_seg to control the minimum length of the
# final segments. It seems that the original annotations for segments
# below half a second are not very accurate, so we test only on segments
# longer than this.
frames_per_sec=100
min_seg=0.5
rm -rf local/bn.tmp
mkdir local/bn.tmp
echo "$0: preparing annotations..."
local/make_annotations_bn.py ${transcript_dir} ${tmp_dir}
echo "$0: Removing overlapping annotations..."
local/refine_annotations_bn.py ${tmp_dir} ${frames_per_sec} ${min_seg}
echo "$0: Preparing broadcast news data directories ${data_dir}/bn..."
local/make_bn.py ${sph_dir} ${tmp_dir}
mkdir -p ${data_dir}/bn
cp ${tmp_dir}/wav.scp ${data_dir}/bn/
cp ${tmp_dir}/utt2spk ${data_dir}/bn/
cp ${tmp_dir}/segments ${data_dir}/bn/
rm -rf local/bn.tmp
utils/fix_data_dir.sh data/bn