make_partitions.sh
3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
# Copyright 2016 Allen Guo
# 2017 Xiaohui Zhang
# Apache License 2.0
# This script creates the data directories that will be used during training.
# This is discussed fully in README.md, but the gist of it is that the data for each
# stage will be located at data/{MULTI}/{STAGE}, and every training stage is individually
# configurable for maximum flexibility.
# Note: The $stage if-blocks use -eq in this script, so running with --stage 4 will
# run only the stage 4 prep.
multi=multi_a # This defines the "variant" we're using; see README.md
stage=1
. utils/parse_options.sh
data_dir=data/$multi
mkdir -p $data_dir
# swbd 10k short (nodup)
if [ $stage -eq 1 ]; then
utils/subset_data_dir.sh --shortest data/swbd/train 100000 data/swbd/train_100kshort
utils/data/remove_dup_utts.sh 10 data/swbd/train_100kshort data/swbd/train_100kshort_nodup
utils/subset_data_dir.sh data/swbd/train_100kshort_nodup 10000 data/swbd/train_10k_nodup
ln -nfs ../swbd/train_10k_nodup $data_dir/mono
fi
# swbd 30k (nodup)
if [ $stage -eq 2 ]; then
utils/subset_data_dir.sh --speakers data/swbd/train 30000 data/swbd/train_30k
utils/data/remove_dup_utts.sh 200 data/swbd/train_30k $data_dir/mono_ali
ln -nfs mono_ali $data_dir/tri1a
ln -nfs mono_ali $data_dir/tri1a_ali
ln -nfs mono_ali $data_dir/tri1b
fi
# swbd 100k (nodup)
if [ $stage -eq 3 ]; then
utils/subset_data_dir.sh --speakers data/swbd/train 100000 data/swbd/train_100k
utils/data/remove_dup_utts.sh 200 data/swbd/train_100k $data_dir/train_100k_nodup
ln -nfs train_100k_nodup $data_dir/tri1b_ali
ln -nfs train_100k_nodup $data_dir/tri2
fi
# whole swbd
if [ $stage -eq 4 ]; then
ln -nfs ../swbd/train $data_dir/tri2_ali
ln -nfs tri2_ali $data_dir/tri3a
fi
# whole fisher + swbd (nodup)
if [ $stage -eq 5 ]; then
utils/combine_data.sh data/fisher_swbd \
data/{swbd,fisher}/train \
|| { echo "Failed to combine data"; exit 1; }
utils/data/remove_dup_utts.sh 300 data/fisher_swbd $data_dir/tri3a_ali
ln -nfs tri3a_ali $data_dir/tri3b
fi
# whole fisher + swbd + tedlium (nodup)
if [ $stage -eq 6 ]; then
utils/combine_data.sh $data_dir/fisher_swbd_tedlium \
data/{fisher,swbd}/train $data_dir/tedlium_cleaned/train \
|| { echo "Failed to combine data"; exit 1; }
utils/data/remove_dup_utts.sh 300 $data_dir/fisher_swbd_tedlium $data_dir/tri3b_ali
ln -nfs tri3b_ali $data_dir/tri4
fi
# whole fisher + swbd + tedlium + wsj + hub4_en (nodup)
if [ $stage -eq 7 ]; then
utils/combine_data.sh $data_dir/fisher_swbd_tedlium_wsj_hub4 \
$data_dir/fisher_swbd_tedlium data/{wsj,hub4_en}/train \
|| { echo "Failed to combine data"; exit 1; }
utils/data/remove_dup_utts.sh 300 $data_dir/fisher_swbd_tedlium_wsj_hub4 $data_dir/tri4_ali
ln -nfs tri4_ali $data_dir/tri5a
fi
# whole fisher + swbd + tedlilum + wsj + hub4_en + librispeech460 (nodup)
if [ $stage -eq 8 ]; then
utils/combine_data.sh $data_dir/fisher_swbd_tedlium_wsj_hub4_libri460 \
$data_dir/fisher_swbd_tedlium_wsj_hub4 data/{librispeech_100,librispeech_360}/train \
|| { echo "Failed to combine data"; exit 1; }
utils/data/remove_dup_utts.sh 300 $data_dir/fisher_swbd_tedlium_wsj_hub4_libri460 $data_dir/tri5a_ali
ln -nfs tri5a_ali $data_dir/tri5b
fi
# whole fisher + swbd + tedlilum + wsj + hub4_en + librispeech960 (nodup)
if [ $stage -eq 9 ]; then
utils/combine_data.sh $data_dir/fisher_swbd_tedlium_wsj_hub4_libri960 \
$data_dir/fisher_swbd_tedlium_wsj_hub4_libri460 data/librispeech_500/train \
|| { echo "Failed to combine data"; exit 1; }
utils/data/remove_dup_utts.sh 300 $data_dir/fisher_swbd_tedlium_wsj_hub4_libri960 $data_dir/tri5b_ali
ln -nfs tri5b_ali $data_dir/tri6a
ln -nfs tri5b_ali $data_dir/tri6a_ali
fi
# sampled data for ivector extractor training,.etc
if [ $stage -eq 10 ]; then
ln -nfs tri6a $data_dir/tdnn
utils/subset_data_dir.sh $data_dir/tdnn \
100000 $data_dir/tdnn_100k
utils/subset_data_dir.sh $data_dir/tdnn \
30000 $data_dir/tdnn_30k
fi