data_split.sh
3.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/bin/bash
# Copyright (c) 2013, Ondrej Platek, Ufal MFF UK <oplatek@ufal.mff.cuni.cz>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License. #
#
# Makes train/test splits
# local/voxforge_data_prep.sh --nspk_test ${nspk_test} ${SELECTED} || exit 1
# create files: (TYPE=train|test)
# a) ${TYPE}_trans.txt: ID transcription capitalized! No interputction
# b) ${TYPE}_wav.scp: ID path2ID.wav
# c) $TYPE.utt2spk: ID-recording ID-speaker
# s) $TYPE.spk2utt
# e) $TYPE.spk2gender all speakers are male
# we have ID-recording = ID-speaker
# The vystadial data are specific by having following marks in transcriptions
# _INHALE_
# _LAUGH_
# _EHM_HMM_
# _NOISE_
# _EHM_HMM_
# _SIL_
# renice 20 $$
every_n=1
[ -f path.sh ] && . ./path.sh # source the path.
. utils/parse_options.sh || exit 1;
if [ $# -ne 4 ] ; then
echo "Usage: local/data_split.sh [--every-n 30] <data-directory> <local-directory> <LMs> <Test-Sets> <tgt-dir>";
exit 1;
fi
DATA=$1; shift
locdata=$1; shift
LMs=$1; shift
test_sets=$1; shift
tgt_dir=$1; shift
echo "LMs $LMs test_sets $test_sets"
echo "=== Starting initial Vystadial data preparation ..."
echo "--- Making test/train data split from $DATA taking every $every_n recording..."
mkdir -p $locdata
i=0
for s in $test_sets train ; do
mkdir -p $locdata/$s
ls $DATA/$s/ | sed -n /.*wav$/p |\
while read wav ; do
((i++)) # bash specific
if [[ $i -ge $every_n ]] ; then
i=0
pwav=$DATA/$s/$wav
trn=`cat $DATA/$s/$wav.trn`
echo "$wav $pwav" >> $locdata/$s/wav.scp
echo "$wav $wav" >> $locdata/$s/utt2spk
echo "$wav $wav" >> $locdata/$s/spk2utt
echo "$wav $trn" >> $locdata/$s/trans.txt
# Ignoring gender -> label all recordings as male
echo "$wav M" >> $locdata/spk2gender
fi
done # while read wav
for f in wav.scp utt2spk spk2utt trans.txt ; do
sort "$locdata/$s/$f" -k1 -u -o "$locdata/$s/$f" # sort in place
done # for f
done # for in $test_sets train
echo "Set 1:1 relation for spk2utt: spk in $test_sets AND train, sort in place"
sort "$locdata/spk2gender" -k1 -o "$locdata/spk2gender"
echo "--- Distributing the file lists to train and ($test_sets x $LMs) directories ..."
mkdir -p $WORK/train
cp $locdata/train/wav.scp $WORK/train/wav.scp || exit 1;
cp $locdata/train/trans.txt $WORK/train/text || exit 1;
cp $locdata/train/spk2utt $WORK/train/spk2utt || exit 1;
cp $locdata/train/utt2spk $WORK/train/utt2spk || exit 1;
utils/filter_scp.pl $WORK/train/spk2utt $locdata/spk2gender > $WORK/train/spk2gender || exit 1;
for s in $test_sets ; do
for lm in $LMs; do
tgt_dir=$WORK/${s}_`basename ${lm}`
mkdir -p $tgt_dir
cp $locdata/${s}/wav.scp $tgt_dir/wav.scp || exit 1;
cp $locdata/${s}/trans.txt $tgt_dir/text || exit 1;
cp $locdata/${s}/spk2utt $tgt_dir/spk2utt || exit 1;
cp $locdata/${s}/utt2spk $tgt_dir/utt2spk || exit 1;
utils/filter_scp.pl $tgt_dir/spk2utt $locdata/spk2gender > $tgt_dir/spk2gender || exit 1;
done
done