sprak_data_prep.sh
5.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/bin/bash
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Copyright 2013-2014 Mirsk Digital Aps (Author: Andreas Kirkedal)
# Copyright 2016 KTH Royal Institute of Technology (Author: Emelie Kullmann)
# Apache 2.0.
dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/transcript_lm
traindir=`pwd`/data/local/trainsrc
testdir=`pwd`/data/local/testsrc
rm -rf $lmdir $traindir $testdir $devdir
mkdir -p $dir $lmdir $traindir $testdir $devdir
local=`pwd`/local
utils=`pwd`/utils
. ./path.sh
# Checks if python3 is available on the system and install python3 in userspace if not
# This recipe currently relies on version 3 because python3 uses utf8 as internal
# string representation
#if ! which python3 >&/dev/null; then
# echo "Python3 is not installed, to install it you should probably do:"
# echo "sudo apt-get install python3" || exit 1;
#fi
if [ ! -d $dir/download ]; then
mkdir -p $dir/download/0467-1 $dir/download/0467-2 $dir/download/0467-3
fi
echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while."
if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then
( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-1.tar.gz --directory-prefix=$dir/download )
fi
if [ ! -f $dir/download/sve.16khz.0467-2.tar.gz ]; then
( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-2.tar.gz --directory-prefix=$dir/download )
fi
if [ ! -f $dir/download/sve.16khz.0467-3.tar.gz ]; then
( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-3.tar.gz --directory-prefix=$dir/download )
fi
if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then
( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0468.tar.gz --directory-prefix=$dir/download )
fi
echo "Corpus files downloaded."
if [ ! -d $dir/download/0468 ]; then
echo "Unpacking files."
tar -xzf $dir/download/sve.16khz.0467-1.tar.gz -C $dir/download/0467-1
tar -xzf $dir/download/sve.16khz.0467-2.tar.gz -C $dir/download/0467-2
tar -xzf $dir/download/sve.16khz.0467-3.tar.gz -C $dir/download/0467-3
tar -xzf $dir/download/sve.16khz.0468.tar.gz -C $dir/download/0468
echo "Corpus unpacked succesfully."
fi
sph2pipe=$(which sph2pipe) || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe . Did you run 'make' in the tools directory?";
exit 1;
fi
echo "done"
echo "Converting downloaded files to a format consumable by Kaldi scripts."
rm -rf $dir/corpus_processed
mkdir -p $dir/corpus_processed/training/0467-1 $dir/corpus_processed/training/0467-2 $dir/corpus_processed/training/0467-3
# Create parallel file lists and text files, but keep sound files in the same location to save disk space
# Writes the lists to data/local/data (~ 310h)
echo "Creating parallel data for training data."
python $local/sprak2kaldi.py $dir/download/0467-1 $dir/corpus_processed/training/0467-1 # ~140h
python $local/sprak2kaldi.py $dir/download/0467-2 $dir/corpus_processed/training/0467-2 # ~125h
python $local/sprak2kaldi.py $dir/download/0467-3 $dir/corpus_processed/training/0467-3 # ~128h
mv $dir/corpus_processed/training/0467-1/'r4670118.791213 8232' $dir/corpus_processed/training/0467-1/'r4670118.791213_8232'
for f in $dir/corpus_processed/training/0467-1/r4670118.791213_8232/*.txt; do
mv "$f" "${f// /_}";
done
(
# Ditto test set (~ 93h)
echo "Creating parallel data for test data."
rm -rf $dir/corpus_processed/test/0468
mkdir -p $dir/corpus_processed/test/0468
python $local/sprak2kaldi.py $dir/download/0468 $dir/corpus_processed/test/0468
)
# Create the LM training data
(
echo "Writing the LM text to file and normalising."
cat $dir/corpus_processed/training/0467-1/txtlist $dir/corpus_processed/training/0467-2/txtlist $dir/corpus_processed/training/0467-3/txtlist | while read l; do cat $l; done > $lmdir/lmsents
python local/normalize_transcript.py $lmdir/lmsents $lmdir/lmsents.norm
sort -u $lmdir/lmsents.norm > $lmdir/transcripts.uniq
)
# Combine training file lists
echo "Combine file lists."
cat $dir/corpus_processed/training/0467-1/txtlist $dir/corpus_processed/training/0467-2/txtlist $dir/corpus_processed/training/0467-3/txtlist > $dir/traintxtfiles
cat $dir/corpus_processed/training/0467-1/sndlist $dir/corpus_processed/training/0467-2/sndlist $dir/corpus_processed/training/0467-3/sndlist > $dir/trainsndfiles
# Move test file lists to the right location
cp $dir/corpus_processed/test/0468/txtlist $dir/testtxtfiles
cp $dir/corpus_processed/test/0468/sndlist $dir/testsndfiles
# Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
# Use sph2pipe because the wav files are actually sph files
echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev"
python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe
python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe
# Create the main data sets
local/create_datasets.sh $testdir data/test
local/create_datasets.sh $traindir data/train
echo "Data preparation succeeded"