prepare_data.sh
3.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/bin/bash
# Copyright 2018 John Morgan
# Apache 2.0.
# configuration variables
tmpdir=data/local/tmp
download_dir=$(pwd)
tmp_tunis=$tmpdir/tunis
tmp_libyan=$tmpdir/libyan
data_dir=$download_dir/Tunisian_MSA/data
# location of test data
libyan_src=$data_dir/speech/test/Libyan_MSA
# end of configuration variable settings
# process the Tunisian MSA devtest data
# get list of wav files
for s in devtest/CTELLONE/Recordings_Arabic/6 devtest/CTELLTHREE/Recordings_Arabic/10; do
echo "$0: looking for wav files for $s."
mkdir -p $tmp_tunis/$s
find $data_dir/speech/$s -type f \
-name "*.wav" | grep Recordings_Arabic > $tmp_tunis/$s/wav.txt
local/devtest_recordings_make_lists.pl \
$data_dir/transcripts/devtest/recordings.tsv $s tunis
mkdir -p data/devtest
for x in wav.scp utt2spk text; do
cat $tmp_tunis/$s/$x | tr " " " " >> data/devtest/$x
done
done
utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk | sort > data/devtest/spk2utt
utils/fix_data_dir.sh data/devtest
# training data consists of 2 parts: answers and recordings (recited)
answers_transcripts=$data_dir/transcripts/train/answers.tsv
recordings_transcripts=$data_dir/transcripts/train/recordings.tsv
# location of test data
cls_rec_tr=$libyan_src/cls/data/transcripts/recordings/cls_recordings.tsv
lfi_rec_tr=$libyan_src/lfi/data/transcripts/recordings/lfi_recordings.tsv
srj_rec_tr=$libyan_src/srj/data/transcripts/recordings/srj_recordings.tsv
mbt_rec_tr=$data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv
# make acoustic model training lists
mkdir -p $tmp_tunis
# get wav file names
# for recited speech
# the data collection laptops had names like CTELLONE CTELLTWO ...
for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do
find $data_dir/speech/train/$machine -type f -name "*.wav" | grep Recordings \
>> $tmp_tunis/recordings_wav.txt
done
# get file names for Answers
for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do
find $data_dir/speech/train/$machine -type f \
-name "*.wav" \
| grep Answers >> $tmp_tunis/answers_wav.txt
done
# make separate transcription lists for answers and recordings
export LC_ALL=en_US.UTF-8
local/answers_make_lists.pl $answers_transcripts
utils/fix_data_dir.sh $tmp_tunis/answers
local/recordings_make_lists.pl $recordings_transcripts
utils/fix_data_dir.sh $tmp_tunis/recordings
# consolidate lists
# acoustic models will be trained on both recited and prompted speech
mkdir -p $tmp_tunis/lists
for x in wav.scp utt2spk text; do
cat $tmp_tunis/answers/$x $tmp_tunis/recordings/$x > $tmp_tunis/lists/$x
done
utils/fix_data_dir.sh $tmp_tunis/lists
# get training lists
mkdir -p data/train
for x in wav.scp utt2spk text; do
sort $tmp_tunis/lists/$x | tr " " " " > data/train/$x
done
utils/utt2spk_to_spk2utt.pl data/train/utt2spk | sort > data/train/spk2utt
utils/fix_data_dir.sh data/train
# process the Libyan MSA data
mkdir -p $tmp_libyan
for s in cls lfi srj; do
mkdir -p $tmp_libyan/$s
# get list of wav files
find $libyan_src/$s -type f \
-name "*.wav" \
| grep recordings > $tmp_libyan/$s/recordings_wav.txt
echo "$0: making recordings list for $s"
local/test_recordings_make_lists.pl \
$libyan_src/$s/data/transcripts/recordings/${s}_recordings.tsv $s libyan
done
# process the Tunisian MSA test data
mkdir -p $tmp_tunis/mbt
# get list of wav files
find $data_dir/speech/test/mbt -type f \
-name "*.wav" \
| grep recordings > $tmp_tunis/mbt/recordings_wav.txt
echo "$0: making recordings list for mbt"
local/test_recordings_make_lists.pl \
$data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv mbt tunis
mkdir -p data/test
# get the Libyan files
for s in cls lfi srj; do
for x in wav.scp utt2spk text; do
cat $tmp_libyan/$s/recordings/$x | tr " " " " >> data/test/$x
done
done
for x in wav.scp utt2spk text; do
cat $tmp_tunis/mbt/recordings/$x | tr " " " " >> data/test/$x
done
utils/utt2spk_to_spk2utt.pl data/test/utt2spk | sort > data/test/spk2utt
utils/fix_data_dir.sh data/test