gale_data_prep_split.sh
2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/bin/bash
# Copyright 2014 (author: Ahmed Ali, Hainan Xu)
# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
# Apache 2.0
if [ $# -ne 1 ]; then
echo "Arguments should be the <gale folder>"; exit 1
fi
set -e -o pipefail
#data will data/local
galeData=$(utils/make_absolute.sh $1)
mkdir -p data/local
dir=$(utils/make_absolute.sh data/local)
# some problem with the text data; same utt id but different transcription
cat $galeData/all | awk '{print$2}' | \
sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list
utils/filter_scp.pl --exclude -f 2 \
$galeData/dup.list $galeData/all > $galeData/all.nodup
mv $galeData/all $galeData/all.orig
mv $galeData/all.nodup $galeData/all
grep -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.dev
grep -v -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.train
cat $galeData/all.dev | awk '{print$2}' > $galeData/dev_utt_list
cat $galeData/all.train | awk '{print$2}' > $galeData/train_utt_list
mkdir -p $dir/dev
mkdir -p $dir/train
utils/filter_scp.pl -f 1 $galeData/dev_utt_list $galeData/utt2spk > $dir/dev/utt2spk
utils/utt2spk_to_spk2utt.pl $dir/dev/utt2spk | sort -u > $dir/dev/spk2utt
utils/filter_scp.pl -f 1 $galeData/train_utt_list $galeData/utt2spk > $dir/train/utt2spk
utils/utt2spk_to_spk2utt.pl $dir/train/utt2spk | sort -u > $dir/train/spk2utt
for x in dev train; do
outdir=$dir/$x
file=$galeData/all.$x
mkdir -p $outdir
awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments
awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
done
cat $dir/dev/segments | awk '{print$2}' | sort -u > $galeData/dev.wav.list
cat $dir/train/segments | awk '{print$2}' | sort -u > $galeData/train.wav.list
utils/filter_scp.pl -f 1 $galeData/dev.wav.list $galeData/wav.scp > $dir/dev/wav.scp
utils/filter_scp.pl -f 1 $galeData/train.wav.list $galeData/wav.scp > $dir/train/wav.scp
cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
{if (seen[$1]) { print $0}}' > $dir/train/wav.scp
echo data prep split succeeded