create_splits.sh
1.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/bin/bash
# Copyright 2018 Desh Raj (Johns Hopkins University)
# This script reads the extracted Bentham database files and creates
# the following files (for all the data subsets):
# text, utt2spk, images.scp.
download_dir=$1
save_dir=$2
mkdir -p $save_dir/{train,val,test}
touch $save_dir/{train,val,test}/{text,images.scp,utt2spk,spk2utt}
partition_dir=$download_dir"/gt/Partitions/"
lines_dir=$download_dir"/gt/Images/Lines/"
text_dir=$download_dir"/gt/Transcriptions/"
function split {
echo "Creating $1 split"
split_dir=$save_dir/$1
line_file=$partition_dir/$2
while read -r line; do
name="$line"
spkid=${name:0:11}
echo -n $name" " | cat - $text_dir/$name* >> $split_dir/text
echo >> $split_dir/text
echo $name $lines_dir"/"$name".png" >> $split_dir/images.scp
echo $name $spkid >> $split_dir/utt2spk
done < "$line_file"
perl -i -ne 'print if /\S/' $split_dir/images.scp $split_dir/text $split_dir/utt2spk
utils/utt2spk_to_spk2utt.pl $split_dir/utt2spk > $split_dir/spk2utt
}
split train TrainLines.lst
split val ValidationLines.lst
split test TestLines.lst