farsdat_data_prep.sh
6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/bash
#
# Copyright 2014 Univercity of Tehran (Author: Bagher BabaAli)
# 2014 Brno University of Technology (Karel Vesely)
# 2014 Johns Hopkins University (Daniel Povey)
#
# farsdat, description of the database:
# http://www.assta.org/sst/SST-94-Vol-ll/cache/SST-94-VOL2-Chapter15-p20.pdf
if [ $# -ne 1 ]; then
echo "Argument should be the farsdat directory, see ../run.sh for example."
exit 1;
fi
dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/nist_lm
mkdir -p $dir $lmdir
local=`pwd`/local
utils=`pwd`/utils
conf=`pwd`/conf
. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
[ -f $conf/test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
[ -f $conf/dev_spk.list ] || error_exit "$PROG: dev-set speaker list not found.";
[ -f $conf/train_spk.list ] || error_exit "$PROG: train-set speaker list not found.";
# First check if the train & test directories exist (these can either be upper-
# or lower-cased
if [ ! -d $*/CD1 -o ! -d $*/CD2 ] && [ ! -d $*/cd1 -o ! -d $*/cd2 ]; then
echo "farsdat_data_prep.sh: Spot check of command line argument failed"
echo "Command line argument must be absolute pathname to Farsdat directory"
echo "with name like /export/corpora5/ELRA/farsdat"
exit 1;
fi
# Now check what case the directory structure is
uppercased=false
cd1_dir=cd1
cd2_dir=cd2
if [ -d $*/CD1 ]; then
uppercased=true
cd1_dir=CD1
cd2_dir=CD2
fi
tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
trap 'rm -rf "$tmpdir"' EXIT
find $*/{$cd1_dir/SENTENCE/,$cd2_dir/SENTENCE/} -iname '*.SNT' -print |\
while read filename; do
rec_id=$(echo "$filename" | sed -e 's:.*/S\([1-2]\)\(.*\)\.SNT$:\2 \1:i' |\
awk '{printf("%03d_%d\n",$1,$2);}' ) || exit 1;
cat "$filename" | awk -v rec_id=$rec_id \
'{printf "%s_%s %s %f %f\n",rec_id,$1,rec_id,$2/(2*22050),$3/(2*22050)}'
done > $dir/segments || exit 1;
find $*/{$cd1_dir/wave,$cd2_dir/wave} -iname '*.WAV' -print > $tmpdir/wav.flist || exit 1;
sed -e 's:.*/S\([1-2]\)\(.*\)\.WAV$:\2 \1:i' $tmpdir/wav.flist |\
awk '{printf("%03d_%d\n",$1,$2);}' > $tmpdir/wav.uttids || exit 1;
paste $tmpdir/wav.uttids $tmpdir/wav.flist | \
awk '{printf("%s sox %s -t wav -r 16000 -c 1 - |\n", $1, $2);}' | sort -k1,1 > $dir/wav.scp
# Now, Convert the transcripts into our format (no normalization yet)
# Get the transcripts: each line of the output contains an utterance
# ID followed by the transcript.
find $*/{$cd1_dir/PHONEME,$cd2_dir/PHONEME} -iname 'PH*.*' -print > $tmpdir/phn.flist
sed -e 's:.*/PH\([1-2]\)\(.*\)\.\(.*\)$:\2 \1 \3:i' $tmpdir/phn.flist |\
awk '{printf("%03d_%d_%d\n",$1,$2,$3);}' > $tmpdir/phn.uttids || exit 1;
while read line; do
[ -f $line ] || error_exit "Cannot find transcription file '$line'";
cut -c1 "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;' || exit 1;
done < $tmpdir/phn.flist > $tmpdir/phn.trans || exit 1;
paste $tmpdir/phn.uttids $tmpdir/phn.trans | sort -k1,1 > $dir/trans || exit 1;
# Do normalization steps.
$local/farsdat_norm_trans.sh $dir/trans | sort > $dir/text || exit 1;
# Prepare gender mapping
cat $*/$cd1_dir/Information/Speaker.txt $*/$cd2_dir/Information/Speaker.txt | \
sed '/Code/d' | awk '{printf("%03d %s\n",$1,$3)}' > $dir/spk2gender || exit 1;
for x in dev test; do
cat $conf/${x}_spk.list | awk '{printf("%03d\n",$1);}' > \
$tmpdir/${x}_spk.list || exit 1;
awk -F'_' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/segments |\
sort -k1 | awk -F'_' '{sent[$1]=sent[$1] " " $3 }
END {
for(i=1; i<=304; ++i)
{ split(sent[i],sent_split," ");
asort(sent_split,sent_sort);
for(j=1; j<=8;j++)
{
print sent_sort[j];
}
}
}' | sort -n | uniq > $tmpdir/${x}.sent || exit 1;
done
cat $conf/train_spk.list | awk '{printf("%03d\n",$1);}' > \
$tmpdir/train_spk.list|| exit 1;
cat $tmpdir/dev.sent $tmpdir/test.sent | uniq -u > $tmpdir/dev+test.sent|| exit 1;
seq 1 404 | sed '/400/d' | grep -F -x -v -f $tmpdir/dev+test.sent - > \
$tmpdir/train.sent || exit 1;
for x in train dev test; do
set=data/$x
mkdir -p $set
awk -F'_' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/segments |\
sort -k1 > $tmpdir/segments || exit 1;
awk -F'_' 'NR==FNR{a[$1]++;next} (a[substr($3,1,index($3," ")-1)])' \
$tmpdir/${x}.sent $tmpdir/segments | sort -k1 > $set/segments || exit 1;
awk -F'_' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/text |\
sort -k1 > $tmpdir/text || exit 1;
awk -F'_' 'NR==FNR{a[$1]++;next} (a[substr($3,1,index($3," ")-1)])' \
$tmpdir/${x}.sent $tmpdir/text | sort -k1 > $set/text || exit 1;
awk -F'_' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/wav.scp > \
$tmpdir/wav.scp || exit 1;
cat $set/segments | awk -F'_' '{printf("%03d_%d\n",$1,$2)}' > \
$tmpdir/spk_session || exit 1;
awk -F' ' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/spk_session $tmpdir/wav.scp |\
sort -k1 > $set/wav.scp || exit 1;
awk 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/spk2gender |\
tr '[:upper:]' '[:lower:]' > $set/spk2gender || exit 1;
# Make the utt2spk and spk2utt files.
cut -d' ' -f1 $set/segments | awk -F'_' '{print $0,$1}' > $set/utt2spk || exit 1;
cat $set/utt2spk | utils/utt2spk_to_spk2utt.pl > $set/spk2utt || exit 1;
# Prepare STM file for sclite:
awk -v txt=$set/text -v sex=$set/spk2gender \
'BEGIN{
while(getline < txt) { ref[$1]=substr($0, index($0,$2)); }
while(getline < sex) { gender[$1]=$2; }
print ";; LABEL \"O\" \"Overall\" \"Overall\"";
print ";; LABEL \"F\" \"Female\" \"Female speakers\"";
print ";; LABEL \"M\" \"Male\" \"Male speakers\"";
}
{ spk_id=substr($2,1,3);
printf("%s 1 %s %s %s <O,%s> %s\n", $1, spk_id, $3, $4, toupper(gender[spk_id]), ref[$1]);
}' $set/segments >$set/stm || exit 1
# Create dummy GLM file for sclite:
echo ';; empty.glm
[FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token
' > $set/glm
# Check that data dirs are okay!
utils/validate_data_dir.sh --no-feats $set || exit 1
done
echo "Data preparation succeeded"