csj_eval_data_prep.sh
2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/bash
# Copyright 2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki)
# 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
# Apache 2.0
# Acknowledgement This work was supported by JSPS KAKENHI Grant Number 26280055.
# Official evaluation data set (each set contains 10 speakers) preparation
# To be run from one directory above this script.
# The input is directory containing the official evaluation test set and transcripts.
if [ $# -ne 2 ]; then
echo "Usage: "`basename $0`" <transcription-dir> <eval_num>"
echo "See comments in the script for more details"
exit 1
fi
tdir=$1
eval_num=$2
. ./path.sh
dir=data/local/$eval_num
mkdir -p $dir
cat $tdir/$eval_num/*/*-wav.list | sort > $dir/wav.flist
n=`cat $dir/wav.flist | wc -l`
sed -e 's?.*/??' -e 's?.wav??' -e 's?\-[R,L]??' $dir/wav.flist | paste - $dir/wav.flist \
> $dir/wavflist.scp
awk '{
printf("%s cat %s |\n", $1, $2);
}' < $dir/wavflist.scp | sort > $dir/wav.scp || exit 1;
# Get segments file...
# segments file format is: utt-id start-time end-time, e.g.:
# A01F0055_00380213_00385.951 => A01F0055 00380.213 00385.951
awk '{
spkutt_id=$1;
split(spkutt_id,T,"[_ ]");
name=T[1]; stime=$2; etime=$3;
printf("%s_%07.0f_%07.0f",name, int(1000*stime), int(1000*etime));
for(i=4;i<=NF;i++) printf(" %s", tolower($i)); printf "\n"
}' $tdir/$eval_num/*/*-trans.text | sort > $dir/transcripts_${eval_num}.txt
# Remove option
cat $dir/transcripts_${eval_num}.txt \
| perl -ane 's:\<s\>::gi;
s:\<\/s\>::gi;
print;' \
| awk '{if(NF > 1) { print; } } ' > $dir/text
export LC_ALL=C;
sort -c $dir/text || exit 1; # check it's sorted.
## Create segment file
awk '{
segment=$1;
split(segment,S,"[_]");
spkid=S[1]; startf=S[2]; endf=S[3];
print segment " " spkid " " startf/1000 " " endf/1000
}' < $dir/text > $dir/segments
# create an utt2spk file that assumes each conversation side is a separate speaker.
awk '{segment=$1; split(segment,S,"[_]"); spkid=S[1]; print $1 " " spkid}' $dir/segments > $dir/utt2spk || exit 1;
sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
dest=data/$eval_num
mkdir -p $dest
for x in wav.scp segments text utt2spk spk2utt; do
cp $dir/$x $dest/$x
done
utils/fix_data_dir.sh $dest
if [ $(wc -l < $dest/wav.scp) -ne 10 ]; then
echo "$0: error: expected 10 lines in wav.scp, got $(wc -l < $dest/wav.scp)"
exit 1;
fi
echo "Completed preparation evaluation set $eval_num"