csj_data_prep.sh
4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/bin/bash
# Copyright 2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki)
# 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
# Apache 2.0
# Acknowledgement This work was supported by JSPS KAKENHI Grant Number 26280055.
# CSJ 291 hours training data preparation.
# "Academic lecture" 270 hours (Exclude the speakers in the evaluation set from all speech data)
# + "Other" 21 hours
# Actually, amount time of training data is 240 hours, this is excluding R-tag utterance and silence section.
# To be run from one directory above this script.
## The input is a directory that contains the CSJ corpus.
## Note: If necessary, rewrite the "cat" command used in the followings
## to locate the .wav file path.
. ./path.sh
set -e # exit on error
#check existing directories
if [ $# -ne 1 ] && [ $# -ne 2 ]; then
echo "Usage: csj_data_prep.sh <csj-data dir> [<mode_number>]"
echo " mode_number can be 0, 1, 2, 3, (0=default using academic lecture and other data, 1=using academic lecture data,"
echo " 2=using all data except for dialog data, 3=using all data)"
exit 1;
fi
CSJ=$1
mode=0
if [ $# -eq 2 ]; then
mode=$2
fi
dir=data/local/train
mkdir -p $dir
# Audio data directory check
if [ ! -d $CSJ ]; then
echo "Error: run.sh requires a directory argument"
exit 1;
fi
# CSJ dictionary file check
if [ ! -f $dir/lexicon.txt ]; then
cp $CSJ/lexicon/lexicon.txt $dir || exit 1;
fi
### Config of using wav data that relates with acoustic model training ###
if [ $mode -eq 3 ]
then
cat $CSJ/*/*/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using All data
elif [ $mode -eq 2 ]
then
cat $CSJ/*/{A*,M*,R*,S*}/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using All data except for "dialog" data
elif [ $mode -eq 1 ]
then
cat $CSJ/*/A*/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using "Academic lecture" data
else
# cat $CSJ/*/{A*,M*}/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using "Academic lecture" and "other" data
cat $CSJ/*/{A,M}*/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using "Academic lecture" and "other" data
fi
n=`cat $dir/wav.flist | wc -l`
[ $n -ne 986 ] && \
echo "Warning: expected 986 data files (Case : Using 'Academic lecture' and 'Other' data), found $n."
# (1a) Transcriptions preparation
# make basic transcription file (add segments info)
##e.g A01F0055_0172 00380.213 00385.951 => A01F0055_0380213_0385951
## for CSJ
awk '{
spkutt_id=$1;
split(spkutt_id,T,"[_ ]");
name=T[1]; stime=$2; etime=$3;
printf("%s_%07.0f_%07.0f",name, int(1000*stime), int(1000*etime));
for(i=4;i<=NF;i++) printf(" %s", tolower($i)); printf "\n"
}' $CSJ/*/*/*-trans.text |sort > $dir/transcripts1.txt # This data is for training language models
# Except evaluation set (30 speakers)
# test if trans. file is sorted
export LC_ALL=C;
sort -c $dir/transcripts1.txt || exit 1; # check it's sorted.
# Remove Option.
# **NOTE: modified the pattern matches to make them case insensitive
cat $dir/transcripts1.txt \
| perl -ane 's:\<s\>::gi;
s:\<\/s\>::gi;
print;' \
| awk '{if(NF > 1) { print; } } ' |sort > $dir/text
# (1c) Make segments files from transcript
#segments file format is: utt-id start-time end-time, e.g.:
#A01F0055_0380213_0385951 => A01F0055_0380213_0385951 A01F0055 00380.213 00385.951
awk '{
segment=$1;
split(segment,S,"[_]");
spkid=S[1]; startf=S[2]; endf=S[3];
print segment " " spkid " " startf/1000 " " endf/1000
}' < $dir/text > $dir/segments
sed -e 's?.*/??' -e 's?.wav??' -e 's?\-[R,L]??' $dir/wav.flist | paste - $dir/wav.flist \
> $dir/wavflist.scp
awk '{
printf("%s cat %s |\n", $1, $2);
}' < $dir/wavflist.scp | sort > $dir/wav.scp || exit 1;
awk '{segment=$1; split(segment,S,"[_]"); spkid=S[1]; print $1 " " spkid}' $dir/segments > $dir/utt2spk || exit 1;
sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
# Copy stuff into its final locations [this has been moved from the format_data script]
mkdir -p data/train
for f in spk2utt utt2spk wav.scp text segments; do
cp data/local/train/$f data/train/ || exit 1;
done
echo "CSJ data preparation succeeded."
utils/fix_data_dir.sh data/train