prepare_data.sh
4.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/bin/bash
#
# Copyright 2017 Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal)
# Apache 2.0
# Begin configuration section.
mictype=worn # worn, ref or others
cleanup=true
# End configuration section
. ./utils/parse_options.sh # accept options.. you can run this run.sh with the
. ./path.sh
echo >&2 "$0" "$@"
if [ $# -ne 3 ] ; then
echo >&2 "$0" "$@"
echo >&2 "$0: Error: wrong number of arguments"
echo -e >&2 "Usage:\n $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>"
echo -e >&2 "eg:\n $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train"
exit 1
fi
set -e -o pipefail
adir=$1
jdir=$2
dir=$3
json_count=$(find -L $jdir -name "*.json" | wc -l)
wav_count=$(find -L $adir -name "*.wav" | wc -l)
if [ "$json_count" -eq 0 ]; then
echo >&2 "We expect that the directory $jdir will contain json files."
echo >&2 "That implies you have supplied a wrong path to the data."
exit 1
fi
if [ "$wav_count" -eq 0 ]; then
echo >&2 "We expect that the directory $adir will contain wav files."
echo >&2 "That implies you have supplied a wrong path to the data."
exit 1
fi
echo "$0: Converting transcription to text"
mkdir -p $dir
for file in $jdir/*json; do
./local/json2text.py --mictype $mictype $file
done | \
sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\
sed -e 's/ - / /g' |\
sed -e 's/mm-/mm/g' > $dir/text.orig
echo "$0: Creating datadir $dir for type=\"$mictype\""
if [ $mictype == "worn" ]; then
# convert the filenames to wav.scp format, use the basename of the file
# as a the wav.scp key, add .L and .R for left and right channel
# i.e. each file will have two entries (left and right channel)
find -L $adir -name "S[0-9]*_P[0-9]*.wav" | \
perl -ne '{
chomp;
$path = $_;
next unless $path;
@F = split "/", $path;
($f = $F[@F-1]) =~ s/.wav//;
@F = split "_", $f;
print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n";
print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n";
}' | sort > $dir/wav.scp
# generate the transcripts for both left and right channel
# from the original transcript in the form
# P09_S03-0006072-0006147 gimme the baker
# create left and right channel transcript
# P09_S03.L-0006072-0006147 gimme the baker
# P09_S03.R-0006072-0006147 gimme the baker
sed -n 's/ *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text
elif [ $mictype == "ref" ]; then
# fixed reference array
# first get a text, which will be used to extract reference arrays
perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text
find -L $adir | grep "\.wav" | sort > $dir/wav.flist
# following command provide the argument for grep to extract only reference arrays
grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2
paste -d" " \
<(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \
$dir/wav.flist2 | sort > $dir/wav.scp
else
# array mic case
# convert the filenames to wav.scp format, use the basename of the file
# as a the wav.scp key
find -L $adir -name "*.wav" -ipath "*${mictype}*" |\
perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\
sort -u > $dir/wav.scp
# convert the transcripts from
# P09_S03-0006072-0006147 gimme the baker
# to the per-channel transcripts
# P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker
# P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker
# P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker
# P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker
perl -ne '$l=$_;
for($i=1; $i<=4; $i++) {
($x=$l)=~ s/-/.CH\Q$i\E-/;
print $x;}' $dir/text.orig | sort > $dir/text
fi
$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist
# Prepare 'segments', 'utt2spk', 'spk2utt'
if [ $mictype == "worn" ]; then
cut -d" " -f 1 $dir/text | \
awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
sed -e "s/_[A-Z]*\././2" \
> $dir/segments
elif [ $mictype == "ref" ]; then
cut -d" " -f 1 $dir/text | \
awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
sed -e "s/_[A-Z]*\././2" |\
sed -e "s/ P.._/ /" > $dir/segments
else
cut -d" " -f 1 $dir/text | \
awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
sed -e "s/_[A-Z]*\././2" |\
sed -e 's/ P.._/ /' > $dir/segments
fi
cut -f 1 -d ' ' $dir/segments | \
perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk
utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
# Check that data dirs are okay!
utils/validate_data_dir.sh --no-feats $dir || exit 1