ami_ihm_data_prep.sh
3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/bash
###########################################################################################
# This script was copied from egs/ami/s5/local/ami_ihm_data_prep.sh
# The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
# Changes made:
# - Specified path to path.sh
# - Modified paths to match multi_en naming conventions
# - Changed wav.scp to downsample to 8 kHz
###########################################################################################
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
# AMI Corpus training data preparation
# Apache 2.0
# To be run from one directory above this script.
. ./path.sh
#check existing directories
if [ $# != 1 ]; then
echo "Usage: ami_ihm_data_prep.sh /path/to/AMI"
exit 1;
fi
AMI_DIR=$1
SEGS=data/local/ami/annotations/train.txt
dir=data/local/ami/ihm_train
mkdir -p $dir
# Audio data directory check
if [ ! -d $AMI_DIR ]; then
echo "Error: $AMI_DIR directory does not exists."
exit 1;
fi
# And transcripts check
if [ ! -f $SEGS ]; then
echo "Error: File $SEGS no found (run ami_text_prep.sh)."
exit 1;
fi
# find headset wav audio files only
find $AMI_DIR -iname '*.Headset-*.wav' | sort > $dir/wav.flist
n=`cat $dir/wav.flist | wc -l`
echo "In total, $n headset files were found."
[ $n -ne 687 ] && \
echo "Warning: expected 687 (168 mtgs x 4 mics + 3 mtgs x 5 mics) data files, found $n"
# (1a) Transcriptions preparation
# here we start with normalised transcriptions, the utt ids follow the convention
# AMI_MEETING_CHAN_SPK_STIME_ETIME
# AMI_ES2011a_H00_FEE041_0003415_0003484
# we use uniq as some (rare) entries are doubled in transcripts
awk '{meeting=$1; channel=$2; speaker=$3; stime=$4; etime=$5;
printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5));
for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort | uniq > $dir/text
# (1b) Make segment files from transcript
awk '{
segment=$1;
split(segment,S,"[_]");
audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6];
print segment " " audioname " " startf*10/1000 " " endf*10/1000 " "
}' < $dir/text > $dir/segments
# (1c) Make wav.scp file.
sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
perl -ne 'split; $_ =~ m/(.*)\..*\-([0-9])/; print "AMI_$1_H0$2\n"' | \
paste - $dir/wav.flist > $dir/wav1.scp
#Keep only train part of waves
awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp > $dir/wav2.scp
#replace path with an appropriate sox command that select single channel only
awk '{print $1" sox -c 1 -t wavpcm -s "$2" -r 8000 -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
# (1d) reco2file_and_channel
cat $dir/wav.scp \
| perl -ane '$_ =~ m:^(\S+)(H0[0-4])\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_";
print "$1$2 $3 A\n"; ' > $dir/reco2file_and_channel || exit 1;
awk '{print $1}' $dir/segments | \
perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
print "$1$2$3 $1$2\n";' > $dir/utt2spk || exit 1;
sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
# Copy stuff into its final location
mkdir -p data/ami_ihm/train
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
cp $dir/$f data/ami_ihm/train/$f || exit 1;
done
utils/validate_data_dir.sh --no-feats data/ami_ihm/train || exit 1;
echo AMI IHM data preparation succeeded.