ami_sdm_data_prep.sh
3.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/bin/bash
###########################################################################################
# This script was copied from egs/ami/s5/local/ami_sdm_data_prep.sh
# The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
# Changes made:
# - Specified path to path.sh
# - Modified paths to match multi_en naming conventions
# - Changed wav.scp to downsample to 8 kHz
###########################################################################################
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski)
# AMI Corpus dev/eval data preparation
. ./path.sh
#check existing directories
if [ $# != 2 ]; then
echo "Usage: ami_sdm_data_prep.sh <path/to/AMI> <dist-mic-num>"
exit 1;
fi
AMI_DIR=$1
MICNUM=$2
DSET="sdm$MICNUM"
SEGS=data/local/ami/annotations/train.txt
dir=data/local/ami/$DSET_train
mkdir -p $dir
# Audio data directory check
if [ ! -d $AMI_DIR ]; then
echo "Error: run.sh requires a directory argument"
exit 1;
fi
# And transcripts check
if [ ! -f $SEGS ]; then
echo "Error: File $SEGS no found (run ami_text_prep.sh)."
exit 1;
fi
# as the sdm we treat first mic from the array
find $AMI_DIR -iname "*.Array1-0$MICNUM.wav" | sort > $dir/wav.flist
n=`cat $dir/wav.flist | wc -l`
echo "In total, $n files were found."
[ $n -ne 169 ] && \
echo Warning: expected 169 data data files, found $n
# (1a) Transcriptions preparation
# here we start with already normalised transcripts, just make the ids
# Note, we set here SDM rather than, for example, SDM1 as we want to easily use
# the same alignments across different mics
awk '{meeting=$1; channel="SDM"; speaker=$3; stime=$4; etime=$5;
printf("AMI_%s_%s_%s_%07.0f_%07.0f", meeting, channel, speaker, int(100*stime+0.5), int(100*etime+0.5));
for(i=6;i<=NF;i++) printf(" %s", $i); printf "\n"}' $SEGS | sort | uniq > $dir/text
# (1c) Make segment files from transcript
#segments file format is: utt-id side-id start-time end-time, e.g.:
#AMI_ES2011a_H00_FEE041_0003415_0003484
awk '{
segment=$1;
split(segment,S,"[_]");
audioname=S[1]"_"S[2]"_"S[3]; startf=S[5]; endf=S[6];
print segment " " audioname " " startf/100 " " endf/100 " "
}' < $dir/text > $dir/segments
#EN2001a.Array1-01.wav
sed -e 's?.*/??' -e 's?.wav??' $dir/wav.flist | \
perl -ne 'split; $_ =~ m/(.*)\..*/; print "AMI_$1_SDM\n"' | \
paste - $dir/wav.flist > $dir/wav1.scp
#Keep only training part of waves
awk '{print $2}' $dir/segments | sort -u | join - $dir/wav1.scp | sort -o $dir/wav2.scp
#Two distant recordings are missing, agree segments with wav.scp
awk '{print $1}' $dir/wav2.scp | join -2 2 - $dir/segments | \
awk '{print $2" "$1" "$3" "$4" "$5}' > $dir/s; mv $dir/s $dir/segments
#...and text with segments
awk '{print $1}' $dir/segments | join - $dir/text > $dir/t; mv $dir/t $dir/text
#replace path with an appropriate sox command that select a single channel only
awk '{print $1" sox -c 1 -t wavpcm -s "$2" -r 8000 -t wavpcm - |"}' $dir/wav2.scp > $dir/wav.scp
# this file reco2file_and_channel maps recording-id
cat $dir/wav.scp | \
perl -ane '$_ =~ m:^(\S+SDM)\s+.*\/([IETB].*)\.wav.*$: || die "bad label $_";
print "$1 $2 A\n"; ' > $dir/reco2file_and_channel || exit 1;
# Assumtion, for sdm we adapt to the session only
awk '{print $1}' $dir/segments | \
perl -ane '$_ =~ m:^(\S+)([FM][A-Z]{0,2}[0-9]{3}[A-Z]*)(\S+)$: || die "bad label $_";
print "$1$2$3 $1\n";' | sort > $dir/utt2spk || exit 1;
sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
# Copy stuff into its final locations
mkdir -p data/ami_$DSET/train
for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
cp $dir/$f data/ami_$DSET/train/$f || exit 1;
done
utils/validate_data_dir.sh --no-feats data/ami_$DSET/train
echo AMI $DSET data preparation succeeded.