ami_text_prep.sh
1.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/bin/bash
###########################################################################################
# This script was copied from egs/ami/s5/local/ami_text_prep.sh
# The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
# Changes made:
# - Modified paths to match multi_en naming conventions
###########################################################################################
# Copyright 2015, Brno University of Technology (Author: Karel Vesely)
# Copyright 2014, University of Edinburgh (Author: Pawel Swietojanski), 2014, Apache 2.0
if [ $# -ne 1 ]; then
echo "Usage: $0 <ami-dir>"
echo " <ami-dir> is download space."
exit 1;
fi
set -eux
dir=$1
mkdir -p $dir
echo "Downloading annotations..."
amiurl=http://groups.inf.ed.ac.uk/ami
annotver=ami_public_manual_1.6.1
annot="$dir/$annotver"
logdir=data/local/ami/downloads; mkdir -p $logdir/log
[ ! -f $annot.zip ] && wget -nv -O $annot.zip $amiurl/AMICorpusAnnotations/$annotver.zip &> $logdir/log/download_ami_annot.log
if [ ! -d $dir/annotations ]; then
mkdir -p $dir/annotations
unzip -o -d $dir/annotations $annot.zip &> /dev/null
fi
[ ! -f "$dir/annotations/AMI-metadata.xml" ] && echo "$0: File AMI-Metadata.xml not found under $dir/annotations." && exit 1;
# extract text from AMI XML annotations,
local/ami_xml2text.sh $dir
wdir=data/local/ami/annotations
[ ! -f $wdir/transcripts1 ] && echo "$0: File $wdir/transcripts1 not found." && exit 1;
echo "Preprocessing transcripts..."
local/ami_split_segments.pl $wdir/transcripts1 $wdir/transcripts2 &> $wdir/log/split_segments.log
# make final train/dev/eval splits
for dset in train eval dev; do
[ ! -f local/ami_split_$dset.final ] && cp local/ami_split_$dset.orig local/ami_split_$dset.final
grep -f local/ami_split_$dset.final $wdir/transcripts2 > $wdir/$dset.txt
done