ami_xml2text.sh
2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash
###########################################################################################
# This script was copied from egs/ami/s5/local/ami_xml2text.sh
# The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
# Changes made:
# - Modified paths to match multi_en naming conventions
###########################################################################################
# Copyright, University of Edinburgh (Pawel Swietojanski and Jonathan Kilgour)
if [ $# -ne 1 ]; then
echo "Usage: $0 <ami-dir>"
exit 1;
fi
adir=$1
wdir=data/local/ami/annotations
[ ! -f $adir/annotations/AMI-metadata.xml ] && echo "$0: File $adir/annotations/AMI-metadata.xml no found." && exit 1;
mkdir -p $wdir/log
JAVA_VER=$(java -version 2>&1 | sed 's/java version "\(.*\)\.\(.*\)\..*"/\1\2/; 1q')
if [ "$JAVA_VER" -ge 15 ]; then
if [ ! -d $wdir/nxt ]; then
echo "Downloading NXT annotation tool..."
wget -O $wdir/nxt.zip http://sourceforge.net/projects/nite/files/nite/nxt_1.4.4/nxt_1.4.4.zip
[ ! -s $wdir/nxt.zip ] && echo "Downloading failed! ($wdir/nxt.zip)" && exit 1
unzip -d $wdir/nxt $wdir/nxt.zip &> /dev/null
fi
if [ ! -f $wdir/transcripts0 ]; then
echo "Parsing XML files (can take several minutes)..."
nxtlib=$wdir/nxt/lib
java -cp $nxtlib/nxt.jar:$nxtlib/xmlParserAPIs.jar:$nxtlib/xalan.jar:$nxtlib \
FunctionQuery -c $adir/annotations/AMI-metadata.xml -q '($s segment)(exists $w1 w):$s^$w1' -atts obs who \
'@extract(($sp speaker)($m meeting):$m@observation=$s@obs && $m^$sp & $s@who==$sp@nxt_agent,global_name, 0)'\
'@extract(($sp speaker)($m meeting):$m@observation=$s@obs && $m^$sp & $s@who==$sp@nxt_agent, channel, 0)' \
transcriber_start transcriber_end starttime endtime '$s' '@extract(($w w):$s^$w & $w@punc="true", starttime,0,0)' \
1> $wdir/transcripts0 2> $wdir/log/nxt_export.log
fi
else
echo "$0. Java not found. Will download exported version of transcripts."
annots=ami_manual_annotations_v1.6.1_export
wget -O $wdir/$annots.gzip http://groups.inf.ed.ac.uk/ami/AMICorpusAnnotations/$annots.gzip
gunzip -c $wdir/${annots}.gzip > $wdir/transcripts0
fi
#remove NXT logs dumped to stdio
grep -e '^Found' -e '^Obs' -i -v $wdir/transcripts0 > $wdir/transcripts1
exit 0;