gale_data_prep_txt.sh
3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/bash
# Copyright 2014 (author: Ahmed Ali, Hainan Xu)
# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
# Apache 2.0
echo $0 "$@"
export LC_ALL=C
galeData=$(utils/make_absolute.sh "${@: -1}" );
length=$(($#-1))
args=${@:1:$length}
top_pwd=`pwd`
txtdir=$galeData/txt
mkdir -p $txtdir
cd $txtdir
for cdx in ${args[@]}; do
echo "Preparing $cdx"
if [[ $cdx == *.tgz ]] ; then
tar -xvf $cdx
elif [ -d "$cdx" ]; then
tgt=$(basename $cdx)
test -x $tgt || ln -s $cdx `basename $tgt`
else
echo "I don't really know what I shall do with $cdx " >&2
fi
done
find -L . -type f -name *.tdf | while read file; do
sed '1,3d' $file
done > all.tmp
perl -e '
($inFile,$idFile,$txtFile,$spk,$mapf)= split /\s+/, $ARGV[0];
open(IN, "$inFile");
open(ID, ">$idFile");
open(TXT, ">$txtFile");
open(SPK, ">$spk");
open(MAP, ">$mapf");
while (<IN>) {
@arr= split /\t/,$_;
$arr[4] =~ s/ //g;
$arr[4] = sprintf("%020s", $arr[4]);
$spkid = "$arr[0]_$arr[4]";
$spkfix = sprintf("%080s", $spkid);
$start=sprintf ("%0.3f",$arr[2]);
$rStart=$start;
$start=~s/\.//;
$start=~s/^0+$/0/;
$start=~s/^0+([^0])/$1/; # remove zeros at the beginning
$start = sprintf("%09s", $start);
$end=sprintf ("%0.3f",$arr[3]);
$rEnd=$end;
$end=~s/^0+([^0])/$1/;
$end=~s/\.//;
$end = sprintf("%09s", $end);
$id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n";
next if ($rStart == $rEnd);
$id =~ s/.sph//g;
print ID $id;
print TXT "$arr[7]\n";
print SPK "${spkfix}_$arr[0]_${start}_${end} ${spkfix}\n";
print MAP "$arr[0] ${spkfix}_$arr[0]\n";
}' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp"
perl -p -i -e 's=/.$==g' contentall.tmp
cd $top_pwd
pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
export PYTHONPATH=$PYTHONPATH:`pwd`/tools/mmseg-1.3.0/lib/python${pyver}/site-packages
if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
echo "--- Downloading mmseg-1.3.0 ..."
echo "NOTE: it assumes that you have Python, Setuptools installed on your system!"
wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz
tar xf tools/mmseg-1.3.0.tar.gz -C tools
cd tools/mmseg-1.3.0
mkdir -p lib/python${pyver}/site-packages
CC=gcc CXX=g++ python setup.py build
python setup.py install --prefix=.
cd ../..
if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
echo "mmseg is not found - installation failed?"
exit 1
fi
fi
cat $txtdir/contentall.tmp |\
sed -e 's/,//g' |\
sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
sed -e 's/<\/foreign>/ /g' |\
perl -pe 's/<Event.*?>/ /g' |\
sed -e 's/\[NS\]//g' |\
sed -e 's/\[ns\]//g' |\
sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
local/gale_normalize.pl | \
python local/gale_segment.py \
> $txtdir/text
paste $txtdir/allid.tmp $txtdir/text | sed 's: $::' | awk '{if (NF>5) {print $0}}' > $txtdir/all_1.tmp
awk '{$1="";print $0}' $txtdir/all_1.tmp | sed 's:^ ::' > $txtdir/../all
cat $txtdir/utt2spk.tmp | sort -u > $txtdir/../utt2spk
cat $txtdir/map.tmp | sort -u > $txtdir/../map
sort -c $txtdir/../utt2spk
utils/utt2spk_to_spk2utt.pl $txtdir/../utt2spk | sort -u > $txtdir/../spk2utt
echo data prep text succeeded