resegment_text.sh
4.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/bin/bash
# Copyright Johns Hopkins University (Author: Daniel Povey) 2013. Apache 2.0.
# This script takes two data directories that represent different
# segmentations of the same data (both must have "segments" files and
# the recording-ids must match), and it converts the text in one directory
# to correspond to the segmentation in the other. Its output is the
# "text" file in the second directory. To get the alignments, it
# must be provided an "alignment" directory where the training data
# from the first directory has been aligned.
# begin configuration section.
stage=0
cmd=run.pl
#end configuration section.
[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;
if [ $# -ne 5 ]; then
echo "Usage: $0 [options] <in-data-dir> <lang> <ali-dir|model-dir> <out-data-dir> <temp/log-dir>"
echo " Options:"
echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes."
echo " --stage (0|1|2) # start scoring script from part-way through."
echo "e.g.:"
echo "$0 data/train data/lang exp/tri3b_ali_all data/train_reseg exp/tri3b_resegment"
exit 1;
fi
data=$1
lang=$2
alidir=$3
data_out=$4
dir=$5
mkdir -p $dir/log || exit 1;
for f in $data/feats.scp $lang/phones.txt $alidir/ali.1.gz $alidir/num_jobs \
$alidir/final.mdl $data_out/reco2file_and_channel $data_out/segments; do
if [ ! -f $f ]; then
echo "$0: no such file $f"
exit 1;
fi
done
if [ $stage -le 0 ]; then
echo "$0: calling get_train_ctm.sh to produce ctms of the alignments."
# Caution: this will produce logs in $alidir/log/get_ctm.log
steps/get_train_ctm.sh --cmd "$cmd" $data $lang $alidir || exit 1;
fi
if [ $stage -le 1 ]; then
if [ ! -s $alidir/ctm ]; then
echo "$0: file $data/ctm does not exist or is empty."
exit 1;
fi
echo "$0: converting ctm to a format where we have the recording-id ..."
echo "$0: ... in place of the side and channel, e.g. sw02008-B instead of sw02008 B"
cat $alidir/ctm | awk -v r=$data_out/reco2file_and_channel \
'BEGIN{while((getline < r) > 0) { if(NF!=3) {exit(1);} map[ $2 "&" $3 ] = $1;}}
{if (NF!=5) {print "bad line " $0; exit(2);} reco = map[$1 "&" $2];
if (length(reco) == 0) { print "Bad key " $1 "&" $2; exit(3); }
print reco, $3, $4, $5; } ' > $dir/ctm_per_reco
fi
if [ $stage -le 2 ]; then
cat $data_out/segments | perl -e '
@ARGV == 1 || die;
$ctm_per_reco = shift @ARGV;
$chunk_size = 3;
open(C, "<$ctm_per_reco") || die "opening ctm file $ctm_per_reco";
# we build up an associative array indexed by a pair of ids: $reco,$n
# where $n is a 5-second chunk of time.
sub to_chunk { my $t = shift @_; return int($t / $chunk_size); }
while (<C>) {
@A = split; @A == 4 || die "Bad line $_ in $ctm_per_reco";
($reco, $start, $length, $word) = @A;
$chunk = to_chunk($start);
if (! defined $reco2list{$reco,$chunk} ){ $reco2list{$reco,$chunk} = [ ]; } # new anonymous array
$arrayref = $reco2list{$reco,$chunk};
push @$arrayref, [ $start, $length, $word ]; # another level of anonymous array..
}
$num_utts = 0; $num_empty = 0;
while(<STDIN>) {
@A = split; @A == 4 || die "Bad line $_ in stdin";
($utt, $reco, $start, $end) = @A;
@text = ();
for ($chunk = to_chunk($start); $chunk <= to_chunk($end); $chunk++) {
$arrayref = $reco2list{$reco,$chunk};
if (defined $arrayref) {
foreach $entry ( @$arrayref ) { # note, $entry is itself an arrayref
# to an array containing $start $end $word.
$word_start = $$entry[0];
if ($word_start >= $start && $word_start <= $end) {
$word_end = $$entry[1] + $word_start;
if ($word_end >= $start && $word_end <= $end) {
$word = $$entry[2]; defined $word || die;
push @text, $word;
}
}
}
}
}
$num_utts++;
if (@text > 0) { $t = join(" ", @text); print "$utt $t\n";; }
else { $num_empty++; }
}
print STDERR "Processed $num_utts utterances, of which $num_empty had no text.\n"; ' \
$dir/ctm_per_reco | sort > $data_out/text || exit 1;
nw_old=`cat $data/text | wc | awk '{print $2 - $1}'`
nw_new=`cat $data_out/text | wc | awk '{print $2 - $1}'`
echo "Number of words of training text changed from $nw_old to $nw_new";
if [ ! -s $data_out/text ]; then
echo "$0: produced empty output. Something went wrong."
exit 1;
fi
fi