make_trans.pl
2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env perl
# Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# usage: make_trans.sh postfix in.flist input.snr out.txt out.scp
# postfix is last letters of the database "key" (rest are numeric)
# in.flist is just a list of filenames, probably of .sph files.
# input.snr is an snr format file from the RM dataset.
# out.txt is the output transcriptions in format "key word1 word\n"
# out.scp is the output scp file, which is as in.scp but has the
# database-key first on each line.
# Reads from first argument e.g. $rootdir/rm1_audio1/rm1/doc/al_sents.snr
# and second argument train_wav.scp
# Writes to standard output trans.txt
if(@ARGV != 5) {
die "usage: make_trans.sh postfix in.flist input.snr out.txt out.scp\n";
}
($postfix, $in_flist, $input_snr, $out_txt, $out_scp) = @ARGV;
open(F, "<$input_snr") || die "Opening SNOR file $input_snr";
while(<F>) {
if(m/^;/) { next; }
m/(.+) \((.+)\)/ || die "bad line $_";
$T{$2} = $1;
}
close(F);
open(G, "<$in_flist") || die "Opening file list $in_flist";
open(O, ">$out_txt") || die "Open output transcription file $out_txt";
open(P, ">$out_scp") || die "Open output scp file $out_scp";
while(<G>) {
$_ =~ m:/(\w+)/(\w+)\.sph\s+$:i || die "bad scp line $_";
$spkname = $1;
$uttname = $2;
$uttname =~ tr/a-z/A-Z/;
defined $T{$uttname} || die "no trans for sent $uttname";
$spkname =~ s/_//g; # remove underscore from spk name to make key nicer.
$key = $spkname . "_" . $uttname . "_" . $postfix;
$key =~ tr/A-Z/a-z/; # Make it all lower case.
# to make the numerical and string-sorted orders the same.
print O "$key $T{$uttname}\n";
print P "$key $_" || die "Error writing to sph file list";
$n++;
}
close(O) || die "Closing output.";
close(P) || die "Closing output.";