Blame view
egs/rm/s5/local/make_trans.pl
2.4 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
#!/usr/bin/env perl # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # usage: make_trans.sh postfix in.flist input.snr out.txt out.scp # postfix is last letters of the database "key" (rest are numeric) # in.flist is just a list of filenames, probably of .sph files. # input.snr is an snr format file from the RM dataset. # out.txt is the output transcriptions in format "key word1 word " # out.scp is the output scp file, which is as in.scp but has the # database-key first on each line. # Reads from first argument e.g. $rootdir/rm1_audio1/rm1/doc/al_sents.snr # and second argument train_wav.scp # Writes to standard output trans.txt if(@ARGV != 5) { die "usage: make_trans.sh postfix in.flist input.snr out.txt out.scp "; } ($postfix, $in_flist, $input_snr, $out_txt, $out_scp) = @ARGV; open(F, "<$input_snr") || die "Opening SNOR file $input_snr"; while(<F>) { if(m/^;/) { next; } m/(.+) \((.+)\)/ || die "bad line $_"; $T{$2} = $1; } close(F); open(G, "<$in_flist") || die "Opening file list $in_flist"; open(O, ">$out_txt") || die "Open output transcription file $out_txt"; open(P, ">$out_scp") || die "Open output scp file $out_scp"; while(<G>) { $_ =~ m:/(\w+)/(\w+)\.sph\s+$:i || die "bad scp line $_"; $spkname = $1; $uttname = $2; $uttname =~ tr/a-z/A-Z/; defined $T{$uttname} || die "no trans for sent $uttname"; $spkname =~ s/_//g; # remove underscore from spk name to make key nicer. $key = $spkname . "_" . $uttname . "_" . $postfix; $key =~ tr/A-Z/a-z/; # Make it all lower case. # to make the numerical and string-sorted orders the same. print O "$key $T{$uttname} "; print P "$key $_" || die "Error writing to sph file list"; $n++; } close(O) || die "Closing output."; close(P) || die "Closing output."; |