Blame view
egs/wsj/s5/local/ndx2flist.pl
2.45 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
#!/usr/bin/env perl # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. # This program takes as its standard input an .ndx file from the WSJ corpus that looks # like this: #;; File: tr_s_wv1.ndx, updated 04/26/94 #;; #;; Index for WSJ0 SI-short Sennheiser training data #;; Data is read WSJ sentences, Sennheiser mic. #;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts #;; per speaker TI) = 7236 utts #;; #11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 #11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 #11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 #and as command-line arguments it takes the names of the WSJ disk locations, e.g.: #/mnt/matylda2/data/WSJ0/11-1.1 /mnt/matylda2/data/WSJ0/11-10.1 ... etc. # It outputs a list of absolute pathnames (it does this by replacing e.g. 11_1_1 with # /mnt/matylda2/data/WSJ0/11-1.1. # It also does a slight fix because one of the WSJ disks (WSJ1/13-16.1) was distributed with # uppercase rather than lower case filenames. foreach $fn (@ARGV) { $fn =~ m:.+/([0-9\.\-]+)/?$: || die "Bad command-line argument $fn "; $disk_id=$1; $disk_id =~ tr/-\./__/; # replace - and . with - so 11-10.1 becomes 11_10_1 $fn =~ s:/$::; # Remove final slash, just in case it is present. $disk2fn{$disk_id} = $fn; } while(<STDIN>){ if(m/^;/){ next; } # Comment. Ignore it. else { m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; $disk=$1; if(!defined $disk2fn{$disk}) { die "Disk id $disk not found"; } $filename = $2; # as a subdirectory of the distributed disk. if($disk eq "13_16_1" && `hostname` =~ m/fit.vutbr.cz/) { # The disk 13-16.1 has been uppercased for some reason, on the # BUT system. This is a fix specifically for that case. $filename =~ tr/a-z/A-Z/; # This disk contains all uppercase filenames. Why? } print "$disk2fn{$disk}/$filename "; } } |