Blame view

Scripts/utils/filter_scp.pl 1.49 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
  #!/usr/bin/perl
  # Copyright 2010-2012 Microsoft Corporation
  #                     Johns Hopkins University (author: Daniel Povey)
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  
  # This script takes a list of utterance-ids or any file whose first field
  # of each line is an utterance-id, and filters an scp
  # file (or any file whose first field is an utterance id), printing
  # out only those lines whose first field is in id_list.
  
  $exclude = 0;
  
  if ($ARGV[0] eq "--exclude") {
    $exclude = 1;
    shift @ARGV;
  }
  
  if(@ARGV < 1 || @ARGV > 2) {
    die "Usage: filter_scp.pl [--exclude] id_list [in.scp] > out.scp ";
  }
  
  
  $idlist = shift @ARGV;
  open(F, "<$idlist") || die "Could not open id-list file $idlist";
  while(<F>) {
    @A = split;
    @A>=1 || die "Invalid id-list file line $_";
    $seen{$A[0]} = 1;
  }
  
  while(<>) {
    @A = split;
    @A > 0 || die "Invalid scp file line $_";
    if((!$exclude && $seen{$A[0]}) || ($exclude && !defined $seen{$A[0]})) {
      print $_;
    }
  }