Blame view

egs/wsj/s5/steps/cleanup/internal/ctm_to_text.pl 1.8 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  #! /usr/bin/perl
  
  # Copyright 2016  Vimal Manohar
  # Apache 2.0.
  
  use strict;
  use warnings;
  
  if (scalar @ARGV != 1 && scalar @ARGV != 3) {
    my $usage = <<END;
  This script converts a CTM into kaldi text format by concatenating the words
  belonging to the same utterance (or recording) and outputs the same to the
  standard output.
  If --non-scored-words list file is provided with one word per line, then 
  those words are not added to the text.
  
  The CTM format is <file> <channel> <start-time> <duration> <word> [<conf>].
  This script assumes the CTM to be in NIST sorted order given by UNIX
  sort command "sort +0 -1 +1 -2 +2nb -3"
  
  Usage: ctm_to_text.pl [--non-scored-words <file>] <ctm-file> > <text>
  END
    die $usage;
  }
  
  my $non_scored_words_list = "";
  if (scalar @ARGV > 1) {
    if ($ARGV[0] eq "--non-scored-words") {
      shift @ARGV;
      $non_scored_words_list = shift @ARGV;
    } else {
      die "Unknown option $ARGV[0]
  ";
    }
  }
  
  my %non_scored_words;
  $non_scored_words{"<eps>"} = 1;
  
  if ($non_scored_words_list ne "") {
    open NONSCORED, $non_scored_words_list or die "Failed to open $non_scored_words_list";
    
    while (<NONSCORED>) {
      chomp;
      my @F = split;
      $non_scored_words{$F[0]} = 1;
    }
  
    close NONSCORED;
  }
  
  my $ctm_file = shift @ARGV;
  open CTM, $ctm_file or die "Failed to open $ctm_file";
  
  my $prev_utt = "";
  my @text;
  
  while (<CTM>) {
    chomp;
    my @F = split;
  
    my $utt = $F[0];
    if ($utt ne $prev_utt && $prev_utt ne "") {
      if (scalar @text > 0) {
        print $prev_utt . " " . join(" ", @text) . "
  ";
      }
      @text = ();
    }
    
    if (scalar @F < 5 || scalar @F > 6) {
      die "Invalid line $_ in CTM $ctm_file
  ";
    }
  
    if (!defined $non_scored_words{$F[4]}) {
      push @text, $F[4];
    }
  
    $prev_utt = $utt;
  }
  
  close CTM;
      
  if (scalar @text > 0) {
    print $prev_utt . " " . join(" ", @text) . "
  ";
  }