Blame view
egs/wsj/s5/steps/cleanup/internal/ctm_to_text.pl
1.8 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
#! /usr/bin/perl # Copyright 2016 Vimal Manohar # Apache 2.0. use strict; use warnings; if (scalar @ARGV != 1 && scalar @ARGV != 3) { my $usage = <<END; This script converts a CTM into kaldi text format by concatenating the words belonging to the same utterance (or recording) and outputs the same to the standard output. If --non-scored-words list file is provided with one word per line, then those words are not added to the text. The CTM format is <file> <channel> <start-time> <duration> <word> [<conf>]. This script assumes the CTM to be in NIST sorted order given by UNIX sort command "sort +0 -1 +1 -2 +2nb -3" Usage: ctm_to_text.pl [--non-scored-words <file>] <ctm-file> > <text> END die $usage; } my $non_scored_words_list = ""; if (scalar @ARGV > 1) { if ($ARGV[0] eq "--non-scored-words") { shift @ARGV; $non_scored_words_list = shift @ARGV; } else { die "Unknown option $ARGV[0] "; } } my %non_scored_words; $non_scored_words{"<eps>"} = 1; if ($non_scored_words_list ne "") { open NONSCORED, $non_scored_words_list or die "Failed to open $non_scored_words_list"; while (<NONSCORED>) { chomp; my @F = split; $non_scored_words{$F[0]} = 1; } close NONSCORED; } my $ctm_file = shift @ARGV; open CTM, $ctm_file or die "Failed to open $ctm_file"; my $prev_utt = ""; my @text; while (<CTM>) { chomp; my @F = split; my $utt = $F[0]; if ($utt ne $prev_utt && $prev_utt ne "") { if (scalar @text > 0) { print $prev_utt . " " . join(" ", @text) . " "; } @text = (); } if (scalar @F < 5 || scalar @F > 6) { die "Invalid line $_ in CTM $ctm_file "; } if (!defined $non_scored_words{$F[4]}) { push @text, $F[4]; } $prev_utt = $utt; } close CTM; if (scalar @text > 0) { print $prev_utt . " " . join(" ", @text) . " "; } |