convert_ctm.pl 2.94 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96


#!/usr/bin/env perl

# Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.

# This takes as standard input a ctm file that's "relative to the utterance",
# i.e. times are measured relative to the beginning of the segments, and it
# uses a "segments" file (format:
# utterance-id recording-id start-time end-time
# ) and a "reco2file_and_channel" file (format:
# recording-id basename-of-file

$skip_unknown=undef;
if ( $ARGV[0] eq "--skip-unknown" ) {
  $skip_unknown=1;
  shift @ARGV;
}

if (@ARGV < 2 || @ARGV > 3) {
  print STDERR "Usage: convert_ctm.pl <segments-file> <reco2file_and_channel-file> [<utterance-ctm>] > real-ctm\n";
  exit(1);
}

$segments = shift @ARGV;
$reco2file_and_channel = shift @ARGV;

open(S, "<$segments") || die "opening segments file $segments";
while(<S>) {
  @A = split(" ", $_);
  @A == 4 || die "Bad line in segments file: $_";
  ($utt, $recording_id, $begin_time, $end_time) = @A;
  $utt2reco{$utt} = $recording_id;
  $begin{$utt} = $begin_time;
  $end{$utt} = $end_time;
}
close(S);
open(R, "<$reco2file_and_channel") || die "open reco2file_and_channel file $reco2file_and_channel";
while(<R>) {
  @A = split(" ", $_);
  @A == 3 || die "Bad line in reco2file_and_channel file: $_";
  ($recording_id, $file, $channel) = @A;
  $reco2file{$recording_id} = $file;
  $reco2channel{$recording_id} = $channel;
}


# Now process the ctm file, which is either the standard input or the third
# command-line argument.
$num_done = 0;
while(<>) {
  @A= split(" ", $_);
  ( @A == 5 || @A == 6 ) || die "Unexpected ctm format: $_";
  # lines look like:
  # <utterance-id> 1 <begin-time> <length> <word> [ confidence ]
  ($utt, $one, $wbegin, $wlen, $w, $conf) = @A;
  $reco = $utt2reco{$utt};
  if (!defined $reco) { 
      next if defined $skip_unknown;
      die "Utterance-id $utt not defined in segments file $segments"; 
  }
  $file = $reco2file{$reco};
  $channel = $reco2channel{$reco};
  if (!defined $file || !defined $channel) { 
    die "Recording-id $reco not defined in reco2file_and_channel file $reco2file_and_channel"; 
  }
  $b = $begin{$utt};
  $e = $end{$utt};
  $wbegin_r = $wbegin + $b; # Make it relative to beginning of the recording.
  $wbegin_r = sprintf("%.2f", $wbegin_r);
  $wlen = sprintf("%.2f", $wlen);
  if (defined $conf) {
    $line = "$file $channel $wbegin_r $wlen $w $conf\n"; 
  } else {
    $line = "$file $channel $wbegin_r $wlen $w\n"; 
  }
  if ($wbegin_r + $wlen > $e + 0.01) {
    print STDERR "Warning: word appears to be past end of recording; line is $line";
  }
  print $line; # goes to stdout.
  $num_done++;
}

if ($num_done == 0) { exit 1; } else { exit 0; }

__END__

# Test example [also test it without the 0.5's]
echo utt reco 10.0 20.0 > segments
echo reco file A > reco2file_and_channel
echo utt 1 8.0 1.0 word 0.5 > ctm_in
echo file A 18.00 1.00 word 0.5 > ctm_out
utils/convert_ctm.pl segments reco2file_and_channel ctm_in | cmp - ctm_out || echo error
rm segments reco2file_and_channel ctm_in ctm_out