Blame view

egs/wsj/s5/steps/cleanup/internal/split_text_into_docs.pl 1.75 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  #! /usr/bin/perl
  
  # Copyright 2017  Vimal Manohar
  # Apache 2.0.
  
  # If 'text' contains:
  #  utterance1 A B C D
  #  utterance2 C B
  #  and you ran:
  #  split_text_into_docs.pl --max-words 2 text doc2text docs
  #  then 'doc2text' would contain:
  #  utterance1-1 utterance1
  #  utterance1-2 utterance1
  #  utterance2-1 utterance2
  #  and 'docs' would contain:
  #  utterance1-1 A B
  #  utterance1-2 C D
  #  utterance2-1 C B
  
  use warnings;
  use strict;
  
  my $max_words = 1000;
  
  my $usage = "Usage: steps/cleanup/internal/split_text_into_docs.pl [--max-words <int>] text doc2text docs
  ";
  
  while (@ARGV > 3) {
      if ($ARGV[0] eq "--max-words") {
          shift @ARGV;
          $max_words = shift @ARGV;
      } else {
          print STDERR "$usage";
          exit (1);
      }
  }
  
  if (scalar @ARGV != 3) {
    print STDERR "$usage";
    exit (1);
  }
  
  sub min ($$) { $_[$_[0] > $_[1]] }
  
  open TEXT, $ARGV[0] or die "$0: Could not open file $ARGV[0] for reading
  ";
  open DOC2TEXT, ">", $ARGV[1] or die "$0: Could not open file $ARGV[1] for writing
  ";
  open DOCS, ">", $ARGV[2] or die "$0: Could not open file $ARGV[2] for writing
  ";
  
  while (<TEXT>) {
    chomp;
    my @F = split;
    my $utt = shift @F;
    my $num_words = scalar @F;
  
    if ($num_words  <= $max_words) {
      print DOCS "$_
  ";
      print DOC2TEXT "$utt $utt
  ";
      next;
    }
  
    my $num_docs = int($num_words / $max_words) + 1;
    my $num_words_shift = int($num_words / $num_docs) + 1;
    my $words_per_doc = $num_words_shift;
  
    #print STDERR ("$utt num-words=$num_words num-docs=$num_docs words-per-doc=$words_per_doc
  ");
    
    for (my $i = 0; $i < $num_docs; $i++) {
      my $st = $i*$num_words_shift;
      my $end = min($st + $words_per_doc, $num_words) - 1;
      print DOCS ("$utt-$i " . join(" ", @F[$st..$end]) . "
  ");
      print DOC2TEXT "$utt-$i $utt
  ";
    }
  }