Blame view
egs/wsj/s5/steps/cleanup/internal/split_text_into_docs.pl
1.75 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
#! /usr/bin/perl # Copyright 2017 Vimal Manohar # Apache 2.0. # If 'text' contains: # utterance1 A B C D # utterance2 C B # and you ran: # split_text_into_docs.pl --max-words 2 text doc2text docs # then 'doc2text' would contain: # utterance1-1 utterance1 # utterance1-2 utterance1 # utterance2-1 utterance2 # and 'docs' would contain: # utterance1-1 A B # utterance1-2 C D # utterance2-1 C B use warnings; use strict; my $max_words = 1000; my $usage = "Usage: steps/cleanup/internal/split_text_into_docs.pl [--max-words <int>] text doc2text docs "; while (@ARGV > 3) { if ($ARGV[0] eq "--max-words") { shift @ARGV; $max_words = shift @ARGV; } else { print STDERR "$usage"; exit (1); } } if (scalar @ARGV != 3) { print STDERR "$usage"; exit (1); } sub min ($$) { $_[$_[0] > $_[1]] } open TEXT, $ARGV[0] or die "$0: Could not open file $ARGV[0] for reading "; open DOC2TEXT, ">", $ARGV[1] or die "$0: Could not open file $ARGV[1] for writing "; open DOCS, ">", $ARGV[2] or die "$0: Could not open file $ARGV[2] for writing "; while (<TEXT>) { chomp; my @F = split; my $utt = shift @F; my $num_words = scalar @F; if ($num_words <= $max_words) { print DOCS "$_ "; print DOC2TEXT "$utt $utt "; next; } my $num_docs = int($num_words / $max_words) + 1; my $num_words_shift = int($num_words / $num_docs) + 1; my $words_per_doc = $num_words_shift; #print STDERR ("$utt num-words=$num_words num-docs=$num_docs words-per-doc=$words_per_doc "); for (my $i = 0; $i < $num_docs; $i++) { my $st = $i*$num_words_shift; my $end = min($st + $words_per_doc, $num_words) - 1; print DOCS ("$utt-$i " . join(" ", @F[$st..$end]) . " "); print DOC2TEXT "$utt-$i $utt "; } } |