Blame view
egs/heroico/s5/local/subs_prepare_data.pl
2.89 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
#!/usr/bin/env perl # Copyright 2017 John Morgan # Apache 2.0. # subs_prepare_data.pl - condition subs data for lm training use strict; use warnings; use Carp; use Encode; # set lower and upper bounds my $low_bound = 8; # only segments with at least $low_bound words will be written my $up_bound = 16; # only segments with fewer than $up_bound words will be written # input and output files my $corpus = "OpenSubtitles.en-es.es"; my $symbol_table = "data/lang/words.txt"; my $filtered = "data/local/tmp/subs/lm/es.txt"; my $oovs = "data/local/tmp/subs/lm/oovs.txt"; my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt"; open my $C, '<', $corpus or croak "problems with $corpus $!"; system "mkdir -p data/local/tmp/subs/lm"; if ( -e $filtered ) { warn "$filtered already exists."; } else { open my $FLT, '+>:utf8', $filtered or croak "problems with $filtered $!"; LINE: while ( my $line = <$C> ) { $line = decode_utf8 $line; chomp $line; my @tokens = split /\s+/, $line; next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound )); # remove punctuation $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[ ]+)/ /msxg; #convert tabs to white space $line =~ s/\t/ /g; #hard to soft space $line =~ s/ / /g; #squeeze white space $line =~ s/\s+/ /g; #initial and final white space $line =~ s/^\p{Separator}+//; $line =~ s/\p{Separator}+$//; #down case $line = lc $line; print $FLT "$line "; } close $FLT; } close $C; # find out of vocabulary words # $symbol_table points to a file containing a map of symbols to integers # hash for word to integer map my %sym2int = (); open my $F, '<', $symbol_table or croak "problem with $symbol_table $!"; # store words to int map in hash while( my $line = <$F>) { chomp $line; my ($s,$i) = split /\s/, $line, 2; $sym2int{$s} = $i; } close $F; open my $I, '<', $filtered or croak "problem with $filtered $!"; open my $OOVS, '+>', $oovs or croak "problems with $oovs $!"; while ( my $line = <$I>) { chomp $line; my @A = split /\s/, $line; foreach my $a (@A) { if (!defined ($sym2int{$a})) { print $OOVS "$a "; } } } close $OOVS; close $I; # remove segments with OOVs # store OOVS in hash my %oov = (); open my $V, '<', $oovs or croak "problems with $oovs $!"; while ( my $line = <$V> ) { chomp $line; $oov{$line} = 1; } close $V; open my $L, '<', $filtered or croak "problems with $filtered $!"; open my $IV, '+>', $iv or croak "problems with $iv $!"; SEGMENT: while ( my $segment = <$L> ) { chomp $segment; my @words = split /\s+/, $segment; foreach my $word ( sort @words ) { next SEGMENT if ( $oov{$word} ); } print $IV "$segment "; } close $IV; close $L; |