Blame view
egs/tunisian_msa/s5/local/subs_prepare_data.pl
2.86 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
#!/usr/bin/env perl # Copyright 2018 John Morgan # Apache 2.0. # subs_prepare_data.pl - condition subs data for lm training use strict; use warnings; use Carp; use Encode; # set lower and upper bounds my $low_bound = 8; # only segments with at least $low_bound words will be written my $up_bound = 16; # only segments with fewer than $up_bound words will be written # input and output files my $corp = "subs.txt"; my $symtab = "data/lang/words.txt"; my $conditioned = "data/local/tmp/subs/lm/ar.txt"; my $oo = "data/local/tmp/subs/lm/oovs.txt"; my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt"; open my $CORP, '<', $corp or croak "problems with $corp $!"; system "mkdir -p data/local/tmp/subs/lm"; open my $COND, '+>:utf8', $conditioned or croak "problems with $conditioned $!"; if ( -s $conditioned ) { croak "$conditioned already exists."; } else { LINE: while ( my $line = <$CORP> ) { $line = decode_utf8 $line; chomp $line; my @tokens = split /\s+/, $line; next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound )); # remove punctuation $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[ ]+)/ /msxg; #convert tabs to white space $line =~ s/\t/ /g; #hard to soft space $line =~ s/ / /g; #squeeze white space $line =~ s/\s+/ /g; #initial and final white space $line =~ s/^\p{Separator}+//; $line =~ s/\p{Separator}+$//; #down case $line = lc $line; print $COND "$line "; } }close $CORP; close $COND; # find out of vocabulary words # $symtab points to a file containing a map of symbols to integers # hash for word to integer map my %sym2int = (); open my $F, '<', $symtab or croak "problem with $symtab $!"; # store words to int map in hash while( my $line = <$F>) { chomp $line; my ($s,$i) = split /\s/, $line, 2; $sym2int{$s} = $i; } close $F; open my $I, '<', $conditioned or croak "problem with $conditioned $!"; open my $OO, '+>', $oo or croak "problems with $oo $!"; while ( my $line = <$I>) { chomp $line; my @A = split /\s/, $line; foreach my $a (@A) { if (!defined ($sym2int{$a})) { print $OO "$a "; } } } close $OO; close $I; # remove segments with OOVs # store OOVS in hash my %oov = (); open my $V, '<', $oo or croak "problems with $oo $!"; while ( my $line = <$V> ) { chomp $line; $oov{$line} = 1; } close $V; open my $L, '<', $conditioned or croak "problems with $conditioned $!"; open my $IV, '+>', $iv or croak "problems with $iv $!"; SEGMENT: while ( my $segment = <$L> ) { chomp $segment; my @words = split /\s+/, $segment; foreach my $word ( sort @words ) { next SEGMENT if ( $oov{$word} ); } print $IV "$segment "; } close $IV; close $L; |