subs_prepare_data.pl
2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env perl
# Copyright 2017 John Morgan
# Apache 2.0.
# subs_prepare_data.pl - condition subs data for lm training
use strict;
use warnings;
use Carp;
use Encode;
# set lower and upper bounds
my $low_bound = 8;
# only segments with at least $low_bound words will be written
my $up_bound = 16;
# only segments with fewer than $up_bound words will be written
# input and output files
my $corpus = "OpenSubtitles.en-es.es";
my $symbol_table = "data/lang/words.txt";
my $filtered = "data/local/tmp/subs/lm/es.txt";
my $oovs = "data/local/tmp/subs/lm/oovs.txt";
my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt";
open my $C, '<', $corpus or croak "problems with $corpus $!";
system "mkdir -p data/local/tmp/subs/lm";
if ( -e $filtered ) {
warn "$filtered already exists.";
} else {
open my $FLT, '+>:utf8', $filtered or croak "problems with $filtered $!";
LINE: while ( my $line = <$C> ) {
$line = decode_utf8 $line;
chomp $line;
my @tokens = split /\s+/, $line;
next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound ));
# remove punctuation
$line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[ ]+)/ /msxg;
#convert tabs to white space
$line =~ s/\t/ /g;
#hard to soft space
$line =~ s/ / /g;
#squeeze white space
$line =~ s/\s+/ /g;
#initial and final white space
$line =~ s/^\p{Separator}+//;
$line =~ s/\p{Separator}+$//;
#down case
$line = lc $line;
print $FLT "$line\n";
}
close $FLT;
}
close $C;
# find out of vocabulary words
# $symbol_table points to a file containing a map of symbols to integers
# hash for word to integer map
my %sym2int = ();
open my $F, '<', $symbol_table or croak "problem with $symbol_table $!";
# store words to int map in hash
while( my $line = <$F>) {
chomp $line;
my ($s,$i) = split /\s/, $line, 2;
$sym2int{$s} = $i;
}
close $F;
open my $I, '<', $filtered or croak "problem with $filtered $!";
open my $OOVS, '+>', $oovs or croak "problems with $oovs $!";
while ( my $line = <$I>) {
chomp $line;
my @A = split /\s/, $line;
foreach my $a (@A) {
if (!defined ($sym2int{$a})) {
print $OOVS "$a\n";
}
}
}
close $OOVS;
close $I;
# remove segments with OOVs
# store OOVS in hash
my %oov = ();
open my $V, '<', $oovs or croak "problems with $oovs $!";
while ( my $line = <$V> ) {
chomp $line;
$oov{$line} = 1;
}
close $V;
open my $L, '<', $filtered or croak "problems with $filtered $!";
open my $IV, '+>', $iv or croak "problems with $iv $!";
SEGMENT: while ( my $segment = <$L> ) {
chomp $segment;
my @words = split /\s+/, $segment;
foreach my $word ( sort @words ) {
next SEGMENT if ( $oov{$word} );
}
print $IV "$segment\n";
}
close $IV;
close $L;