subs_prepare_data.pl
2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env perl
# Copyright 2018 John Morgan
# Apache 2.0.
# subs_prepare_data.pl - condition subs data for lm training
use strict;
use warnings;
use Carp;
use Encode;
# set lower and upper bounds
my $low_bound = 8;
# only segments with at least $low_bound words will be written
my $up_bound = 16;
# only segments with fewer than $up_bound words will be written
# input and output files
my $corp = "subs.txt";
my $symtab = "data/lang/words.txt";
my $conditioned = "data/local/tmp/subs/lm/ar.txt";
my $oo = "data/local/tmp/subs/lm/oovs.txt";
my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt";
open my $CORP, '<', $corp or croak "problems with $corp $!";
system "mkdir -p data/local/tmp/subs/lm";
open my $COND, '+>:utf8', $conditioned or croak "problems with $conditioned $!";
if ( -s $conditioned ) {
croak "$conditioned already exists.";
} else {
LINE: while ( my $line = <$CORP> ) {
$line = decode_utf8 $line;
chomp $line;
my @tokens = split /\s+/, $line;
next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound ));
# remove punctuation
$line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[ ]+)/ /msxg;
#convert tabs to white space
$line =~ s/\t/ /g;
#hard to soft space
$line =~ s/ / /g;
#squeeze white space
$line =~ s/\s+/ /g;
#initial and final white space
$line =~ s/^\p{Separator}+//;
$line =~ s/\p{Separator}+$//;
#down case
$line = lc $line;
print $COND "$line\n";
}
}close $CORP;
close $COND;
# find out of vocabulary words
# $symtab points to a file containing a map of symbols to integers
# hash for word to integer map
my %sym2int = ();
open my $F, '<', $symtab or croak "problem with $symtab $!";
# store words to int map in hash
while( my $line = <$F>) {
chomp $line;
my ($s,$i) = split /\s/, $line, 2;
$sym2int{$s} = $i;
}
close $F;
open my $I, '<', $conditioned or croak "problem with $conditioned $!";
open my $OO, '+>', $oo or croak "problems with $oo $!";
while ( my $line = <$I>) {
chomp $line;
my @A = split /\s/, $line;
foreach my $a (@A) {
if (!defined ($sym2int{$a})) {
print $OO "$a\n";
}
}
}
close $OO;
close $I;
# remove segments with OOVs
# store OOVS in hash
my %oov = ();
open my $V, '<', $oo or croak "problems with $oo $!";
while ( my $line = <$V> ) {
chomp $line;
$oov{$line} = 1;
}
close $V;
open my $L, '<', $conditioned or croak "problems with $conditioned $!";
open my $IV, '+>', $iv or croak "problems with $iv $!";
SEGMENT: while ( my $segment = <$L> ) {
chomp $segment;
my @words = split /\s+/, $segment;
foreach my $word ( sort @words ) {
next SEGMENT if ( $oov{$word} );
}
print $IV "$segment\n";
}
close $IV;
close $L;