Blame view

egs/heroico/s5/local/subs_prepare_data.pl 2.89 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  #!/usr/bin/env perl
  
  # Copyright 2017 John Morgan
  # Apache 2.0.
  
  # subs_prepare_data.pl - condition subs data for lm training
  
  use strict;
  use warnings;
  use Carp;
  
  use Encode;
  
  # set lower and upper bounds
  my $low_bound = 8;
  # only segments with at least  $low_bound words will be written
  my $up_bound = 16;
  # only segments with fewer than $up_bound words will be written
  
  # input and output files
  
  my $corpus = "OpenSubtitles.en-es.es";
  my $symbol_table = "data/lang/words.txt";
  my $filtered = "data/local/tmp/subs/lm/es.txt";
  my $oovs = "data/local/tmp/subs/lm/oovs.txt";
  my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt";
  
  open my $C, '<', $corpus or croak "problems with $corpus $!";
  
  system "mkdir -p data/local/tmp/subs/lm";
  
  if ( -e $filtered ) {
      warn "$filtered already exists.";
  } else {
    open my $FLT, '+>:utf8', $filtered or croak "problems with $filtered $!";
    LINE: while ( my $line = <$C> ) {
        $line = decode_utf8 $line;
        chomp $line;
  
        my @tokens = split /\s+/, $line;
  
        next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound ));
  
        # remove punctuation
        $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[	 ]+)/ /msxg;
        #convert tabs to white space
        $line =~ s/\t/ /g;
        #hard to soft space
        $line =~ s/ / /g;
        #squeeze white space
        $line =~ s/\s+/ /g;
        #initial and final white space
        $line =~ s/^\p{Separator}+//;
        $line =~ s/\p{Separator}+$//;
        #down case
        $line = lc $line;
  
        print $FLT "$line
  ";
    }
    close $FLT;
  }
  close $C;
  
  
  # find out of vocabulary words
  
  # $symbol_table points to a file containing a map of symbols to integers
  
  # hash for word to integer map
  my %sym2int = ();
  
  open my $F, '<', $symbol_table or croak "problem with $symbol_table $!";
  
  # store words to int map in hash
  while( my $line = <$F>) {
      chomp $line;
      my ($s,$i) = split /\s/, $line, 2;
      $sym2int{$s} = $i;
  }
  close $F;
  
  open my $I, '<', $filtered or croak "problem with $filtered $!";
  open my $OOVS, '+>', $oovs or croak "problems with $oovs $!";
  
  while ( my $line = <$I>) {
      chomp $line;
      my @A = split /\s/, $line;
      foreach my $a (@A) {
  	if (!defined ($sym2int{$a})) {
              print $OOVS "$a
  ";
  	}
      }
  }
  close $OOVS;
  close $I;
  
  # remove segments with OOVs
  
  # store OOVS in hash
  my %oov = ();
  open my $V, '<', $oovs or croak "problems with $oovs $!";
  while ( my $line = <$V> ) {
      chomp $line;
      $oov{$line} = 1;
  }
  close $V;
  
  open my $L, '<', $filtered or croak "problems with $filtered $!";
  open my $IV, '+>', $iv or croak "problems with $iv $!";
  
  SEGMENT: while ( my $segment = <$L> ) {
      chomp $segment;
      my @words = split /\s+/, $segment;
      foreach my $word ( sort @words ) {
  	next SEGMENT if ( $oov{$word} );
      }
      print $IV "$segment
  ";
  }
  close $IV;
  close $L;