subs_prepare_data.pl 2.89 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120


#!/usr/bin/env perl

# Copyright 2017 John Morgan
# Apache 2.0.

# subs_prepare_data.pl - condition subs data for lm training

use strict;
use warnings;
use Carp;

use Encode;

# set lower and upper bounds
my $low_bound = 8;
# only segments with at least  $low_bound words will be written
my $up_bound = 16;
# only segments with fewer than $up_bound words will be written

# input and output files

my $corpus = "OpenSubtitles.en-es.es";
my $symbol_table = "data/lang/words.txt";
my $filtered = "data/local/tmp/subs/lm/es.txt";
my $oovs = "data/local/tmp/subs/lm/oovs.txt";
my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt";

open my $C, '<', $corpus or croak "problems with $corpus $!";

system "mkdir -p data/local/tmp/subs/lm";

if ( -e $filtered ) {
    warn "$filtered already exists.";
} else {
  open my $FLT, '+>:utf8', $filtered or croak "problems with $filtered $!";
  LINE: while ( my $line = <$C> ) {
      $line = decode_utf8 $line;
      chomp $line;

      my @tokens = split /\s+/, $line;

      next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound ));

      # remove punctuation
      $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[	 ]+)/ /msxg;
      #convert tabs to white space
      $line =~ s/\t/ /g;
      #hard to soft space
      $line =~ s/ / /g;
      #squeeze white space
      $line =~ s/\s+/ /g;
      #initial and final white space
      $line =~ s/^\p{Separator}+//;
      $line =~ s/\p{Separator}+$//;
      #down case
      $line = lc $line;

      print $FLT "$line\n";
  }
  close $FLT;
}
close $C;


# find out of vocabulary words

# $symbol_table points to a file containing a map of symbols to integers

# hash for word to integer map
my %sym2int = ();

open my $F, '<', $symbol_table or croak "problem with $symbol_table $!";

# store words to int map in hash
while( my $line = <$F>) {
    chomp $line;
    my ($s,$i) = split /\s/, $line, 2;
    $sym2int{$s} = $i;
}
close $F;

open my $I, '<', $filtered or croak "problem with $filtered $!";
open my $OOVS, '+>', $oovs or croak "problems with $oovs $!";

while ( my $line = <$I>) {
    chomp $line;
    my @A = split /\s/, $line;
    foreach my $a (@A) {
	if (!defined ($sym2int{$a})) {
            print $OOVS "$a\n";
	}
    }
}
close $OOVS;
close $I;

# remove segments with OOVs

# store OOVS in hash
my %oov = ();
open my $V, '<', $oovs or croak "problems with $oovs $!";
while ( my $line = <$V> ) {
    chomp $line;
    $oov{$line} = 1;
}
close $V;

open my $L, '<', $filtered or croak "problems with $filtered $!";
open my $IV, '+>', $iv or croak "problems with $iv $!";

SEGMENT: while ( my $segment = <$L> ) {
    chomp $segment;
    my @words = split /\s+/, $segment;
    foreach my $word ( sort @words ) {
	next SEGMENT if ( $oov{$word} );
    }
    print $IV "$segment\n";
}
close $IV;
close $L;