balance_priors_to_test.pl 1.94 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80


#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2014 David Snyder
# Apache 2.0.
#
# This script produces a vector used by logistic-regression-copy to 
# rescale the logistic regression model which reduces bias due to unbalanced
# classes. This script relies only on the distribution of the test data;
# alternatively, a uniform prior can be used (see run_logistic_regression.sh). 

# The scale parameter controls how sensitive the priors are to the
# distribution of the test data. Typically this ranges from 0.5
# to 1.0. Smaller values are less reliant on the test data distribution.
my ($train_file, $test_file, $lang_file, $scale, $priors_file) = @ARGV;
open(UTT2LANG_TRAIN, "<$train_file") or die "no utt2lang training file";

%train_count = ();
$train_tot = 0;
while(<UTT2LANG_TRAIN>) {
  $line = $_;
  chomp($line);
  @words = split(" ", $line);
  $lang = $words[1];
  if (not exists($train_count{$lang})) {
    $train_count{$lang} = 1;
  } else {
    $train_count{$lang} += 1;
  }
  $train_tot += 1;
}

open(UTT2LANG_TEST, "<$test_file");

%test_count = ();
$test_tot = 0;
while(<UTT2LANG_TEST>) {
  $line = $_;
  chomp($line);
  @words = split(" ", $line);
  $lang = $words[1];
  if (not exists($test_count{$lang})) {
    $test_count{$lang} = 1;
  } else {
    $test_count{$lang} += 1;
  }
  $test_tot += 1;
}

foreach my $key (keys %train_count) {
  if (not exists($test_count{$key})) {
    $test_count{$key} = 0;
  }
}

# load languages file
open(LANGUAGES, "<$lang_file");
@idx_to_lang = ();

$largest_idx = 0;
while(<LANGUAGES>) {
  $line = $_;
  chomp($line);
  @words = split(" ", $line);
  $lang = $words[0];
  $idx = $words[1];
  $idx_to_lang[$idx + 0] = $lang;
  if ($idx > $largest_idx) {
    $largest_idx = $idx;
  }
}

$priors = " [ ";
foreach $lang (@idx_to_lang) {
 $ratio = ((1.0*$test_count{$lang}) / $train_count{$lang})**($scale);
 $priors .= "$ratio ";
}

$priors .= " ]";
open(PRIORS, ">$priors_file");
print PRIORS $priors;