Blame view
egs/lre07/v1/lid/balance_priors_to_test.pl
1.94 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
#!/usr/bin/env perl use warnings; #sed replacement for -w perl parameter # Copyright 2014 David Snyder # Apache 2.0. # # This script produces a vector used by logistic-regression-copy to # rescale the logistic regression model which reduces bias due to unbalanced # classes. This script relies only on the distribution of the test data; # alternatively, a uniform prior can be used (see run_logistic_regression.sh). # The scale parameter controls how sensitive the priors are to the # distribution of the test data. Typically this ranges from 0.5 # to 1.0. Smaller values are less reliant on the test data distribution. my ($train_file, $test_file, $lang_file, $scale, $priors_file) = @ARGV; open(UTT2LANG_TRAIN, "<$train_file") or die "no utt2lang training file"; %train_count = (); $train_tot = 0; while(<UTT2LANG_TRAIN>) { $line = $_; chomp($line); @words = split(" ", $line); $lang = $words[1]; if (not exists($train_count{$lang})) { $train_count{$lang} = 1; } else { $train_count{$lang} += 1; } $train_tot += 1; } open(UTT2LANG_TEST, "<$test_file"); %test_count = (); $test_tot = 0; while(<UTT2LANG_TEST>) { $line = $_; chomp($line); @words = split(" ", $line); $lang = $words[1]; if (not exists($test_count{$lang})) { $test_count{$lang} = 1; } else { $test_count{$lang} += 1; } $test_tot += 1; } foreach my $key (keys %train_count) { if (not exists($test_count{$key})) { $test_count{$key} = 0; } } # load languages file open(LANGUAGES, "<$lang_file"); @idx_to_lang = (); $largest_idx = 0; while(<LANGUAGES>) { $line = $_; chomp($line); @words = split(" ", $line); $lang = $words[0]; $idx = $words[1]; $idx_to_lang[$idx + 0] = $lang; if ($idx > $largest_idx) { $largest_idx = $idx; } } $priors = " [ "; foreach $lang (@idx_to_lang) { $ratio = ((1.0*$test_count{$lang}) / $train_count{$lang})**($scale); $priors .= "$ratio "; } $priors .= " ]"; open(PRIORS, ">$priors_file"); print PRIORS $priors; |