run_logistic_regression.sh
3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/bash
# Copyright 2014 David Snyder, Daniel Povey
# Apache 2.0.
#
# This script trains a logistic regression model on top of
# i-Vectors, and evaluates it on the NIST LRE07 closed-set
# evaluation.
. ./cmd.sh
. ./path.sh
set -e
train_dir=exp/ivectors_train
test_dir=exp/ivectors_lre07
model_dir=exp/ivectors_train
train_utt2lang=data/train_lr/utt2lang
test_utt2lang=data/lre07/utt2lang
prior_scale=1.0
apply_log=true # If true, the output of the binary
# logistitic-regression-eval are log-posteriors.
# Probabilities are the output if this is false.
conf=conf/logistic-regression.conf
languages=local/general_lr_closed_set_langs.txt
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
mkdir -p $model_dir/log
model=$model_dir/logistic_regression
model_rebalanced=$model_dir/logistic_regression_rebalanced
train_ivectors="ark:ivector-normalize-length \
scp:$train_dir/ivector.scp ark:- |";
test_ivectors="ark:ivector-normalize-length \
scp:$test_dir/ivector.scp ark:- |";
classes="ark:lid/remove_dialect.pl $train_utt2lang \
| utils/sym2int.pl -f 2 $languages - |"
# A uniform prior.
#utils/sym2int.pl -f 2 $languages \
# <(lid/remove_dialect.pl $train_utt2lang) | \
# awk '{print $2}' | sort -n | uniq -c | \
# awk 'BEGIN{printf(" [ ");} {printf("%s ", 1.0/$1); } END{print(" ]"); }' \
# >$model_dir/inv_priors.vec
# Create priors to rebalance the model. The following script rebalances
# the languages as ( count(lang_test) / count(lang_train) )^(prior_scale).
lid/balance_priors_to_test.pl \
<(lid/remove_dialect.pl <(utils/filter_scp.pl -f 1 \
$train_dir/ivector.scp $train_utt2lang)) \
<(lid/remove_dialect.pl $test_utt2lang) \
$languages \
$prior_scale \
$model_dir/priors.vec
logistic-regression-train --config=$conf "$train_ivectors" \
"$classes" $model \
2>$model_dir/log/logistic_regression.log
logistic-regression-copy --scale-priors=$model_dir/priors.vec \
$model $model_rebalanced
logistic-regression-eval --apply-log=$apply_log $model \
"$train_ivectors" ark,t:$train_dir/posteriors
cat $train_dir/posteriors | \
awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max)
{ max=$f; argmax=f; }}
print $1, (argmax - 3); }' | \
utils/int2sym.pl -f 2 $languages \
>$train_dir/output
# note: we treat the language as a sentence; it happens that the WER/SER
# corresponds to the recognition error rate.
compute-wer --mode=present --text ark:<(lid/remove_dialect.pl $train_utt2lang) \
ark:$train_dir/output
# Evaluate on test data. Most likely a NIST LRE.
logistic-regression-eval --apply-log=$apply_log $model_rebalanced \
"$test_ivectors" ark,t:$test_dir/posteriors
cat $test_dir/posteriors | \
awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max)
{ max=$f; argmax=f; }}
print $1, (argmax - 3); }' | \
utils/int2sym.pl -f 2 $languages \
>$test_dir/output
compute-wer --text ark:<(lid/remove_dialect.pl $test_utt2lang) \
ark:$test_dir/output