run_logistic_regression.sh
3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/bash
# Copyright 2014 David Snyder, Daniel Povey
# Apache 2.0.
#
# An in progress example script for training and evaluating
# using logistic regression.
. ./cmd.sh
. ./path.sh
set -e
config=conf/logistic-regression.conf
awk '{print $2}' <(lid/remove_dialect.pl data/train/utt2lang) | sort -u | \
awk '{print $1, NR-1}' > exp/ivectors_train/languages.txt
model=exp/ivectors_train/logistic_regression
model_rebalanced=exp/ivectors_train/logistic_regression_rebalanced
train_ivectors="ark:ivector-normalize-length \
scp:exp/ivectors_train/ivector.scp ark:- |";
classes="ark:lid/remove_dialect.pl data/train/utt2lang \
| utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt - |"
# An alternative prior.
#utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt \
# <(lid/remove_dialect.pl data/train/utt2lang) | \
# awk '{print $2}' | sort -n | uniq -c | \
# awk 'BEGIN{printf(" [ ");} {printf("%s ", 1.0/$1); } END{print(" ]"); }' \
# >exp/ivectors_train/inv_priors.vec
# Create priors to rebalance the model. The following script rebalances
# the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
lid/balance_priors_to_test.pl \
<(lid/remove_dialect.pl <(utils/filter_scp.pl \
exp/ivectors_train/ivector.scp data/train/utt2lang)) \
<(lid/remove_dialect.pl data/lre07/utt2lang) \
exp/ivectors_train/languages.txt \
exp/ivectors_train/priors.vec
logistic-regression-train --config=$config "$train_ivectors" \
"$classes" $model \
2>exp/ivectors_train/log/logistic_regression.log
logistic-regression-copy --scale-priors=exp/ivectors_train/priors.vec \
$model $model_rebalanced
trials="lid/remove_dialect.pl data/train/utt2lang \
| utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt -|"
scores="|utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt \
>exp/ivectors_train/train_scores"
logistic-regression-eval $model "$train_ivectors" \
ark,t:exp/ivectors_train/posteriors
logistic-regression-eval $model "ark:$trials" "$train_ivectors" "$scores"
cat exp/ivectors_train/posteriors | \
awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max)
{ max=$f; argmax=f; }}
print $1, (argmax - 3); }' | \
utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt \
>exp/ivectors_train/output
# note: we treat the language as a sentence; it happens that the WER/SER
# corresponds to the recognition error rate.
compute-wer --mode=present --text ark:<(lid/remove_dialect.pl data/train/utt2lang) \
ark:exp/ivectors_train/output
# %WER 4.73 [ 3389 / 71668, 0 ins, 0 del, 3389 sub ] [PARTIAL]
# %SER 4.73 [ 3389 / 71668 ]
# Scored 71668 sentences, 16 not present in hyp.
logistic-regression-eval $model_rebalanced \
'ark:ivector-normalize-length scp:exp/ivectors_lre07/ivector.scp ark:- |' ark,t:- | \
awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max)
{ max=$f; argmax=f; }}
print $1, (argmax - 3); }' | \
utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/ivectors_lre07/output
compute-wer --text ark:<(lid/remove_dialect.pl data/lre07/utt2lang) \
ark:exp/ivectors_lre07/output
# %WER 33.04 [ 2487 / 7527, 0 ins, 0 del, 2487 sub ]
# %SER 33.04 [ 2487 / 7527 ]
# Scored 7527 sentences, 0 not present in hyp.
# General LR closed-set eval.
local/lre07_logistic_regression_eval.sh $model_rebalanced
#Duration (sec): avg 3 10 30
# ER (%): 33.04 53.21 29.55 16.37
# C_avg (%): 17.65 29.53 15.64 7.79