run_logistic_regression.sh 3.52 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91


#!/bin/bash
# Copyright  2014   David Snyder,  Daniel Povey
# Apache 2.0.
#
# An in progress example script for training and evaluating
# using logistic regression.

. ./cmd.sh
. ./path.sh
set -e

config=conf/logistic-regression.conf

awk '{print $2}' <(lid/remove_dialect.pl data/train/utt2lang) | sort -u | \
  awk '{print $1, NR-1}' >  exp/ivectors_train/languages.txt

model=exp/ivectors_train/logistic_regression
model_rebalanced=exp/ivectors_train/logistic_regression_rebalanced
train_ivectors="ark:ivector-normalize-length \
         scp:exp/ivectors_train/ivector.scp ark:- |";
classes="ark:lid/remove_dialect.pl data/train/utt2lang \
         | utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt - |"

# An alternative prior.
#utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt \
#  <(lid/remove_dialect.pl data/train/utt2lang) | \
#  awk '{print $2}' | sort -n | uniq -c | \
#  awk 'BEGIN{printf(" [ ");} {printf("%s ", 1.0/$1); } END{print(" ]"); }' \
#   >exp/ivectors_train/inv_priors.vec

# Create priors to rebalance the model. The following script rebalances
# the languages as count(lang_test) / (count(lang_test) + count(lang_train)).
lid/balance_priors_to_test.pl \
    <(lid/remove_dialect.pl <(utils/filter_scp.pl \
        exp/ivectors_train/ivector.scp data/train/utt2lang)) \
    <(lid/remove_dialect.pl data/lre07/utt2lang) \
    exp/ivectors_train/languages.txt \
    exp/ivectors_train/priors.vec

logistic-regression-train --config=$config "$train_ivectors" \
                          "$classes" $model \
   2>exp/ivectors_train/log/logistic_regression.log


 logistic-regression-copy --scale-priors=exp/ivectors_train/priors.vec \
   $model $model_rebalanced

trials="lid/remove_dialect.pl data/train/utt2lang \
        | utils/sym2int.pl -f 2 exp/ivectors_train/languages.txt -|"
scores="|utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt  \
        >exp/ivectors_train/train_scores"

logistic-regression-eval $model "$train_ivectors" \
  ark,t:exp/ivectors_train/posteriors

logistic-regression-eval $model "ark:$trials" "$train_ivectors" "$scores"

cat exp/ivectors_train/posteriors | \
  awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max) 
                          { max=$f; argmax=f; }}  
                          print $1, (argmax - 3); }' | \
  utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt \
    >exp/ivectors_train/output

# note: we treat the language as a sentence; it happens that the WER/SER
# corresponds to the recognition error rate.
compute-wer --mode=present --text ark:<(lid/remove_dialect.pl data/train/utt2lang) \
  ark:exp/ivectors_train/output

# %WER 4.73 [ 3389 / 71668, 0 ins, 0 del, 3389 sub ] [PARTIAL]
# %SER 4.73 [ 3389 / 71668 ]
# Scored 71668 sentences, 16 not present in hyp.
logistic-regression-eval $model_rebalanced \
  'ark:ivector-normalize-length scp:exp/ivectors_lre07/ivector.scp ark:- |' ark,t:- | \
  awk '{max=$3; argmax=3; for(f=3;f<NF;f++) { if ($f>max) 
                          { max=$f; argmax=f; }}  
                          print $1, (argmax - 3); }' | \
  utils/int2sym.pl -f 2 exp/ivectors_train/languages.txt >exp/ivectors_lre07/output

compute-wer --text ark:<(lid/remove_dialect.pl data/lre07/utt2lang) \
  ark:exp/ivectors_lre07/output

# %WER 33.04 [ 2487 / 7527, 0 ins, 0 del, 2487 sub ]
# %SER 33.04 [ 2487 / 7527 ]
# Scored 7527 sentences, 0 not present in hyp.

# General LR closed-set eval.
local/lre07_logistic_regression_eval.sh $model_rebalanced
#Duration (sec):    avg      3     10     30
#        ER (%):  33.04  53.21  29.55  16.37
#     C_avg (%):  17.65  29.53  15.64   7.79