Blame view

egs/fisher_callhome_spanish/s5/local/fsp_train_lms.sh 4.85 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
  #!/bin/bash
  # Copyright 2014  Gaurav Kumar.   Apache 2.0
  
  # To be run from one level above this directory
  # Generate the text for the LM training
  tmp_dir=data/local/tmp
  train_all=data/local/data/train_all
  
  if [ $# -lt 1 ]; then
    echo "Specify the location of the split files"
    exit 1;
  fi
  
  splitFile=$1
  split=train
  # Train only
  if [ -d $tmp_dir/$split ]; then
    rm -r $tmp_dir/$split
  fi
  cp -r $train_all $tmp_dir/$split
  
  awk 'BEGIN {FS=" "}; FNR==NR { a[$1]; next } ((substr($2,0,length($2)-2) ".sph") in a)' \
  $splitFile/$split $train_all/segments > $tmp_dir/$split/segments
  
  n=`awk 'BEGIN {FS = " "}; {print substr($2,0,length($2)-2)}' $tmp_dir/$split/segments | sort | uniq | wc -l`
  
  echo "$n conversations left in split $split"
  
  utils/fix_data_dir.sh $tmp_dir/$split
  # There is no feature file yet, use --no-feats switch
  utils/validate_data_dir.sh --no-feats $tmp_dir/$split
  
  # Now use this training text
  
  text=$tmp_dir/train/text
  lexicon=data/local/dict/lexicon.txt
  
  for f in "$text" "$lexicon"; do
    [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
  done
  
  # This script takes no arguments.  It assumes you have already run
  # fisher_data_prep.sh and fisher_prepare_dict.sh
  # It takes as input the files
  #data/train_all/text
  #data/local/dict/lexicon.txt
  
  dir=`pwd`/data/local/lm
  mkdir -p $dir
  export LC_ALL=C # You'll get errors about things being not sorted, if you
  # have a different locale.
  export PATH=$PATH:`pwd`/../../../tools/kaldi_lm
  ( # First make sure the kaldi_lm toolkit is installed.
   cd ../../../tools || exit 1;
   if [ -d kaldi_lm ]; then
     echo Not installing the kaldi_lm toolkit since it is already there.
   else
     echo Downloading and installing the kaldi_lm tools
     if [ ! -f kaldi_lm.tar.gz ]; then
       wget http://www.danielpovey.com/files/kaldi/kaldi_lm.tar.gz || exit 1;
     fi
     tar -xvzf kaldi_lm.tar.gz || exit 1;
     cd kaldi_lm
     make || exit 1;
     echo Done making the kaldi_lm tools
   fi
  ) || exit 1;
  
  mkdir -p $dir
  
  
  cleantext=$dir/text.no_oov
  
  cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
    {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<unk> ");} } printf("
  ");}' \
    > $cleantext || exit 1;
  
  
  cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
     sort -nr > $dir/word.counts || exit 1;
  
  
  # Get counts from acoustic training transcripts, and add  one-count
  # for each word in the lexicon (but not silence, we don't want it
  # in the LM-- we'll add it optionally later).
  cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
    cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
     sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
  
  # note: we probably won't really make use of <unk> as there aren't any OOVs
  cat $dir/unigram.counts  | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<unk>" > $dir/word_map \
     || exit 1;
  
  # note: ignore 1st field of train.txt, it's the utterance-id.
  cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
    { for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
     || exit 1;
  
  train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
  
  # Perplexity over 88307.000000 words (excluding 691.000000 OOVs) is 71.241332
  
  # note: output is
  # data/local/lm/3gram-mincount/lm_unpruned.gz
  
  
  exit 0
  
  echo "Baseline"
  
  # From here is some commands to do a baseline with SRILM (assuming
  # you have it installed).
  heldout_sent=158126 # Don't change this if you want result to be comparable with
      # kaldi_lm results
  sdir=$dir/srilm # in case we want to use SRILM to double-check perplexities.
  mkdir -p $sdir
  cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
    head -$heldout_sent > $sdir/heldout
  cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
    tail -n +$heldout_sent > $sdir/train
  
  cat $dir/word_map | awk '{print $1}' | cat - <(echo "<s>"; echo "</s>" ) > $sdir/wordlist
  
  
  ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \
    -map-unk "<unk>" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz
  ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout
  
  # data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
  # file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
  # 0 zeroprobs, logprob= -165170 ppl= 71.7609 ppl1= 123.258
  
  
  # Note: perplexity SRILM gives to Kaldi-LM model is similar to what kaldi-lm reports above.
  # Difference in WSJ must have been due to different treatment of <unk>.
  ngram -lm $dir/3gram-mincount/lm_unpruned.gz  -ppl $sdir/heldout
  
  # data/local/lm/srilm/srilm.o3g.kn.gz: line 71: warning: non-zero probability for <unk> in closed-vocabulary LM
  # file data/local/lm/srilm/heldout: 10000 sentences, 78998 words, 0 OOVs
  # 0 zeroprobs, logprob= -164990 ppl= 71.4278 ppl1= 122.614