Blame view

egs/voxforge/online_demo/run.sh 4.04 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
  #!/bin/bash
  
  # Copyright 2012 Vassil Panayotov
  # Apache 2.0
  
  # Note: you have to do 'make ext' in ../../../src/ before running this.
  
  # Set the paths to the binaries and scripts needed
  KALDI_ROOT=`pwd`/../../..
  export PATH=$PWD/../s5/utils/:$KALDI_ROOT/src/onlinebin:$KALDI_ROOT/src/bin:$PATH
  
  data_file="online-data"
  data_url="http://sourceforge.net/projects/kaldi/files/online-data.tar.bz2"
  
  # Change this to "tri2a" if you like to test using a ML-trained model
  ac_model_type=tri2b_mmi
  
  # Alignments and decoding results are saved in this directory(simulated decoding only)
  decode_dir="./work"
  
  # Change this to "live" either here or using command line switch like:
  # --test-mode live
  test_mode="simulated"
  
  . parse_options.sh
  
  ac_model=${data_file}/models/$ac_model_type
  trans_matrix=""
  audio=${data_file}/audio
  
  if [ ! -s ${data_file}.tar.bz2 ]; then
      echo "Downloading test models and data ..."
      wget -T 10 -t 3 $data_url;
  
      if [ ! -s ${data_file}.tar.bz2 ]; then
          echo "Download of $data_file has failed!"
          exit 1
      fi
  fi
  
  if [ ! -d $ac_model ]; then
      echo "Extracting the models and data ..."
      tar xf ${data_file}.tar.bz2
  fi
  
  if [ -s $ac_model/matrix ]; then
      trans_matrix=$ac_model/matrix
  fi
  
  case $test_mode in
      live)
          echo
          echo -e "  LIVE DEMO MODE - you can use a microphone and say something
  "
          echo "  The (bigram) language model used to build the decoding graph was"
          echo "  estimated on an audio book's text. The text in question is"
          echo "  \"King Solomon's Mines\" (http://www.gutenberg.org/ebooks/2166)."
          echo "  You may want to read some sentences from this book first ..."
          echo
          online-gmm-decode-faster --rt-min=0.5 --rt-max=0.7 --max-active=4000 \
             --beam=12.0 --acoustic-scale=0.0769 $ac_model/model $ac_model/HCLG.fst \
             $ac_model/words.txt '1:2:3:4:5' $trans_matrix;;
  
      simulated)
          echo
          echo -e "  SIMULATED ONLINE DECODING - pre-recorded audio is used
  "
          echo "  The (bigram) language model used to build the decoding graph was"
          echo "  estimated on an audio book's text. The text in question is"
          echo "  \"King Solomon's Mines\" (http://www.gutenberg.org/ebooks/2166)."
          echo "  The audio chunks to be decoded were taken from the audio book read"
          echo "  by John Nicholson(http://librivox.org/king-solomons-mines-by-haggard/)"
          echo
          echo "  NOTE: Using utterances from the book, on which the LM was estimated"
          echo "        is considered to be \"cheating\" and we are doing this only for"
          echo "        the purposes of the demo."
          echo
          echo "  You can type \"./run.sh --test-mode live\" to try it using your"
          echo "  own voice!"
          echo
          mkdir -p $decode_dir
          # make an input .scp file
          > $decode_dir/input.scp
          for f in $audio/*.wav; do
              bf=`basename $f`
              bf=${bf%.wav}
              echo $bf $f >> $decode_dir/input.scp
          done
          online-wav-gmm-decode-faster --verbose=1 --rt-min=0.8 --rt-max=0.85\
              --max-active=4000 --beam=12.0 --acoustic-scale=0.0769 \
              scp:$decode_dir/input.scp $ac_model/model $ac_model/HCLG.fst \
              $ac_model/words.txt '1:2:3:4:5' ark,t:$decode_dir/trans.txt \
              ark,t:$decode_dir/ali.txt $trans_matrix;;
  
      *)
          echo "Invalid test mode! Should be either \"live\" or \"simulated\"!";
          exit 1;;
  esac
  
  # Estimate the error rate for the simulated decoding
  if [ $test_mode == "simulated" ]; then
      # Convert the reference transcripts from symbols to word IDs
      sym2int.pl -f 2- $ac_model/words.txt < $audio/trans.txt > $decode_dir/ref.txt
  
      # Compact the hypotheses belonging to the same test utterance
      cat $decode_dir/trans.txt |\
          sed -e 's/^\(test[0-9]\+\)\([^ ]\+\)\(.*\)/\1 \3/' |\
          gawk '{key=$1; $1=""; arr[key]=arr[key] " " $0; } END { for (k in arr) { print k " " arr[k]} }' > $decode_dir/hyp.txt
  
     # Finally compute WER
     compute-wer --mode=present ark,t:$decode_dir/ref.txt ark,t:$decode_dir/hyp.txt
  fi