Blame view

egs/apiai_decode/s5/recognize-wav.sh 2.24 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
  #!/bin/bash
  # Copyright 2016 Api.ai (Author: Ilya Platonov)
  # Apache 2.0
  
  # This script demonstrates kaldi decoding using pretrained model. It will decode list of wav files.
  #
  # IMPORTANT: wav files must be in 16kHz, 16 bit little-endian format.
  #
  # This script tries to follow with what other scripts are doing in terms of directory structures and data handling.
  #
  # Use ./download-model.sh script to download asr model
  # See https://github.com/api-ai/api-ai-english-asr-model for details about a model and how to use it.
  
  . ./path.sh
  MODEL_DIR="exp/api.ai-model"
  DATA_DIR="data/test-corpus"
  
  echo "///////"
  echo "// IMPORTANT: wav files must be in 16kHz, 16 bit little-endian format."
  echo "//////";
  
  for file in final.mdl HCLG.fst words.txt frame_subsampling_factor; do
    if [ ! -f $MODEL_DIR/$file ]; then
      echo "$MODEL_DIR/$file not found, use ./download-model.sh"
      exit 1;
    fi
  done;
  
  for app in nnet3-latgen-faster apply-cmvn lattice-scale; do
    command -v $app >/dev/null 2>&1 || { echo >&2 "$app not found, is kaldi compiled?"; exit 1; }
  done;
  
  local/create-corpus.sh $DATA_DIR $@ || exit 1;
  
  echo "///////"
  echo "// Computing mfcc and cmvn (cmvn is not really used)"
  echo "//////";
  
   steps/make_mfcc.sh --nj 1 --mfcc-config $MODEL_DIR/mfcc.conf \
        --cmd "run.pl" $DATA_DIR exp/make_mfcc exp/mfcc || { echo "Unable to calculate mfcc, ensure 16kHz, 16 bit little-endian wav format or see log"; exit 1; };
      steps/compute_cmvn_stats.sh $DATA_DIR exp/make_mfcc/ exp/mfcc || exit 1;
  
  echo "///////"
  echo "// Doing decoding (see log for results)"
  echo "//////";
  frame_subsampling_factor=$(cat $MODEL_DIR/frame_subsampling_factor)
  nnet3-latgen-faster --frame-subsampling-factor=$frame_subsampling_factor --frames-per-chunk=50 --extra-left-context=0 \
   --extra-right-context=0 --extra-left-context-initial=-1 --extra-right-context-final=-1 \
   --minimize=false --max-active=7000 --min-active=200 --beam=15.0 --lattice-beam=8.0 \
   --acoustic-scale=1.0 --allow-partial=true \
   --word-symbol-table=$MODEL_DIR/words.txt $MODEL_DIR/final.mdl $MODEL_DIR//HCLG.fst \
   "ark,s,cs:apply-cmvn --norm-means=false --norm-vars=false --utt2spk=ark:$DATA_DIR/utt2spk scp:$DATA_DIR/cmvn.scp scp:$DATA_DIR/feats.scp ark:- |" \
   "ark:|lattice-scale --acoustic-scale=10.0 ark:- ark:-  >exp/lat.1"