Blame view

egs/wsj/s5/local/wsj_format_local_lms.sh 2.77 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
  #!/bin/bash
  
  # Copyright Johns Hopkins University (Author: Daniel Povey) 2012
  #           Guoguo Chen 2014
  
  lang_suffix=
  
  echo "$0 $@"  # Print the command line for logging
  . ./path.sh
  . utils/parse_options.sh || exit 1;
  
  [ ! -d data/lang${lang_suffix}_bd ] &&\
    echo "Expect data/local/lang${lang_suffix}_bd to exist" && exit 1;
  
  lm_srcdir_3g=data/local/local_lm/3gram-mincount
  lm_srcdir_4g=data/local/local_lm/4gram-mincount
  
  [ ! -d "$lm_srcdir_3g" ] && echo "No such dir $lm_srcdir_3g" && exit 1;
  [ ! -d "$lm_srcdir_4g" ] && echo "No such dir $lm_srcdir_4g" && exit 1;
  
  for d in data/lang${lang_suffix}_test_bd_{tg,tgpr,tgconst,fg,fgpr,fgconst}; do
    rm -r $d 2>/dev/null
    cp -r data/lang${lang_suffix}_bd $d
  done
  
  lang=data/lang${lang_suffix}_bd
  
  # Check a few files that we have to use.
  for f in words.txt oov.int; do
    if [[ ! -f $lang/$f ]]; then
      echo "$0: no such file $lang/$f"
      exit 1;
    fi
  done
  
  # Parameters needed for ConstArpaLm.
  unk=`cat $lang/oov.int`
  bos=`grep "<s>" $lang/words.txt | awk '{print $2}'`
  eos=`grep "</s>" $lang/words.txt | awk '{print $2}'`
  if [[ -z $bos || -z $eos ]]; then
    echo "$0: <s> and </s> symbols are not in $lang/words.txt"
    exit 1;
  fi
  
  # Be careful: this time we dispense with the grep -v '<s> <s>' so this might
  # not work for LMs generated from all toolkits.
  gunzip -c $lm_srcdir_3g/lm_pr6.0.gz | \
    arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_tgpr/G.fst || exit 1;
    fstisstochastic data/lang${lang_suffix}_test_bd_tgpr/G.fst
  
  gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
    arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_tg/G.fst || exit 1;
    fstisstochastic data/lang${lang_suffix}_test_bd_tg/G.fst
  
  # Build ConstArpaLm for the unpruned language model.
  gunzip -c $lm_srcdir_3g/lm_unpruned.gz | \
    utils/map_arpa_lm.pl $lang/words.txt | \
    arpa-to-const-arpa --bos-symbol=$bos --eos-symbol=$eos \
    --unk-symbol=$unk - data/lang${lang_suffix}_test_bd_tgconst/G.carpa || exit 1
  
  gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
    arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_fg/G.fst || exit 1;
    fstisstochastic data/lang${lang_suffix}_test_bd_fg/G.fst
  
  # Build ConstArpaLm for the unpruned language model.
  gunzip -c $lm_srcdir_4g/lm_unpruned.gz | \
    utils/map_arpa_lm.pl $lang/words.txt | \
    arpa-to-const-arpa --bos-symbol=$bos --eos-symbol=$eos \
    --unk-symbol=$unk - data/lang${lang_suffix}_test_bd_fgconst/G.carpa || exit 1
  
  gunzip -c $lm_srcdir_4g/lm_pr7.0.gz | \
    arpa2fst --disambig-symbol=#0 \
             --read-symbol-table=$lang/words.txt - data/lang${lang_suffix}_test_bd_fgpr/G.fst || exit 1;
    fstisstochastic data/lang${lang_suffix}_test_bd_fgpr/G.fst
  
  exit 0;