Blame view

egs/lre07/v2/local/dnn/fisher_create_test_lang.sh 2.25 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
  #!/bin/bash 
  #
  
  if [ -f path.sh ]; then . ./path.sh; fi
  
  mkdir -p data/lang_test
  
  arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz
  [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
  
  mkdir -p data/lang_test
  cp -r data/lang/* data/lang_test
  
  # grep -v '<s> <s>' etc. is only for future-proofing this script.  Our
  # LM doesn't have these "invalid combinations".  These can cause 
  # determinization failures of CLG [ends up being epsilon cycles].
  # Note: remove_oovs.pl takes a list of words in the LM that aren't in
  # our word list.  Since our LM doesn't have any, we just give it
  # /dev/null [we leave it in the script to show how you'd do it].
  gunzip -c "$arpa_lm" | \
     grep -v '<s> <s>' | \
     grep -v '</s> <s>' | \
     grep -v '</s> </s>' | \
     arpa2fst - | fstprint | \
     utils/remove_oovs.pl /dev/null | \
     utils/eps2disambig.pl | utils/s2eps.pl | fstcompile --isymbols=data/lang_test/words.txt \
       --osymbols=data/lang_test/words.txt  --keep_isymbols=false --keep_osymbols=false | \
      fstrmepsilon | fstarcsort --sort_type=ilabel > data/lang_test/G.fst
    fstisstochastic data/lang_test/G.fst
  
  
  echo  "Checking how stochastic G is (the first of these numbers should be small):"
  fstisstochastic data/lang_test/G.fst 
  
  ## Check lexicon.
  ## just have a look and make sure it seems sane.
  echo "First few lines of lexicon FST:"
  fstprint   --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst  | head
  
  echo Performing further checks
  
  # Checking that G.fst is determinizable.
  fstdeterminize data/lang_test/G.fst /dev/null || echo Error determinizing G.
  
  # Checking that L_disambig.fst is determinizable.
  fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L.
  
  # Checking that disambiguated lexicon times G is determinizable
  # Note: we do this with fstdeterminizestar not fstdeterminize, as
  # fstdeterminize was taking forever (presumbaly relates to a bug
  # in this version of OpenFst that makes determinization slow for
  # some case).
  fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \
     fstdeterminizestar >/dev/null || echo Error
  
  # Checking that LG is stochastic:
  fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \
     fstisstochastic || echo "[log:] LG is not stochastic"
  
  
  echo "$0 succeeded"