Blame view

egs/commonvoice/s5/local/prepare_dict.sh 2.74 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
  #!/bin/bash
  
  # Copyright 2012   Vassil Panayotov
  #           2017   Ewald Enzinger
  # Apache 2.0
  
  # Adapted from egs/voxforge/s5/local/voxforge_prepare_dict.sh (commit acb5439bf97a39398d5eeb926a2a5cfa71b5f72a)
  
  . path.sh || exit 1
  
  locdata=data/local
  locdict=$locdata/dict
  
  echo "=== Preparing the dictionary ..."
  
  if [ ! -f $locdict/cmudict/cmudict.0.7a ]; then
    echo "--- Downloading CMU dictionary ..."
    mkdir -p $locdict
    svn co http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
      $locdict/cmudict || exit 1;
  fi
  
  echo "--- Striping stress and pronunciation variant markers from cmudict ..."
  perl $locdict/cmudict/scripts/make_baseform.pl \
    $locdict/cmudict/cmudict.0.7a /dev/stdout |\
    sed -e 's:^\([^\s(]\+\)([0-9]\+)\(\s\+\)\(.*\):\1\2\3:' | tr '[A-Z]' '[a-z]' > $locdict/cmudict-plain.txt
  
  echo "--- Searching for OOV words ..."
  awk 'NR==FNR{words[$1]; next;} !($1 in words)' \
    $locdict/cmudict-plain.txt $locdata/vocab-full.txt |\
    egrep -v '<.?s>' > $locdict/vocab-oov.txt
  
  awk 'NR==FNR{words[$1]; next;} ($1 in words)' \
    $locdata/vocab-full.txt $locdict/cmudict-plain.txt |\
    egrep -v '<.?s>' > $locdict/lexicon-iv.txt
  
  wc -l $locdict/vocab-oov.txt
  wc -l $locdict/lexicon-iv.txt
  
  if [ ! -f conf/g2p_model ]; then
    echo "--- Downloading a pre-trained Sequitur G2P model ..."
    wget http://sourceforge.net/projects/kaldi/files/sequitur-model4 -O conf/g2p_model
    if [ ! -f conf/g2p_model ]; then
      echo "Failed to download the g2p model!"
      exit 1
    fi
  fi
  
  if [[ "$(uname)" == "Darwin" ]]; then
    command -v greadlink >/dev/null 2>&1 || \
      { echo "Mac OS X detected and 'greadlink' not found - please install using macports or homebrew"; exit 1; }
    alias readlink=greadlink
  fi
  
  sequitur=$KALDI_ROOT/tools/sequitur-g2p
  export PATH=$PATH:$sequitur/bin
  export PYTHONPATH=$PYTHONPATH:`utils/make_absolute.sh $sequitur/lib/python*/site-packages`
  
  if ! g2p=`which g2p.py` ; then
    echo "The Sequitur was not found !"
    echo "Go to $KALDI_ROOT/tools and execute extras/install_sequitur.sh"
    exit 1
  fi
  
  echo "--- Preparing pronunciations for OOV words ..."
  g2p.py --model=conf/g2p_model --apply $locdict/vocab-oov.txt > $locdict/lexicon-oov.txt
  
  cat $locdict/lexicon-oov.txt $locdict/lexicon-iv.txt |\
    sort > $locdict/lexicon.txt
  rm $locdict/lexiconp.txt 2>/dev/null || true
  
  echo "--- Prepare phone lists ..."
  echo SIL > $locdict/silence_phones.txt
  echo SIL > $locdict/optional_silence.txt
  grep -v -w sil $locdict/lexicon.txt | \
    awk '{for(n=2;n<=NF;n++) { p[$n]=1; }} END{for(x in p) {print x}}' |\
    sort > $locdict/nonsilence_phones.txt
  
  echo "--- Adding <unk> to the lexicon ..."
  echo -e "<unk>\tSIL" >> $locdict/lexicon.txt
  
  # Some downstream scripts expect this file exists, even if empty
  touch $locdict/extra_questions.txt
  
  echo "*** Dictionary preparation finished!"