Blame view

egs/gale_arabic/s5/local/gale_prep_dict.sh 821 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
  #!/bin/bash
  
  # Copyright 2014 QCRI (author: Ahmed Ali)
  # Apache 2.0
  
  
  # run this from ../
  dir=data/local/dict
  mkdir -p $dir
  
  
  # (1) Get QCRI dictionary
  wget http://alt.qcri.org//resources/speech/dictionary/ar-ar_lexicon_2014-03-17.txt.bz2  || exit 1;
  bzcat ar-ar_lexicon_2014-03-17.txt.bz2 | sed '1,3d'  >  $dir/lexicon.txt 
  rm -fr ar-ar_lexicon_2014-03-17.txt.bz2
  
  #(2) Dictionary preparation:
  
  # silence phones, one per line.
  echo SIL > $dir/silence_phones.txt
  echo SIL > $dir/optional_silence.txt
  
  # nonsilence phones; on each line is a list of phones that correspond
  # really to the same base phone.
  cat $dir/lexicon.txt | cut -d ' ' -f2- | tr -s ' ' '
  ' |\
  sort -u >  $dir/nonsilence_phones.txt || exit 1;
  
  perl -i -pe 'print "<UNK> SIL
  " if $.==1'  $dir/lexicon.txt
  
  echo Dictionary preparation succeeded
  
  exit 0