Blame view

egs/spanish_dimex100/s5/local/lang_prep.sh 1.59 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
  #!/bin/bash
  
  ## Only run this file from the example root directory
  ##      $ ./local/data_prep.sh
  
  CORPUS_DIR="$1"
  
  mkdir -p "data/local/dict"
  
  source ./path.sh
  
  #############################
  # data/local/dict/lexicon.txt
  #############################
  
  export LC_ALL=C
  
  echo -e '!SIL sil
  <UNK> spn' > data/local/dict/lexicon.txt
  cat "$CORPUS_DIR/diccionarios/T22.full.dic" \
      | tr '[:upper:]' '[:lower:]' \
      | sed -e 's/([0123456789]*)//g' \
          -e 's/\([^ ]\)n\~/\1n/g' \
          -e 's/a_7/a/g' -e 's/e_7/e/g' -e 's/i_7/i/g' -e 's/o_7/o/g' -e 's/u_7/u/g' \
          -e 's/a-7/a/g' -e 's/e-7/e/g' -e 's/i-7/i/g' -e 's/o-7/o/g' -e 's/u-7/u/g' \
          -e 's/a_/a/g' -e 's/e_/e/g' -e 's/i_/i/g' -e 's/o_/o/g' -e 's/u_/u/g' \
      | sed -e 's/_7n.*$//' \
          -e 's/atl_7tica/atletica/' \
          -e 's/biol_7gicas/biologicas/' \
          -e 's/elec_7ctrico/electrico/' \
          -e 's/gr_7afico/grafico/' \
          -e 's/s_7lo/solo/' \
      | sed -e 's/n~/ni/g' -e 's/r(/rh/g' \
      | sed -e 's/\t/ /g' -e '/^$/d' \
      | sort | uniq \
      >> data/local/dict/lexicon.txt
  
  
  #######################################
  # data/local/dict/silence_phones.txt
  # data/local/dict/optional_silence.txt
  # data/local/dict/nonsilence_phones.txt
  # data/local/dict/extra_questions.txt
  #######################################
  
  echo -e 'sil
  spn' > data/local/dict/silence_phones.txt
  echo -e 'sil' > data/local/dict/optional_silence.txt
  cat data/local/dict/lexicon.txt \
      | grep -v '<UNK>' \
      | grep -v '!SIL' \
      | cut -d' ' -f1 --complement \
      | sed 's/ /
  /g' \
      | sort -u \
      > data/local/dict/nonsilence_phones.txt