Blame view

egs/rm/s5/local/prepare_wsj_rm_lang.sh 3.08 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
  #!/bin/bash
  # Copyright 2017 Pegah Ghahremani
  
  # This script prepares a dictionary for wsj-to-rm transfer learning experiment,
  # which uses wsj phone set phones.txt, lexicon lexicon.txt and dict.
  # The new lexicon.txt are created for words in rm words.txt as follows:
  #   1) The lexicon are copied from wsj lexicon.txt for common words in wsj and rm.
  #   2) Words in rm that are not in the wsj lexicon are added
  #      as oov to new lexicon.txt.
  # The oov word "<SPOKEN_NOISE>" in wsj is also added to words.txt and G.fst is
  # recompiled using updated word list.
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . utils/parse_options.sh
  
  if [ $# != 3 ]; then
    echo "Usage: local/prepare_wsj_rm_lang.sh <src-dict> <src-lang> <output-dir>"
    echo "e.g:"
    echo "$0 ../../wsj/s5/data/local/dict ../../wsj/s5/data/lang_nosp data/wsj_rm_dir"
  fi
  
  src_dict=$1
  src_lang=$2
  output_dir=$3
  
  required_dict_files="$src_dict/lexicon.txt $src_dict/nonsilence_phones.txt $src_dict/silence_phones.txt $src_dict/optional_silence.txt $src_lang/oov.txt $src_lang/phones.txt"
  for f in $required_dict_files; do
    if [ ! -f $f ]; then
      echo "$0: file $f that is required for preparing lang does not exist." && exit 1;
    fi
  done
  
  rm -r $output_dir 2>/dev/null || true
  mkdir -p $output_dir
  mkdir -p $output_dir/local
  # copy *phones.txt from source to target.
  cp -r $src_dict $output_dir/local/dict
  rm $output_dir/local/dict/lexicon*.txt
  
  oov_word=`cat $src_lang/oov.txt`
  # common word list in rm lexicon with lexicon in wsj
  comm -12 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \
    <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \
    sed -r "s/'/+/g" | sort > $output_dir/words_tmp.txt
  
  comm -23 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \
    <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \
    sed -r "s/'/+/g" | sort > $output_dir/words_only_tgt.txt
  
  # add oov_word to word list
  (echo "$oov_word"; cat $output_dir/words_tmp.txt) | sort > $output_dir/words_tgt_src.txt
  rm $output_dir/words_tmp.txt
  
  # we use wsj lexicon and find common word list in rm and wsj to generate lexicon for rm-wsj
  # using wsj phone sets. More than 90% of words in RM are in WSJ(950/994).
  cat $output_dir/words_tgt_src.txt | sed "s/\+/\'/g" | \
  utils/apply_map.pl --permissive $src_dict/lexicon.txt | \
    paste <(cat $output_dir/words_tgt_src.txt) - > $output_dir/local/dict/lexicon_tgt_src.txt
  
  # extend lexicon.txt by adding only_tg words as oov.
  oov_phone=`grep "$oov_word" $src_dict/lexicon.txt | cut -d' ' -f2`
  cat $output_dir/local/dict/lexicon_tgt_src.txt <(sed 's/$/ SPN/g' $output_dir/words_only_tgt.txt) | sort -u > $output_dir/local/dict/lexicon.txt
  
  # prepare dictionary using new lexicon.txt for RM-SWJ.
  utils/prepare_lang.sh --phone-symbol-table $src_lang/phones.txt \
    $output_dir/local/dict "$oov_word" $output_dir/local/lang_tmp $output_dir
  
  # Generate new G.fst using updated words list with added <SPOKEN_NOISE>
  fstcompile --isymbols=$output_dir/words.txt --osymbols=$output_dir/words.txt --keep_isymbols=false \
      -keep_osymbols=false data/local/tmp/G.txt | fstarcsort --sort_type=ilabel > $output_dir/G.fst || exit 1;