Blame view
egs/rm/s5/local/prepare_wsj_rm_lang.sh
3.08 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
#!/bin/bash # Copyright 2017 Pegah Ghahremani # This script prepares a dictionary for wsj-to-rm transfer learning experiment, # which uses wsj phone set phones.txt, lexicon lexicon.txt and dict. # The new lexicon.txt are created for words in rm words.txt as follows: # 1) The lexicon are copied from wsj lexicon.txt for common words in wsj and rm. # 2) Words in rm that are not in the wsj lexicon are added # as oov to new lexicon.txt. # The oov word "<SPOKEN_NOISE>" in wsj is also added to words.txt and G.fst is # recompiled using updated word list. if [ -f path.sh ]; then . ./path.sh; fi . utils/parse_options.sh if [ $# != 3 ]; then echo "Usage: local/prepare_wsj_rm_lang.sh <src-dict> <src-lang> <output-dir>" echo "e.g:" echo "$0 ../../wsj/s5/data/local/dict ../../wsj/s5/data/lang_nosp data/wsj_rm_dir" fi src_dict=$1 src_lang=$2 output_dir=$3 required_dict_files="$src_dict/lexicon.txt $src_dict/nonsilence_phones.txt $src_dict/silence_phones.txt $src_dict/optional_silence.txt $src_lang/oov.txt $src_lang/phones.txt" for f in $required_dict_files; do if [ ! -f $f ]; then echo "$0: file $f that is required for preparing lang does not exist." && exit 1; fi done rm -r $output_dir 2>/dev/null || true mkdir -p $output_dir mkdir -p $output_dir/local # copy *phones.txt from source to target. cp -r $src_dict $output_dir/local/dict rm $output_dir/local/dict/lexicon*.txt oov_word=`cat $src_lang/oov.txt` # common word list in rm lexicon with lexicon in wsj comm -12 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \ <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ sed -r "s/'/+/g" | sort > $output_dir/words_tmp.txt comm -23 <(awk '{print $1}' data/local/dict/lexicon.txt | sed "s/\+/\'/g" | sort) \ <(awk '{print $1}' $src_dict/lexicon.txt | sort) | \ sed -r "s/'/+/g" | sort > $output_dir/words_only_tgt.txt # add oov_word to word list (echo "$oov_word"; cat $output_dir/words_tmp.txt) | sort > $output_dir/words_tgt_src.txt rm $output_dir/words_tmp.txt # we use wsj lexicon and find common word list in rm and wsj to generate lexicon for rm-wsj # using wsj phone sets. More than 90% of words in RM are in WSJ(950/994). cat $output_dir/words_tgt_src.txt | sed "s/\+/\'/g" | \ utils/apply_map.pl --permissive $src_dict/lexicon.txt | \ paste <(cat $output_dir/words_tgt_src.txt) - > $output_dir/local/dict/lexicon_tgt_src.txt # extend lexicon.txt by adding only_tg words as oov. oov_phone=`grep "$oov_word" $src_dict/lexicon.txt | cut -d' ' -f2` cat $output_dir/local/dict/lexicon_tgt_src.txt <(sed 's/$/ SPN/g' $output_dir/words_only_tgt.txt) | sort -u > $output_dir/local/dict/lexicon.txt # prepare dictionary using new lexicon.txt for RM-SWJ. utils/prepare_lang.sh --phone-symbol-table $src_lang/phones.txt \ $output_dir/local/dict "$oov_word" $output_dir/local/lang_tmp $output_dir # Generate new G.fst using updated words list with added <SPOKEN_NOISE> fstcompile --isymbols=$output_dir/words.txt --osymbols=$output_dir/words.txt --keep_isymbols=false \ -keep_osymbols=false data/local/tmp/G.txt | fstarcsort --sort_type=ilabel > $output_dir/G.fst || exit 1; |