Blame view
egs/gp/s1/local/gp_format_lms_edin.sh
3.72 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
#!/bin/bash -u # Copyright 2012 Arnab Ghoshal # Copyright 2010-2011 Microsoft Corporation # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. set -o errexit #set -o pipefail function error_exit () { echo -e "$@" >&2; exit 1; } function read_dirname () { [ -d "$1" ] || error_exit "Argument '$1' not a directory"; local retval=`cd $1 2>/dev/null && pwd || exit 1` echo $retval } function format_lms () { local lm_suffix=$1; local work_dir=$2 local test=$work_dir/test_${lm_suffix} mkdir -p $test for f in phones.txt words.txt phones_disambig.txt L.fst L_disambig.fst \ silphones.csl nonsilphones.csl; do cp $work_dir/lang_test/$f $test done # kkm: I am removing fstdeterminizelog from the following pipe, no point. gunzip -c $work_dir/local/lm_${lm_suffix}.arpa.gz \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=$test/words.txt - $test/G.fst set +e fstisstochastic $test/G.fst set -e # The output is like: # 9.14233e-05 -0.259833 # we do expect the first of these 2 numbers to be close to zero (the second is # nonzero because the backoff weights make the states sum to >1). # Because of the <s> fiasco for these particular LMs, the first number is not # as close to zero as it could be. # Everything below is only for diagnostic. # Checking that G has no cycles with empty words on them (e.g. <s>, </s>); # this might cause determinization failure of CLG. # #0 is treated as an empty word. mkdir -p tmpdir.g awk '{if(NF==1){ printf("0 0 %s %s ", $1,$1); }} END{print "0 0 #0 #0"; print "0";}' \ < $work_dir/local/lexicon_??.txt >tmpdir.g/select_empty.fst.txt fstcompile --isymbols=$test/words.txt --osymbols=$test/words.txt tmpdir.g/select_empty.fst.txt | \ fstarcsort --sort_type=olabel | fstcompose - $test/G.fst > tmpdir.g/empty_words.fst fstinfo tmpdir.g/empty_words.fst | grep cyclic | grep -w 'y' && echo "Language model has cycles with empty words" && exit 1 rm -r tmpdir.g } PROG=`basename $0`; usage="Usage: $PROG data_dir Convert ARPA-format language models to FSTs for GlobalPhone langauges. (Currently converts for German, Portuguese, Spanish & Swedish). "; if [ $# -ne 1 ]; then error_exit $usage; fi WDIR=`read_dirname $1`; # Next, for each type of language model, create the corresponding FST # and the corresponding lang_test directory. echo "Preparing language models for test" # German - 17K { format_lms GE17k_bg $WDIR/GE; format_lms GE17k_tg $WDIR/GE; format_lms GE17k_tg_pr $WDIR/GE; } >& $WDIR/GE/format_lms.log # German - 60K { format_lms GE60k_bg $WDIR/GE; format_lms GE60k_tg $WDIR/GE; format_lms GE60k_tg_pr $WDIR/GE; } >> $WDIR/GE/format_lms.log 2>&1 # Portuguese - 60K { format_lms PO60k_bg $WDIR/PO; format_lms PO60k_tg $WDIR/PO; format_lms PO60k_tg_pr $WDIR/PO; } >& $WDIR/PO/format_lms.log # Spanish - 23K { format_lms SP23k_bg $WDIR/SP; format_lms SP23k_tg $WDIR/SP; format_lms SP23k_tg_pr $WDIR/SP; } >& $WDIR/SP/format_lms.log # Swedish - 24K # TODO(arnab): Something going wrong with the Swedish trigram LM. { # format_lms SW24k_tg $WDIR/SW; # format_lms SW24k_tg_pr $WDIR/SW; format_lms SW24k_bg $WDIR/SW; } >& $WDIR/SW/format_lms.log echo "Preparing test data" |