Blame view
egs/spanish_dimex100/s5/local/lm_prep.sh
1.42 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
#!/bin/bash ## Install SRILM in the `tools` directory (install_srilm.sh) ## Only run this file from the example root directory ## $ ./local/data_prep.sh mkdir -p "data/local/tmp" "data/lang/tmp" source ./path.sh if [ -d "../../../tools/srilm/bin/i686-m64" ]; then ngram_count_exe="../../../tools/srilm/bin/i686-m64/ngram-count" elif [ -d "../../../tools/srilm/bin/i686" ]; then ngram_count_exe="../../../tools/srilm/bin/i686/ngram-count" else echo echo "[!] Install SRILM in the 'tools' directory (install_srilm.sh)" echo exit 1 fi ######################## # data/local/tmp/lm_text ######################## # Text sentences input for language model generation # taken from data/[train|test]/text but with utterance IDs removed cat data/train/text data/test/text | cut -d' ' -f1 --complement > data/local/tmp/lm_text ################################# # data/local/tmp/3gram_arpa_lm.gz ################################## $ngram_count_exe -lm data/local/tmp/3gram_lm.arpa.kn.gz \ -order 3 \ -write-vocab data/local/tmp/vocab-full.txt \ -sort \ -wbdiscount \ -unk \ -map-unk "<UNK>" \ -text data/local/tmp/lm_text # -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 \ # -kndiscount3 -gt3min 3 -order 3 \ ################# # data/lang/G.fst ################# utils/format_lm.sh data/lang \ data/local/tmp/3gram_lm.arpa.kn.gz \ data/local/dict/lexicon.txt \ data/lang |