Blame view
egs/chime4/s5_1ch/local/chime4_train_lms.sh
3.78 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
#!/bin/bash # Modified from the script for CHiME3 baseline # Copyright 2015, Mitsubishi Electric Research Laboratories, MERL (Author: Takaaki Hori) # Config: order=5 # n-gram order . utils/parse_options.sh || exit 1; . ./path.sh if [ $# -ne 1 ]; then printf " USAGE: %s <Chime4 root directory> " `basename $0` echo "Please specifies a Chime4 root directory" echo "If you use kaldi scripts distributed in the Chime4 data," echo "It would be `pwd`/../.." exit 1; fi # check data directories chime4_data=$1 wsj0_data=$chime4_data/data/WSJ0 # directory of WSJ0 in Chime4. You can also specify your WSJ0 corpus directory if [ ! -d $chime4_data ]; then echo "$chime4_data does not exist. Please specify chime4 data root correctly" && exit 1 fi if [ ! -d $wsj0_data ]; then echo "$wsj0_data does not exist. Please specify WSJ0 corpus directory" && exit 1 fi lm_train=$wsj0_data/wsj0/doc/lng_modl/lm_train/np_data # check whether run_init is executed if [ ! -d data/lang ]; then echo "error, execute local/run_init.sh, first" exit 1; fi # lm directories dir=data/local/local_lm srcdir=data/local/nist_lm mkdir -p $dir # check srilm ngram ! which ngram-count \ && echo "SRILM tools not installed, which are required for LM training" && exit 1; # extract 5k vocabulary from a baseline language model srclm=$srcdir/lm_tgpr_5k.arpa.gz if [ -f $srclm ]; then echo "Getting vocabulary from a baseline language model"; ngram -lm $srclm -unk -map-unk '<UNK>' -write-vocab $dir/vocab_5k.txt else echo "Language model $srclm does not exist" && exit 1; fi # collect training data from WSJ0 touch $dir/train.gz if [ `du -m $dir/train.gz | cut -f 1` -eq 63 ]; then echo "Not getting training data again [already exists]"; else echo "Collecting training data from $lm_train"; gunzip -c $lm_train/{87,88,89}/*.z \ | awk -v voc=$dir/vocab_5k.txt ' BEGIN{ while((getline<voc)>0) { invoc[$1]=1; }} /^</{next}{ for (x=1;x<=NF;x++) { w=toupper($x); if (invoc[w]) { printf("%s ",w); } else { printf("<UNK> "); } } printf(" "); }' | gzip -c > $dir/train.gz fi # get validation data from Chime4 dev set touch $dir/valid.gz if [ `du -k $dir/valid.gz | cut -f 1` -eq 68 ]; then echo "Not getting validation data again [already exists]"; else echo "Collecting validation data from $chime4_data/data/transcriptions"; cut -d" " -f2- $chime4_data/data/transcriptions/dt05_real.trn_all \ $chime4_data/data/transcriptions/dt05_simu.trn_all \ |gzip -c > $dir/valid.gz fi # train a large n-gram language model lm_suffix=${order}gkn_5k if [ -f $dir/lm_${lm_suffix}.arpa.gz ]; then echo "A $order-gram language model aready exists and is not constructed again" echo "To reconstruct, remove $dir/lm_${lm_suffix}.arpa.gz first" else echo "Training a $order-gram language model" ngram-count -text $dir/train.gz -order $order \ -vocab $dir/vocab_5k.txt -unk -map-unk "<UNK>" \ -gt2min 1 -gt3min 1 -gt4min 2 -gt5min 2 \ -interpolate -kndiscount \ -lm $dir/lm_${lm_suffix}.arpa.gz fi echo "Checking validation perplexity of $order-gram language model" ngram -order $order -ppl $dir/valid.gz -lm $dir/lm_${lm_suffix}.arpa.gz # e.g. 5-gram perplexity: # file data/local/local_lm/valid.txt: 3280 sentences, 54239 words, 3 OOVs # 0 zeroprobs, logprob= -96775.5 ppl= 48.1486 ppl1= 60.8611 # convert arpa LM to G.fst echo "Converting the $order-gram language model to G.fst" test=data/lang_test_${lm_suffix} mkdir -p $test cp -r data/lang/* $test || exit 1; gunzip -c $dir/lm_${lm_suffix}.arpa.gz | \ arpa2fst --disambig-symbol=#0 \ --read-symbol-table=$test/words.txt - $test/G.fst utils/validate_lang.pl --skip-determinization-check $test || exit 1; echo "Succeeded in $order-gram LM training and conversion to G.fst" |