Blame view
egs/csj/s5/local/csj_prepare_dict.sh
1.33 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
#!/bin/bash # Making dictionary using CSJ data with morpheme analysis. # from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013) # To be run from one directory above this script. . ./path.sh #check existing directories [ $# != 0 ] && echo "Usage: local/csj_data_prep.sh" && exit 1; srcdir=data/local/train dir=data/local/dict_nosp mkdir -p $dir srcdict=$srcdir/lexicon.txt # assume csj_data_prep.sh was done already. [ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1; #(2a) Dictionary preparation: # Pre-processing (Upper-case, remove comments) cat $srcdict > $dir/lexicon1.txt || exit 1; cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ grep -v sp > $dir/nonsilence_phones.txt || exit 1; #( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt ( echo sp ; echo spn ; ) > $dir/silence_phones.txt echo sp > $dir/optional_silence.txt # No "extra questions" in the input to this setup, as we don't # have stress or tone. echo -n >$dir/extra_questions.txt # Add to the lexicon the silences, noises etc. ( echo '<sp> sp' ; echo '<unk> spn'; ) | cat - $dir/lexicon1.txt > $dir/lexicon2.txt || exit 1; pushd $dir >&/dev/null ln -sf lexicon2.txt lexicon.txt popd >&/dev/null echo Prepared input dictionary and phone-sets for CSJ phase 1. |