Blame view
egs/fisher_swbd/s5/local/swbd1_prepare_dict.sh
2.6 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
#!/bin/bash # Formatting the Mississippi State dictionary for use in Edinburgh. Differs # from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013) # To be run from one directory above this script. . ./path.sh #check existing directories [ $# != 0 ] && echo "Usage: local/swbd1_data_prep.sh" && exit 1; srcdir=data/local/train # This is where we downloaded some stuff.. dir=data/local/dict mkdir -p $dir srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text # assume swbd_p1_data_prep.sh was done already. [ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1; #(2a) Dictionary preparation: # Pre-processing (Lower-case, remove comments) grep -v '^#' $srcdict | tr '[A-Z]' '[a-z]' | awk 'NF>0' | sort > $dir/lexicon1.txt || exit 1; cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ grep -v sil > $dir/nonsilence_phones.txt || exit 1; ( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt echo sil > $dir/optional_silence.txt # No "extra questions" in the input to this setup, as we don't # have stress or tone. echo -n >$dir/extra_questions.txt # Add to the lexicon the silences, noises etc. ( echo '!sil sil'; echo '[vocalized-noise] spn'; echo '[noise] nsn'; \ echo '[laughter] lau'; echo '<unk> spn' ) \ | cat - $dir/lexicon1.txt > $dir/lexicon2.txt || exit 1; # Map the words in the lexicon. That is-- for each word in the lexicon, we map it # to a new written form. The transformations we do are: # remove laughter markings, e.g. # [LAUGHTER-STORY] -> STORY # Remove partial-words, e.g. # -[40]1K W AH N K EY # becomes -1K # and # -[AN]Y IY # becomes # -Y # -[A]B[OUT]- B # becomes # -B- # Also, curly braces, which appear to be used for "nonstandard" # words or non-words, are removed, e.g. # {WOLMANIZED} W OW L M AX N AY Z D # -> WOLMANIZED # Also, mispronounced words, e.g. # [YEAM/YEAH] Y AE M # are changed to just e.g. YEAM, i.e. the orthography # of the mispronounced version. # Note-- this is only really to be used in training. The main practical # reason is to avoid having tons of disambiguation symbols, which # we otherwise would get because there are many partial words with # the same phone sequences (most problematic: S). # Also, map # THEM_1 EH M -> THEM # so that multiple pronunciations just have alternate entries # in the lexicon. local/swbd1_map_words.pl -f 1 $dir/lexicon2.txt | sort -u \ > $dir/lexicon3.txt || exit 1; pushd $dir >&/dev/null ln -sf lexicon3.txt lexicon.txt # This is the final lexicon. popd >&/dev/null echo Prepared input dictionary and phone-sets for Switchboard phase 1. |