Blame view
egs/wsj/s5/local/wsj_prepare_char_dict.sh
1.18 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
#!/bin/bash # Copyright 2017 Hossein Hadian phone_dir=data/local/dict_nosp dir=data/local/dict_char mkdir -p $dir [ -f path.sh ] && . ./path.sh # Simply transcribe each word with its comprising characters: # We keep only one pronunciation for each word. Other alternative pronunciations are discarded. cat $phone_dir/lexicon1_raw_nosil.txt | \ perl -e 'while(<>){@A = split; if(! $seen{$A[0]}) {$seen{$A[0]} = 1; print $_;}}' \ > $phone_dir/lexicon2_raw_nosil.txt || exit 1; cat $phone_dir/lexicon2_raw_nosil.txt | python -c 'import sys for l in sys.stdin: w = l.strip().split(" ")[0] r = w for c in w: if c not in "!~@#$%^&*()+=/\",;:?_{}-": r += " " + c print r ' > $dir/lexicon2_raw_nosil.txt || exit 1; (echo SIL; echo SPN; echo NSN) > $dir/silence_phones.txt echo SIL > $dir/optional_silence.txt (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; \ echo '<UNK> SPN'; echo '<NOISE> NSN'; ) | \ cat - $dir/lexicon2_raw_nosil.txt | sort | uniq > $dir/lexicon.txt || exit 1; # Get the set of non-silence phones cut -d' ' -f2- $dir/lexicon2_raw_nosil.txt | tr ' ' ' ' | \ sort -u > $dir/nonsilence_phones.txt echo "Character-based dictionary preparation succeeded." |