Blame view
egs/fisher_swbd/s5/local/fisher_swbd_prepare_dict.sh
5.34 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
#!/bin/bash # # To be run from one directory above this script. ## The input is some directory containing the switchboard-1 release 2 ## corpus (LDC97S62). Note: we don't make many assumptions about how ## you unpacked this. We are just doing a "find" command to locate ## the .sph files. # for example /mnt/matylda2/data/SWITCHBOARD_1R2 . ./path.sh # The parts of the output of this that will be needed are # [in data/local/dict/ ] # lexicon.txt # extra_questions.txt # nonsilence_phones.txt # optional_silence.txt # silence_phones.txt #check existing directories [ $# != 0 ] && echo "Usage: local/fisher_prepare_dict.sh" && exit 1; dir=data/local/dict_nosp mkdir -p $dir echo "Getting CMU dictionary" svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict $dir/cmudict # silence phones, one per line. for w in sil laughter noise oov; do echo $w; done > $dir/silence_phones.txt echo sil > $dir/optional_silence.txt # For this setup we're discarding stress. cat $dir/cmudict/cmudict.0.7a.symbols | sed s/[0-9]//g | \ tr '[A-Z]' '[a-z]' | perl -ane 's:\r::; print;' | sort | uniq > $dir/nonsilence_phones.txt # An extra question will be added by including the silence phones in one class. cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf " ";}' > $dir/extra_questions.txt || exit 1; grep -v ';;;' $dir/cmudict/cmudict.0.7a | tr '[A-Z]' '[a-z]' | \ perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; s: : :; print; }' | \ sed s/[0-9]//g | sort | uniq > $dir/lexicon1_raw_nosil.txt || exit 1; # Add prons for laughter, noise, oov for w in `grep -v sil $dir/silence_phones.txt`; do echo "[$w] $w" done | cat - $dir/lexicon1_raw_nosil.txt > $dir/lexicon2_raw.txt || exit 1; # This is just for diagnostics: cat data/train_fisher/text | \ awk '{for (n=2;n<=NF;n++){ count[$n]++; } } END { for(n in count) { print count[n], n; }}' | \ sort -nr > $dir/word_counts # between lexicon2_raw and lexicon3_expand we limit it to the words seen in # the Fisher data, and also expand the vocab for acronyms like c._n._n. and other # underscore-containing things. cat $dir/lexicon2_raw.txt | \ perl -e 'while(<STDIN>) { @A=split; $w = shift @A; $pron{$w} = join(" ", @A); } ($w) = @ARGV; open(W, "<$w") || die "Error opening word-counts from $w"; while(<W>) { # reading in words we saw in training data.. ($c, $w) = split; if (defined $pron{$w}) { print "$w $pron{$w} "; } else { @A = split("_", $w); if (@A > 1) { $this_pron = ""; $pron_ok = 1; foreach $a (@A) { if (defined($pron{$a})) { $this_pron = $this_pron . "$pron{$a} "; } else { $pron_ok = 0; print STDERR "Not handling word $w, count is $c "; last; } } if ($pron_ok) { $this_pron =~ s/\s+$//; $new_pron{$w} = $this_pron; } }}} foreach $w (keys %new_pron) { print "$w $new_pron{$w} "; } ' \ $dir/word_counts > $dir/lexicon3_expand_v1.txt || exit 1; cat $dir/word_counts | awk '{print $2}' > $dir/fisher_word_list filter_scp.pl $dir/fisher_word_list $dir/lexicon2_raw.txt > $dir/lexicon3_expand_v2.txt cat $dir/lexicon3_expand_v1.txt $dir/lexicon3_expand_v2.txt | sort -u > $dir/lexicon3_expand.txt cat $dir/lexicon3_expand.txt \ <( echo "mm m" echo "<unk> oov" ) > $dir/lexicon4_extra.txt cp $dir/lexicon4_extra.txt $dir/lexicon_fisher.txt awk '{print $1}' $dir/lexicon_fisher.txt | \ perl -e '($word_counts)=@ARGV; open(W, "<$word_counts")||die "opening word-counts $word_counts"; while(<STDIN>) { chop; $seen{$_}=1; } while(<W>) { ($c,$w) = split; if (!defined $seen{$w}) { print; } } ' $dir/word_counts > $dir/oov_counts.txt echo "*Highest-count OOVs are:" head -n 20 $dir/oov_counts.txt # Preparing SWBD acronymns from its dictionary srcdir=data/local/train_swbd # This is where we downloaded some stuff.. dir=data/local/dict_nosp mkdir -p $dir srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text # assume swbd_p1_data_prep.sh was done already. [ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1; rm $dir/lexicon0.txt 2>/dev/null cp $srcdict $dir/lexicon0.txt || exit 1; chmod +w $srcdict $dir/lexicon0.txt # Use absolute path in case patch reports the "Invalid file name" error (a bug with patch) patch <local/dict.patch `pwd`/$dir/lexicon0.txt || exit 1; #(2a) Dictionary preparation: # Pre-processing (remove comments) grep -v '^#' $dir/lexicon0.txt | awk 'NF>0' | sort > $dir/lexicon1_swbd.txt || exit 1; cat $dir/lexicon1_swbd.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \ grep -v SIL > $dir/nonsilence_phones_msu.txt || exit 1; local/swbd1_map_words.pl -f 1 $dir/lexicon1_swbd.txt | sort | uniq \ > $dir/lexicon2_swbd.txt || exit 1; cp conf/MSU_single_letter.txt $dir/MSU_single_letter.txt python local/format_acronyms_dict.py -i $dir/lexicon2_swbd.txt \ -o1 $dir/acronyms_lex_swbd.txt -o2 $dir/acronyms_lex_swbd_ori.txt \ -L $dir/MSU_single_letter.txt -M $dir/acronyms_raw.map cat $dir/acronyms_raw.map | sort -u > $dir/acronyms_swbd.map cat $dir/acronyms_lex_swbd.txt |\ sed 's/ ax/ ah/g' |\ sed 's/ en/ ah n/g' |\ sed 's/ el/ ah l/g' \ > $dir/acronyms_lex_swbd_cmuphones.txt cat $dir/acronyms_lex_swbd_cmuphones.txt $dir/lexicon_fisher.txt | sort -u > $dir/lexicon.txt echo Prepared input dictionary and phone-sets for Switchboard phase 1. utils/validate_dict_dir.pl $dir |