Blame view
egs/wsj/s5/steps/cleanup/make_utterance_fsts.pl
1.67 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
#!/usr/bin/env perl use warnings; #sed replacement for -w perl parameter # makes unigram decoding-graph FSTs specific to each utterances, where the # supplied top-n-words list together with the supervision text of the utterance are # combined. if (@ARGV != 1) { print STDERR "** Warning: this script is deprecated and will be removed. See " . "** steps/cleanup/make_biased_lm_graphs.sh. " . "Usage: make_utterance_fsts.pl top-words-file.txt < text-archive > fsts-archive " . "e.g.: utils/sym2int.pl -f 2- data/lang/words.txt data/train/text | \\ " . " make_utterance_fsts.pl exp/foo/top_words.int | compile-train-graphs-fsts ... "; exit(1); } ($top_words_file) = @ARGV; open(F, "<$top_words_file") || die "opening $top_words_file"; %top_word_probs = ( ); while(<F>) { @A = split; (@A == 2 && $A[0] > 0.0) || die "Bad line $_ in $top_words_file"; $A[1] =~ m/^[0-9]+$/ || die "Expecting numeric word-ids in $top_words_file: $_ "; $top_word_probs{$A[1]} += $A[0]; } while (<STDIN>) { @A = split; $utterance_id = shift @A; print "$utterance_id "; $num_words = @A + 0; # length of array @A %word_probs = %top_word_probs; foreach $w (@A) { $w =~ m/^[0-9]+$/ || die "Expecting numeric word-ids as stdin: $_"; $word_probs{$w} += 1.0 / $num_words; } foreach $w (keys %word_probs) { $prob = $word_probs{$w}; $prob > 0.0 || die "Word $w with bad probability $prob, utterance-id = $utterance_id "; $cost = -log($prob); print "0 0 $w $w $cost "; } $final_cost = -log(1.0 / $num_words); print "0 $final_cost "; print " "; # Empty line terminates the FST in the text-archive format. } |