Blame view
egs/hkust/s5/local/hkust_extract_subdict.pl
2.81 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
#!/usr/bin/env perl # Copyright Hong Kong University of Science and Technology (Author: Ricky Chan) 2013. # # A script for dictionary generation with an input dict and a wordlist # # example of dict format as follow: # WORD1 ph1 ph2 # WORD2 ph1 ph2 ph3 # WORDX ph4 # WORDY ph4 ph5 # WORDZ ph3 ph1 # # example of wordlist (support phrase of words) format as follow: # WORD1 # WORD2 # WORDX WORDY # WORDX WORDY WORDZ if($#ARGV+1 != 2 && $#ARGV+1 != 3) { printUsage(); exit; } $usespron=0; if(@ARGV == 3) { if($ARGV[2] ne "--spron") { printUsage(); exit; } $usespron=1; } $dictfile=$ARGV[0]; $inputfile=$ARGV[1]; %dictionarylist=(); open(INFILE, $dictfile) || die("Can't open dict ".$dictfile." "); while(<INFILE>){ chomp; @line=split(/\s+/); $a=$line[0]; $b=""; for($i=1; $i<scalar(@line); $i++) { $b=$b . " " . $line[$i]; } push ( @{ $dictionarylist{$a} }, $b ); } close(INFILE); open(INFILE, $inputfile) || die("Can't open wordlist ".$inputfile." "); while(<INFILE>) { chomp; $phrase = $_; @line = split(/\s+/); ## single pronunciation handling if($usespron==1) { if(scalar(@line)==0) { next; } for($i=0; $i<scalar(@line); $i++) { print $line[$i]." "; } print "\t"; for($i=0; $i<scalar(@line); $i++) { if(!exists($dictionarylist{$line[$i]})) { print " _NOT_FOUND_"; } else { @ref=@{ $dictionarylist{$line[$i]} }; print $ref[0].""; } } print " "; next; } ## multiple pronunciations handling @pronlist=(); @tmppronlist=(); if(scalar(@line)>0) { $word = $line[$0]; if(!exists($dictionarylist{$word})) { push(@pronlist, '_NOT_FOUND_'); } else { @ref=@{ $dictionarylist{$word} }; for($i=0; $i<scalar(@ref); $i++) { push(@pronlist, $ref[$i].""); } } for($i=1; $i<scalar(@line); $i++) { $word = $line[$i]; if(!exists($dictionarylist{$word})) { for($j=0; $j<scalar(@pronlist); $j++) { $pronlist[$j] = $pronlist[$j]." _NOT_FOUND_"; } } else { @ref=@{ $dictionarylist{$word} }; while(scalar(@pronlist)>0) { push(@tmppronlist, shift(@pronlist)); } while(scalar(@tmppronlist)>0) { $tmppron = shift(@tmppronlist); for($j=0; $j<scalar(@ref); $j++) { push(@pronlist, $tmppron." ".$ref[$j]); } } } } for($i=0; $i<scalar(@pronlist); $i++) { print $phrase."\t".$pronlist[$i]." "; } } } close(INFILE); sub printUsage { print "usage: perl hkust_extract_subdict.pl dict wordlist [--spron] "; print "### this script handle multiple pronunciations for dict in default "; print "### if you want to extract single(top) pronunciation from dict, please use the option --spron "; } |