hkust_extract_subdict.pl 2.81 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130


#!/usr/bin/env perl
# Copyright Hong Kong University of Science and Technology (Author: Ricky Chan) 2013.
# 
# A script for dictionary generation with an input dict and a wordlist 
#
# example of dict format as follow:
# WORD1 ph1 ph2
# WORD2 ph1 ph2 ph3
# WORDX ph4
# WORDY ph4 ph5
# WORDZ ph3 ph1
#
# example of wordlist (support phrase of words) format as follow:
# WORD1
# WORD2
# WORDX WORDY 
# WORDX WORDY WORDZ

if($#ARGV+1 != 2 && $#ARGV+1 != 3) {
  printUsage();
  exit;
}

$usespron=0;
if(@ARGV == 3) {
  if($ARGV[2] ne "--spron") {
    printUsage();
    exit;
  }
  $usespron=1;
}

$dictfile=$ARGV[0];
$inputfile=$ARGV[1];

%dictionarylist=();
open(INFILE, $dictfile) || die("Can't open dict ".$dictfile."\n");
while(<INFILE>){
  chomp;
  @line=split(/\s+/);
  $a=$line[0];
  $b="";
  for($i=1; $i<scalar(@line); $i++) {
    $b=$b . " " . $line[$i];
  }
  push ( @{ $dictionarylist{$a} }, $b );
}
close(INFILE);

open(INFILE, $inputfile) || die("Can't open wordlist ".$inputfile."\n");
while(<INFILE>) {
  chomp;
  $phrase = $_;
  @line = split(/\s+/);

  ## single pronunciation handling
  if($usespron==1) {
    if(scalar(@line)==0) {
      next;
    }

    for($i=0; $i<scalar(@line); $i++) {
      print $line[$i]." ";
    }
    print "\t";

    for($i=0; $i<scalar(@line); $i++) {
      if(!exists($dictionarylist{$line[$i]})) {
        print " _NOT_FOUND_";
      }
      else {
        @ref=@{ $dictionarylist{$line[$i]} };
        print $ref[0]."";
      }
    }
    print "\n";
    next;
  }

  ## multiple pronunciations handling 
  @pronlist=();
  @tmppronlist=();

  if(scalar(@line)>0) {
    $word = $line[$0];
    if(!exists($dictionarylist{$word})) {
        push(@pronlist, '_NOT_FOUND_');
    }
    else {
      @ref=@{ $dictionarylist{$word} };
      for($i=0; $i<scalar(@ref); $i++) {
        push(@pronlist, $ref[$i]."");
      }
    }

    for($i=1; $i<scalar(@line); $i++) {
      $word = $line[$i];
      if(!exists($dictionarylist{$word})) {
        for($j=0; $j<scalar(@pronlist); $j++) {
          $pronlist[$j] = $pronlist[$j]." _NOT_FOUND_";
        }
      }
      else {
        @ref=@{ $dictionarylist{$word} };
        while(scalar(@pronlist)>0) {
          push(@tmppronlist, shift(@pronlist));
        }
        while(scalar(@tmppronlist)>0) {
          $tmppron = shift(@tmppronlist);
          for($j=0; $j<scalar(@ref); $j++) {
            push(@pronlist, $tmppron." ".$ref[$j]);
          } 
        }
      }
    }
    
    for($i=0; $i<scalar(@pronlist); $i++) {
      print $phrase."\t".$pronlist[$i]."\n";
    }
  }

}
close(INFILE);

sub printUsage {
    print "usage: perl hkust_extract_subdict.pl dict wordlist [--spron]\n\n";
    print "### this script handle multiple pronunciations for dict in default\n";
    print "### if you want to extract single(top) pronunciation from dict, please use the option --spron\n\n";
}