Yannick Estève / ONTRAC-Kaldi

Blame view

egs/sprakbanken/s5/local/dict/get_rules.pl 7.55 KB
  #!/usr/bin/env perl
  
  # This program creates suggested suffix rules from a dictionary.
  # It outputs quadruples of the form:
  # suffix,base-suffix,psuffix,base-psuffix
  # where "suffix" is the suffix of the letters of a word, "base-suffix" is
  # the suffix of the letters of the base-word, "psuffix" is the suffix of the
  # pronunciation of the word (a space-separated list of phonemes), and
  # "base-psuffix" is the suffix of the pronunciation of the baseword.
  # As far as this program is concerned, there is no distinction between
  # "word" and "base-word".  To simplify things slightly, what it does
  # is return all tuples (a,b,c,d) [with a != b] such that there are
  # at least $min_suffix_count instances in the dictionary of
  # a (word-prefix, pron-prefix) pair where there exists (word,pron)
  # pairs of the form
  # ( word-prefix . a,  pron-prefix . c)
  # and 
  # ( word-prefix . b, pron-prefix . d)
  # For example if (a,b,c,d) equals (USLY,US,S L IY0,S)
  # then this quadruple will be output as long as there at least
  # e.g. 30 instances of prefixes like (FAM, F EY1 M AH0)
  # where there exist (word, pron) pairs like:
  # FAMOUS, F EY1 M AH0 S
  # FAMOUSLY  F EY1 M AH0 S L IY0
  #
  # There are some modifications to the picture above, for efficiency.
  # If $disallow_empty_suffix != 0, this program will not output 4-tuples where
  # the first element (the own-word suffix) is empty, as this would cause
  # efficiency problems in get_candidate_prons.pl.  If 
  # $ignore_prefix_stress != 0, this program will ignore stress markings
  # while evaluating whether prefixes are the same.
  # The minimum count for a quadruple to be output is $min_suffix_count
  # (e.g. 30).
  #
  # The function of this program is not to evaluate the accuracy of these rules;
  # it is mostly a pruning step, where we suggest rules that have large enough
  # counts to be suitable for our later procedure where we evaluate their
  # accuracy in predicting prons.
  
  $disallow_empty_suffix = 1; # Disallow rules where the suffix of the "own-word" is
     # empty.  This is for efficiency in later stages (e.g. get_candidate_prons.pl).
  $min_prefix_len = 3;  # this must match with get_candidate_prons.pl
  $ignore_prefix_stress = 1; # or 0 to take account of stress in prefix.
  $min_suffix_count = 20;
  
  # Takes in dictionary.
  
  print STDERR "Reading dict
  ";
  while(<>) {
    @A = split(" ", $_);
    my $word = shift @A;
    my $pron = join(" ", @A);
    if (!defined $prons{$word}) {
      $prons{$word} = $pron;
      push @words, $word;
    } else {
      $prons{$word} = $prons{$word} . ";" . $pron;
    }
  }
  
  # Get common suffixes (e.g., count >100).  Include empty suffix.
  
  print STDERR "Getting common suffix counts.
  ";
  {
    foreach $word (@words) {
      $len = length($word);
      for ($x = $min_prefix_len; $x <= $len; $x++) {
        $suffix_count{substr($word, $x)}++;
      }
    }
  
    foreach $suffix (keys %suffix_count) {
      if ($suffix_count{$suffix} >= $min_suffix_count) {
        $newsuffix_count{$suffix} = $suffix_count{$suffix};
      }
    }
    %suffix_count = %newsuffix_count;
    undef %newsuffix_count;
  
    foreach $suffix ( sort { $suffix_count{$b} <=> $suffix_count{$a} } keys %suffix_count ) {
      print STDERR "$suffix_count{$suffix} $suffix
  ";
    }
  }
  
  print STDERR "Getting common suffix pairs.
  ";
  
  {
    print STDERR " Getting map from prefix -> suffix-set.
  ";
  
    # Create map from prefix -> suffix-set.
    foreach $word (@words) {
      $len = length($word);
      for ($x = $min_prefix_len; $x <= $len; $x++) {
        $prefix = substr($word, 0, $x);
        $suffix = substr($word, $x);
        if (defined $suffix_count{$suffix}) { # Suffix is common...
          if (!defined $suffixes_of{$prefix}) {
            $suffixes_of{$prefix} = [ $suffix ]; # Create a reference to a new array with
            # one element.
          } else {
            push @{$suffixes_of{$prefix}}, $suffix; # Push $suffix onto array that the
            # hash member is a reference .
          }
        }
      }
    }
    my %suffix_set_count;
    print STDERR " Getting map from suffix-set -> count.
  ";
    while ( my ($key, $value) = each(%suffixes_of) ) { 
      my @suffixes = sort ( @$value );
      $suffix_set_count{join(";", @suffixes)}++;
    }
    print STDERR " Getting counts for suffix pairs.
  ";
    while ( my ($suffix_set, $count) = each (%suffix_set_count) ) {
      my @suffixes = split(";", $suffix_set);
      # Consider pairs to be ordered.  This is more convenient
      # later on.
      foreach $suffix_a (@suffixes) {
        foreach $suffix_b (@suffixes) {
          if ($suffix_a ne $suffix_b) {
            $suffix_pair = $suffix_a . "," . $suffix_b;
            $suffix_pair_count{$suffix_pair} += $count;
          }
        }
      }
    }
  
    # To save memory, only keep pairs above threshold in the hash.
    while ( my ($suffix_pair, $count) = each (%suffix_pair_count) ) {
      if ($count >= $min_suffix_count) {
        $new_hash{$suffix_pair} = $count;
      }
    }
    %suffix_pair_count = %new_hash;
    undef %new_hash;
  
    # Print out the suffix pairs so the user can see.
    foreach $suffix_pair ( 
        sort { $suffix_pair_count{$b} <=> $suffix_pair_count{$a} } keys %suffix_pair_count ) {
      print STDERR "$suffix_pair_count{$suffix_pair} $suffix_pair
  ";
    }
  }
  
  print STDERR "Getting common suffix/suffix/psuffix/psuffix quadruples
  ";
  
  {
    while ( my ($prefix, $suffixes_ref) = each(%suffixes_of) ) {
      # Note: suffixes_ref is a reference to an array.  We dereference with
      # @$suffixes_ref.
      # Consider each pair of suffixes (in each order).
      foreach my $suffix_a ( @$suffixes_ref ) {
        foreach my $suffix_b ( @$suffixes_ref ) {
          # could just used "defined" in next line, but this is for clarity.
          $suffix_pair = $suffix_a.",".$suffix_b;
          if ( $suffix_pair_count{$suffix_pair} >= $min_suffix_count ) {
            foreach $pron_a_str (split(";", $prons{$prefix.$suffix_a})) {
              @pron_a = split(" ", $pron_a_str);
              foreach $pron_b_str (split(";", $prons{$prefix.$suffix_b})) {
                @pron_b = split(" ", $pron_b_str);
                $len_a = @pron_a; # evaluating array as scalar automatically gives length.
                $len_b = @pron_b;
                for (my $pos = 0; $pos <= $len_a && $pos <= $len_b; $pos++) {
                  # $pos is starting-pos of psuffix-pair. 
                  $psuffix_a = join(" ", @pron_a[$pos...$#pron_a]);
                  $psuffix_b = join(" ", @pron_b[$pos...$#pron_b]);
                  $quadruple = $suffix_pair . "," . $psuffix_a . "," . $psuffix_b;
                  $quadruple_count{$quadruple}++;
                  
                  my $pron_a_pos = $pron_a[$pos], $pron_b_pos = $pron_b[$pos];
                  if ($ignore_prefix_stress) {
                    $pron_a_pos =~ s/\d//; # e.g convert IH0 to IH.  Only affects
                    $pron_b_pos =~ s/\d//; # whether we exit the loop below.
                  }
                  if ($pron_a_pos ne $pron_b_pos) {
                    # This is important: we don't consider a pron suffix-pair to be
                    # valid unless the pron prefix is the same.
                    last;
                  }
                }
              }
            }
          }
        }
      }
    }
    # To save memory, only keep pairs above threshold in the hash.
    while ( my ($quadruple, $count) = each (%quadruple_count) ) {
      if ($count >= $min_suffix_count) {
        $new_hash{$quadruple} = $count;
      }
    }
    %quadruple_count = %new_hash;
    undef %new_hash;
    
    # Print out the quadruples for diagnostics.
    foreach $quadruple ( 
      sort { $quadruple_count{$b} <=> $quadruple_count{$a} } keys %quadruple_count ) {
      print STDERR "$quadruple_count{$quadruple} $quadruple
  ";
    }
  }
  # Now print out the quadruples; these are the output of this program.
  foreach $quadruple (keys %quadruple_count) {
    print $quadruple."
  ";
  }