Blame view

lib/rir/ttagger.rb 2.69 KB
a79a22843   Romain Deveaud   new TreeTagger mo...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  #!/usr/bin/env ruby
  
  # This file is a part of an Information Retrieval oriented Ruby library
  #
  # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
  #
  # This program is free software: you can redistribute it and/or modify
  # it under the terms of the GNU General Public License as published by
  # the Free Software Foundation, either version 3 of the License, or
  # (at your option) any later version.
  #
  # This program is distributed in the hope that it will be useful,
  # but WITHOUT ANY WARRANTY; without even the implied warranty of
  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  # GNU General Public License for more details.
  #
  # You should have received a copy of the GNU General Public License
  # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  
  module RIR
  
    # TreeTagger-related stuff module.
    #
    # See http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html
    module TreeTagger
      
      # This class handles generic parsing of tagger-chunker outputs.
      class TaggerChunker
        attr_reader :chunks, :file
  
  
        # Parses a tagger-chunker output and returns an Array of Chunk.
        def self.parse chunk_lines
          open = false
          tag  = nil
  
          chunks = []
          words  = []
  
          chunk_lines.each do |l|
            l.chomp!
            if l =~ /^<\w+>$/
              open = true
              tag  = l
            elsif l =~ /^<\/\w+>$/
              if !words.empty? && open && l == tag.sub(/</, '</')
                open = false
                chunks.push Chunk.new(words.join(" "), tag) 
                words.clear
              else
                next
              end
            else
              words.push(l.split.first)
            end
          end
  
          chunks
        end
  
        # Initializes parsing. +chunk_file+ is the output of +tagger-chunker-+ and must
        # be a valid path to the file.
        #
        #   TaggerChunker.new("ttout/2010020") #=> #<RIR::TreeTagger::TaggerChunker:0x92fd088 @chunks=[#<RIR::TreeTagger::Chunk:0x8ec5a10 @words=["robert", "schumann"], @tag="NC">, ...] ...>
        def initialize chunk_file
          @chunks = TaggerChunker.parse File.open(chunk_file).readlines
        end
  
      end
  
      class TaggerChunkerEnglish < TaggerChunker
      end
  
      class TaggerChunkerFrench  < TaggerChunker
      end
  
      class TaggerChunkerGerman  < TaggerChunker
      end
  
      # Represents a Chunk extracted when parsing a TaggerChunker file.
      class Chunk
        attr_reader :words, :tag
  
        # +str+ are whitespace-separated terms.
        # +tag+ see : ftp://ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt
        def initialize str,tag
          @words = str.split
          @tag   = tag[1..-2]
        end
      end
  
    end
  end