tokenize_file.rb 667 Bytes
#!/usr/bin/env ruby

require "rubygems"
require "tokenizer"

def launch(file)

    f = File.open(file)
    f.each do |line|
        begin
            line = line.split("\t")
            #l = line[2].gsub(/^"/, "").gsub(/"$/, "").gsub('\\""""', '"').gsub(/\\u[\da-f]{4}/i) { |m| [m[-4..-1].to_i(16)].pack('U')  }
            tok = tokenize( line[2] )
            line[2] = tok
            puts line.join("\t")
        rescue
            $stderr.puts "ERROR"
        end
    end
    f.close

end


def errarg
    puts "Usage : ./programme.rb"
    puts "Mickael Rouvier <mickael.rouvier@univ-avignon.fr>"
end


if ARGV.size == 1
    launch(ARGV[0])
else
    errarg
end