tokenize_file.rb
667 Bytes
#!/usr/bin/env ruby
require "rubygems"
require "tokenizer"
def launch(file)
f = File.open(file)
f.each do |line|
begin
line = line.split("\t")
#l = line[2].gsub(/^"/, "").gsub(/"$/, "").gsub('\\""""', '"').gsub(/\\u[\da-f]{4}/i) { |m| [m[-4..-1].to_i(16)].pack('U') }
tok = tokenize( line[2] )
line[2] = tok
puts line.join("\t")
rescue
$stderr.puts "ERROR"
end
end
f.close
end
def errarg
puts "Usage : ./programme.rb"
puts "Mickael Rouvier <mickael.rouvier@univ-avignon.fr>"
end
if ARGV.size == 1
launch(ARGV[0])
else
errarg
end