remove_dialect.pl
579 Bytes
#!/usr/bin/env perl
# Removes the dialect parts on an utt2lang file.
# For example <utt> chinese.wu is converted to <utt> chinese.
my ($utt2lang_file) = @ARGV;
open(UTT2LANG, "<$utt2lang_file") or die "no utt2lang file";
$utt2lang_short = "";
while(<UTT2LANG>) {
$line = $_;
chomp($line);
@words = split(" ", $line);
$utt = $words[0];
$lang_long = $words[1];
@lang_parts = split('[.]', $lang_long);
# The actual language. Other parts are dialects or subcategories.
$lang = $lang_parts[0];
$utt2lang_short .= $utt . " " . $lang . "\n";
}
print $utt2lang_short;