#!/bin/csh -f
#
# Cleaning a corpus already cut in sentences ( and ) for training an LM according to a lexicon
#
#
if ( $#argv < 1 ) then
echo "Syntax: lia_clean_corpus []"
exit
endif
# Building the ressouces for the lexicon
#
if ( ! -e $1.tab ) then
$LIA_TAGG/bin/lia_compile_lexitree $1 $1.tab
$LIA_TAGG/bin/trans_apos -deglue < $1 > $1.apos
$LIA_TAGG/bin/lia_compile_lexitree $1.apos $1.apos.tab
endif
if ( $#argv == 2 ) then
$LIA_TAGG/bin/trans_apos -deglue -space | \
$LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben | \
$LIA_TAGG/bin/trans_apos -glue | \
$LIA_TAGG/bin/unephraseparligne -fmt |\
$LIA_TAGG/bin/lia_nett_capital $1.tab | \
$LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \
$LIA_TAGG/bin/lia_unmotparligne | \
$LIA_TAGG/bin/detokenize -word | \
$LIA_TAGG/bin/rewrite_corpus -rule $2 | \
$LIA_TAGG/bin/last_capital_clean $1 | \
$LIA_TAGG/bin/lia_tokenize $1.tab
else
$LIA_TAGG/bin/trans_apos -deglue -space |\
$LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben |\
$LIA_TAGG/bin/trans_apos -glue |\
$LIA_TAGG/bin/unephraseparligne -fmt |\
$LIA_TAGG/bin/lia_nett_capital $1.tab |\
$LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \
$LIA_TAGG/bin/lia_unmotparligne | \
$LIA_TAGG/bin/detokenize -word | \
$LIA_TAGG/bin/last_capital_clean $1 | \
$LIA_TAGG/bin/lia_tokenize $1.tab
endif