lia_clean_corpus 1.37 KB
#!/bin/csh -f
#
#  Cleaning a corpus already cut in sentences (<s> and </s>) for training an LM according to a lexicon
#
#

if ( $#argv < 1 ) then
 echo "Syntax: lia_clean_corpus <lexicon> [<correction rule file>]"
 exit
endif

#  Building the ressouces for the lexicon
#
if ( ! -e $1.tab ) then
 $LIA_TAGG/bin/lia_compile_lexitree $1 $1.tab
 $LIA_TAGG/bin/trans_apos -deglue < $1 > $1.apos
 $LIA_TAGG/bin/lia_compile_lexitree $1.apos $1.apos.tab
endif 

if ( $#argv == 2 ) then
 $LIA_TAGG/bin/trans_apos -deglue -space | \
	$LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben | \
	$LIA_TAGG/bin/trans_apos -glue | \
	$LIA_TAGG/bin/unephraseparligne -fmt |\
	$LIA_TAGG/bin/lia_nett_capital $1.tab | \
	$LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \
	$LIA_TAGG/bin/lia_unmotparligne | \
	$LIA_TAGG/bin/detokenize -word | \
	$LIA_TAGG/bin/rewrite_corpus -rule $2 | \
	$LIA_TAGG/bin/last_capital_clean $1 | \
	$LIA_TAGG/bin/lia_tokenize $1.tab
else
 $LIA_TAGG/bin/trans_apos -deglue -space |\
	$LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben |\
	$LIA_TAGG/bin/trans_apos -glue |\
	$LIA_TAGG/bin/unephraseparligne -fmt |\
	$LIA_TAGG/bin/lia_nett_capital $1.tab |\
	$LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \
	$LIA_TAGG/bin/lia_unmotparligne | \
	$LIA_TAGG/bin/detokenize -word | \
	$LIA_TAGG/bin/last_capital_clean $1 | \
	$LIA_TAGG/bin/lia_tokenize $1.tab
endif