Blame view
tools/lia_ltbox/lia_biglex/script/lia_clean_corpus
1.37 KB
e6be5137b reinitialized pro... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
#!/bin/csh -f # # Cleaning a corpus already cut in sentences (<s> and </s>) for training an LM according to a lexicon # # if ( $#argv < 1 ) then echo "Syntax: lia_clean_corpus <lexicon> [<correction rule file>]" exit endif # Building the ressouces for the lexicon # if ( ! -e $1.tab ) then $LIA_TAGG/bin/lia_compile_lexitree $1 $1.tab $LIA_TAGG/bin/trans_apos -deglue < $1 > $1.apos $LIA_TAGG/bin/lia_compile_lexitree $1.apos $1.apos.tab endif if ( $#argv == 2 ) then $LIA_TAGG/bin/trans_apos -deglue -space | \ $LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben | \ $LIA_TAGG/bin/trans_apos -glue | \ $LIA_TAGG/bin/unephraseparligne -fmt |\ $LIA_TAGG/bin/lia_nett_capital $1.tab | \ $LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \ $LIA_TAGG/bin/lia_unmotparligne | \ $LIA_TAGG/bin/detokenize -word | \ $LIA_TAGG/bin/rewrite_corpus -rule $2 | \ $LIA_TAGG/bin/last_capital_clean $1 | \ $LIA_TAGG/bin/lia_tokenize $1.tab else $LIA_TAGG/bin/trans_apos -deglue -space |\ $LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben |\ $LIA_TAGG/bin/trans_apos -glue |\ $LIA_TAGG/bin/unephraseparligne -fmt |\ $LIA_TAGG/bin/lia_nett_capital $1.tab |\ $LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \ $LIA_TAGG/bin/lia_unmotparligne | \ $LIA_TAGG/bin/detokenize -word | \ $LIA_TAGG/bin/last_capital_clean $1 | \ $LIA_TAGG/bin/lia_tokenize $1.tab endif |