lia_clean_corpus
1.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/csh -f
#
# Cleaning a corpus already cut in sentences (<s> and </s>) for training an LM according to a lexicon
#
#
if ( $#argv < 1 ) then
echo "Syntax: lia_clean_corpus <lexicon> [<correction rule file>]"
exit
endif
# Building the ressouces for the lexicon
#
if ( ! -e $1.tab ) then
$LIA_TAGG/bin/lia_compile_lexitree $1 $1.tab
$LIA_TAGG/bin/trans_apos -deglue < $1 > $1.apos
$LIA_TAGG/bin/lia_compile_lexitree $1.apos $1.apos.tab
endif
if ( $#argv == 2 ) then
$LIA_TAGG/bin/trans_apos -deglue -space | \
$LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben | \
$LIA_TAGG/bin/trans_apos -glue | \
$LIA_TAGG/bin/unephraseparligne -fmt |\
$LIA_TAGG/bin/lia_nett_capital $1.tab | \
$LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \
$LIA_TAGG/bin/lia_unmotparligne | \
$LIA_TAGG/bin/detokenize -word | \
$LIA_TAGG/bin/rewrite_corpus -rule $2 | \
$LIA_TAGG/bin/last_capital_clean $1 | \
$LIA_TAGG/bin/lia_tokenize $1.tab
else
$LIA_TAGG/bin/trans_apos -deglue -space |\
$LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben |\
$LIA_TAGG/bin/trans_apos -glue |\
$LIA_TAGG/bin/unephraseparligne -fmt |\
$LIA_TAGG/bin/lia_nett_capital $1.tab |\
$LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \
$LIA_TAGG/bin/lia_unmotparligne | \
$LIA_TAGG/bin/detokenize -word | \
$LIA_TAGG/bin/last_capital_clean $1 | \
$LIA_TAGG/bin/lia_tokenize $1.tab
endif