Blame view

tools/lia_ltbox/lia_biglex/script/lia_clean_corpus 1.37 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
  #!/bin/csh -f
  #
  #  Cleaning a corpus already cut in sentences (<s> and </s>) for training an LM according to a lexicon
  #
  #
  
  if ( $#argv < 1 ) then
   echo "Syntax: lia_clean_corpus <lexicon> [<correction rule file>]"
   exit
  endif
  
  #  Building the ressouces for the lexicon
  #
  if ( ! -e $1.tab ) then
   $LIA_TAGG/bin/lia_compile_lexitree $1 $1.tab
   $LIA_TAGG/bin/trans_apos -deglue < $1 > $1.apos
   $LIA_TAGG/bin/lia_compile_lexitree $1.apos $1.apos.tab
  endif 
  
  if ( $#argv == 2 ) then
   $LIA_TAGG/bin/trans_apos -deglue -space | \
  	$LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben | \
  	$LIA_TAGG/bin/trans_apos -glue | \
  	$LIA_TAGG/bin/unephraseparligne -fmt |\
  	$LIA_TAGG/bin/lia_nett_capital $1.tab | \
  	$LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \
  	$LIA_TAGG/bin/lia_unmotparligne | \
  	$LIA_TAGG/bin/detokenize -word | \
  	$LIA_TAGG/bin/rewrite_corpus -rule $2 | \
  	$LIA_TAGG/bin/last_capital_clean $1 | \
  	$LIA_TAGG/bin/lia_tokenize $1.tab
  else
   $LIA_TAGG/bin/trans_apos -deglue -space |\
  	$LIA_TAGG/bin/lia_tokenize $1.apos.tab -ben |\
  	$LIA_TAGG/bin/trans_apos -glue |\
  	$LIA_TAGG/bin/unephraseparligne -fmt |\
  	$LIA_TAGG/bin/lia_nett_capital $1.tab |\
  	$LIA_TAGG/bin/lia_nomb2alpha $LIA_TAGG/data/list_chif_virgule.fr.tab | \
  	$LIA_TAGG/bin/lia_unmotparligne | \
  	$LIA_TAGG/bin/detokenize -word | \
  	$LIA_TAGG/bin/last_capital_clean $1 | \
  	$LIA_TAGG/bin/lia_tokenize $1.tab
  endif