Blame view
tools/scripts/CleanFilter.sh
1.35 KB
e6be5137b reinitialized pro... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
#!/bin/bash SCRIPT=$OTMEDIA_HOME/tools/scripts DATA=$OTMEDIA_HOME/data/rules OUT=. while getopts ":o:d:h" OPTION do case $OPTION in h) #display help echo -e "CleanFilter.sh :" echo -e "\tbrief : apply a clean filter to the standard input and display the result on the standard output" echo -e "\tusage : cat <raw_txt> | CleanFilter.sh [OPTIONS]" echo -e "\toptions:" echo -e "\t\td) specify a data source (default $DATA)" echo -e "\t\tN.B: this data source must contain the files : preprocess.regex, random_regex.tab, numeric_rules, muRules.tab and postprocess.regex" echo -e "\t\to) specify an ouput (default $OUT)" exit 1 ;; o) #specify another output OUT=$OPTARG ;; d) #specify a corpus file extension DATA=$OPTARG ;; :) echo "BAD USAGE : OPTION $OPTARG need a value" exit 1 ;; \?) echo "BAD USAGE : unknow option '$OPTARG'" exit 1 ;; esac done cat - |\ $SCRIPT/BdlexUC.pl $OTMEDIA_HOME/data/rules/basic -t |\ $SCRIPT/UrlConverter.pl |\ sed -f $DATA/preprocess.regex |\ $SCRIPT/RandomRegex.pl $DATA/random_regex.tab |\ #$SCRIPT/Rom2Dec.pl |\ $SCRIPT/Date2txt.pl |\ $SCRIPT/Number2txt.pl -r $DATA/numeric_rules -m $DATA/muRules.tab -c -o $OUT/mu_oov.log |\ sed -f $DATA/postprocess.regex |\ $SCRIPT/Sentencer.pl |\ sed -f $DATA/lastprocess.regex |\ $SCRIPT/BdlexUC.pl $OTMEDIA_HOME/data/rules/basic -f |\ grep -v "<s> </s>" |