CleanFilter.sh 1.35 KB
#!/bin/bash

SCRIPT=$OTMEDIA_HOME/tools/scripts
DATA=$OTMEDIA_HOME/data/rules
OUT=.

while getopts ":o:d:h" OPTION
do
	case $OPTION in
	h) 	#display help
		echo -e "CleanFilter.sh :"
		echo -e "\tbrief : apply a clean filter to the standard input and display the result on the standard output"
		echo -e "\tusage : cat <raw_txt> | CleanFilter.sh [OPTIONS]"
		echo -e "\toptions:"
		echo -e "\t\td) specify a data source (default $DATA)"
		echo -e "\t\tN.B: this data source must contain the files : preprocess.regex, random_regex.tab, numeric_rules, muRules.tab and postprocess.regex"
		echo -e "\t\to) specify an ouput (default $OUT)"
		exit 1
	;;
	o) 	#specify another output
	   	OUT=$OPTARG
	;;
	d) 	#specify a corpus file extension
		DATA=$OPTARG
	;;
	:)
		echo "BAD USAGE : OPTION $OPTARG need a value"
	   	exit 1
	;;
	\?) 
		echo "BAD USAGE : unknow option '$OPTARG'"
	   	exit 1 
	;;
	esac
done

cat - |\
$SCRIPT/BdlexUC.pl $OTMEDIA_HOME/data/rules/basic -t |\
$SCRIPT/UrlConverter.pl |\
sed -f $DATA/preprocess.regex |\
$SCRIPT/RandomRegex.pl $DATA/random_regex.tab |\
#$SCRIPT/Rom2Dec.pl |\
$SCRIPT/Date2txt.pl |\
$SCRIPT/Number2txt.pl -r $DATA/numeric_rules -m $DATA/muRules.tab -c -o $OUT/mu_oov.log |\
sed -f $DATA/postprocess.regex |\
$SCRIPT/Sentencer.pl |\
sed -f $DATA/lastprocess.regex |\
$SCRIPT/BdlexUC.pl $OTMEDIA_HOME/data/rules/basic -f |\
grep -v "<s> </s>"