ConfidenceMeasure.sh 11 KB
#!/bin/bash
#-----------------------------------------------------------------------------------------
# Author : Benjamin Lecouteux & Emmanuel FERREIRA (contact emmanuel.ferreira0194@gmail.com)
# Brief: Determine les mesures de confiance d'une transcription (res de speeral)
#-----------------------------------------------------------------------------------------

# where is ConfidenceMeasure.sh
if [ -z $MAIN_SCRIPT_PATH ]; then MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0)); fi

# where is ConfidenceMeasure.cfg
CONFIDENCEMEASURE_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ConfidenceMeasure.cfg"
if [ -e $CONFIDENCEMEASURE_CONFIG_FILE ]
then
    . $CONFIDENCEMEASURE_CONFIG_FILE
else
    echo "ERROR : Can't find configuration file $CONFIDENCEMEASURE_CONFIG_FILE" >&2
    exit 1
fi

PACKAGE_CONF_MEASURE=$CONFIDENCEMEASURE_CONFIG_FILE
#------------------
# Parser les options
#-------------------
while getopts ":c:s:h" OPTION
do
	case $OPTION in
	h) 	#Display help
		echo -e "$0 :"
		echo -e "\tAuthor : Benjamin Lecouteux & Emmanuel FERREIRA (contact: emmanuel.ferreira0194@gmail.com)"
		echo -e "\tVersion : 2.0"
		echo -e "\tBrief : Determine confidence measure of a transcription"
		echo -e "\tUsage : $0 [OPTIONS] <(i) REP_IN> <REP_NAME>"
		echo -e "\tOptions:"
		echo -e "\t\tc) specify the path of the configuration file (default $PACKAGE_CONF_MEASURE)"
		echo -e "\t\ts) specify PORT@HOST of a SRILM server"
		exit 1
	;;
	c) 	#Change the configuration file
	   	PACKAGE_CONF_MEASURE=$OPTARG
	;;
	s)	#use an SRILM server (avoid loading arpa model in memory)
		SERVER=$OPTARG
	;;
	:)
		echo "BAD USAGE : OPTION $OPTARG need a value"
	   	exit 1
	;;
	\?) 
		echo "BAD USAGE : unknow option '$OPTARG'"
	   	exit 1 
	;;
	esac
done

#-------------------------------------------
# Shift options pour recuperation arguments
#-------------------------------------------
shift $((OPTIND-1))

if [ -z "$1" ]
then
    echo "BAD USAGE: $0 [OPTIONS] <(i) repertoire (ex:20041006_0800_0900_CULTURE)> <REP_NAME (ex:res_p2)>"
	exit 1
fi

if [ -z "$2" ]
then
    echo "BAD USAGE: $0 [OPTIONS] <(i) repertoire (ex:20041006_0800_0900_CULTURE)> <REP_NAME (ex:res_p2)>"
	exit 1
fi

. $PACKAGE_CONF_MEASURE

#------------------------------------
# INIT - Creation du workspace
#------------------------------------
NAME=`basename $1`
CONF_DIR=$1/conf/$2
FICHIER_RES=$2
REF=$CONF_DIR/ref
POS=$CONF_DIR/pos
MLCLASS=$CONF_DIR/mlclass
GVALIGN=$CONF_DIR/gvalign
HTK_POST=$CONF_DIR/htk_post
HTK_LM=$CONF_DIR/htk_lm
WLAT=$CONF_DIR/wlat
LIKELIHOOD=$CONF_DIR/likelihood
GVCTM=$CONF_DIR/gvctm
SEGCTM=$CONF_DIR/segctm
SUPER_CTM=$CONF_DIR/super_ctm
SCORED_CTM=$CONF_DIR/scored_ctm
mkdir -p $CONF_DIR > /dev/null 2>&1
mkdir -p $REF > /dev/null 2>&1
mkdir -p $POS > /dev/null 2>&1
mkdir -p $MLCLASS > /dev/null 2>&1
mkdir -p $GVALIGN > /dev/null 2>&1
mkdir -p $HTK_POST > /dev/null 2>&1
#mkdir -p $HTK_LM ==> generer auto par SRILM si besoin
mkdir -p $WLAT > /dev/null 2>&1
mkdir -p $LIKELIHOOD > /dev/null 2>&1
mkdir -p $GVCTM > /dev/null 2>&1
mkdir -p $SEGCTM > /dev/null 2>&1
mkdir -p $SUPER_CTM > /dev/null 2>&1
mkdir -p $SCORED_CTM > /dev/null 2>&1
if [ -z $BOOST_BIN ];then
	BOOST_BIN=$ROOT/bin/icsiboost-64bit-static-r160	
fi
#-----------------------------------------------------------------
# STEP 1 - Extension des treillis + ajout posteriors (format htk)
#-----------------------------------------------------------------
if [ $EXTEND  == 1 ]
then
	echo "EXTEND step..."
	rm -r $HTK_LM > /dev/null 2>&1
	rm $HTK_POST/* > /dev/null 2>&1
	#
	# --> Ajout des scores linguistiques dans le HTK
	#
	ls $1/$FICHIER_RES/*.treil > $CONF_DIR/Liste_treil_${NAME}.lst
	
	LM_ACCESS="-lm $ML"
	if [ ! -z $SERVER ]; then
		LM_ACCESS="-use-server $SERVER -cache-served-ngrams"
	fi	
	echo "$SRILM_BIN/lattice-tool -read-htk -in-lattice-list $CONF_DIR/Liste_treil_${NAME}.lst $LM_ACCESS -order $ORDER -htk-logbase 10 -htk-lmscale $FUDGE -htk-wdpenalty $PENALITE -write-htk -out-lattice-dir $HTK_LM";
	$SRILM_BIN/lattice-tool -read-htk -in-lattice-list $CONF_DIR/Liste_treil_${NAME}.lst $LM_ACCESS -order $ORDER -htk-logbase 10 -htk-lmscale $FUDGE -htk-wdpenalty $PENALITE -write-htk -out-lattice-dir $HTK_LM

	#
	# --> Calcul des posteriors a partir des scores acoustiques et linguistiques present dans le HTK
	#
	for file in `ls $HTK_LM/*.treil`
	do
		base=`basename $file .treil`;
		#echo "lattice-tool -read-htk -in-lattice $file -compute-posteriors -write-htk -out-lattice $HTK_POST/${base}.htk"
		$SRILM_BIN/lattice-tool -read-htk -in-lattice $file -compute-posteriors -write-htk -out-lattice $HTK_POST/${base}.htk
	done
fi

#---------------------------------------------------------------------------------------------------------------
# STEP 2 - alignement res et wlat pour creer res avec scores + infos (utilise un fastnc modifie)
# Exemple :
#  ok amendement 0.814885 ( time=36 nodes=3 min=0.0016862 max=0.814885 mean=0.333896 var=0.363849 svar=0.603199 )
#----------------------------------------------------------------------------------------------------------------
if [ $FASTNC == 1 ]
then
	echo "FASTNC step..."
	rm -f $POS/* $WLAT/* > /dev/null 2>&1
	for file in `ls $HTK_LM/*.treil`
	do
		base=`basename $file .treil`;
		#echo "$ROOT/bin/fastnc_v1.4 $HTK_POST/${base}.htk $WLAT/${base}.wlat  $1/$FICHIER_RES/${base}.res rien -dtw2 > $POS/$base.pos2&"
		$ROOT/bin/fastnc_v1.4 $HTK_POST/${base}.htk $WLAT/${base}.wlat  $1/$FICHIER_RES/${base}.res rien -dtw2 > $POS/$base.pos2 
	done
fi

#------------------------------------------------------------------------------------------------------------
# STEP 3 - recuperation de la probabilite pour chaque mot + info relatives au modele de langue (backoff, ...)
#------------------------------------------------------------------------------------------------------------
if [ $PPL == 1 ]
then
	echo "PPL step..."
	rm -f $REF/* $CONF_DIR/${NAME}_ALLREF.* $MLCLASS/* > /dev/null 2>&1
	#
	# --> Creation des references a partir des .res (uniquement si .treil present)
	#
	for file in `ls $1/$FICHIER_RES/*.res`
	do
		base=`basename $file .res`;
		if [ -f $1/$FICHIER_RES/$base.treil ];then
			cat $file | cut -f5 -d' ' | tr "\n" " " > $REF/${base}.ref
		fi
	done
	
	#
	# --> creation d'un fichier contenant l'ensemble des transcriptions du show
	#
	compteur=0
	for file in `du -sh $REF/*.ref | grep -v "^0" | cut -f2`
	do
		base=`basename $file .ref`;
		cat $file >> $CONF_DIR/${NAME}_ALLREF.txt
		echo "" >> $CONF_DIR/${NAME}_ALLREF.txt
		ListeFichiers[$compteur]=$base.mlclass
		compteur=$(( $compteur + 1 ))
	done

	#
	# --> recuperation de la probabilite pour chaque mot provenant des resultats de l'ASR + informations linguistiques (backoff used, ngram,...)
	#
	$SRILM_BIN/ngram -lm $ML -order $ORDER -ppl $CONF_DIR/${NAME}_ALLREF.txt -debug 2 > $CONF_DIR/${NAME}_ALLREF.mlclass

	#
	# --> creation d'un fichier par fichier .ref
	#
	compteur=0
	cat $CONF_DIR/${NAME}_ALLREF.mlclass | while read line
	do
		echo $line | grep "^$" > /dev/null
		if [ $? == 0 ];then
			compteur=$(( $compteur + 1 ))
		else
			echo "$line" | grep "p(" > /dev/null
			if [ $? == 0 ];then
				echo "$line" >> $MLCLASS/${ListeFichiers[${compteur}]};
			fi
		fi
	done
fi

#----------------------------------------------------------
# STEP 4 - recuperation du score acoustique de chaque mot 
#----------------------------------------------------------
if [ $ACOUST == 1 ]
then
	echo "ACOUST step..."
	rm -f $GVALIGN/* > /dev/null 2>&1
	rm -f $GVCTM/* > /dev/null 2>&1
	rm -f $SEGCTM/* > /dev/null 2>&1
	rm -f $LIKELIHOOD/* > /dev/null 2>&1
	
	for file in `ls $1/$FICHIER_RES/*.res`
	do
		base=`basename $file .res`
		if [ -f $1/$FICHIER_RES/$base.treil ];then
			#echo "$ROOT/script/MakeListForGVAlign.pl $file $GVALIGN";
			$ROOT/script/MakeListForGVAlign.pl $file $GVALIGN
		fi
	done

	for file in `ls $GVALIGN/*.gvalign`
	do
		base=`basename $file .gvalign`; 

		type=`echo $base | cut -f2 -d: | cut -f2- -d\# | sed -e "s/[0-9]\+//"`

		case "$type" in
			"M#S")
			HMM=$mod_ms
			;;
			"F#S")
			HMM=$mod_fs
			;;
			"M#T")
			HMM=$mod_mt
			;;
			"F#T")
			HMM=$mod_ft
			;;
		esac

		#echo "$ROOT/bin/gvalign.old $HMM $PHON $file -e $1/${REP_PLP}/ -f .plp -r $GVALIGN -g .gv -C FAST -W $GVCTM -O CTM -s $SEGCTM > $LIKELIHOOD/${base}.likelihood | sed -e 's/Decoding/\\nDecoding/g' > $LIKELIHOOD/${base}.likelihood";
		#$ROOT/bin/gvalign.old $HMM $PHON $file -e $1/${REP_PLP}/ -f .plp -r $GVALIGN -g .gv -C FAST -W $GVCTM  -O CTM  -s $SEGCTM | sed -e 's/Decoding/\nDecoding/g' > $LIKELIHOOD/${base}.likelihood
                touch $LIKELIHOOD/${base}.likelihood
	done
fi

#--------------------------------------------------------------------------------------------------------------------------------
# STEP 5 - Merge de tous les scores caclules => res (ctm) avec scores/params utiliser dans la classif
# Format : 
#   mot    NbNode MinNode MaxNode MeanNode VarNode SVarNode Posterior AcousticLogLikelihood AcousticLogLikelihood/Frame ...
#	   AcousticConfidenceLikelihood AcousticConstraintLikeLihood AcousticNoConstraint Likelihood ClasseRepliLinguistique ...
#          RepliLinguistique LogLinguistique LogUnigramme NbMotsFenetre NbNulNode NbTrame
#---------------------------------------------------------------------------------------------------------------------------------
if [ $EXTRACT == 1 ]
then
	echo "EXTRACT step..."
	rm -f $SUPER_CTM/* > /dev/null 2>&1

	for file in `ls $1/$FICHIER_RES/*.res`
	do
		base=`basename $file .res`;
		like=`echo "$base" | sed -e 's/\..*//'`;
		if [ -f $1/$FICHIER_RES/$base.treil ]; then
			echo "$ROOT/scripts/ExtractData.pl $pathML $nameML $POS/${base}.pos2 $file $LIKELIHOOD/${like}.likelihood $MLCLASS/${base}.mlclass $TYPE_ML >  $SUPER_CTM/${base}.ctm";
			$ROOT/script/ExtractData.pl $pathML $nameML $POS/${base}.pos2 $file $LIKELIHOOD/${like}.likelihood $MLCLASS/${base}.mlclass $TYPE_ML >  $SUPER_CTM/${base}.ctm
#			$ROOT/script/ExtractData.pl $pathML $nameML $POS/${base}.pos2 $file $LIKELIHOOD/${like}.likelihood $MLCLASS/${base}.mlclass $TYPE_ML >  $SUPER_CTM/${base}.ctm
		fi
	done
fi

#----------------------------------------------------------------
# STEP 6 - Calcul effectif du score de confiance pour chaque mot
#----------------------------------------------------------------
if [ $BOOST == 1 ]
then
	echo "BOOST step..."
	rm -f $SCORED_CTM/* $CONF_DIR/${NAME}.sctm $CONF_DIR/${NAME}.boost* $CONF_DIR/${NAME}.resboost* $CONF_DIR/${NAME}.corres* > /dev/null 2>&1
	# utilise pour le test sans etiquette
	$ROOT/script/DissociateErroneousFromDecoded.pl $SUPER_CTM 2 equilibre > $CONF_DIR/${NAME}.sctm

	$ROOT/script/ConvertSuperCTMtoDataSVM.pl $CONF_DIR/${NAME}.sctm boost 2 0 0 > $CONF_DIR/${NAME}.boost
	$ROOT/script/ConvertSuperCTMtoDataSVM.pl $CONF_DIR/${NAME}.sctm boost 2 0 1 > $CONF_DIR/${NAME}.boost_refs

	$BOOST_BIN -S $ROOT/TRAIN -C --posteriors < $CONF_DIR/${NAME}.boost  > $CONF_DIR/${NAME}.resboost

	cat $CONF_DIR/${NAME}.resboost | cut -f4 -d" " > $CONF_DIR/${NAME}.resboost2

	cat $CONF_DIR/${NAME}.boost_refs |  sed -e 's/.*ref=//' > $CONF_DIR/${NAME}.corres

	paste $CONF_DIR/${NAME}.corres $CONF_DIR/${NAME}.resboost2 | sed -e 's/\.ctm/\.res/' > $CONF_DIR/${NAME}.corres2

	$ROOT/script/AssociateScoreToCtm.pl $CONF_DIR/${NAME}.corres2 $1/$FICHIER_RES/ $SCORED_CTM/
fi 
echo "END"