Jean-François Rey / otmedia

Blame view

main_tools/ExploitConfidencePass.sh 16.8 KB
  #!/bin/bash
  
  #####################################################
  # File :    ExploitConfidencePass.sh                #
  # Brief :   Exploit the ASR confidence pass to :    #
  #           -> boost the confident zone             #
  #           -> find alternative in non confident zone
  #           -> dynamicly extend the lexicon         #
  # Author :  Jean-François Rey                       #
  #	        (base on Emmanuel Ferreira              #
  #	        and Hugo Mauchrétien works)             #
  # Version : 1.0                                     #
  # Date :    25/06/13                                #
  #####################################################
  
  # Check OTMEDIA_HOME env var
  if [ -z ${OTMEDIA_HOME} ]
  then 
      OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0)))
      export OTMEDIA_HOME=$OTMEDIA_HOME
  fi
  
  # where is ExploitConfidencePass.sh
  MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0))
  
  if [ -z ${SCRIPT_PATH} ]
  then
      SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts
  fi
  
  # Include scripts
  . $SCRIPT_PATH"/Tools.sh"
  . $SCRIPT_PATH"/CheckExploitConfPass.sh"
  
  # where is ExploitConfidencePass.cfg
  EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg"
  if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ]
  then
  	. $EXPLOITCONFIDENCEPASS_CONFIG_FILE
  else
  	echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2
  	exit 1
  fi
  
  #---------------#
  # Parse Options #
  #---------------#
  while getopts ":hDv:cf:r" opt
  do
  	case $opt in
  		h)
  			echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>
  "
              echo -e "\t Options:"
              echo -e "\t\t-h :\tprint this message"
              echo -e "\t\t-D :\tDEBUG mode on"
              echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode"
              echo -e "\t\t-c :\tCheck process, stop if error detected"
              echo -e "\t\t-f n :\tspecify a speeral forks number (default 1)"
              echo -e "\t\t-r n :\tforce rerun without deleting files"
  			exit 1
  			;;
  		D)
  			DEBUG=1
  			;;
          v)
              VERBOSE=$OPTARG
              ;;
          c)
              CHECK=1
              ;;
          f)
              FORKS="--forks $OPTARG"
              ;;
          r)
              RERUN=1
              ;;
  		:)
  			echo "Option -$OPTARG requires an argument." >&2
  			exit 1
  			;;
  		\?)
  			echo "BAD USAGE : unknow opton -$OPTARG"
  			#exit 1
  			;;
  	esac
  done
  
  # mode debug enable
  if [ $DEBUG -eq 1 ]
  then
         set -x
         echo -e "## Mode DEBUG ON ##"
  fi
  
  # mode verbose enable
  if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ; fi
  
  # Check USAGE by arguments number
  if [ $(($#-($OPTIND-1))) -ne 1 ]
  then
  	echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>"
  	echo "$0 -h for more info"
  	exit 1
  fi
  
  shift $((OPTIND-1))
  # check input directory - first argument
  if [ ! -e $1 ]
  then
      print_error "can't open $1"
      exit 1
  fi
  
  #-------------#
  # GLOBAL VARS #
  #-------------#
  INPUT_DIR=$(readlink -e $1)
  OUTPUT_DIR=$INPUT_DIR 
  BASENAME=$(basename $OUTPUT_DIR)
  SHOW_DIR="$OUTPUT_DIR/shows/"
  SOLR_RES="$OUTPUT_DIR/solr/"
  EXT_LEX="$OUTPUT_DIR/LEX/"
  TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/"
  LOGFILE="$(dirname $OUTPUT_DIR)/info_exploitconf.log"
  ERRORFILE="$(dirname $OUTPUT_DIR)/error_exploitconf.log"
  
  CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg"
  if [ -e $CONFPASS_CONFIG_FILE ]
  then
  {
      RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=")
      RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=")
      print_warn "Use confidence measure from : $RES_CONF" 1
  }
  else
  {
      print_error "Can't find $CONFPASS_CONFIG_FILE" 1
      RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm"
      RES_CONF="$INPUT_DIR/conf/res_p2"
  }
  fi
  
  mkdir -p $SHOW_DIR
  mkdir -p $SOLR_RES
  mkdir -p $EXT_LEX
  mkdir -p $TRIGGER_CONFZONE
  
  #------------------#
  # Create Workspace #
  #------------------#
  # Lock directory
  if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ]; then exit 1; fi
  rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1
  touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1
  
  #------#
  # Save #
  #------#
  cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg
  echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg
  
  
  #-----------------------#
  # Segmentation by show  #
  #-----------------------#
  # create txt file from scored res
  # tag pos and lemmatization of the txt file
  # merge the scored res and taglem file
  # segment using the last generated file
  # and create a ctm file by show
  
  print_info "Segmentation by show" 1
  
  # -> to txt
  print_info "Create txt from scored res" 2
  cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm
  cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp
  cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt
  
  # -> to tagger + lemme
  print_info "Tag pos and lem in txt file" 2
  iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp
  $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem
  
  # merge sctm and taglem
  print_info "Merge scored ctm with tag pos and lem file" 2
  cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl
  
  # -> new seg
  print_info "Create xml file and run Topic Seg" 2
  $SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml
  rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem 
  
  # Lia_topic_seg : bring together sentences into show
  cp $INPUT_DIR/$BASENAME.doc.xml 0.xml
  java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg
  cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg
  rm 0.xml $INPUT_DIR/show.seg
  
  if [ $CHECK -eq 1 ]
  then
      if [ ! -s $INPUT_DIR/$BASENAME.show.seg ];then echo -e "ERROR : no Topic segmentation" >> $ERRORFILE; fi
  fi
  
  # Segment ctm into several show files and create a seg list by show
  print_info "Segment ctm into show files and a seg list by show" 2
  $SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR
  
  #-----------------------------------------------------------#
  # SOLR QUERIES                                              #
  # -> Create Confidente Word                                 #
  #   Keep conf words and use Tags                            #
  # -> Query SOLR (document & multimedia)                     #
  #   concat word + add date 2 day before and after the show  #
  #   query document & multimedia                             #
  #-----------------------------------------------------------#
  print_info "Create SOLR queries and ASK SOLR" 1
  for show in $(ls $SHOW_DIR/*.ctm)
  do
      bn=$(basename $show .ctm)
      # Remove words with low confidence and keep useful tagger words
      cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone"
      # Get date 2 day before and after the show
      datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)`
      # Create SOLR queries
      cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries"
      query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]"
      echo $query > $SHOW_DIR/$bn.queries
      # Ask SOLR DB
      if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then
          python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp
          cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords
          cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt
          rm $SOLR_RES/*.tmp
      fi
  
      if [ $CHECK -eq 1 ]
      then
          if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ]
          then
              print_warn "$bn.keywords and $bn.txt are empty !
  Maybe SOLR server is down !" 1
          fi
      fi
  
  done
  
  #----------------------------------------------------------------------------------------------- 
  #  Build trigger file
  #       1) keywords are automatically boosted in the non confident zone of the current res
  #          confident zone are boosted
  #          previous words in sensible zone are penalized
  #       2) OOVs are extracted + phonetized
  #       3) Try to find OOVs acousticly in the current segment
  #       4) Generate the .trigg file
  #------------------------------------------------------------------------------------------------
  print_info "Build trigger files" 1
  for i in `ls $SOLR_RES/*.keywords`
  do
      basename=`basename $i .keywords`
  
      #
      # Tokenize & produce coverage report
      # Use filter you need
      #
      print_info "keywords filtering and produce coverage report" 2
      # Default filter
      cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\
          $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
      # do less filter
      #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
      
  
      #
      # Extract "real" OOV and phonetize them 
      # -> petit filtrage persoo pour eviter d'avoir trop de bruits
      #
      print_info "Extract OOV and phonetize them" 2
      ${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov
  
      #
      # Search INVOC & OOV in the current lattice
      #
      print_info "Search INVOC and OOV in the current lattice" 2
      cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch
      cat $SOLR_RES/${basename}.phon_oov | cut -f1 >>  $TRIGGER_CONFZONE/$basename.tosearch
      
      # For each treil
      for baseseg in $(cat "$SHOW_DIR/$basename.lst")
      do
          $OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound
          #
          # Produce the boost file for the next decoding pass
          #
          print_info "Produce trigg file : $baseseg " 3
          cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg
      done
  
  done
  
  #----------------------------------------------------------------------------------------------- 
  # Build the extended SPEERAL Lexicon
  #   1) Merge OOVs + LEXICON
  #   1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba)
  #   2) The current lexicon is extended with all the valid OOVs
  #----------------------------------------------------------------------------------------------- 
  print_info "Build extended Speeral Lexicon" 1
  mkdir -p $EXT_LEX/final
  mkdir -p $EXT_LEX/tmp
  mkdir -p $EXT_LEX/tmp/txt
  #
  # Collect the acousticly found oov and their phonetisation 
  #
  print_info "Get all OOV and retrieve all phonetisation" 2
  for i in `ls $SOLR_RES/*.phon_oov`
  do
      basename=`basename $i .phon_oov`
  
      rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null
      # list acousticly found for the show
      for baseseg in $(cat "$SHOW_DIR/$basename.lst")
      do
          cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound
      done
      cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp
      mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound
  
      #
      # Extract OOV really added
      #
      cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov
      $SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound
      #
      # Retrieve all phonetisation
      #
      cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon
  done
  
  #
  # Merge OOVs and their phonetisation
  #
  print_info "Merge OOV and their phonetisation" 2
  lexname=$(basename $LEXICON)
  cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon
  cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$"  > $EXT_LEX/final/all.oov_acousticlyfound
  $SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon
  
  #
  # Collect + clean retrieved txt
  #
  print_info "Collect and clean SOLR txt answers" 2
  # choose filter
  # default
  cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt
  # low filter
  #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt
  
  #
  # Construct the map file
  #
  # Notes:
  # - Expected format : 
  #   <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1>
  #
  print_info "Construct map file" 2
  rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null
  rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null
  
  while read oov
  do
      oov=`echo $oov | sed "s/
  //g"`
      #
      # Obtain the oov's tag
      #
      #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2`
      #
      # Try to collect text containing the oov word
      #
      cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt
      if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then
          nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
          if [ $nbWords -eq 0 ]; then
              echo "UNVALID OOV: $oov => $nbWords occurrences"
              echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
          else
              #
              # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected
              #
              #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt"
              candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
              if [ ! x$candidate = "x" ]; then
                  grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon
                  while read phonLine
                  do
                      #<word> <phon> => <word> <candidate> <phon> 
                      echo "$phonLine" | sed "s|\t|\t$candidate\t|"  >> $EXT_LEX/final/${lexname}_ext.map
                  done < $EXT_LEX/tmp/$oov.phon
              else
                  echo "UNVALID OOV: $oov => no availaible Candidate word in LM"
                  echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
              fi
          fi
      else
          echo "UNVALID OOV: $oov"
          echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
      fi
  done < $EXT_LEX/final/all.oov_acousticlyfound
  
  #
  ### Speeral 
  #
  
  lexname=`basename $LEXICON`
  #
  # Build the final trigger file
  #
  print_info "Clean trigg files" 2
  mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null
  mkdir -p $EXT_LEX/speeral/ 2> /dev/null
  for i in `ls $TRIGGER_CONFZONE/*.trigg`
  do
      basename=`basename $i .trigg`
      cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg
  done
  #
  # Compile the speeral extended lexicon
  #
  print_info "Compile Speeral extended lexicon" 2
  $SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext
  
  if [ $CHECK -eq 1 ]
  then
      check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext"
      if [ $? -eq 1 ]
      then
          echo -e "ERROR : Building Speeral Lexicon $INPUT_DIR " >> $ERRORFILE
          exit 1;
      fi
  fi
  
  
  #-------#
  # CLOSE #
  #-------#
  # Seem OK 
  print_info "<= End $BASENAME Solr | $(date +'%d/%m/%y %H:%M:%S')" 1
  echo -e "#Solr $BASENAME " >> $LOGFILE
   
  # unlock directory
  mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock"