Blame view

main_tools/ExploitConfidencePass.sh 18.7 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
  #!/bin/bash
  
  #####################################################
  # File :    ExploitConfidencePass.sh                #
  # Brief :   Exploit the ASR confidence pass to :    #
  #           -> boost the confident zone             #
  #           -> find alternative in non confident zone
  #           -> dynamicly extend the lexicon         #
  # Author :  Jean-François Rey                       #
  #	        (base on Emmanuel Ferreira              #
  #	        and Hugo Mauchrétien works)             #
  # Version : 1.0                                     #
  # Date :    25/06/13                                #
  #####################################################
f37e72eaf   Jean-François Rey   up
15
  echo "### ExploitConfidencePass.sh ###"
e6be5137b   Jean-François Rey   reinitialized pro...
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
  # Check OTMEDIA_HOME env var
  if [ -z ${OTMEDIA_HOME} ]
  then 
      OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0)))
      export OTMEDIA_HOME=$OTMEDIA_HOME
  fi
  
  # where is ExploitConfidencePass.sh
  MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0))
  
  if [ -z ${SCRIPT_PATH} ]
  then
      SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts
  fi
  
  # Include scripts
  . $SCRIPT_PATH"/Tools.sh"
  . $SCRIPT_PATH"/CheckExploitConfPass.sh"
  
  # where is ExploitConfidencePass.cfg
  EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg"
  if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ]
  then
  	. $EXPLOITCONFIDENCEPASS_CONFIG_FILE
  else
  	echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2
  	exit 1
  fi
  
  #---------------#
  # Parse Options #
  #---------------#
665a8dac3   Jean-François Rey   ! follow the whit...
48
  while getopts ":hDv:cr" opt
e6be5137b   Jean-François Rey   reinitialized pro...
49
50
51
52
53
54
55
56
57
58
  do
  	case $opt in
  		h)
  			echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>
  "
              echo -e "\t Options:"
              echo -e "\t\t-h :\tprint this message"
              echo -e "\t\t-D :\tDEBUG mode on"
              echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode"
              echo -e "\t\t-c :\tCheck process, stop if error detected"
e6be5137b   Jean-François Rey   reinitialized pro...
59
60
61
62
63
64
65
66
67
68
69
70
              echo -e "\t\t-r n :\tforce rerun without deleting files"
  			exit 1
  			;;
  		D)
  			DEBUG=1
  			;;
          v)
              VERBOSE=$OPTARG
              ;;
          c)
              CHECK=1
              ;;
e6be5137b   Jean-François Rey   reinitialized pro...
71
72
73
74
          r)
              RERUN=1
              ;;
  		:)
d7e9e4b9d   Jean-François Rey   update bugfix stderr
75
  			echo "Option -$OPTARG requires an argument." >&2
e6be5137b   Jean-François Rey   reinitialized pro...
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
  			exit 1
  			;;
  		\?)
  			echo "BAD USAGE : unknow opton -$OPTARG"
  			#exit 1
  			;;
  	esac
  done
  
  # mode debug enable
  if [ $DEBUG -eq 1 ]
  then
         set -x
         echo -e "## Mode DEBUG ON ##"
  fi
  
  # mode verbose enable
1fd315c89   Jean-François Rey   add Extract audio...
93
  if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi
e6be5137b   Jean-François Rey   reinitialized pro...
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
  
  # Check USAGE by arguments number
  if [ $(($#-($OPTIND-1))) -ne 1 ]
  then
  	echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>"
  	echo "$0 -h for more info"
  	exit 1
  fi
  
  shift $((OPTIND-1))
  # check input directory - first argument
  if [ ! -e $1 ]
  then
      print_error "can't open $1"
      exit 1
  fi
561670acc   Jean-François Rey   remove output red...
110
  print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1
e6be5137b   Jean-François Rey   reinitialized pro...
111
112
113
114
115
116
117
118
119
120
  #-------------#
  # GLOBAL VARS #
  #-------------#
  INPUT_DIR=$(readlink -e $1)
  OUTPUT_DIR=$INPUT_DIR 
  BASENAME=$(basename $OUTPUT_DIR)
  SHOW_DIR="$OUTPUT_DIR/shows/"
  SOLR_RES="$OUTPUT_DIR/solr/"
  EXT_LEX="$OUTPUT_DIR/LEX/"
  TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/"
b427f103e   Jean-François Rey   update log info p...
121
122
  LOGFILE="$OUTPUT_DIR/info_exploitconf.log"
  ERRORFILE="$OUTPUT_DIR/error_exploitconf.log"
e6be5137b   Jean-François Rey   reinitialized pro...
123
124
125
126
127
128
129
  
  CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg"
  if [ -e $CONFPASS_CONFIG_FILE ]
  then
  {
      RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=")
      RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=")
7e99f0793   Jean-François Rey   up
130
      print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2
e6be5137b   Jean-François Rey   reinitialized pro...
131
132
133
  }
  else
  {
7e99f0793   Jean-François Rey   up
134
135
      print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE"
      print_error "[${BASENAME}] -> use res_p2"
e6be5137b   Jean-François Rey   reinitialized pro...
136
137
138
139
      RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm"
      RES_CONF="$INPUT_DIR/conf/res_p2"
  }
  fi
1fd315c89   Jean-François Rey   add Extract audio...
140
141
142
143
  mkdir -p $SHOW_DIR > /dev/null 2>&1
  mkdir -p $SOLR_RES > /dev/null 2>&1
  mkdir -p $EXT_LEX > /dev/null 2>&1
  mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1
e6be5137b   Jean-François Rey   reinitialized pro...
144
145
146
147
148
  
  #------------------#
  # Create Workspace #
  #------------------#
  # Lock directory
7e99f0793   Jean-François Rey   up
149
150
151
152
153
  if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ]
  then
      print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2
      exit 1
  fi
e6be5137b   Jean-François Rey   reinitialized pro...
154
155
  rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1
  touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1
7c5273953   Jean-François Rey   up
156
  rm $LOGFILE $ERRORFILE 2>/dev/null
e6be5137b   Jean-François Rey   reinitialized pro...
157
158
159
160
161
162
163
164
  #------#
  # Save #
  #------#
  cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg
  echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg
7e99f0793   Jean-François Rey   up
165
  print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1
e6be5137b   Jean-François Rey   reinitialized pro...
166

561670acc   Jean-François Rey   remove output red...
167
168
169
  #---------------#
  # Check Pass    #
  #---------------#
c388b40c7   Jean-François Rey   :D:D
170
  if [ $( ls ${RES_CONF_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ]
b427f103e   Jean-François Rey   update log info p...
171
172
  then
      print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass"
4e81bd46e   Jean-François Rey   up :D
173
      if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in ${RES_CONF_DIR}" ;fi
b427f103e   Jean-François Rey   update log info p...
174
175
      exit 1
  fi
561670acc   Jean-François Rey   remove output red...
176

e6be5137b   Jean-François Rey   reinitialized pro...
177
178
179
180
181
182
183
184
  #-----------------------#
  # Segmentation by show  #
  #-----------------------#
  # create txt file from scored res
  # tag pos and lemmatization of the txt file
  # merge the scored res and taglem file
  # segment using the last generated file
  # and create a ctm file by show
561670acc   Jean-François Rey   remove output red...
185
  print_info "[${BASENAME}] Segmentation by show" 1
e6be5137b   Jean-François Rey   reinitialized pro...
186
187
  
  # -> to txt
b427f103e   Jean-François Rey   update log info p...
188
  print_info "[${BASENAME}] Create txt from scored res" 3
e6be5137b   Jean-François Rey   reinitialized pro...
189
190
191
192
193
  cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm
  cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp
  cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt
  
  # -> to tagger + lemme
b427f103e   Jean-François Rey   update log info p...
194
  print_info "[${BASENAME}] Tag pos and lem in txt file" 3
e6be5137b   Jean-François Rey   reinitialized pro...
195
196
197
198
  iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp
  $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem
  
  # merge sctm and taglem
b427f103e   Jean-François Rey   update log info p...
199
  print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3
e6be5137b   Jean-François Rey   reinitialized pro...
200
201
202
  cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl
  
  # -> new seg
b427f103e   Jean-François Rey   update log info p...
203
  print_info "[${BASENAME}] Create xml file and run Topic Seg" 3
e6be5137b   Jean-François Rey   reinitialized pro...
204
205
206
207
208
209
210
211
212
213
214
  $SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml
  rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem 
  
  # Lia_topic_seg : bring together sentences into show
  cp $INPUT_DIR/$BASENAME.doc.xml 0.xml
  java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg
  cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg
  rm 0.xml $INPUT_DIR/show.seg
  
  if [ $CHECK -eq 1 ]
  then
b427f103e   Jean-François Rey   update log info p...
215
216
217
218
219
220
      if [ ! -s $INPUT_DIR/$BASENAME.show.seg ]
      then
          print_error "[${BASENAME}] No Topic segmentation ! "
          print_error "[${BASENAME}] Check $ERRORFILE "
          print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg"
      fi
e6be5137b   Jean-François Rey   reinitialized pro...
221
222
223
  fi
  
  # Segment ctm into several show files and create a seg list by show
b427f103e   Jean-François Rey   update log info p...
224
  print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1
561670acc   Jean-François Rey   remove output red...
225
  $SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR
e6be5137b   Jean-François Rey   reinitialized pro...
226
227
228
229
230
231
232
233
234
  
  #-----------------------------------------------------------#
  # SOLR QUERIES                                              #
  # -> Create Confidente Word                                 #
  #   Keep conf words and use Tags                            #
  # -> Query SOLR (document & multimedia)                     #
  #   concat word + add date 2 day before and after the show  #
  #   query document & multimedia                             #
  #-----------------------------------------------------------#
b427f103e   Jean-François Rey   update log info p...
235
  print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1
e6be5137b   Jean-François Rey   reinitialized pro...
236
237
238
239
240
241
242
243
244
  for show in $(ls $SHOW_DIR/*.ctm)
  do
      bn=$(basename $show .ctm)
      # Remove words with low confidence and keep useful tagger words
      cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone"
      # Get date 2 day before and after the show
      datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)`
      # Create SOLR queries
      cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries"
e6be5137b   Jean-François Rey   reinitialized pro...
245
246
      # Ask SOLR DB
      if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then
668cac4d1   Jean-François Rey   check if solr qui...
247
248
          query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]"
          echo $query > $SHOW_DIR/$bn.queries
4188f35cd   Jean-François Rey   update
249
          print_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3
561670acc   Jean-François Rey   remove output red...
250
          python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp
e6be5137b   Jean-François Rey   reinitialized pro...
251
252
          cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords
          cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt
1fd315c89   Jean-François Rey   add Extract audio...
253
          rm $SOLR_RES/*.tmp > /dev/null 2>&1
e6be5137b   Jean-François Rey   reinitialized pro...
254
255
256
257
258
259
      fi
  
      if [ $CHECK -eq 1 ]
      then
          if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ]
          then
b427f103e   Jean-François Rey   update log info p...
260
261
262
263
              print_warn "$bn.keywords and $bn.txt are empty !
  Maybe SOLR server is down !" 2
              print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty !
  Maybe SOLR server is down !"
e6be5137b   Jean-François Rey   reinitialized pro...
264
265
266
267
268
269
270
271
272
273
274
275
276
277
          fi
      fi
  
  done
  
  #----------------------------------------------------------------------------------------------- 
  #  Build trigger file
  #       1) keywords are automatically boosted in the non confident zone of the current res
  #          confident zone are boosted
  #          previous words in sensible zone are penalized
  #       2) OOVs are extracted + phonetized
  #       3) Try to find OOVs acousticly in the current segment
  #       4) Generate the .trigg file
  #------------------------------------------------------------------------------------------------
561670acc   Jean-François Rey   remove output red...
278
  print_info "[${BASENAME}] Build trigger files" 1
e6be5137b   Jean-François Rey   reinitialized pro...
279
280
281
282
283
284
285
286
  for i in `ls $SOLR_RES/*.keywords`
  do
      basename=`basename $i .keywords`
  
      #
      # Tokenize & produce coverage report
      # Use filter you need
      #
b427f103e   Jean-François Rey   update log info p...
287
      print_info "[${BASENAME}] keywords filtering and produce coverage report" 3 
e6be5137b   Jean-François Rey   reinitialized pro...
288
289
290
291
292
293
294
295
296
297
298
      # Default filter
      cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\
          $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
      # do less filter
      #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
      
  
      #
      # Extract "real" OOV and phonetize them 
      # -> petit filtrage persoo pour eviter d'avoir trop de bruits
      #
b427f103e   Jean-François Rey   update log info p...
299
      print_info "[${BASENAME}] Extract OOV and phonetize them" 3
e6be5137b   Jean-François Rey   reinitialized pro...
300
301
302
303
304
      ${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov
  
      #
      # Search INVOC & OOV in the current lattice
      #
b427f103e   Jean-François Rey   update log info p...
305
      print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3
e6be5137b   Jean-François Rey   reinitialized pro...
306
307
308
309
310
311
      cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch
      cat $SOLR_RES/${basename}.phon_oov | cut -f1 >>  $TRIGGER_CONFZONE/$basename.tosearch
      
      # For each treil
      for baseseg in $(cat "$SHOW_DIR/$basename.lst")
      do
1fd315c89   Jean-François Rey   add Extract audio...
312
          $OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION
e6be5137b   Jean-François Rey   reinitialized pro...
313
314
315
          #
          # Produce the boost file for the next decoding pass
          #
561670acc   Jean-François Rey   remove output red...
316
          print_info "[${BASENAME}] Produce trigg file : $baseseg " 3
e6be5137b   Jean-François Rey   reinitialized pro...
317
318
319
320
321
322
323
324
325
326
327
          cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg
      done
  
  done
  
  #----------------------------------------------------------------------------------------------- 
  # Build the extended SPEERAL Lexicon
  #   1) Merge OOVs + LEXICON
  #   1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba)
  #   2) The current lexicon is extended with all the valid OOVs
  #----------------------------------------------------------------------------------------------- 
561670acc   Jean-François Rey   remove output red...
328
  print_info "[${BASENAME}] Build extended Speeral Lexicon" 1
e6be5137b   Jean-François Rey   reinitialized pro...
329
330
331
332
333
334
  mkdir -p $EXT_LEX/final
  mkdir -p $EXT_LEX/tmp
  mkdir -p $EXT_LEX/tmp/txt
  #
  # Collect the acousticly found oov and their phonetisation 
  #
b427f103e   Jean-François Rey   update log info p...
335
  print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3
e6be5137b   Jean-François Rey   reinitialized pro...
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
  for i in `ls $SOLR_RES/*.phon_oov`
  do
      basename=`basename $i .phon_oov`
  
      rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null
      # list acousticly found for the show
      for baseseg in $(cat "$SHOW_DIR/$basename.lst")
      do
          cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound
      done
      cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp
      mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound
  
      #
      # Extract OOV really added
      #
      cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov
561670acc   Jean-François Rey   remove output red...
353
      $SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound
e6be5137b   Jean-François Rey   reinitialized pro...
354
355
356
357
358
359
360
361
362
      #
      # Retrieve all phonetisation
      #
      cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon
  done
  
  #
  # Merge OOVs and their phonetisation
  #
b427f103e   Jean-François Rey   update log info p...
363
  print_info "[${BASENAME}] Merge OOV and their phonetisation" 3
e6be5137b   Jean-François Rey   reinitialized pro...
364
365
366
  lexname=$(basename $LEXICON)
  cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon
  cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$"  > $EXT_LEX/final/all.oov_acousticlyfound
561670acc   Jean-François Rey   remove output red...
367
  $SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon
e6be5137b   Jean-François Rey   reinitialized pro...
368
369
370
371
  
  #
  # Collect + clean retrieved txt
  #
561670acc   Jean-François Rey   remove output red...
372
  print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2
e6be5137b   Jean-François Rey   reinitialized pro...
373
374
375
376
377
378
379
380
381
382
383
384
385
  # choose filter
  # default
  cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt
  # low filter
  #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt
  
  #
  # Construct the map file
  #
  # Notes:
  # - Expected format : 
  #   <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1>
  #
b427f103e   Jean-François Rey   update log info p...
386
  print_info "[${BASENAME}] Construct map file" 3
e6be5137b   Jean-François Rey   reinitialized pro...
387
388
389
390
391
392
393
394
395
396
397
398
399
400
  rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null
  rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null
  
  while read oov
  do
      oov=`echo $oov | sed "s/
  //g"`
      #
      # Obtain the oov's tag
      #
      #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2`
      #
      # Try to collect text containing the oov word
      #
b427f103e   Jean-François Rey   update log info p...
401
      print_info "[${BASENAME}] Collect text containing the oov" 3
e6be5137b   Jean-François Rey   reinitialized pro...
402
403
404
405
      cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt
      if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then
          nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
          if [ $nbWords -eq 0 ]; then
b427f103e   Jean-François Rey   update log info p...
406
              print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2 
e6be5137b   Jean-François Rey   reinitialized pro...
407
408
409
410
411
412
              echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
          else
              #
              # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected
              #
              #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt"
b427f103e   Jean-François Rey   update log info p...
413
              print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3
e6be5137b   Jean-François Rey   reinitialized pro...
414
              candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
b427f103e   Jean-François Rey   update log info p...
415
              if [ ! "$candidate" == "" ]; then
e6be5137b   Jean-François Rey   reinitialized pro...
416
417
418
419
420
421
422
                  grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon
                  while read phonLine
                  do
                      #<word> <phon> => <word> <candidate> <phon> 
                      echo "$phonLine" | sed "s|\t|\t$candidate\t|"  >> $EXT_LEX/final/${lexname}_ext.map
                  done < $EXT_LEX/tmp/$oov.phon
              else
b427f103e   Jean-François Rey   update log info p...
423
                  print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2
e6be5137b   Jean-François Rey   reinitialized pro...
424
425
426
427
                  echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
              fi
          fi
      else
b427f103e   Jean-François Rey   update log info p...
428
          print_warn "[${BASENAME}] UNVALID OOV: $oov" 2
e6be5137b   Jean-François Rey   reinitialized pro...
429
430
431
432
433
434
435
436
437
438
439
440
          echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
      fi
  done < $EXT_LEX/final/all.oov_acousticlyfound
  
  #
  ### Speeral 
  #
  
  lexname=`basename $LEXICON`
  #
  # Build the final trigger file
  #
b427f103e   Jean-François Rey   update log info p...
441
  print_info "[${BASENAME}] Clean trigg files" 3
e6be5137b   Jean-François Rey   reinitialized pro...
442
443
444
445
446
447
448
449
450
451
  mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null
  mkdir -p $EXT_LEX/speeral/ 2> /dev/null
  for i in `ls $TRIGGER_CONFZONE/*.trigg`
  do
      basename=`basename $i .trigg`
      cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg
  done
  #
  # Compile the speeral extended lexicon
  #
b427f103e   Jean-François Rey   update log info p...
452
453
  print_info "[${BASENAME}] Compile Speeral extended lexicon" 3
  print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3
561670acc   Jean-François Rey   remove output red...
454
  $SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext
e6be5137b   Jean-François Rey   reinitialized pro...
455
456
457
458
459
460
  
  if [ $CHECK -eq 1 ]
  then
      check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext"
      if [ $? -eq 1 ]
      then
b427f103e   Jean-François Rey   update log info p...
461
462
463
464
          print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit"
          print_error "[${BASENAME}] Check $ERRORFILE"
          print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR"
          print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?"
e6be5137b   Jean-François Rey   reinitialized pro...
465
466
467
468
469
470
471
472
473
          exit 1;
      fi
  fi
  
  
  #-------#
  # CLOSE #
  #-------#
  # Seem OK 
b427f103e   Jean-François Rey   update log info p...
474
  print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1
e6be5137b   Jean-François Rey   reinitialized pro...
475
   
b427f103e   Jean-François Rey   update log info p...
476
  # unlok directory
e6be5137b   Jean-François Rey   reinitialized pro...
477
  mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock"