Blame view

main_tools/ExploitConfidencePass.sh 17.1 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  #!/bin/bash
  
  #####################################################
  # File :    ExploitConfidencePass.sh                #
  # Brief :   Exploit the ASR confidence pass to :    #
  #           -> boost the confident zone             #
  #           -> find alternative in non confident zone
  #           -> dynamicly extend the lexicon         #
  # Author :  Jean-François Rey                       #
  #	        (base on Emmanuel Ferreira              #
  #	        and Hugo Mauchrétien works)             #
  # Version : 1.0                                     #
  # Date :    25/06/13                                #
  #####################################################
  
  # Check OTMEDIA_HOME env var
  if [ -z ${OTMEDIA_HOME} ]
  then 
      OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0)))
      export OTMEDIA_HOME=$OTMEDIA_HOME
  fi
  
  # where is ExploitConfidencePass.sh
  MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0))
  
  if [ -z ${SCRIPT_PATH} ]
  then
      SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts
  fi
  
  # Include scripts
  . $SCRIPT_PATH"/Tools.sh"
  . $SCRIPT_PATH"/CheckExploitConfPass.sh"
  
  # where is ExploitConfidencePass.cfg
  EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg"
  if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ]
  then
  	. $EXPLOITCONFIDENCEPASS_CONFIG_FILE
  else
  	echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2
  	exit 1
  fi
  
  #---------------#
  # Parse Options #
  #---------------#
  while getopts ":hDv:cf:r" opt
  do
  	case $opt in
  		h)
  			echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>
  "
              echo -e "\t Options:"
              echo -e "\t\t-h :\tprint this message"
              echo -e "\t\t-D :\tDEBUG mode on"
              echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode"
              echo -e "\t\t-c :\tCheck process, stop if error detected"
              echo -e "\t\t-f n :\tspecify a speeral forks number (default 1)"
              echo -e "\t\t-r n :\tforce rerun without deleting files"
  			exit 1
  			;;
  		D)
  			DEBUG=1
  			;;
          v)
              VERBOSE=$OPTARG
              ;;
          c)
              CHECK=1
              ;;
          f)
              FORKS="--forks $OPTARG"
              ;;
          r)
              RERUN=1
              ;;
  		:)
  			echo "Option -$OPTARG requires an argument." >&2
  			exit 1
  			;;
  		\?)
  			echo "BAD USAGE : unknow opton -$OPTARG"
  			#exit 1
  			;;
  	esac
  done
  
  # mode debug enable
  if [ $DEBUG -eq 1 ]
  then
         set -x
         echo -e "## Mode DEBUG ON ##"
1fd315c89   Jean-François Rey   add Extract audio...
94
95
96
         REDIRECTION_OUTPUT=""
     else
         REDIRECTION_OUTPUT=" 2> /dev/null"
e6be5137b   Jean-François Rey   reinitialized pro...
97
98
99
  fi
  
  # mode verbose enable
1fd315c89   Jean-François Rey   add Extract audio...
100
  if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi
e6be5137b   Jean-François Rey   reinitialized pro...
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
  
  # Check USAGE by arguments number
  if [ $(($#-($OPTIND-1))) -ne 1 ]
  then
  	echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>"
  	echo "$0 -h for more info"
  	exit 1
  fi
  
  shift $((OPTIND-1))
  # check input directory - first argument
  if [ ! -e $1 ]
  then
      print_error "can't open $1"
      exit 1
  fi
  
  #-------------#
  # GLOBAL VARS #
  #-------------#
  INPUT_DIR=$(readlink -e $1)
  OUTPUT_DIR=$INPUT_DIR 
  BASENAME=$(basename $OUTPUT_DIR)
  SHOW_DIR="$OUTPUT_DIR/shows/"
  SOLR_RES="$OUTPUT_DIR/solr/"
  EXT_LEX="$OUTPUT_DIR/LEX/"
  TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/"
  LOGFILE="$(dirname $OUTPUT_DIR)/info_exploitconf.log"
  ERRORFILE="$(dirname $OUTPUT_DIR)/error_exploitconf.log"
  
  CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg"
  if [ -e $CONFPASS_CONFIG_FILE ]
  then
  {
      RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=")
      RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=")
      print_warn "Use confidence measure from : $RES_CONF" 1
  }
  else
  {
      print_error "Can't find $CONFPASS_CONFIG_FILE" 1
      RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm"
      RES_CONF="$INPUT_DIR/conf/res_p2"
  }
  fi
1fd315c89   Jean-François Rey   add Extract audio...
146
147
148
149
  mkdir -p $SHOW_DIR > /dev/null 2>&1
  mkdir -p $SOLR_RES > /dev/null 2>&1
  mkdir -p $EXT_LEX > /dev/null 2>&1
  mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1
e6be5137b   Jean-François Rey   reinitialized pro...
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
  
  #------------------#
  # Create Workspace #
  #------------------#
  # Lock directory
  if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ]; then exit 1; fi
  rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1
  touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1
  
  #------#
  # Save #
  #------#
  cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg
  echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg
  
  
  #-----------------------#
  # Segmentation by show  #
  #-----------------------#
  # create txt file from scored res
  # tag pos and lemmatization of the txt file
  # merge the scored res and taglem file
  # segment using the last generated file
  # and create a ctm file by show
  
  print_info "Segmentation by show" 1
  
  # -> to txt
  print_info "Create txt from scored res" 2
  cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm
  cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp
  cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt
  
  # -> to tagger + lemme
  print_info "Tag pos and lem in txt file" 2
  iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp
  $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem
  
  # merge sctm and taglem
  print_info "Merge scored ctm with tag pos and lem file" 2
  cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl
  
  # -> new seg
  print_info "Create xml file and run Topic Seg" 2
  $SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml
  rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem 
  
  # Lia_topic_seg : bring together sentences into show
  cp $INPUT_DIR/$BASENAME.doc.xml 0.xml
  java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg
  cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg
  rm 0.xml $INPUT_DIR/show.seg
  
  if [ $CHECK -eq 1 ]
  then
      if [ ! -s $INPUT_DIR/$BASENAME.show.seg ];then echo -e "ERROR : no Topic segmentation" >> $ERRORFILE; fi
  fi
  
  # Segment ctm into several show files and create a seg list by show
  print_info "Segment ctm into show files and a seg list by show" 2
1fd315c89   Jean-François Rey   add Extract audio...
213
  $SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR $REDIRECTION_OUTPUT
e6be5137b   Jean-François Rey   reinitialized pro...
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
  
  #-----------------------------------------------------------#
  # SOLR QUERIES                                              #
  # -> Create Confidente Word                                 #
  #   Keep conf words and use Tags                            #
  # -> Query SOLR (document & multimedia)                     #
  #   concat word + add date 2 day before and after the show  #
  #   query document & multimedia                             #
  #-----------------------------------------------------------#
  print_info "Create SOLR queries and ASK SOLR" 1
  for show in $(ls $SHOW_DIR/*.ctm)
  do
      bn=$(basename $show .ctm)
      # Remove words with low confidence and keep useful tagger words
      cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone"
      # Get date 2 day before and after the show
      datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)`
      # Create SOLR queries
      cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries"
e6be5137b   Jean-François Rey   reinitialized pro...
233
234
      # Ask SOLR DB
      if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then
668cac4d1   Jean-François Rey   check if solr qui...
235
236
          query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]"
          echo $query > $SHOW_DIR/$bn.queries
1fd315c89   Jean-François Rey   add Extract audio...
237
          python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp $REDIRECTION_OUTPUT
e6be5137b   Jean-François Rey   reinitialized pro...
238
239
          cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords
          cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt
1fd315c89   Jean-François Rey   add Extract audio...
240
          rm $SOLR_RES/*.tmp > /dev/null 2>&1
e6be5137b   Jean-François Rey   reinitialized pro...
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
      fi
  
      if [ $CHECK -eq 1 ]
      then
          if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ]
          then
              print_warn "$bn.keywords and $bn.txt are empty !
  Maybe SOLR server is down !" 1
          fi
      fi
  
  done
  
  #----------------------------------------------------------------------------------------------- 
  #  Build trigger file
  #       1) keywords are automatically boosted in the non confident zone of the current res
  #          confident zone are boosted
  #          previous words in sensible zone are penalized
  #       2) OOVs are extracted + phonetized
  #       3) Try to find OOVs acousticly in the current segment
  #       4) Generate the .trigg file
  #------------------------------------------------------------------------------------------------
  print_info "Build trigger files" 1
  for i in `ls $SOLR_RES/*.keywords`
  do
      basename=`basename $i .keywords`
  
      #
      # Tokenize & produce coverage report
      # Use filter you need
      #
      print_info "keywords filtering and produce coverage report" 2
      # Default filter
      cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\
          $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
      # do less filter
      #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
      
  
      #
      # Extract "real" OOV and phonetize them 
      # -> petit filtrage persoo pour eviter d'avoir trop de bruits
      #
      print_info "Extract OOV and phonetize them" 2
      ${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov
  
      #
      # Search INVOC & OOV in the current lattice
      #
      print_info "Search INVOC and OOV in the current lattice" 2
      cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch
      cat $SOLR_RES/${basename}.phon_oov | cut -f1 >>  $TRIGGER_CONFZONE/$basename.tosearch
      
      # For each treil
      for baseseg in $(cat "$SHOW_DIR/$basename.lst")
      do
1fd315c89   Jean-François Rey   add Extract audio...
297
          $OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION
e6be5137b   Jean-François Rey   reinitialized pro...
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
          #
          # Produce the boost file for the next decoding pass
          #
          print_info "Produce trigg file : $baseseg " 3
          cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg
      done
  
  done
  
  #----------------------------------------------------------------------------------------------- 
  # Build the extended SPEERAL Lexicon
  #   1) Merge OOVs + LEXICON
  #   1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba)
  #   2) The current lexicon is extended with all the valid OOVs
  #----------------------------------------------------------------------------------------------- 
  print_info "Build extended Speeral Lexicon" 1
  mkdir -p $EXT_LEX/final
  mkdir -p $EXT_LEX/tmp
  mkdir -p $EXT_LEX/tmp/txt
  #
  # Collect the acousticly found oov and their phonetisation 
  #
  print_info "Get all OOV and retrieve all phonetisation" 2
  for i in `ls $SOLR_RES/*.phon_oov`
  do
      basename=`basename $i .phon_oov`
  
      rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null
      # list acousticly found for the show
      for baseseg in $(cat "$SHOW_DIR/$basename.lst")
      do
          cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound
      done
      cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp
      mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound
  
      #
      # Extract OOV really added
      #
      cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov
1fd315c89   Jean-François Rey   add Extract audio...
338
      $SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound $REDIRECTION_OUTPUT
e6be5137b   Jean-François Rey   reinitialized pro...
339
340
341
342
343
344
345
346
347
348
349
350
351
      #
      # Retrieve all phonetisation
      #
      cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon
  done
  
  #
  # Merge OOVs and their phonetisation
  #
  print_info "Merge OOV and their phonetisation" 2
  lexname=$(basename $LEXICON)
  cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon
  cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$"  > $EXT_LEX/final/all.oov_acousticlyfound
1fd315c89   Jean-François Rey   add Extract audio...
352
  $SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon $REDIRECTION_OUTPUT
e6be5137b   Jean-François Rey   reinitialized pro...
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
  
  #
  # Collect + clean retrieved txt
  #
  print_info "Collect and clean SOLR txt answers" 2
  # choose filter
  # default
  cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt
  # low filter
  #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt
  
  #
  # Construct the map file
  #
  # Notes:
  # - Expected format : 
  #   <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1>
  #
  print_info "Construct map file" 2
  rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null
  rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null
  
  while read oov
  do
      oov=`echo $oov | sed "s/
  //g"`
      #
      # Obtain the oov's tag
      #
      #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2`
      #
      # Try to collect text containing the oov word
      #
      cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt
      if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then
          nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
          if [ $nbWords -eq 0 ]; then
              echo "UNVALID OOV: $oov => $nbWords occurrences"
              echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
          else
              #
              # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected
              #
              #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt"
              candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
              if [ ! x$candidate = "x" ]; then
                  grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon
                  while read phonLine
                  do
                      #<word> <phon> => <word> <candidate> <phon> 
                      echo "$phonLine" | sed "s|\t|\t$candidate\t|"  >> $EXT_LEX/final/${lexname}_ext.map
                  done < $EXT_LEX/tmp/$oov.phon
              else
                  echo "UNVALID OOV: $oov => no availaible Candidate word in LM"
                  echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
              fi
          fi
      else
          echo "UNVALID OOV: $oov"
          echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
      fi
  done < $EXT_LEX/final/all.oov_acousticlyfound
  
  #
  ### Speeral 
  #
  
  lexname=`basename $LEXICON`
  #
  # Build the final trigger file
  #
  print_info "Clean trigg files" 2
  mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null
  mkdir -p $EXT_LEX/speeral/ 2> /dev/null
  for i in `ls $TRIGGER_CONFZONE/*.trigg`
  do
      basename=`basename $i .trigg`
      cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg
  done
  #
  # Compile the speeral extended lexicon
  #
  print_info "Compile Speeral extended lexicon" 2
1fd315c89   Jean-François Rey   add Extract audio...
436
  $SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext $REDIRECTION_OUTPUT
e6be5137b   Jean-François Rey   reinitialized pro...
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
  
  if [ $CHECK -eq 1 ]
  then
      check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext"
      if [ $? -eq 1 ]
      then
          echo -e "ERROR : Building Speeral Lexicon $INPUT_DIR " >> $ERRORFILE
          exit 1;
      fi
  fi
  
  
  #-------#
  # CLOSE #
  #-------#
  # Seem OK 
  print_info "<= End $BASENAME Solr | $(date +'%d/%m/%y %H:%M:%S')" 1
  echo -e "#Solr $BASENAME " >> $LOGFILE
   
  # unlock directory
  mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock"