Blame view

main_tools/ExploitConfidencePass.sh 18.8 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
  #!/bin/bash
  
  #####################################################
  # File :    ExploitConfidencePass.sh                #
  # Brief :   Exploit the ASR confidence pass to :    #
  #           -> boost the confident zone             #
  #           -> find alternative in non confident zone
  #           -> dynamicly extend the lexicon         #
  # Author :  Jean-François Rey                       #
  #	        (base on Emmanuel Ferreira              #
  #	        and Hugo Mauchrétien works)             #
  # Version : 1.0                                     #
  # Date :    25/06/13                                #
  #####################################################
f37e72eaf   Jean-François Rey   up
15
  echo "### ExploitConfidencePass.sh ###"
e6be5137b   Jean-François Rey   reinitialized pro...
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  # Check OTMEDIA_HOME env var
  if [ -z ${OTMEDIA_HOME} ]
  then 
      OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0)))
      export OTMEDIA_HOME=$OTMEDIA_HOME
  fi
  
  # where is ExploitConfidencePass.sh
  MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0))
  
  if [ -z ${SCRIPT_PATH} ]
  then
      SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts
  fi
  
  # Include scripts
  . $SCRIPT_PATH"/Tools.sh"
  . $SCRIPT_PATH"/CheckExploitConfPass.sh"
  
  # where is ExploitConfidencePass.cfg
  EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg"
  if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ]
  then
  	. $EXPLOITCONFIDENCEPASS_CONFIG_FILE
  else
  	echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2
  	exit 1
  fi
  
  #---------------#
  # Parse Options #
  #---------------#
  while getopts ":hDv:cf:r" opt
  do
  	case $opt in
  		h)
  			echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>
  "
              echo -e "\t Options:"
              echo -e "\t\t-h :\tprint this message"
              echo -e "\t\t-D :\tDEBUG mode on"
              echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode"
              echo -e "\t\t-c :\tCheck process, stop if error detected"
              echo -e "\t\t-f n :\tspecify a speeral forks number (default 1)"
              echo -e "\t\t-r n :\tforce rerun without deleting files"
  			exit 1
  			;;
  		D)
  			DEBUG=1
  			;;
          v)
              VERBOSE=$OPTARG
              ;;
          c)
              CHECK=1
              ;;
          f)
              FORKS="--forks $OPTARG"
              ;;
          r)
              RERUN=1
              ;;
  		:)
d7e9e4b9d   Jean-François Rey   update bugfix stderr
79
  			echo "Option -$OPTARG requires an argument." >&2
e6be5137b   Jean-François Rey   reinitialized pro...
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
  			exit 1
  			;;
  		\?)
  			echo "BAD USAGE : unknow opton -$OPTARG"
  			#exit 1
  			;;
  	esac
  done
  
  # mode debug enable
  if [ $DEBUG -eq 1 ]
  then
         set -x
         echo -e "## Mode DEBUG ON ##"
  fi
  
  # mode verbose enable
1fd315c89   Jean-François Rey   add Extract audio...
97
  if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi
e6be5137b   Jean-François Rey   reinitialized pro...
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  
  # Check USAGE by arguments number
  if [ $(($#-($OPTIND-1))) -ne 1 ]
  then
  	echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>"
  	echo "$0 -h for more info"
  	exit 1
  fi
  
  shift $((OPTIND-1))
  # check input directory - first argument
  if [ ! -e $1 ]
  then
      print_error "can't open $1"
      exit 1
  fi
561670acc   Jean-François Rey   remove output red...
114
  print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1
e6be5137b   Jean-François Rey   reinitialized pro...
115
116
117
118
119
120
121
122
123
124
  #-------------#
  # GLOBAL VARS #
  #-------------#
  INPUT_DIR=$(readlink -e $1)
  OUTPUT_DIR=$INPUT_DIR 
  BASENAME=$(basename $OUTPUT_DIR)
  SHOW_DIR="$OUTPUT_DIR/shows/"
  SOLR_RES="$OUTPUT_DIR/solr/"
  EXT_LEX="$OUTPUT_DIR/LEX/"
  TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/"
b427f103e   Jean-François Rey   update log info p...
125
126
  LOGFILE="$OUTPUT_DIR/info_exploitconf.log"
  ERRORFILE="$OUTPUT_DIR/error_exploitconf.log"
e6be5137b   Jean-François Rey   reinitialized pro...
127
128
129
130
131
132
133
  
  CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg"
  if [ -e $CONFPASS_CONFIG_FILE ]
  then
  {
      RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=")
      RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=")
7e99f0793   Jean-François Rey   up
134
      print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2
e6be5137b   Jean-François Rey   reinitialized pro...
135
136
137
  }
  else
  {
7e99f0793   Jean-François Rey   up
138
139
      print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE"
      print_error "[${BASENAME}] -> use res_p2"
e6be5137b   Jean-François Rey   reinitialized pro...
140
141
142
143
      RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm"
      RES_CONF="$INPUT_DIR/conf/res_p2"
  }
  fi
1fd315c89   Jean-François Rey   add Extract audio...
144
145
146
147
  mkdir -p $SHOW_DIR > /dev/null 2>&1
  mkdir -p $SOLR_RES > /dev/null 2>&1
  mkdir -p $EXT_LEX > /dev/null 2>&1
  mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1
e6be5137b   Jean-François Rey   reinitialized pro...
148
149
150
151
152
  
  #------------------#
  # Create Workspace #
  #------------------#
  # Lock directory
7e99f0793   Jean-François Rey   up
153
154
155
156
157
  if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ]
  then
      print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2
      exit 1
  fi
e6be5137b   Jean-François Rey   reinitialized pro...
158
159
160
161
162
163
164
165
166
167
168
  rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1
  touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1
  
  #------#
  # Save #
  #------#
  cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg
  echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg
  echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg
7e99f0793   Jean-François Rey   up
169
  print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1
e6be5137b   Jean-François Rey   reinitialized pro...
170

561670acc   Jean-François Rey   remove output red...
171
172
173
  #---------------#
  # Check Pass    #
  #---------------#
b427f103e   Jean-François Rey   update log info p...
174
175
176
177
178
179
  if [ $( ${RES_CONf_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ]
  then
      print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass"
      if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in $[RES_CONf_DIR}" ;fi
      exit 1
  fi
561670acc   Jean-François Rey   remove output red...
180

e6be5137b   Jean-François Rey   reinitialized pro...
181
182
183
184
185
186
187
188
  #-----------------------#
  # Segmentation by show  #
  #-----------------------#
  # create txt file from scored res
  # tag pos and lemmatization of the txt file
  # merge the scored res and taglem file
  # segment using the last generated file
  # and create a ctm file by show
561670acc   Jean-François Rey   remove output red...
189
  print_info "[${BASENAME}] Segmentation by show" 1
e6be5137b   Jean-François Rey   reinitialized pro...
190
191
  
  # -> to txt
b427f103e   Jean-François Rey   update log info p...
192
  print_info "[${BASENAME}] Create txt from scored res" 3
e6be5137b   Jean-François Rey   reinitialized pro...
193
194
195
196
197
  cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm
  cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp
  cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt
  
  # -> to tagger + lemme
b427f103e   Jean-François Rey   update log info p...
198
  print_info "[${BASENAME}] Tag pos and lem in txt file" 3
e6be5137b   Jean-François Rey   reinitialized pro...
199
200
201
202
  iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp
  $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem
  
  # merge sctm and taglem
b427f103e   Jean-François Rey   update log info p...
203
  print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3
e6be5137b   Jean-François Rey   reinitialized pro...
204
205
206
  cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl
  
  # -> new seg
b427f103e   Jean-François Rey   update log info p...
207
  print_info "[${BASENAME}] Create xml file and run Topic Seg" 3
e6be5137b   Jean-François Rey   reinitialized pro...
208
209
210
211
212
213
214
215
216
217
218
  $SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml
  rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem 
  
  # Lia_topic_seg : bring together sentences into show
  cp $INPUT_DIR/$BASENAME.doc.xml 0.xml
  java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg
  cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg
  rm 0.xml $INPUT_DIR/show.seg
  
  if [ $CHECK -eq 1 ]
  then
b427f103e   Jean-François Rey   update log info p...
219
220
221
222
223
224
      if [ ! -s $INPUT_DIR/$BASENAME.show.seg ]
      then
          print_error "[${BASENAME}] No Topic segmentation ! "
          print_error "[${BASENAME}] Check $ERRORFILE "
          print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg"
      fi
e6be5137b   Jean-François Rey   reinitialized pro...
225
226
227
  fi
  
  # Segment ctm into several show files and create a seg list by show
b427f103e   Jean-François Rey   update log info p...
228
  print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1
561670acc   Jean-François Rey   remove output red...
229
  $SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR
e6be5137b   Jean-François Rey   reinitialized pro...
230
231
232
233
234
235
236
237
238
  
  #-----------------------------------------------------------#
  # SOLR QUERIES                                              #
  # -> Create Confidente Word                                 #
  #   Keep conf words and use Tags                            #
  # -> Query SOLR (document & multimedia)                     #
  #   concat word + add date 2 day before and after the show  #
  #   query document & multimedia                             #
  #-----------------------------------------------------------#
b427f103e   Jean-François Rey   update log info p...
239
  print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1
e6be5137b   Jean-François Rey   reinitialized pro...
240
241
242
243
244
245
246
247
248
  for show in $(ls $SHOW_DIR/*.ctm)
  do
      bn=$(basename $show .ctm)
      # Remove words with low confidence and keep useful tagger words
      cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone"
      # Get date 2 day before and after the show
      datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)`
      # Create SOLR queries
      cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries"
e6be5137b   Jean-François Rey   reinitialized pro...
249
250
      # Ask SOLR DB
      if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then
668cac4d1   Jean-François Rey   check if solr qui...
251
252
          query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]"
          echo $query > $SHOW_DIR/$bn.queries
b427f103e   Jean-François Rey   update log info p...
253
          prnt_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3
561670acc   Jean-François Rey   remove output red...
254
          python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp
e6be5137b   Jean-François Rey   reinitialized pro...
255
256
          cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords
          cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt
1fd315c89   Jean-François Rey   add Extract audio...
257
          rm $SOLR_RES/*.tmp > /dev/null 2>&1
e6be5137b   Jean-François Rey   reinitialized pro...
258
259
260
261
262
263
      fi
  
      if [ $CHECK -eq 1 ]
      then
          if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ]
          then
b427f103e   Jean-François Rey   update log info p...
264
265
266
267
              print_warn "$bn.keywords and $bn.txt are empty !
  Maybe SOLR server is down !" 2
              print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty !
  Maybe SOLR server is down !"
e6be5137b   Jean-François Rey   reinitialized pro...
268
269
270
271
272
273
274
275
276
277
278
279
280
281
          fi
      fi
  
  done
  
  #----------------------------------------------------------------------------------------------- 
  #  Build trigger file
  #       1) keywords are automatically boosted in the non confident zone of the current res
  #          confident zone are boosted
  #          previous words in sensible zone are penalized
  #       2) OOVs are extracted + phonetized
  #       3) Try to find OOVs acousticly in the current segment
  #       4) Generate the .trigg file
  #------------------------------------------------------------------------------------------------
561670acc   Jean-François Rey   remove output red...
282
  print_info "[${BASENAME}] Build trigger files" 1
e6be5137b   Jean-François Rey   reinitialized pro...
283
284
285
286
287
288
289
290
  for i in `ls $SOLR_RES/*.keywords`
  do
      basename=`basename $i .keywords`
  
      #
      # Tokenize & produce coverage report
      # Use filter you need
      #
b427f103e   Jean-François Rey   update log info p...
291
      print_info "[${BASENAME}] keywords filtering and produce coverage report" 3 
e6be5137b   Jean-François Rey   reinitialized pro...
292
293
294
295
296
297
298
299
300
301
302
      # Default filter
      cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\
          $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
      # do less filter
      #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok
      
  
      #
      # Extract "real" OOV and phonetize them 
      # -> petit filtrage persoo pour eviter d'avoir trop de bruits
      #
b427f103e   Jean-François Rey   update log info p...
303
      print_info "[${BASENAME}] Extract OOV and phonetize them" 3
e6be5137b   Jean-François Rey   reinitialized pro...
304
305
306
307
308
      ${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov
  
      #
      # Search INVOC & OOV in the current lattice
      #
b427f103e   Jean-François Rey   update log info p...
309
      print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3
e6be5137b   Jean-François Rey   reinitialized pro...
310
311
312
313
314
315
      cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch
      cat $SOLR_RES/${basename}.phon_oov | cut -f1 >>  $TRIGGER_CONFZONE/$basename.tosearch
      
      # For each treil
      for baseseg in $(cat "$SHOW_DIR/$basename.lst")
      do
1fd315c89   Jean-François Rey   add Extract audio...
316
          $OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION
e6be5137b   Jean-François Rey   reinitialized pro...
317
318
319
          #
          # Produce the boost file for the next decoding pass
          #
561670acc   Jean-François Rey   remove output red...
320
          print_info "[${BASENAME}] Produce trigg file : $baseseg " 3
e6be5137b   Jean-François Rey   reinitialized pro...
321
322
323
324
325
326
327
328
329
330
331
          cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg
      done
  
  done
  
  #----------------------------------------------------------------------------------------------- 
  # Build the extended SPEERAL Lexicon
  #   1) Merge OOVs + LEXICON
  #   1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba)
  #   2) The current lexicon is extended with all the valid OOVs
  #----------------------------------------------------------------------------------------------- 
561670acc   Jean-François Rey   remove output red...
332
  print_info "[${BASENAME}] Build extended Speeral Lexicon" 1
e6be5137b   Jean-François Rey   reinitialized pro...
333
334
335
336
337
338
  mkdir -p $EXT_LEX/final
  mkdir -p $EXT_LEX/tmp
  mkdir -p $EXT_LEX/tmp/txt
  #
  # Collect the acousticly found oov and their phonetisation 
  #
b427f103e   Jean-François Rey   update log info p...
339
  print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3
e6be5137b   Jean-François Rey   reinitialized pro...
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
  for i in `ls $SOLR_RES/*.phon_oov`
  do
      basename=`basename $i .phon_oov`
  
      rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null
      # list acousticly found for the show
      for baseseg in $(cat "$SHOW_DIR/$basename.lst")
      do
          cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound
      done
      cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp
      mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound
  
      #
      # Extract OOV really added
      #
      cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov
561670acc   Jean-François Rey   remove output red...
357
      $SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound
e6be5137b   Jean-François Rey   reinitialized pro...
358
359
360
361
362
363
364
365
366
      #
      # Retrieve all phonetisation
      #
      cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon
  done
  
  #
  # Merge OOVs and their phonetisation
  #
b427f103e   Jean-François Rey   update log info p...
367
  print_info "[${BASENAME}] Merge OOV and their phonetisation" 3
e6be5137b   Jean-François Rey   reinitialized pro...
368
369
370
  lexname=$(basename $LEXICON)
  cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon
  cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$"  > $EXT_LEX/final/all.oov_acousticlyfound
561670acc   Jean-François Rey   remove output red...
371
  $SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon
e6be5137b   Jean-François Rey   reinitialized pro...
372
373
374
375
  
  #
  # Collect + clean retrieved txt
  #
561670acc   Jean-François Rey   remove output red...
376
  print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2
e6be5137b   Jean-François Rey   reinitialized pro...
377
378
379
380
381
382
383
384
385
386
387
388
389
  # choose filter
  # default
  cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt
  # low filter
  #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt
  
  #
  # Construct the map file
  #
  # Notes:
  # - Expected format : 
  #   <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1>
  #
b427f103e   Jean-François Rey   update log info p...
390
  print_info "[${BASENAME}] Construct map file" 3
e6be5137b   Jean-François Rey   reinitialized pro...
391
392
393
394
395
396
397
398
399
400
401
402
403
404
  rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null
  rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null
  
  while read oov
  do
      oov=`echo $oov | sed "s/
  //g"`
      #
      # Obtain the oov's tag
      #
      #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2`
      #
      # Try to collect text containing the oov word
      #
b427f103e   Jean-François Rey   update log info p...
405
      print_info "[${BASENAME}] Collect text containing the oov" 3
e6be5137b   Jean-François Rey   reinitialized pro...
406
407
408
409
      cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt
      if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then
          nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
          if [ $nbWords -eq 0 ]; then
b427f103e   Jean-François Rey   update log info p...
410
              print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2 
e6be5137b   Jean-François Rey   reinitialized pro...
411
412
413
414
415
416
              echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
          else
              #
              # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected
              #
              #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt"
b427f103e   Jean-François Rey   update log info p...
417
              print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3
e6be5137b   Jean-François Rey   reinitialized pro...
418
              candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "`
b427f103e   Jean-François Rey   update log info p...
419
              if [ ! "$candidate" == "" ]; then
e6be5137b   Jean-François Rey   reinitialized pro...
420
421
422
423
424
425
426
                  grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon
                  while read phonLine
                  do
                      #<word> <phon> => <word> <candidate> <phon> 
                      echo "$phonLine" | sed "s|\t|\t$candidate\t|"  >> $EXT_LEX/final/${lexname}_ext.map
                  done < $EXT_LEX/tmp/$oov.phon
              else
b427f103e   Jean-François Rey   update log info p...
427
                  print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2
e6be5137b   Jean-François Rey   reinitialized pro...
428
429
430
431
                  echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
              fi
          fi
      else
b427f103e   Jean-François Rey   update log info p...
432
          print_warn "[${BASENAME}] UNVALID OOV: $oov" 2
e6be5137b   Jean-François Rey   reinitialized pro...
433
434
435
436
437
438
439
440
441
442
443
444
          echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov
      fi
  done < $EXT_LEX/final/all.oov_acousticlyfound
  
  #
  ### Speeral 
  #
  
  lexname=`basename $LEXICON`
  #
  # Build the final trigger file
  #
b427f103e   Jean-François Rey   update log info p...
445
  print_info "[${BASENAME}] Clean trigg files" 3
e6be5137b   Jean-François Rey   reinitialized pro...
446
447
448
449
450
451
452
453
454
455
  mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null
  mkdir -p $EXT_LEX/speeral/ 2> /dev/null
  for i in `ls $TRIGGER_CONFZONE/*.trigg`
  do
      basename=`basename $i .trigg`
      cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg
  done
  #
  # Compile the speeral extended lexicon
  #
b427f103e   Jean-François Rey   update log info p...
456
457
  print_info "[${BASENAME}] Compile Speeral extended lexicon" 3
  print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3
561670acc   Jean-François Rey   remove output red...
458
  $SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext
e6be5137b   Jean-François Rey   reinitialized pro...
459
460
461
462
463
464
  
  if [ $CHECK -eq 1 ]
  then
      check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext"
      if [ $? -eq 1 ]
      then
b427f103e   Jean-François Rey   update log info p...
465
466
467
468
          print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit"
          print_error "[${BASENAME}] Check $ERRORFILE"
          print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR"
          print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?"
e6be5137b   Jean-François Rey   reinitialized pro...
469
470
471
472
473
474
475
476
477
          exit 1;
      fi
  fi
  
  
  #-------#
  # CLOSE #
  #-------#
  # Seem OK 
b427f103e   Jean-François Rey   update log info p...
478
  print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1
e6be5137b   Jean-François Rey   reinitialized pro...
479
   
b427f103e   Jean-François Rey   update log info p...
480
  # unlok directory
e6be5137b   Jean-François Rey   reinitialized pro...
481
  mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock"