Blame view
main_tools/ExploitConfidencePass.sh
18.7 KB
e6be5137b reinitialized pro... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
#!/bin/bash ##################################################### # File : ExploitConfidencePass.sh # # Brief : Exploit the ASR confidence pass to : # # -> boost the confident zone # # -> find alternative in non confident zone # -> dynamicly extend the lexicon # # Author : Jean-François Rey # # (base on Emmanuel Ferreira # # and Hugo Mauchrétien works) # # Version : 1.0 # # Date : 25/06/13 # ##################################################### |
f37e72eaf up |
15 |
echo "### ExploitConfidencePass.sh ###" |
e6be5137b reinitialized pro... |
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
# Check OTMEDIA_HOME env var if [ -z ${OTMEDIA_HOME} ] then OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0))) export OTMEDIA_HOME=$OTMEDIA_HOME fi # where is ExploitConfidencePass.sh MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0)) if [ -z ${SCRIPT_PATH} ] then SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts fi # Include scripts . $SCRIPT_PATH"/Tools.sh" . $SCRIPT_PATH"/CheckExploitConfPass.sh" # where is ExploitConfidencePass.cfg EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg" if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ] then . $EXPLOITCONFIDENCEPASS_CONFIG_FILE else echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2 exit 1 fi #---------------# # Parse Options # #---------------# |
665a8dac3 ! follow the whit... |
48 |
while getopts ":hDv:cr" opt |
e6be5137b reinitialized pro... |
49 50 51 52 53 54 55 56 57 58 |
do case $opt in h) echo -e "$0 [OPTIONS] <INPUT_DIRECTORY> " echo -e "\t Options:" echo -e "\t\t-h :\tprint this message" echo -e "\t\t-D :\tDEBUG mode on" echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode" echo -e "\t\t-c :\tCheck process, stop if error detected" |
e6be5137b reinitialized pro... |
59 60 61 62 63 64 65 66 67 68 69 70 |
echo -e "\t\t-r n :\tforce rerun without deleting files" exit 1 ;; D) DEBUG=1 ;; v) VERBOSE=$OPTARG ;; c) CHECK=1 ;; |
e6be5137b reinitialized pro... |
71 72 73 74 |
r) RERUN=1 ;; :) |
d7e9e4b9d update bugfix stderr |
75 |
echo "Option -$OPTARG requires an argument." >&2 |
e6be5137b reinitialized pro... |
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
exit 1 ;; \?) echo "BAD USAGE : unknow opton -$OPTARG" #exit 1 ;; esac done # mode debug enable if [ $DEBUG -eq 1 ] then set -x echo -e "## Mode DEBUG ON ##" fi # mode verbose enable |
1fd315c89 add Extract audio... |
93 |
if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi |
e6be5137b reinitialized pro... |
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# Check USAGE by arguments number if [ $(($#-($OPTIND-1))) -ne 1 ] then echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>" echo "$0 -h for more info" exit 1 fi shift $((OPTIND-1)) # check input directory - first argument if [ ! -e $1 ] then print_error "can't open $1" exit 1 fi |
561670acc remove output red... |
110 |
print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1 |
e6be5137b reinitialized pro... |
111 112 113 114 115 116 117 118 119 120 |
#-------------# # GLOBAL VARS # #-------------# INPUT_DIR=$(readlink -e $1) OUTPUT_DIR=$INPUT_DIR BASENAME=$(basename $OUTPUT_DIR) SHOW_DIR="$OUTPUT_DIR/shows/" SOLR_RES="$OUTPUT_DIR/solr/" EXT_LEX="$OUTPUT_DIR/LEX/" TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/" |
b427f103e update log info p... |
121 122 |
LOGFILE="$OUTPUT_DIR/info_exploitconf.log" ERRORFILE="$OUTPUT_DIR/error_exploitconf.log" |
e6be5137b reinitialized pro... |
123 124 125 126 127 128 129 |
CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg" if [ -e $CONFPASS_CONFIG_FILE ] then { RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=") RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=") |
7e99f0793 up |
130 |
print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2 |
e6be5137b reinitialized pro... |
131 132 133 |
} else { |
7e99f0793 up |
134 135 |
print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE" print_error "[${BASENAME}] -> use res_p2" |
e6be5137b reinitialized pro... |
136 137 138 139 |
RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm" RES_CONF="$INPUT_DIR/conf/res_p2" } fi |
1fd315c89 add Extract audio... |
140 141 142 143 |
mkdir -p $SHOW_DIR > /dev/null 2>&1 mkdir -p $SOLR_RES > /dev/null 2>&1 mkdir -p $EXT_LEX > /dev/null 2>&1 mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1 |
e6be5137b reinitialized pro... |
144 145 146 147 148 |
#------------------# # Create Workspace # #------------------# # Lock directory |
7e99f0793 up |
149 150 151 152 153 |
if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ] then print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2 exit 1 fi |
e6be5137b reinitialized pro... |
154 155 |
rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1 touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1 |
7c5273953 up |
156 |
rm $LOGFILE $ERRORFILE 2>/dev/null |
e6be5137b reinitialized pro... |
157 158 159 160 161 162 163 164 |
#------# # Save # #------# cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg |
7e99f0793 up |
165 |
print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1 |
e6be5137b reinitialized pro... |
166 |
|
561670acc remove output red... |
167 168 169 |
#---------------# # Check Pass # #---------------# |
c388b40c7 :D:D |
170 |
if [ $( ls ${RES_CONF_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ] |
b427f103e update log info p... |
171 172 |
then print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass" |
4e81bd46e up :D |
173 |
if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in ${RES_CONF_DIR}" ;fi |
b427f103e update log info p... |
174 175 |
exit 1 fi |
561670acc remove output red... |
176 |
|
e6be5137b reinitialized pro... |
177 178 179 180 181 182 183 184 |
#-----------------------# # Segmentation by show # #-----------------------# # create txt file from scored res # tag pos and lemmatization of the txt file # merge the scored res and taglem file # segment using the last generated file # and create a ctm file by show |
561670acc remove output red... |
185 |
print_info "[${BASENAME}] Segmentation by show" 1 |
e6be5137b reinitialized pro... |
186 187 |
# -> to txt |
b427f103e update log info p... |
188 |
print_info "[${BASENAME}] Create txt from scored res" 3 |
e6be5137b reinitialized pro... |
189 190 191 192 193 |
cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt # -> to tagger + lemme |
b427f103e update log info p... |
194 |
print_info "[${BASENAME}] Tag pos and lem in txt file" 3 |
e6be5137b reinitialized pro... |
195 196 197 198 |
iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem # merge sctm and taglem |
b427f103e update log info p... |
199 |
print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3 |
e6be5137b reinitialized pro... |
200 201 202 |
cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl # -> new seg |
b427f103e update log info p... |
203 |
print_info "[${BASENAME}] Create xml file and run Topic Seg" 3 |
e6be5137b reinitialized pro... |
204 205 206 207 208 209 210 211 212 213 214 |
$SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem # Lia_topic_seg : bring together sentences into show cp $INPUT_DIR/$BASENAME.doc.xml 0.xml java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg rm 0.xml $INPUT_DIR/show.seg if [ $CHECK -eq 1 ] then |
b427f103e update log info p... |
215 216 217 218 219 220 |
if [ ! -s $INPUT_DIR/$BASENAME.show.seg ] then print_error "[${BASENAME}] No Topic segmentation ! " print_error "[${BASENAME}] Check $ERRORFILE " print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg" fi |
e6be5137b reinitialized pro... |
221 222 223 |
fi # Segment ctm into several show files and create a seg list by show |
b427f103e update log info p... |
224 |
print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1 |
561670acc remove output red... |
225 |
$SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR |
e6be5137b reinitialized pro... |
226 227 228 229 230 231 232 233 234 |
#-----------------------------------------------------------# # SOLR QUERIES # # -> Create Confidente Word # # Keep conf words and use Tags # # -> Query SOLR (document & multimedia) # # concat word + add date 2 day before and after the show # # query document & multimedia # #-----------------------------------------------------------# |
b427f103e update log info p... |
235 |
print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1 |
e6be5137b reinitialized pro... |
236 237 238 239 240 241 242 243 244 |
for show in $(ls $SHOW_DIR/*.ctm) do bn=$(basename $show .ctm) # Remove words with low confidence and keep useful tagger words cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone" # Get date 2 day before and after the show datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)` # Create SOLR queries cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries" |
e6be5137b reinitialized pro... |
245 246 |
# Ask SOLR DB if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then |
668cac4d1 check if solr qui... |
247 248 |
query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]" echo $query > $SHOW_DIR/$bn.queries |
4188f35cd update |
249 |
print_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3 |
561670acc remove output red... |
250 |
python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp |
e6be5137b reinitialized pro... |
251 252 |
cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt |
1fd315c89 add Extract audio... |
253 |
rm $SOLR_RES/*.tmp > /dev/null 2>&1 |
e6be5137b reinitialized pro... |
254 255 256 257 258 259 |
fi if [ $CHECK -eq 1 ] then if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ] then |
b427f103e update log info p... |
260 261 262 263 |
print_warn "$bn.keywords and $bn.txt are empty ! Maybe SOLR server is down !" 2 print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty ! Maybe SOLR server is down !" |
e6be5137b reinitialized pro... |
264 265 266 267 268 269 270 271 272 273 274 275 276 277 |
fi fi done #----------------------------------------------------------------------------------------------- # Build trigger file # 1) keywords are automatically boosted in the non confident zone of the current res # confident zone are boosted # previous words in sensible zone are penalized # 2) OOVs are extracted + phonetized # 3) Try to find OOVs acousticly in the current segment # 4) Generate the .trigg file #------------------------------------------------------------------------------------------------ |
561670acc remove output red... |
278 |
print_info "[${BASENAME}] Build trigger files" 1 |
e6be5137b reinitialized pro... |
279 280 281 282 283 284 285 286 |
for i in `ls $SOLR_RES/*.keywords` do basename=`basename $i .keywords` # # Tokenize & produce coverage report # Use filter you need # |
b427f103e update log info p... |
287 |
print_info "[${BASENAME}] keywords filtering and produce coverage report" 3 |
e6be5137b reinitialized pro... |
288 289 290 291 292 293 294 295 296 297 298 |
# Default filter cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\ $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok # do less filter #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok # # Extract "real" OOV and phonetize them # -> petit filtrage persoo pour eviter d'avoir trop de bruits # |
b427f103e update log info p... |
299 |
print_info "[${BASENAME}] Extract OOV and phonetize them" 3 |
e6be5137b reinitialized pro... |
300 301 302 303 304 |
${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov # # Search INVOC & OOV in the current lattice # |
b427f103e update log info p... |
305 |
print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3 |
e6be5137b reinitialized pro... |
306 307 308 309 310 311 |
cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch cat $SOLR_RES/${basename}.phon_oov | cut -f1 >> $TRIGGER_CONFZONE/$basename.tosearch # For each treil for baseseg in $(cat "$SHOW_DIR/$basename.lst") do |
1fd315c89 add Extract audio... |
312 |
$OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION |
e6be5137b reinitialized pro... |
313 314 315 |
# # Produce the boost file for the next decoding pass # |
561670acc remove output red... |
316 |
print_info "[${BASENAME}] Produce trigg file : $baseseg " 3 |
e6be5137b reinitialized pro... |
317 318 319 320 321 322 323 324 325 326 327 |
cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg done done #----------------------------------------------------------------------------------------------- # Build the extended SPEERAL Lexicon # 1) Merge OOVs + LEXICON # 1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba) # 2) The current lexicon is extended with all the valid OOVs #----------------------------------------------------------------------------------------------- |
561670acc remove output red... |
328 |
print_info "[${BASENAME}] Build extended Speeral Lexicon" 1 |
e6be5137b reinitialized pro... |
329 330 331 332 333 334 |
mkdir -p $EXT_LEX/final mkdir -p $EXT_LEX/tmp mkdir -p $EXT_LEX/tmp/txt # # Collect the acousticly found oov and their phonetisation # |
b427f103e update log info p... |
335 |
print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3 |
e6be5137b reinitialized pro... |
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 |
for i in `ls $SOLR_RES/*.phon_oov` do basename=`basename $i .phon_oov` rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null # list acousticly found for the show for baseseg in $(cat "$SHOW_DIR/$basename.lst") do cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound done cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound # # Extract OOV really added # cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov |
561670acc remove output red... |
353 |
$SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound |
e6be5137b reinitialized pro... |
354 355 356 357 358 359 360 361 362 |
# # Retrieve all phonetisation # cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon done # # Merge OOVs and their phonetisation # |
b427f103e update log info p... |
363 |
print_info "[${BASENAME}] Merge OOV and their phonetisation" 3 |
e6be5137b reinitialized pro... |
364 365 366 |
lexname=$(basename $LEXICON) cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$" > $EXT_LEX/final/all.oov_acousticlyfound |
561670acc remove output red... |
367 |
$SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon |
e6be5137b reinitialized pro... |
368 369 370 371 |
# # Collect + clean retrieved txt # |
561670acc remove output red... |
372 |
print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2 |
e6be5137b reinitialized pro... |
373 374 375 376 377 378 379 380 381 382 383 384 385 |
# choose filter # default cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt # low filter #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt # # Construct the map file # # Notes: # - Expected format : # <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1> # |
b427f103e update log info p... |
386 |
print_info "[${BASENAME}] Construct map file" 3 |
e6be5137b reinitialized pro... |
387 388 389 390 391 392 393 394 395 396 397 398 399 400 |
rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null while read oov do oov=`echo $oov | sed "s/ //g"` # # Obtain the oov's tag # #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2` # # Try to collect text containing the oov word # |
b427f103e update log info p... |
401 |
print_info "[${BASENAME}] Collect text containing the oov" 3 |
e6be5137b reinitialized pro... |
402 403 404 405 |
cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` if [ $nbWords -eq 0 ]; then |
b427f103e update log info p... |
406 |
print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2 |
e6be5137b reinitialized pro... |
407 408 409 410 411 412 |
echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov else # # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected # #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt" |
b427f103e update log info p... |
413 |
print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3 |
e6be5137b reinitialized pro... |
414 |
candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` |
b427f103e update log info p... |
415 |
if [ ! "$candidate" == "" ]; then |
e6be5137b reinitialized pro... |
416 417 418 419 420 421 422 |
grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon while read phonLine do #<word> <phon> => <word> <candidate> <phon> echo "$phonLine" | sed "s|\t|\t$candidate\t|" >> $EXT_LEX/final/${lexname}_ext.map done < $EXT_LEX/tmp/$oov.phon else |
b427f103e update log info p... |
423 |
print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2 |
e6be5137b reinitialized pro... |
424 425 426 427 |
echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov fi fi else |
b427f103e update log info p... |
428 |
print_warn "[${BASENAME}] UNVALID OOV: $oov" 2 |
e6be5137b reinitialized pro... |
429 430 431 432 433 434 435 436 437 438 439 440 |
echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov fi done < $EXT_LEX/final/all.oov_acousticlyfound # ### Speeral # lexname=`basename $LEXICON` # # Build the final trigger file # |
b427f103e update log info p... |
441 |
print_info "[${BASENAME}] Clean trigg files" 3 |
e6be5137b reinitialized pro... |
442 443 444 445 446 447 448 449 450 451 |
mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null mkdir -p $EXT_LEX/speeral/ 2> /dev/null for i in `ls $TRIGGER_CONFZONE/*.trigg` do basename=`basename $i .trigg` cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg done # # Compile the speeral extended lexicon # |
b427f103e update log info p... |
452 453 |
print_info "[${BASENAME}] Compile Speeral extended lexicon" 3 print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3 |
561670acc remove output red... |
454 |
$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext |
e6be5137b reinitialized pro... |
455 456 457 458 459 460 |
if [ $CHECK -eq 1 ] then check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext" if [ $? -eq 1 ] then |
b427f103e update log info p... |
461 462 463 464 |
print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit" print_error "[${BASENAME}] Check $ERRORFILE" print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR" print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?" |
e6be5137b reinitialized pro... |
465 466 467 468 469 470 471 472 473 |
exit 1; fi fi #-------# # CLOSE # #-------# # Seem OK |
b427f103e update log info p... |
474 |
print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1 |
e6be5137b reinitialized pro... |
475 |
|
b427f103e update log info p... |
476 |
# unlok directory |
e6be5137b reinitialized pro... |
477 |
mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" |