Commit 4188f35cd874216184dc7cd30737873852b25808
1 parent
c388b40c7c
Exists in
master
update
Showing 2 changed files with 2 additions and 1 deletions Inline Diff
1 | - Check and add Verbose messages | 1 | - Check and add Verbose messages |
2 | - Modify option -r behaviour | ||
2 | - Modifiy SOLR request | 3 | - Modifiy SOLR request |
3 | 4 |
main_tools/ExploitConfidencePass.sh
1 | #!/bin/bash | 1 | #!/bin/bash |
2 | 2 | ||
3 | ##################################################### | 3 | ##################################################### |
4 | # File : ExploitConfidencePass.sh # | 4 | # File : ExploitConfidencePass.sh # |
5 | # Brief : Exploit the ASR confidence pass to : # | 5 | # Brief : Exploit the ASR confidence pass to : # |
6 | # -> boost the confident zone # | 6 | # -> boost the confident zone # |
7 | # -> find alternative in non confident zone | 7 | # -> find alternative in non confident zone |
8 | # -> dynamicly extend the lexicon # | 8 | # -> dynamicly extend the lexicon # |
9 | # Author : Jean-François Rey # | 9 | # Author : Jean-François Rey # |
10 | # (base on Emmanuel Ferreira # | 10 | # (base on Emmanuel Ferreira # |
11 | # and Hugo Mauchrétien works) # | 11 | # and Hugo Mauchrétien works) # |
12 | # Version : 1.0 # | 12 | # Version : 1.0 # |
13 | # Date : 25/06/13 # | 13 | # Date : 25/06/13 # |
14 | ##################################################### | 14 | ##################################################### |
15 | 15 | ||
16 | echo "### ExploitConfidencePass.sh ###" | 16 | echo "### ExploitConfidencePass.sh ###" |
17 | 17 | ||
18 | # Check OTMEDIA_HOME env var | 18 | # Check OTMEDIA_HOME env var |
19 | if [ -z ${OTMEDIA_HOME} ] | 19 | if [ -z ${OTMEDIA_HOME} ] |
20 | then | 20 | then |
21 | OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0))) | 21 | OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0))) |
22 | export OTMEDIA_HOME=$OTMEDIA_HOME | 22 | export OTMEDIA_HOME=$OTMEDIA_HOME |
23 | fi | 23 | fi |
24 | 24 | ||
25 | # where is ExploitConfidencePass.sh | 25 | # where is ExploitConfidencePass.sh |
26 | MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0)) | 26 | MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0)) |
27 | 27 | ||
28 | if [ -z ${SCRIPT_PATH} ] | 28 | if [ -z ${SCRIPT_PATH} ] |
29 | then | 29 | then |
30 | SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts | 30 | SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts |
31 | fi | 31 | fi |
32 | 32 | ||
33 | # Include scripts | 33 | # Include scripts |
34 | . $SCRIPT_PATH"/Tools.sh" | 34 | . $SCRIPT_PATH"/Tools.sh" |
35 | . $SCRIPT_PATH"/CheckExploitConfPass.sh" | 35 | . $SCRIPT_PATH"/CheckExploitConfPass.sh" |
36 | 36 | ||
37 | # where is ExploitConfidencePass.cfg | 37 | # where is ExploitConfidencePass.cfg |
38 | EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg" | 38 | EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg" |
39 | if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ] | 39 | if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ] |
40 | then | 40 | then |
41 | . $EXPLOITCONFIDENCEPASS_CONFIG_FILE | 41 | . $EXPLOITCONFIDENCEPASS_CONFIG_FILE |
42 | else | 42 | else |
43 | echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2 | 43 | echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2 |
44 | exit 1 | 44 | exit 1 |
45 | fi | 45 | fi |
46 | 46 | ||
47 | #---------------# | 47 | #---------------# |
48 | # Parse Options # | 48 | # Parse Options # |
49 | #---------------# | 49 | #---------------# |
50 | while getopts ":hDv:cf:r" opt | 50 | while getopts ":hDv:cf:r" opt |
51 | do | 51 | do |
52 | case $opt in | 52 | case $opt in |
53 | h) | 53 | h) |
54 | echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>\n" | 54 | echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>\n" |
55 | echo -e "\t Options:" | 55 | echo -e "\t Options:" |
56 | echo -e "\t\t-h :\tprint this message" | 56 | echo -e "\t\t-h :\tprint this message" |
57 | echo -e "\t\t-D :\tDEBUG mode on" | 57 | echo -e "\t\t-D :\tDEBUG mode on" |
58 | echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode" | 58 | echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode" |
59 | echo -e "\t\t-c :\tCheck process, stop if error detected" | 59 | echo -e "\t\t-c :\tCheck process, stop if error detected" |
60 | echo -e "\t\t-f n :\tspecify a speeral forks number (default 1)" | 60 | echo -e "\t\t-f n :\tspecify a speeral forks number (default 1)" |
61 | echo -e "\t\t-r n :\tforce rerun without deleting files" | 61 | echo -e "\t\t-r n :\tforce rerun without deleting files" |
62 | exit 1 | 62 | exit 1 |
63 | ;; | 63 | ;; |
64 | D) | 64 | D) |
65 | DEBUG=1 | 65 | DEBUG=1 |
66 | ;; | 66 | ;; |
67 | v) | 67 | v) |
68 | VERBOSE=$OPTARG | 68 | VERBOSE=$OPTARG |
69 | ;; | 69 | ;; |
70 | c) | 70 | c) |
71 | CHECK=1 | 71 | CHECK=1 |
72 | ;; | 72 | ;; |
73 | f) | 73 | f) |
74 | FORKS="--forks $OPTARG" | 74 | FORKS="--forks $OPTARG" |
75 | ;; | 75 | ;; |
76 | r) | 76 | r) |
77 | RERUN=1 | 77 | RERUN=1 |
78 | ;; | 78 | ;; |
79 | :) | 79 | :) |
80 | echo "Option -$OPTARG requires an argument." >&2 | 80 | echo "Option -$OPTARG requires an argument." >&2 |
81 | exit 1 | 81 | exit 1 |
82 | ;; | 82 | ;; |
83 | \?) | 83 | \?) |
84 | echo "BAD USAGE : unknow opton -$OPTARG" | 84 | echo "BAD USAGE : unknow opton -$OPTARG" |
85 | #exit 1 | 85 | #exit 1 |
86 | ;; | 86 | ;; |
87 | esac | 87 | esac |
88 | done | 88 | done |
89 | 89 | ||
90 | # mode debug enable | 90 | # mode debug enable |
91 | if [ $DEBUG -eq 1 ] | 91 | if [ $DEBUG -eq 1 ] |
92 | then | 92 | then |
93 | set -x | 93 | set -x |
94 | echo -e "## Mode DEBUG ON ##" | 94 | echo -e "## Mode DEBUG ON ##" |
95 | fi | 95 | fi |
96 | 96 | ||
97 | # mode verbose enable | 97 | # mode verbose enable |
98 | if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi | 98 | if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi |
99 | 99 | ||
100 | # Check USAGE by arguments number | 100 | # Check USAGE by arguments number |
101 | if [ $(($#-($OPTIND-1))) -ne 1 ] | 101 | if [ $(($#-($OPTIND-1))) -ne 1 ] |
102 | then | 102 | then |
103 | echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>" | 103 | echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>" |
104 | echo "$0 -h for more info" | 104 | echo "$0 -h for more info" |
105 | exit 1 | 105 | exit 1 |
106 | fi | 106 | fi |
107 | 107 | ||
108 | shift $((OPTIND-1)) | 108 | shift $((OPTIND-1)) |
109 | # check input directory - first argument | 109 | # check input directory - first argument |
110 | if [ ! -e $1 ] | 110 | if [ ! -e $1 ] |
111 | then | 111 | then |
112 | print_error "can't open $1" | 112 | print_error "can't open $1" |
113 | exit 1 | 113 | exit 1 |
114 | fi | 114 | fi |
115 | 115 | ||
116 | print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1 | 116 | print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1 |
117 | 117 | ||
118 | #-------------# | 118 | #-------------# |
119 | # GLOBAL VARS # | 119 | # GLOBAL VARS # |
120 | #-------------# | 120 | #-------------# |
121 | INPUT_DIR=$(readlink -e $1) | 121 | INPUT_DIR=$(readlink -e $1) |
122 | OUTPUT_DIR=$INPUT_DIR | 122 | OUTPUT_DIR=$INPUT_DIR |
123 | BASENAME=$(basename $OUTPUT_DIR) | 123 | BASENAME=$(basename $OUTPUT_DIR) |
124 | SHOW_DIR="$OUTPUT_DIR/shows/" | 124 | SHOW_DIR="$OUTPUT_DIR/shows/" |
125 | SOLR_RES="$OUTPUT_DIR/solr/" | 125 | SOLR_RES="$OUTPUT_DIR/solr/" |
126 | EXT_LEX="$OUTPUT_DIR/LEX/" | 126 | EXT_LEX="$OUTPUT_DIR/LEX/" |
127 | TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/" | 127 | TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/" |
128 | LOGFILE="$OUTPUT_DIR/info_exploitconf.log" | 128 | LOGFILE="$OUTPUT_DIR/info_exploitconf.log" |
129 | ERRORFILE="$OUTPUT_DIR/error_exploitconf.log" | 129 | ERRORFILE="$OUTPUT_DIR/error_exploitconf.log" |
130 | 130 | ||
131 | CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg" | 131 | CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg" |
132 | if [ -e $CONFPASS_CONFIG_FILE ] | 132 | if [ -e $CONFPASS_CONFIG_FILE ] |
133 | then | 133 | then |
134 | { | 134 | { |
135 | RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=") | 135 | RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=") |
136 | RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=") | 136 | RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=") |
137 | print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2 | 137 | print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2 |
138 | } | 138 | } |
139 | else | 139 | else |
140 | { | 140 | { |
141 | print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE" | 141 | print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE" |
142 | print_error "[${BASENAME}] -> use res_p2" | 142 | print_error "[${BASENAME}] -> use res_p2" |
143 | RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm" | 143 | RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm" |
144 | RES_CONF="$INPUT_DIR/conf/res_p2" | 144 | RES_CONF="$INPUT_DIR/conf/res_p2" |
145 | } | 145 | } |
146 | fi | 146 | fi |
147 | 147 | ||
148 | mkdir -p $SHOW_DIR > /dev/null 2>&1 | 148 | mkdir -p $SHOW_DIR > /dev/null 2>&1 |
149 | mkdir -p $SOLR_RES > /dev/null 2>&1 | 149 | mkdir -p $SOLR_RES > /dev/null 2>&1 |
150 | mkdir -p $EXT_LEX > /dev/null 2>&1 | 150 | mkdir -p $EXT_LEX > /dev/null 2>&1 |
151 | mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1 | 151 | mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1 |
152 | 152 | ||
153 | #------------------# | 153 | #------------------# |
154 | # Create Workspace # | 154 | # Create Workspace # |
155 | #------------------# | 155 | #------------------# |
156 | # Lock directory | 156 | # Lock directory |
157 | if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ] | 157 | if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ] |
158 | then | 158 | then |
159 | print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2 | 159 | print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2 |
160 | exit 1 | 160 | exit 1 |
161 | fi | 161 | fi |
162 | rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1 | 162 | rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1 |
163 | touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1 | 163 | touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1 |
164 | 164 | ||
165 | #------# | 165 | #------# |
166 | # Save # | 166 | # Save # |
167 | #------# | 167 | #------# |
168 | cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg | 168 | cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg |
169 | echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg | 169 | echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg |
170 | echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg | 170 | echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg |
171 | echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg | 171 | echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg |
172 | echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg | 172 | echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg |
173 | print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1 | 173 | print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1 |
174 | 174 | ||
175 | #---------------# | 175 | #---------------# |
176 | # Check Pass # | 176 | # Check Pass # |
177 | #---------------# | 177 | #---------------# |
178 | if [ $( ls ${RES_CONF_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ] | 178 | if [ $( ls ${RES_CONF_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ] |
179 | then | 179 | then |
180 | print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass" | 180 | print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass" |
181 | if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in ${RES_CONF_DIR}" ;fi | 181 | if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in ${RES_CONF_DIR}" ;fi |
182 | exit 1 | 182 | exit 1 |
183 | fi | 183 | fi |
184 | 184 | ||
185 | #-----------------------# | 185 | #-----------------------# |
186 | # Segmentation by show # | 186 | # Segmentation by show # |
187 | #-----------------------# | 187 | #-----------------------# |
188 | # create txt file from scored res | 188 | # create txt file from scored res |
189 | # tag pos and lemmatization of the txt file | 189 | # tag pos and lemmatization of the txt file |
190 | # merge the scored res and taglem file | 190 | # merge the scored res and taglem file |
191 | # segment using the last generated file | 191 | # segment using the last generated file |
192 | # and create a ctm file by show | 192 | # and create a ctm file by show |
193 | 193 | ||
194 | print_info "[${BASENAME}] Segmentation by show" 1 | 194 | print_info "[${BASENAME}] Segmentation by show" 1 |
195 | 195 | ||
196 | # -> to txt | 196 | # -> to txt |
197 | print_info "[${BASENAME}] Create txt from scored res" 3 | 197 | print_info "[${BASENAME}] Create txt from scored res" 3 |
198 | cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm | 198 | cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm |
199 | cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp | 199 | cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp |
200 | cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt | 200 | cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt |
201 | 201 | ||
202 | # -> to tagger + lemme | 202 | # -> to tagger + lemme |
203 | print_info "[${BASENAME}] Tag pos and lem in txt file" 3 | 203 | print_info "[${BASENAME}] Tag pos and lem in txt file" 3 |
204 | iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp | 204 | iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp |
205 | $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem | 205 | $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem |
206 | 206 | ||
207 | # merge sctm and taglem | 207 | # merge sctm and taglem |
208 | print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3 | 208 | print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3 |
209 | cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl | 209 | cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl |
210 | 210 | ||
211 | # -> new seg | 211 | # -> new seg |
212 | print_info "[${BASENAME}] Create xml file and run Topic Seg" 3 | 212 | print_info "[${BASENAME}] Create xml file and run Topic Seg" 3 |
213 | $SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml | 213 | $SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml |
214 | rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem | 214 | rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem |
215 | 215 | ||
216 | # Lia_topic_seg : bring together sentences into show | 216 | # Lia_topic_seg : bring together sentences into show |
217 | cp $INPUT_DIR/$BASENAME.doc.xml 0.xml | 217 | cp $INPUT_DIR/$BASENAME.doc.xml 0.xml |
218 | java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg | 218 | java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg |
219 | cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg | 219 | cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg |
220 | rm 0.xml $INPUT_DIR/show.seg | 220 | rm 0.xml $INPUT_DIR/show.seg |
221 | 221 | ||
222 | if [ $CHECK -eq 1 ] | 222 | if [ $CHECK -eq 1 ] |
223 | then | 223 | then |
224 | if [ ! -s $INPUT_DIR/$BASENAME.show.seg ] | 224 | if [ ! -s $INPUT_DIR/$BASENAME.show.seg ] |
225 | then | 225 | then |
226 | print_error "[${BASENAME}] No Topic segmentation ! " | 226 | print_error "[${BASENAME}] No Topic segmentation ! " |
227 | print_error "[${BASENAME}] Check $ERRORFILE " | 227 | print_error "[${BASENAME}] Check $ERRORFILE " |
228 | print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg" | 228 | print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg" |
229 | fi | 229 | fi |
230 | fi | 230 | fi |
231 | 231 | ||
232 | # Segment ctm into several show files and create a seg list by show | 232 | # Segment ctm into several show files and create a seg list by show |
233 | print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1 | 233 | print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1 |
234 | $SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR | 234 | $SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR |
235 | 235 | ||
236 | #-----------------------------------------------------------# | 236 | #-----------------------------------------------------------# |
237 | # SOLR QUERIES # | 237 | # SOLR QUERIES # |
238 | # -> Create Confidente Word # | 238 | # -> Create Confidente Word # |
239 | # Keep conf words and use Tags # | 239 | # Keep conf words and use Tags # |
240 | # -> Query SOLR (document & multimedia) # | 240 | # -> Query SOLR (document & multimedia) # |
241 | # concat word + add date 2 day before and after the show # | 241 | # concat word + add date 2 day before and after the show # |
242 | # query document & multimedia # | 242 | # query document & multimedia # |
243 | #-----------------------------------------------------------# | 243 | #-----------------------------------------------------------# |
244 | print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1 | 244 | print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1 |
245 | for show in $(ls $SHOW_DIR/*.ctm) | 245 | for show in $(ls $SHOW_DIR/*.ctm) |
246 | do | 246 | do |
247 | bn=$(basename $show .ctm) | 247 | bn=$(basename $show .ctm) |
248 | # Remove words with low confidence and keep useful tagger words | 248 | # Remove words with low confidence and keep useful tagger words |
249 | cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone" | 249 | cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone" |
250 | # Get date 2 day before and after the show | 250 | # Get date 2 day before and after the show |
251 | datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)` | 251 | datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)` |
252 | # Create SOLR queries | 252 | # Create SOLR queries |
253 | cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries" | 253 | cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries" |
254 | # Ask SOLR DB | 254 | # Ask SOLR DB |
255 | if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then | 255 | if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then |
256 | query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]" | 256 | query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]" |
257 | echo $query > $SHOW_DIR/$bn.queries | 257 | echo $query > $SHOW_DIR/$bn.queries |
258 | prnt_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3 | 258 | print_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3 |
259 | python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp | 259 | python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp |
260 | cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords | 260 | cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords |
261 | cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt | 261 | cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt |
262 | rm $SOLR_RES/*.tmp > /dev/null 2>&1 | 262 | rm $SOLR_RES/*.tmp > /dev/null 2>&1 |
263 | fi | 263 | fi |
264 | 264 | ||
265 | if [ $CHECK -eq 1 ] | 265 | if [ $CHECK -eq 1 ] |
266 | then | 266 | then |
267 | if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ] | 267 | if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ] |
268 | then | 268 | then |
269 | print_warn "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" 2 | 269 | print_warn "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" 2 |
270 | print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" | 270 | print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" |
271 | fi | 271 | fi |
272 | fi | 272 | fi |
273 | 273 | ||
274 | done | 274 | done |
275 | 275 | ||
276 | #----------------------------------------------------------------------------------------------- | 276 | #----------------------------------------------------------------------------------------------- |
277 | # Build trigger file | 277 | # Build trigger file |
278 | # 1) keywords are automatically boosted in the non confident zone of the current res | 278 | # 1) keywords are automatically boosted in the non confident zone of the current res |
279 | # confident zone are boosted | 279 | # confident zone are boosted |
280 | # previous words in sensible zone are penalized | 280 | # previous words in sensible zone are penalized |
281 | # 2) OOVs are extracted + phonetized | 281 | # 2) OOVs are extracted + phonetized |
282 | # 3) Try to find OOVs acousticly in the current segment | 282 | # 3) Try to find OOVs acousticly in the current segment |
283 | # 4) Generate the .trigg file | 283 | # 4) Generate the .trigg file |
284 | #------------------------------------------------------------------------------------------------ | 284 | #------------------------------------------------------------------------------------------------ |
285 | print_info "[${BASENAME}] Build trigger files" 1 | 285 | print_info "[${BASENAME}] Build trigger files" 1 |
286 | for i in `ls $SOLR_RES/*.keywords` | 286 | for i in `ls $SOLR_RES/*.keywords` |
287 | do | 287 | do |
288 | basename=`basename $i .keywords` | 288 | basename=`basename $i .keywords` |
289 | 289 | ||
290 | # | 290 | # |
291 | # Tokenize & produce coverage report | 291 | # Tokenize & produce coverage report |
292 | # Use filter you need | 292 | # Use filter you need |
293 | # | 293 | # |
294 | print_info "[${BASENAME}] keywords filtering and produce coverage report" 3 | 294 | print_info "[${BASENAME}] keywords filtering and produce coverage report" 3 |
295 | # Default filter | 295 | # Default filter |
296 | cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\ | 296 | cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\ |
297 | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok | 297 | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok |
298 | # do less filter | 298 | # do less filter |
299 | #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok | 299 | #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok |
300 | 300 | ||
301 | 301 | ||
302 | # | 302 | # |
303 | # Extract "real" OOV and phonetize them | 303 | # Extract "real" OOV and phonetize them |
304 | # -> petit filtrage persoo pour eviter d'avoir trop de bruits | 304 | # -> petit filtrage persoo pour eviter d'avoir trop de bruits |
305 | # | 305 | # |
306 | print_info "[${BASENAME}] Extract OOV and phonetize them" 3 | 306 | print_info "[${BASENAME}] Extract OOV and phonetize them" 3 |
307 | ${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov | 307 | ${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov |
308 | 308 | ||
309 | # | 309 | # |
310 | # Search INVOC & OOV in the current lattice | 310 | # Search INVOC & OOV in the current lattice |
311 | # | 311 | # |
312 | print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3 | 312 | print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3 |
313 | cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch | 313 | cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch |
314 | cat $SOLR_RES/${basename}.phon_oov | cut -f1 >> $TRIGGER_CONFZONE/$basename.tosearch | 314 | cat $SOLR_RES/${basename}.phon_oov | cut -f1 >> $TRIGGER_CONFZONE/$basename.tosearch |
315 | 315 | ||
316 | # For each treil | 316 | # For each treil |
317 | for baseseg in $(cat "$SHOW_DIR/$basename.lst") | 317 | for baseseg in $(cat "$SHOW_DIR/$basename.lst") |
318 | do | 318 | do |
319 | $OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION | 319 | $OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION |
320 | # | 320 | # |
321 | # Produce the boost file for the next decoding pass | 321 | # Produce the boost file for the next decoding pass |
322 | # | 322 | # |
323 | print_info "[${BASENAME}] Produce trigg file : $baseseg " 3 | 323 | print_info "[${BASENAME}] Produce trigg file : $baseseg " 3 |
324 | cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg | 324 | cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg |
325 | done | 325 | done |
326 | 326 | ||
327 | done | 327 | done |
328 | 328 | ||
329 | #----------------------------------------------------------------------------------------------- | 329 | #----------------------------------------------------------------------------------------------- |
330 | # Build the extended SPEERAL Lexicon | 330 | # Build the extended SPEERAL Lexicon |
331 | # 1) Merge OOVs + LEXICON | 331 | # 1) Merge OOVs + LEXICON |
332 | # 1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba) | 332 | # 1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba) |
333 | # 2) The current lexicon is extended with all the valid OOVs | 333 | # 2) The current lexicon is extended with all the valid OOVs |
334 | #----------------------------------------------------------------------------------------------- | 334 | #----------------------------------------------------------------------------------------------- |
335 | print_info "[${BASENAME}] Build extended Speeral Lexicon" 1 | 335 | print_info "[${BASENAME}] Build extended Speeral Lexicon" 1 |
336 | mkdir -p $EXT_LEX/final | 336 | mkdir -p $EXT_LEX/final |
337 | mkdir -p $EXT_LEX/tmp | 337 | mkdir -p $EXT_LEX/tmp |
338 | mkdir -p $EXT_LEX/tmp/txt | 338 | mkdir -p $EXT_LEX/tmp/txt |
339 | # | 339 | # |
340 | # Collect the acousticly found oov and their phonetisation | 340 | # Collect the acousticly found oov and their phonetisation |
341 | # | 341 | # |
342 | print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3 | 342 | print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3 |
343 | for i in `ls $SOLR_RES/*.phon_oov` | 343 | for i in `ls $SOLR_RES/*.phon_oov` |
344 | do | 344 | do |
345 | basename=`basename $i .phon_oov` | 345 | basename=`basename $i .phon_oov` |
346 | 346 | ||
347 | rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null | 347 | rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null |
348 | # list acousticly found for the show | 348 | # list acousticly found for the show |
349 | for baseseg in $(cat "$SHOW_DIR/$basename.lst") | 349 | for baseseg in $(cat "$SHOW_DIR/$basename.lst") |
350 | do | 350 | do |
351 | cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound | 351 | cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound |
352 | done | 352 | done |
353 | cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp | 353 | cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp |
354 | mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound | 354 | mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound |
355 | 355 | ||
356 | # | 356 | # |
357 | # Extract OOV really added | 357 | # Extract OOV really added |
358 | # | 358 | # |
359 | cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov | 359 | cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov |
360 | $SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound | 360 | $SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound |
361 | # | 361 | # |
362 | # Retrieve all phonetisation | 362 | # Retrieve all phonetisation |
363 | # | 363 | # |
364 | cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon | 364 | cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon |
365 | done | 365 | done |
366 | 366 | ||
367 | # | 367 | # |
368 | # Merge OOVs and their phonetisation | 368 | # Merge OOVs and their phonetisation |
369 | # | 369 | # |
370 | print_info "[${BASENAME}] Merge OOV and their phonetisation" 3 | 370 | print_info "[${BASENAME}] Merge OOV and their phonetisation" 3 |
371 | lexname=$(basename $LEXICON) | 371 | lexname=$(basename $LEXICON) |
372 | cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon | 372 | cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon |
373 | cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$" > $EXT_LEX/final/all.oov_acousticlyfound | 373 | cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$" > $EXT_LEX/final/all.oov_acousticlyfound |
374 | $SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon | 374 | $SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon |
375 | 375 | ||
376 | # | 376 | # |
377 | # Collect + clean retrieved txt | 377 | # Collect + clean retrieved txt |
378 | # | 378 | # |
379 | print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2 | 379 | print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2 |
380 | # choose filter | 380 | # choose filter |
381 | # default | 381 | # default |
382 | cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt | 382 | cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt |
383 | # low filter | 383 | # low filter |
384 | #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt | 384 | #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt |
385 | 385 | ||
386 | # | 386 | # |
387 | # Construct the map file | 387 | # Construct the map file |
388 | # | 388 | # |
389 | # Notes: | 389 | # Notes: |
390 | # - Expected format : | 390 | # - Expected format : |
391 | # <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1> | 391 | # <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1> |
392 | # | 392 | # |
393 | print_info "[${BASENAME}] Construct map file" 3 | 393 | print_info "[${BASENAME}] Construct map file" 3 |
394 | rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null | 394 | rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null |
395 | rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null | 395 | rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null |
396 | 396 | ||
397 | while read oov | 397 | while read oov |
398 | do | 398 | do |
399 | oov=`echo $oov | sed "s/\n//g"` | 399 | oov=`echo $oov | sed "s/\n//g"` |
400 | # | 400 | # |
401 | # Obtain the oov's tag | 401 | # Obtain the oov's tag |
402 | # | 402 | # |
403 | #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2` | 403 | #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2` |
404 | # | 404 | # |
405 | # Try to collect text containing the oov word | 405 | # Try to collect text containing the oov word |
406 | # | 406 | # |
407 | print_info "[${BASENAME}] Collect text containing the oov" 3 | 407 | print_info "[${BASENAME}] Collect text containing the oov" 3 |
408 | cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt | 408 | cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt |
409 | if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then | 409 | if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then |
410 | nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` | 410 | nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` |
411 | if [ $nbWords -eq 0 ]; then | 411 | if [ $nbWords -eq 0 ]; then |
412 | print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2 | 412 | print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2 |
413 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov | 413 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov |
414 | else | 414 | else |
415 | # | 415 | # |
416 | # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected | 416 | # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected |
417 | # | 417 | # |
418 | #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt" | 418 | #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt" |
419 | print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3 | 419 | print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3 |
420 | candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` | 420 | candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` |
421 | if [ ! "$candidate" == "" ]; then | 421 | if [ ! "$candidate" == "" ]; then |
422 | grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon | 422 | grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon |
423 | while read phonLine | 423 | while read phonLine |
424 | do | 424 | do |
425 | #<word> <phon> => <word> <candidate> <phon> | 425 | #<word> <phon> => <word> <candidate> <phon> |
426 | echo "$phonLine" | sed "s|\t|\t$candidate\t|" >> $EXT_LEX/final/${lexname}_ext.map | 426 | echo "$phonLine" | sed "s|\t|\t$candidate\t|" >> $EXT_LEX/final/${lexname}_ext.map |
427 | done < $EXT_LEX/tmp/$oov.phon | 427 | done < $EXT_LEX/tmp/$oov.phon |
428 | else | 428 | else |
429 | print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2 | 429 | print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2 |
430 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov | 430 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov |
431 | fi | 431 | fi |
432 | fi | 432 | fi |
433 | else | 433 | else |
434 | print_warn "[${BASENAME}] UNVALID OOV: $oov" 2 | 434 | print_warn "[${BASENAME}] UNVALID OOV: $oov" 2 |
435 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov | 435 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov |
436 | fi | 436 | fi |
437 | done < $EXT_LEX/final/all.oov_acousticlyfound | 437 | done < $EXT_LEX/final/all.oov_acousticlyfound |
438 | 438 | ||
439 | # | 439 | # |
440 | ### Speeral | 440 | ### Speeral |
441 | # | 441 | # |
442 | 442 | ||
443 | lexname=`basename $LEXICON` | 443 | lexname=`basename $LEXICON` |
444 | # | 444 | # |
445 | # Build the final trigger file | 445 | # Build the final trigger file |
446 | # | 446 | # |
447 | print_info "[${BASENAME}] Clean trigg files" 3 | 447 | print_info "[${BASENAME}] Clean trigg files" 3 |
448 | mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null | 448 | mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null |
449 | mkdir -p $EXT_LEX/speeral/ 2> /dev/null | 449 | mkdir -p $EXT_LEX/speeral/ 2> /dev/null |
450 | for i in `ls $TRIGGER_CONFZONE/*.trigg` | 450 | for i in `ls $TRIGGER_CONFZONE/*.trigg` |
451 | do | 451 | do |
452 | basename=`basename $i .trigg` | 452 | basename=`basename $i .trigg` |
453 | cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg | 453 | cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg |
454 | done | 454 | done |
455 | # | 455 | # |
456 | # Compile the speeral extended lexicon | 456 | # Compile the speeral extended lexicon |
457 | # | 457 | # |
458 | print_info "[${BASENAME}] Compile Speeral extended lexicon" 3 | 458 | print_info "[${BASENAME}] Compile Speeral extended lexicon" 3 |
459 | print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3 | 459 | print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3 |
460 | $SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext | 460 | $SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext |
461 | 461 | ||
462 | if [ $CHECK -eq 1 ] | 462 | if [ $CHECK -eq 1 ] |
463 | then | 463 | then |
464 | check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext" | 464 | check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext" |
465 | if [ $? -eq 1 ] | 465 | if [ $? -eq 1 ] |
466 | then | 466 | then |
467 | print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit" | 467 | print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit" |
468 | print_error "[${BASENAME}] Check $ERRORFILE" | 468 | print_error "[${BASENAME}] Check $ERRORFILE" |
469 | print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR" | 469 | print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR" |
470 | print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?" | 470 | print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?" |
471 | exit 1; | 471 | exit 1; |
472 | fi | 472 | fi |
473 | fi | 473 | fi |
474 | 474 | ||
475 | 475 | ||
476 | #-------# | 476 | #-------# |
477 | # CLOSE # | 477 | # CLOSE # |
478 | #-------# | 478 | #-------# |
479 | # Seem OK | 479 | # Seem OK |
480 | print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1 | 480 | print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1 |
481 | 481 | ||
482 | # unlok directory | 482 | # unlok directory |
483 | mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" | 483 | mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" |
484 | 484 | ||
485 | 485 | ||
486 | 486 |