Commit 7c52739538a4527d4e71336e7d451c9f58c11eab
1 parent
1fa99e8a2b
Exists in
master
up
Showing 1 changed file with 2 additions and 0 deletions Inline Diff
main_tools/ExploitConfidencePass.sh
1 | #!/bin/bash | 1 | #!/bin/bash |
2 | 2 | ||
3 | ##################################################### | 3 | ##################################################### |
4 | # File : ExploitConfidencePass.sh # | 4 | # File : ExploitConfidencePass.sh # |
5 | # Brief : Exploit the ASR confidence pass to : # | 5 | # Brief : Exploit the ASR confidence pass to : # |
6 | # -> boost the confident zone # | 6 | # -> boost the confident zone # |
7 | # -> find alternative in non confident zone | 7 | # -> find alternative in non confident zone |
8 | # -> dynamicly extend the lexicon # | 8 | # -> dynamicly extend the lexicon # |
9 | # Author : Jean-François Rey # | 9 | # Author : Jean-François Rey # |
10 | # (base on Emmanuel Ferreira # | 10 | # (base on Emmanuel Ferreira # |
11 | # and Hugo Mauchrétien works) # | 11 | # and Hugo Mauchrétien works) # |
12 | # Version : 1.0 # | 12 | # Version : 1.0 # |
13 | # Date : 25/06/13 # | 13 | # Date : 25/06/13 # |
14 | ##################################################### | 14 | ##################################################### |
15 | 15 | ||
16 | echo "### ExploitConfidencePass.sh ###" | 16 | echo "### ExploitConfidencePass.sh ###" |
17 | 17 | ||
18 | # Check OTMEDIA_HOME env var | 18 | # Check OTMEDIA_HOME env var |
19 | if [ -z ${OTMEDIA_HOME} ] | 19 | if [ -z ${OTMEDIA_HOME} ] |
20 | then | 20 | then |
21 | OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0))) | 21 | OTMEDIA_HOME=$(dirname $(dirname $(readlink -e $0))) |
22 | export OTMEDIA_HOME=$OTMEDIA_HOME | 22 | export OTMEDIA_HOME=$OTMEDIA_HOME |
23 | fi | 23 | fi |
24 | 24 | ||
25 | # where is ExploitConfidencePass.sh | 25 | # where is ExploitConfidencePass.sh |
26 | MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0)) | 26 | MAIN_SCRIPT_PATH=$(dirname $(readlink -e $0)) |
27 | 27 | ||
28 | if [ -z ${SCRIPT_PATH} ] | 28 | if [ -z ${SCRIPT_PATH} ] |
29 | then | 29 | then |
30 | SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts | 30 | SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts |
31 | fi | 31 | fi |
32 | 32 | ||
33 | # Include scripts | 33 | # Include scripts |
34 | . $SCRIPT_PATH"/Tools.sh" | 34 | . $SCRIPT_PATH"/Tools.sh" |
35 | . $SCRIPT_PATH"/CheckExploitConfPass.sh" | 35 | . $SCRIPT_PATH"/CheckExploitConfPass.sh" |
36 | 36 | ||
37 | # where is ExploitConfidencePass.cfg | 37 | # where is ExploitConfidencePass.cfg |
38 | EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg" | 38 | EXPLOITCONFIDENCEPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ExploitConfidencePass.cfg" |
39 | if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ] | 39 | if [ -e $EXPLOITCONFIDENCEPASS_CONFIG_FILE ] |
40 | then | 40 | then |
41 | . $EXPLOITCONFIDENCEPASS_CONFIG_FILE | 41 | . $EXPLOITCONFIDENCEPASS_CONFIG_FILE |
42 | else | 42 | else |
43 | echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2 | 43 | echo "ERROR : Can't find configuration file $EXPLOITCONFIDENCEPASS_CONFIG_FILE" >&2 |
44 | exit 1 | 44 | exit 1 |
45 | fi | 45 | fi |
46 | 46 | ||
47 | #---------------# | 47 | #---------------# |
48 | # Parse Options # | 48 | # Parse Options # |
49 | #---------------# | 49 | #---------------# |
50 | while getopts ":hDv:cr" opt | 50 | while getopts ":hDv:cr" opt |
51 | do | 51 | do |
52 | case $opt in | 52 | case $opt in |
53 | h) | 53 | h) |
54 | echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>\n" | 54 | echo -e "$0 [OPTIONS] <INPUT_DIRECTORY>\n" |
55 | echo -e "\t Options:" | 55 | echo -e "\t Options:" |
56 | echo -e "\t\t-h :\tprint this message" | 56 | echo -e "\t\t-h :\tprint this message" |
57 | echo -e "\t\t-D :\tDEBUG mode on" | 57 | echo -e "\t\t-D :\tDEBUG mode on" |
58 | echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode" | 58 | echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode" |
59 | echo -e "\t\t-c :\tCheck process, stop if error detected" | 59 | echo -e "\t\t-c :\tCheck process, stop if error detected" |
60 | echo -e "\t\t-r n :\tforce rerun without deleting files" | 60 | echo -e "\t\t-r n :\tforce rerun without deleting files" |
61 | exit 1 | 61 | exit 1 |
62 | ;; | 62 | ;; |
63 | D) | 63 | D) |
64 | DEBUG=1 | 64 | DEBUG=1 |
65 | ;; | 65 | ;; |
66 | v) | 66 | v) |
67 | VERBOSE=$OPTARG | 67 | VERBOSE=$OPTARG |
68 | ;; | 68 | ;; |
69 | c) | 69 | c) |
70 | CHECK=1 | 70 | CHECK=1 |
71 | ;; | 71 | ;; |
72 | r) | 72 | r) |
73 | RERUN=1 | 73 | RERUN=1 |
74 | ;; | 74 | ;; |
75 | :) | 75 | :) |
76 | echo "Option -$OPTARG requires an argument." >&2 | 76 | echo "Option -$OPTARG requires an argument." >&2 |
77 | exit 1 | 77 | exit 1 |
78 | ;; | 78 | ;; |
79 | \?) | 79 | \?) |
80 | echo "BAD USAGE : unknow opton -$OPTARG" | 80 | echo "BAD USAGE : unknow opton -$OPTARG" |
81 | #exit 1 | 81 | #exit 1 |
82 | ;; | 82 | ;; |
83 | esac | 83 | esac |
84 | done | 84 | done |
85 | 85 | ||
86 | # mode debug enable | 86 | # mode debug enable |
87 | if [ $DEBUG -eq 1 ] | 87 | if [ $DEBUG -eq 1 ] |
88 | then | 88 | then |
89 | set -x | 89 | set -x |
90 | echo -e "## Mode DEBUG ON ##" | 90 | echo -e "## Mode DEBUG ON ##" |
91 | fi | 91 | fi |
92 | 92 | ||
93 | # mode verbose enable | 93 | # mode verbose enable |
94 | if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi | 94 | if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ;fi |
95 | 95 | ||
96 | # Check USAGE by arguments number | 96 | # Check USAGE by arguments number |
97 | if [ $(($#-($OPTIND-1))) -ne 1 ] | 97 | if [ $(($#-($OPTIND-1))) -ne 1 ] |
98 | then | 98 | then |
99 | echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>" | 99 | echo "BAD USAGE : ExploitConfidencePass.sh [OPTIONS] <INPUT_DIRECTORY>" |
100 | echo "$0 -h for more info" | 100 | echo "$0 -h for more info" |
101 | exit 1 | 101 | exit 1 |
102 | fi | 102 | fi |
103 | 103 | ||
104 | shift $((OPTIND-1)) | 104 | shift $((OPTIND-1)) |
105 | # check input directory - first argument | 105 | # check input directory - first argument |
106 | if [ ! -e $1 ] | 106 | if [ ! -e $1 ] |
107 | then | 107 | then |
108 | print_error "can't open $1" | 108 | print_error "can't open $1" |
109 | exit 1 | 109 | exit 1 |
110 | fi | 110 | fi |
111 | 111 | ||
112 | print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1 | 112 | print_info "[${BASENAME}] => ExploitConfPass start | $(date +'%d/%m/%y %H:%M:%S')" 1 |
113 | 113 | ||
114 | #-------------# | 114 | #-------------# |
115 | # GLOBAL VARS # | 115 | # GLOBAL VARS # |
116 | #-------------# | 116 | #-------------# |
117 | INPUT_DIR=$(readlink -e $1) | 117 | INPUT_DIR=$(readlink -e $1) |
118 | OUTPUT_DIR=$INPUT_DIR | 118 | OUTPUT_DIR=$INPUT_DIR |
119 | BASENAME=$(basename $OUTPUT_DIR) | 119 | BASENAME=$(basename $OUTPUT_DIR) |
120 | SHOW_DIR="$OUTPUT_DIR/shows/" | 120 | SHOW_DIR="$OUTPUT_DIR/shows/" |
121 | SOLR_RES="$OUTPUT_DIR/solr/" | 121 | SOLR_RES="$OUTPUT_DIR/solr/" |
122 | EXT_LEX="$OUTPUT_DIR/LEX/" | 122 | EXT_LEX="$OUTPUT_DIR/LEX/" |
123 | TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/" | 123 | TRIGGER_CONFZONE="$OUTPUT_DIR/trigg/" |
124 | LOGFILE="$OUTPUT_DIR/info_exploitconf.log" | 124 | LOGFILE="$OUTPUT_DIR/info_exploitconf.log" |
125 | ERRORFILE="$OUTPUT_DIR/error_exploitconf.log" | 125 | ERRORFILE="$OUTPUT_DIR/error_exploitconf.log" |
126 | 126 | ||
127 | CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg" | 127 | CONFPASS_CONFIG_FILE="$(readlink -e $1)/ConfPass.cfg" |
128 | if [ -e $CONFPASS_CONFIG_FILE ] | 128 | if [ -e $CONFPASS_CONFIG_FILE ] |
129 | then | 129 | then |
130 | { | 130 | { |
131 | RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=") | 131 | RES_CONF_DIR=$(cat $CONFPASS_CONFIG_FILE | grep "^RES_CONF_DIR=" | cut -f2 -d"=") |
132 | RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=") | 132 | RES_CONF=$(cat $CONFPASS_CONFIG_FILE | grep "^CONF_DIR=" | cut -f2 -d"=") |
133 | print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2 | 133 | print_info "[${BASENAME}] Use confidence measure from : $RES_CONF" 2 |
134 | } | 134 | } |
135 | else | 135 | else |
136 | { | 136 | { |
137 | print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE" | 137 | print_error "[${BASENAME}] Can't find $CONFPASS_CONFIG_FILE" |
138 | print_error "[${BASENAME}] -> use res_p2" | 138 | print_error "[${BASENAME}] -> use res_p2" |
139 | RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm" | 139 | RES_CONF_DIR="$INPUT_DIR/conf/res_p2/scored_ctm" |
140 | RES_CONF="$INPUT_DIR/conf/res_p2" | 140 | RES_CONF="$INPUT_DIR/conf/res_p2" |
141 | } | 141 | } |
142 | fi | 142 | fi |
143 | 143 | ||
144 | mkdir -p $SHOW_DIR > /dev/null 2>&1 | 144 | mkdir -p $SHOW_DIR > /dev/null 2>&1 |
145 | mkdir -p $SOLR_RES > /dev/null 2>&1 | 145 | mkdir -p $SOLR_RES > /dev/null 2>&1 |
146 | mkdir -p $EXT_LEX > /dev/null 2>&1 | 146 | mkdir -p $EXT_LEX > /dev/null 2>&1 |
147 | mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1 | 147 | mkdir -p $TRIGGER_CONFZONE > /dev/null 2>&1 |
148 | 148 | ||
149 | #------------------# | 149 | #------------------# |
150 | # Create Workspace # | 150 | # Create Workspace # |
151 | #------------------# | 151 | #------------------# |
152 | # Lock directory | 152 | # Lock directory |
153 | if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ] | 153 | if [ -e "$OUTPUT_DIR_BASENAME/EXPLOITCONFPASS.lock" ] && [ $RERUN -eq 0 ] |
154 | then | 154 | then |
155 | print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2 | 155 | print_warn "[${BASENAME}] ExploitConfidencePass is locked -> exit" 2 |
156 | exit 1 | 156 | exit 1 |
157 | fi | 157 | fi |
158 | rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1 | 158 | rm "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" > /dev/null 2>&1 |
159 | touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1 | 159 | touch "$OUTPUT_DIR/EXPLOITCONFPASS.lock" > /dev/null 2>&1 |
160 | 160 | ||
161 | rm $LOGFILE $ERRORFILE 2>/dev/null | ||
162 | |||
161 | #------# | 163 | #------# |
162 | # Save # | 164 | # Save # |
163 | #------# | 165 | #------# |
164 | cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg | 166 | cp $EXPLOITCONFIDENCEPASS_CONFIG_FILE $OUTPUT_DIR/ExploitConfPass.cfg |
165 | echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg | 167 | echo "TRIGGER_DIR=$TRIGGER_CONFZONE" >> $OUTPUT_DIR/ExploitConfPass.cfg |
166 | echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg | 168 | echo "TRIGGER_SPEERAL=$TRIGGER_CONFZONE/speeral/" >> $OUTPUT_DIR/ExploitConfPass.cfg |
167 | echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg | 169 | echo "LEX_SPEERAL=$EXT_LEX/speeral/${lexname}_ext" >> $OUTPUT_DIR/ExploitConfPass.cfg |
168 | echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg | 170 | echo "LEX_BINODE_SPEERAL=$EXT_LEX/speeral/${lexname}_ext.bin" >> $OUTPUT_DIR/ExploitConfPass.cfg |
169 | print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1 | 171 | print_info "[${BASENAME}] Save config in $OUTPUT_DIR_BASENAME/ExploitConfPass.cfg" 1 |
170 | 172 | ||
171 | #---------------# | 173 | #---------------# |
172 | # Check Pass # | 174 | # Check Pass # |
173 | #---------------# | 175 | #---------------# |
174 | if [ $( ls ${RES_CONF_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ] | 176 | if [ $( ls ${RES_CONF_DIR}/*.res 2> /dev/null | wc -l) -eq 0 ] |
175 | then | 177 | then |
176 | print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass" | 178 | print_error "[${BASENAME}] No Conf Pass res -> exit ExploitConfPass" |
177 | if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in ${RES_CONF_DIR}" ;fi | 179 | if [ $CHECK -eq 1 ]; then print_log_file $ERRORFILE "No ConfPass res in ${RES_CONF_DIR}" ;fi |
178 | exit 1 | 180 | exit 1 |
179 | fi | 181 | fi |
180 | 182 | ||
181 | #-----------------------# | 183 | #-----------------------# |
182 | # Segmentation by show # | 184 | # Segmentation by show # |
183 | #-----------------------# | 185 | #-----------------------# |
184 | # create txt file from scored res | 186 | # create txt file from scored res |
185 | # tag pos and lemmatization of the txt file | 187 | # tag pos and lemmatization of the txt file |
186 | # merge the scored res and taglem file | 188 | # merge the scored res and taglem file |
187 | # segment using the last generated file | 189 | # segment using the last generated file |
188 | # and create a ctm file by show | 190 | # and create a ctm file by show |
189 | 191 | ||
190 | print_info "[${BASENAME}] Segmentation by show" 1 | 192 | print_info "[${BASENAME}] Segmentation by show" 1 |
191 | 193 | ||
192 | # -> to txt | 194 | # -> to txt |
193 | print_info "[${BASENAME}] Create txt from scored res" 3 | 195 | print_info "[${BASENAME}] Create txt from scored res" 3 |
194 | cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm | 196 | cat ${RES_CONF_DIR}/*.res > $INPUT_DIR/$BASENAME.sctm |
195 | cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp | 197 | cat $INPUT_DIR/$BASENAME.seg | $SIGMUND_BIN/myConvert.pl $INPUT_DIR/$BASENAME.sctm $INPUT_DIR/$BASENAME.tmp |
196 | cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt | 198 | cat $INPUT_DIR/$BASENAME.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | sed -e "s/_/ /g" | sort -nt 'n' -k '2' > $INPUT_DIR/$BASENAME.txt |
197 | 199 | ||
198 | # -> to tagger + lemme | 200 | # -> to tagger + lemme |
199 | print_info "[${BASENAME}] Tag pos and lem in txt file" 3 | 201 | print_info "[${BASENAME}] Tag pos and lem in txt file" 3 |
200 | iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp | 202 | iconv -t ISO_8859-1 $INPUT_DIR/$BASENAME.txt > $INPUT_DIR/$BASENAME.tmp |
201 | $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem | 203 | $SIGMUND_BIN/txt2lem.sh $INPUT_DIR/$BASENAME.tmp $INPUT_DIR/$BASENAME.taglem |
202 | 204 | ||
203 | # merge sctm and taglem | 205 | # merge sctm and taglem |
204 | print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3 | 206 | print_info "[${BASENAME}] Merge scored ctm with tag pos and lem file" 3 |
205 | cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl | 207 | cat $INPUT_DIR/$BASENAME.sctm | $SCRIPT_PATH/BdlexUC.pl ${RULES}/basic -f | iconv -t ISO_8859-1 | $SCRIPT_PATH/scoredCtmAndTaggedLem2All.pl $INPUT_DIR/$BASENAME.taglem > $INPUT_DIR/$BASENAME.ctl |
206 | 208 | ||
207 | # -> new seg | 209 | # -> new seg |
208 | print_info "[${BASENAME}] Create xml file and run Topic Seg" 3 | 210 | print_info "[${BASENAME}] Create xml file and run Topic Seg" 3 |
209 | $SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml | 211 | $SIGMUND_BIN/tagLem2xml.pl $INPUT_DIR/$BASENAME.taglem $INPUT_DIR/$BASENAME.doc.xml |
210 | rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem | 212 | rm $INPUT_DIR/$BASENAME.tmp #$INPUT_DIR/$BASENAME.taglem |
211 | 213 | ||
212 | # Lia_topic_seg : bring together sentences into show | 214 | # Lia_topic_seg : bring together sentences into show |
213 | cp $INPUT_DIR/$BASENAME.doc.xml 0.xml | 215 | cp $INPUT_DIR/$BASENAME.doc.xml 0.xml |
214 | java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg | 216 | java -cp $LIATOPICSEG/bin Test > $INPUT_DIR/show.seg |
215 | cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg | 217 | cat $INPUT_DIR/show.seg | $SIGMUND_BIN/toSegEmiss.pl $INPUT_DIR/$BASENAME.show.seg |
216 | rm 0.xml $INPUT_DIR/show.seg | 218 | rm 0.xml $INPUT_DIR/show.seg |
217 | 219 | ||
218 | if [ $CHECK -eq 1 ] | 220 | if [ $CHECK -eq 1 ] |
219 | then | 221 | then |
220 | if [ ! -s $INPUT_DIR/$BASENAME.show.seg ] | 222 | if [ ! -s $INPUT_DIR/$BASENAME.show.seg ] |
221 | then | 223 | then |
222 | print_error "[${BASENAME}] No Topic segmentation ! " | 224 | print_error "[${BASENAME}] No Topic segmentation ! " |
223 | print_error "[${BASENAME}] Check $ERRORFILE " | 225 | print_error "[${BASENAME}] Check $ERRORFILE " |
224 | print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg" | 226 | print_log_file "$ERRORFILE" "No Topic segmentation in ${BASENAME}.show.seg" |
225 | fi | 227 | fi |
226 | fi | 228 | fi |
227 | 229 | ||
228 | # Segment ctm into several show files and create a seg list by show | 230 | # Segment ctm into several show files and create a seg list by show |
229 | print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1 | 231 | print_info "[${BASENAME}] Segment ctm into show files and a seg list by show" 1 |
230 | $SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR | 232 | $SCRIPT_PATH/ctm2show.pl $INPUT_DIR/$BASENAME.ctl $INPUT_DIR/$BASENAME.show.seg $SHOW_DIR |
231 | 233 | ||
232 | #-----------------------------------------------------------# | 234 | #-----------------------------------------------------------# |
233 | # SOLR QUERIES # | 235 | # SOLR QUERIES # |
234 | # -> Create Confidente Word # | 236 | # -> Create Confidente Word # |
235 | # Keep conf words and use Tags # | 237 | # Keep conf words and use Tags # |
236 | # -> Query SOLR (document & multimedia) # | 238 | # -> Query SOLR (document & multimedia) # |
237 | # concat word + add date 2 day before and after the show # | 239 | # concat word + add date 2 day before and after the show # |
238 | # query document & multimedia # | 240 | # query document & multimedia # |
239 | #-----------------------------------------------------------# | 241 | #-----------------------------------------------------------# |
240 | print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1 | 242 | print_info "[${BASENAME}] Create SOLR queries and ask SOLR" 1 |
241 | for show in $(ls $SHOW_DIR/*.ctm) | 243 | for show in $(ls $SHOW_DIR/*.ctm) |
242 | do | 244 | do |
243 | bn=$(basename $show .ctm) | 245 | bn=$(basename $show .ctm) |
244 | # Remove words with low confidence and keep useful tagger words | 246 | # Remove words with low confidence and keep useful tagger words |
245 | cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone" | 247 | cat $show | $SCRIPT_PATH/KeepConfZone.pl | grep -e "MOTINC\|NMS\|NMP\|NFS\|NFP\|X[A-Z]{3,5}" | cut -f3 -d' ' > "$SHOW_DIR/$bn.confzone" |
246 | # Get date 2 day before and after the show | 248 | # Get date 2 day before and after the show |
247 | datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)` | 249 | datePattern=`$SCRIPT_PATH/daybefore2after.sh $(echo $BASENAME | cut -c1-6)` |
248 | # Create SOLR queries | 250 | # Create SOLR queries |
249 | cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries" | 251 | cat $SHOW_DIR/$bn".confzone" | $SCRIPT_PATH/GenerateSOLRQueries.pl | iconv -f ISO_8859-1 -t UTF-8 > "$SHOW_DIR/$bn.queries" |
250 | # Ask SOLR DB | 252 | # Ask SOLR DB |
251 | if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then | 253 | if [ $(wc -w "$SHOW_DIR/$bn.queries" | cut -f1 -d' ') -gt 0 ]; then |
252 | query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]" | 254 | query=$(cat $SHOW_DIR/$bn.queries)"&fq=docDate:[$datePattern]" |
253 | echo $query > $SHOW_DIR/$bn.queries | 255 | echo $query > $SHOW_DIR/$bn.queries |
254 | print_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3 | 256 | print_info "python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp" 3 |
255 | python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp | 257 | python $SCRIPT_PATH/ProcessSOLRQueries.py $SHOW_DIR/$bn.queries $SOLR_RES/$bn.keywords.tmp $SOLR_RES/$bn.txt.tmp |
256 | cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords | 258 | cat $SOLR_RES/$bn.keywords.tmp | sort -u > $SOLR_RES/$bn.keywords |
257 | cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt | 259 | cat $SOLR_RES/$bn.txt.tmp | sort -u > $SOLR_RES/$bn.txt |
258 | rm $SOLR_RES/*.tmp > /dev/null 2>&1 | 260 | rm $SOLR_RES/*.tmp > /dev/null 2>&1 |
259 | fi | 261 | fi |
260 | 262 | ||
261 | if [ $CHECK -eq 1 ] | 263 | if [ $CHECK -eq 1 ] |
262 | then | 264 | then |
263 | if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ] | 265 | if [ ! -e $SOLR_RES/$bn.keywords ] || [ ! -e $SOLR_RES/$bn.txt ] |
264 | then | 266 | then |
265 | print_warn "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" 2 | 267 | print_warn "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" 2 |
266 | print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" | 268 | print_log_file "$LOGFILE" "$bn.keywords and $bn.txt are empty !\nMaybe SOLR server is down !" |
267 | fi | 269 | fi |
268 | fi | 270 | fi |
269 | 271 | ||
270 | done | 272 | done |
271 | 273 | ||
272 | #----------------------------------------------------------------------------------------------- | 274 | #----------------------------------------------------------------------------------------------- |
273 | # Build trigger file | 275 | # Build trigger file |
274 | # 1) keywords are automatically boosted in the non confident zone of the current res | 276 | # 1) keywords are automatically boosted in the non confident zone of the current res |
275 | # confident zone are boosted | 277 | # confident zone are boosted |
276 | # previous words in sensible zone are penalized | 278 | # previous words in sensible zone are penalized |
277 | # 2) OOVs are extracted + phonetized | 279 | # 2) OOVs are extracted + phonetized |
278 | # 3) Try to find OOVs acousticly in the current segment | 280 | # 3) Try to find OOVs acousticly in the current segment |
279 | # 4) Generate the .trigg file | 281 | # 4) Generate the .trigg file |
280 | #------------------------------------------------------------------------------------------------ | 282 | #------------------------------------------------------------------------------------------------ |
281 | print_info "[${BASENAME}] Build trigger files" 1 | 283 | print_info "[${BASENAME}] Build trigger files" 1 |
282 | for i in `ls $SOLR_RES/*.keywords` | 284 | for i in `ls $SOLR_RES/*.keywords` |
283 | do | 285 | do |
284 | basename=`basename $i .keywords` | 286 | basename=`basename $i .keywords` |
285 | 287 | ||
286 | # | 288 | # |
287 | # Tokenize & produce coverage report | 289 | # Tokenize & produce coverage report |
288 | # Use filter you need | 290 | # Use filter you need |
289 | # | 291 | # |
290 | print_info "[${BASENAME}] keywords filtering and produce coverage report" 3 | 292 | print_info "[${BASENAME}] keywords filtering and produce coverage report" 3 |
291 | # Default filter | 293 | # Default filter |
292 | cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\ | 294 | cat $i | $SCRIPT_PATH/CleanFilter.sh | ${SCRIPT_PATH}/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t |\ |
293 | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok | 295 | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok |
294 | # do less filter | 296 | # do less filter |
295 | #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok | 297 | #cat $i | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | $SCRIPT_PATH/CoverageReportMaker.pl --out $SOLR_RES/${basename}_tmp_report $LEXICON.bdlex_tok |
296 | 298 | ||
297 | 299 | ||
298 | # | 300 | # |
299 | # Extract "real" OOV and phonetize them | 301 | # Extract "real" OOV and phonetize them |
300 | # -> petit filtrage persoo pour eviter d'avoir trop de bruits | 302 | # -> petit filtrage persoo pour eviter d'avoir trop de bruits |
301 | # | 303 | # |
302 | print_info "[${BASENAME}] Extract OOV and phonetize them" 3 | 304 | print_info "[${BASENAME}] Extract OOV and phonetize them" 3 |
303 | ${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov | 305 | ${SCRIPT_PATH}/FindNormRules.pl $SOLR_RES/${basename}_tmp_report/report.oov $LEXICON.bdlex_tok | cut -f3 | grep -v "#" | grep -v "^[A-Z]\+$" | grep -v "^[0-9]" | grep --perl-regex -v "^([a-z']){1,3}$" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f | iconv -t ISO_8859-1 -f UTF-8 | ${LIA_LTBOX}/lia_phon/script/lia_lex2phon_variante | grep -v "core dumped" | cut -d"[" -f1 | sort -u | ${SCRIPT_PATH}/PhonFormatter.pl | iconv -f ISO_8859-1 -t UTF-8 | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $SOLR_RES/${basename}.phon_oov |
304 | 306 | ||
305 | # | 307 | # |
306 | # Search INVOC & OOV in the current lattice | 308 | # Search INVOC & OOV in the current lattice |
307 | # | 309 | # |
308 | print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3 | 310 | print_info "[${BASENAME}] Search INVOC and OOV in the current lattice" 3 |
309 | cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch | 311 | cat $SOLR_RES/${basename}_tmp_report/report.invoc | grep -v "\b0" | cut -f1 | grep -v --perl-regex -v "^[a-zA-Z']{1,3}$" | grep -v --perl-regex "^[a-zA-Z0-9]{1,3}$" | grep -v "<s>" | grep -v "</s>" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $TRIGGER_CONFZONE/$basename.tosearch |
310 | cat $SOLR_RES/${basename}.phon_oov | cut -f1 >> $TRIGGER_CONFZONE/$basename.tosearch | 312 | cat $SOLR_RES/${basename}.phon_oov | cut -f1 >> $TRIGGER_CONFZONE/$basename.tosearch |
311 | 313 | ||
312 | # For each treil | 314 | # For each treil |
313 | for baseseg in $(cat "$SHOW_DIR/$basename.lst") | 315 | for baseseg in $(cat "$SHOW_DIR/$basename.lst") |
314 | do | 316 | do |
315 | $OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION | 317 | $OTMEDIA_HOME/tools/QUOTE_FINDER/bin/acousticFinder ${LEXICON}.speer_phon $RES_CONF/wlat/$baseseg.wlat $TRIGGER_CONFZONE/${basename}.tosearch $SOLR_RES/$basename.phon_oov > $TRIGGER_CONFZONE/$baseseg.acousticlyfound $OUTPUT_REDIRECTION |
316 | # | 318 | # |
317 | # Produce the boost file for the next decoding pass | 319 | # Produce the boost file for the next decoding pass |
318 | # | 320 | # |
319 | print_info "[${BASENAME}] Produce trigg file : $baseseg " 3 | 321 | print_info "[${BASENAME}] Produce trigg file : $baseseg " 3 |
320 | cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg | 322 | cat $RES_CONF_DIR/$baseseg.res | $SCRIPT_PATH/ScoreCtm2trigg.pl $TRIGGER_CONFZONE/$baseseg.acousticlyfound > $TRIGGER_CONFZONE/$baseseg.trigg |
321 | done | 323 | done |
322 | 324 | ||
323 | done | 325 | done |
324 | 326 | ||
325 | #----------------------------------------------------------------------------------------------- | 327 | #----------------------------------------------------------------------------------------------- |
326 | # Build the extended SPEERAL Lexicon | 328 | # Build the extended SPEERAL Lexicon |
327 | # 1) Merge OOVs + LEXICON | 329 | # 1) Merge OOVs + LEXICON |
328 | # 1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba) | 330 | # 1) Related text are collected in order to find the invoc word with maximizing the ppl (LM proba) |
329 | # 2) The current lexicon is extended with all the valid OOVs | 331 | # 2) The current lexicon is extended with all the valid OOVs |
330 | #----------------------------------------------------------------------------------------------- | 332 | #----------------------------------------------------------------------------------------------- |
331 | print_info "[${BASENAME}] Build extended Speeral Lexicon" 1 | 333 | print_info "[${BASENAME}] Build extended Speeral Lexicon" 1 |
332 | mkdir -p $EXT_LEX/final | 334 | mkdir -p $EXT_LEX/final |
333 | mkdir -p $EXT_LEX/tmp | 335 | mkdir -p $EXT_LEX/tmp |
334 | mkdir -p $EXT_LEX/tmp/txt | 336 | mkdir -p $EXT_LEX/tmp/txt |
335 | # | 337 | # |
336 | # Collect the acousticly found oov and their phonetisation | 338 | # Collect the acousticly found oov and their phonetisation |
337 | # | 339 | # |
338 | print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3 | 340 | print_info "[${BASENAME}] Get all OOV and retrieve all phonetisation" 3 |
339 | for i in `ls $SOLR_RES/*.phon_oov` | 341 | for i in `ls $SOLR_RES/*.phon_oov` |
340 | do | 342 | do |
341 | basename=`basename $i .phon_oov` | 343 | basename=`basename $i .phon_oov` |
342 | 344 | ||
343 | rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null | 345 | rm $EXT_LEX/$basename.acousticlyfound 2> /dev/null |
344 | # list acousticly found for the show | 346 | # list acousticly found for the show |
345 | for baseseg in $(cat "$SHOW_DIR/$basename.lst") | 347 | for baseseg in $(cat "$SHOW_DIR/$basename.lst") |
346 | do | 348 | do |
347 | cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound | 349 | cat $TRIGGER_CONFZONE/$baseseg.acousticlyfound | cut -f1 | cut -f2 -d"=" >> $EXT_LEX/$basename.acousticlyfound |
348 | done | 350 | done |
349 | cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp | 351 | cat $EXT_LEX/$basename.acousticlyfound | sort -u > $EXT_LEX/.tmp |
350 | mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound | 352 | mv $EXT_LEX/.tmp $EXT_LEX/$basename.acousticlyfound |
351 | 353 | ||
352 | # | 354 | # |
353 | # Extract OOV really added | 355 | # Extract OOV really added |
354 | # | 356 | # |
355 | cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov | 357 | cat $SOLR_RES/$basename.phon_oov | cut -f1 | sort -u > $EXT_LEX/$basename.oov |
356 | $SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound | 358 | $SCRIPT_PATH/intersec.pl $EXT_LEX/$basename.oov $EXT_LEX/$basename.acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound |
357 | # | 359 | # |
358 | # Retrieve all phonetisation | 360 | # Retrieve all phonetisation |
359 | # | 361 | # |
360 | cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon | 362 | cat $SOLR_RES/${basename}.phon_oov | $SCRIPT_PATH/LexPhonFilter.pl $EXT_LEX/$basename.oov_acousticlyfound > $EXT_LEX/$basename.oov_acousticlyfound_phon |
361 | done | 363 | done |
362 | 364 | ||
363 | # | 365 | # |
364 | # Merge OOVs and their phonetisation | 366 | # Merge OOVs and their phonetisation |
365 | # | 367 | # |
366 | print_info "[${BASENAME}] Merge OOV and their phonetisation" 3 | 368 | print_info "[${BASENAME}] Merge OOV and their phonetisation" 3 |
367 | lexname=$(basename $LEXICON) | 369 | lexname=$(basename $LEXICON) |
368 | cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon | 370 | cat $EXT_LEX/*.oov_acousticlyfound_phon | sort -u > $EXT_LEX/final/all.oov_acousticlyfound_phon |
369 | cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$" > $EXT_LEX/final/all.oov_acousticlyfound | 371 | cat $EXT_LEX/*.oov_acousticlyfound | sort -u | grep --perl-regex -v "^([a-z']){3}$" > $EXT_LEX/final/all.oov_acousticlyfound |
370 | $SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon | 372 | $SCRIPT_PATH/MergeLexicon.pl $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/final/${lexname}_ext.phon |
371 | 373 | ||
372 | # | 374 | # |
373 | # Collect + clean retrieved txt | 375 | # Collect + clean retrieved txt |
374 | # | 376 | # |
375 | print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2 | 377 | print_info "[${BASENAME}] Collect and clean SOLR txt answers" 2 |
376 | # choose filter | 378 | # choose filter |
377 | # default | 379 | # default |
378 | cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt | 380 | cat $SOLR_RES/*.txt | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t > $EXT_LEX/final/all.bdlex_txt |
379 | # low filter | 381 | # low filter |
380 | #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt | 382 | #cat $SOLR_RES/*.txt | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex > $EXT_LEX/final/all.bdlex_txt |
381 | 383 | ||
382 | # | 384 | # |
383 | # Construct the map file | 385 | # Construct the map file |
384 | # | 386 | # |
385 | # Notes: | 387 | # Notes: |
386 | # - Expected format : | 388 | # - Expected format : |
387 | # <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1> | 389 | # <WORD1_STRING> <CANDIDATE1_STRING> <PHON_1> |
388 | # | 390 | # |
389 | print_info "[${BASENAME}] Construct map file" 3 | 391 | print_info "[${BASENAME}] Construct map file" 3 |
390 | rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null | 392 | rm -f $EXT_LEX/final/${lexname}_ext.map 2>/dev/null |
391 | rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null | 393 | rm -f $EXT_LEX/final/${lexname}.unvalid_oov 2>/dev/null |
392 | 394 | ||
393 | while read oov | 395 | while read oov |
394 | do | 396 | do |
395 | oov=`echo $oov | sed "s/\n//g"` | 397 | oov=`echo $oov | sed "s/\n//g"` |
396 | # | 398 | # |
397 | # Obtain the oov's tag | 399 | # Obtain the oov's tag |
398 | # | 400 | # |
399 | #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2` | 401 | #oov_tag=`grep --perl-regex "^$oov\t" $DYNAMIC_TAGSTATS/all.tags | cut -f2` |
400 | # | 402 | # |
401 | # Try to collect text containing the oov word | 403 | # Try to collect text containing the oov word |
402 | # | 404 | # |
403 | print_info "[${BASENAME}] Collect text containing the oov" 3 | 405 | print_info "[${BASENAME}] Collect text containing the oov" 3 |
404 | cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt | 406 | cat $EXT_LEX/final/all.bdlex_txt | grep --perl-regex " $oov " | $SCRIPT_PATH/NbMaxWordsFilter.pl 40 |uniq > $EXT_LEX/tmp/txt/$oov.bdlex_txt |
405 | if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then | 407 | if [ -f $EXT_LEX/tmp/txt/$oov.bdlex_txt ]; then |
406 | nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` | 408 | nbWords=`wc -l $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` |
407 | if [ $nbWords -eq 0 ]; then | 409 | if [ $nbWords -eq 0 ]; then |
408 | print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2 | 410 | print_warn "[${BASENAME}] UNVALID OOV: $oov => $nbWords occurrences" 2 |
409 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov | 411 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov |
410 | else | 412 | else |
411 | # | 413 | # |
412 | # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected | 414 | # Find a candidate in a filtred invoc lexicon => a candidate which maximize the ppl in the overall txt collected |
413 | # | 415 | # |
414 | #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt" | 416 | #echo "$/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $LEXICON.bdlex_tok $EXT_LEX/tmp/txt/$oov.bdlex_txt" |
415 | print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3 | 417 | print_info `$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` 3 |
416 | candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` | 418 | candidate=`$SPEERAL_PATH/bin/getCandidate $SPEER_LM_PATH $SPEER_LM_BASENAME $oov $CANDIDATE_LEXICON $EXT_LEX/tmp/txt/$oov.bdlex_txt | cut -f1 -d" "` |
417 | if [ ! "$candidate" == "" ]; then | 419 | if [ ! "$candidate" == "" ]; then |
418 | grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon | 420 | grep --perl-regex "^$oov\t" $EXT_LEX/final/all.oov_acousticlyfound_phon > $EXT_LEX/tmp/$oov.phon |
419 | while read phonLine | 421 | while read phonLine |
420 | do | 422 | do |
421 | #<word> <phon> => <word> <candidate> <phon> | 423 | #<word> <phon> => <word> <candidate> <phon> |
422 | echo "$phonLine" | sed "s|\t|\t$candidate\t|" >> $EXT_LEX/final/${lexname}_ext.map | 424 | echo "$phonLine" | sed "s|\t|\t$candidate\t|" >> $EXT_LEX/final/${lexname}_ext.map |
423 | done < $EXT_LEX/tmp/$oov.phon | 425 | done < $EXT_LEX/tmp/$oov.phon |
424 | else | 426 | else |
425 | print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2 | 427 | print_warn "[${BASENAME}] UNVALID OOV: $oov => no availaible Candidate word in LM" 2 |
426 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov | 428 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov |
427 | fi | 429 | fi |
428 | fi | 430 | fi |
429 | else | 431 | else |
430 | print_warn "[${BASENAME}] UNVALID OOV: $oov" 2 | 432 | print_warn "[${BASENAME}] UNVALID OOV: $oov" 2 |
431 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov | 433 | echo "$oov" >> $EXT_LEX/final/${lexname}.unvalid_oov |
432 | fi | 434 | fi |
433 | done < $EXT_LEX/final/all.oov_acousticlyfound | 435 | done < $EXT_LEX/final/all.oov_acousticlyfound |
434 | 436 | ||
435 | # | 437 | # |
436 | ### Speeral | 438 | ### Speeral |
437 | # | 439 | # |
438 | 440 | ||
439 | lexname=`basename $LEXICON` | 441 | lexname=`basename $LEXICON` |
440 | # | 442 | # |
441 | # Build the final trigger file | 443 | # Build the final trigger file |
442 | # | 444 | # |
443 | print_info "[${BASENAME}] Clean trigg files" 3 | 445 | print_info "[${BASENAME}] Clean trigg files" 3 |
444 | mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null | 446 | mkdir -p $TRIGGER_CONFZONE/speeral/ 2> /dev/null |
445 | mkdir -p $EXT_LEX/speeral/ 2> /dev/null | 447 | mkdir -p $EXT_LEX/speeral/ 2> /dev/null |
446 | for i in `ls $TRIGGER_CONFZONE/*.trigg` | 448 | for i in `ls $TRIGGER_CONFZONE/*.trigg` |
447 | do | 449 | do |
448 | basename=`basename $i .trigg` | 450 | basename=`basename $i .trigg` |
449 | cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg | 451 | cat $i | $SCRIPT_PATH/RemoveLineContaining.pl $EXT_LEX/$lexname.unvalid_oov > $TRIGGER_CONFZONE/speeral/$basename.trigg |
450 | done | 452 | done |
451 | # | 453 | # |
452 | # Compile the speeral extended lexicon | 454 | # Compile the speeral extended lexicon |
453 | # | 455 | # |
454 | print_info "[${BASENAME}] Compile Speeral extended lexicon" 3 | 456 | print_info "[${BASENAME}] Compile Speeral extended lexicon" 3 |
455 | print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3 | 457 | print_info "$SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext" 3 |
456 | $SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext | 458 | $SPEERAL_PATH/bin/buildmappedbinode $LEXICON.bdlex_phon $EXT_LEX/final/${lexname}_ext.map $AM_SKL $EXT_LEX/speeral/${lexname}_ext |
457 | 459 | ||
458 | if [ $CHECK -eq 1 ] | 460 | if [ $CHECK -eq 1 ] |
459 | then | 461 | then |
460 | check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext" | 462 | check_exploitconfpass_lex_check "${EXT_LEX}/speeral/${lexname}_ext" |
461 | if [ $? -eq 1 ] | 463 | if [ $? -eq 1 ] |
462 | then | 464 | then |
463 | print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit" | 465 | print_error "[${BASENAME}] Building Speeral Lexicon $INPUT_DIR -> exit" |
464 | print_error "[${BASENAME}] Check $ERRORFILE" | 466 | print_error "[${BASENAME}] Check $ERRORFILE" |
465 | print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR" | 467 | print_log_file $ERRORFILE "ERROR : Building Speeral Lexicon $INPUT_DIR" |
466 | print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?" | 468 | print_log_file $ERRORFILE "ERROR : ${EXT_LEX}/speeral/${lexname}_ext Empty after buildmappedbinode ?" |
467 | exit 1; | 469 | exit 1; |
468 | fi | 470 | fi |
469 | fi | 471 | fi |
470 | 472 | ||
471 | 473 | ||
472 | #-------# | 474 | #-------# |
473 | # CLOSE # | 475 | # CLOSE # |
474 | #-------# | 476 | #-------# |
475 | # Seem OK | 477 | # Seem OK |
476 | print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1 | 478 | print_info "[${BASENAME}] <= ExploitConfidencePass End | $(date +'%d/%m/%y %H:%M:%S')" 1 |
477 | 479 | ||
478 | # unlok directory | 480 | # unlok directory |
479 | mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" | 481 | mv "$OUTPUT_DIR/EXPLOITCONFPASS.lock" "$OUTPUT_DIR/EXPLOITCONFPASS.unlock" |
480 | 482 | ||
481 | 483 | ||
482 | 484 |