Commit 0bf609bcceb3af008651888fa40b72c381245e37

Authored by Jean-François Rey
1 parent 87013ba29c
Exists in master

update and add script to extract TV corpus

Showing 6 changed files with 24 additions and 17 deletions Side-by-side Diff

... ... @@ -3,6 +3,7 @@
3 3 # version 1.0 #
4 4 #-------------------#
5 5  
  6 +"Observatoire Transmedia pour l'étude des évolutions et transformations du monde médiatique"
6 7  
7 8 OTMEDIA_HOME
8 9 install.sh
1   --Check and add Verbose messages
  1 +- Check and add Verbose messages
  2 +- Modifiy SOLR request
main_tools/ConfPass.sh
... ... @@ -204,9 +204,11 @@
204 204 # create USF configuration file
205 205 echo -e "name $AUTHOR\nfileName $BASENAME\nfileExt wav\nsegFile $OUTPUT_DIR/$BASENAME.seg" > $OUTPUT_DIR/$BASENAME.usf_cfg
206 206 # create USF file
207   -$SCRIPT_PATH/res2out.pl --dir $RES_CONF_DIR --format USF --ignore $RULES/asupp --out $USF_FILE --usf_config $OUTPUT_DIR/$BASENAME.usf_cfg
  207 +$SCRIPT_PATH/res2out.pl --dir $RES_CONF_DIR --format USF --ignore $RULES/asupp --out $USF_FILE.tmp --usf_config $OUTPUT_DIR/$BASENAME.usf_cfg
208 208 rm $OUTPUT_DIR/$BASENAME.usf_cfg
  209 +cat $USF_FILE.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f > $USF_FILE
209 210 cp $USF_FILE ${OUTPUT_DIR}/${BASENAME}.usf
  211 +rm $USF_FILE.tmp
210 212  
211 213 #----------------#
212 214 # Check USF file #
main_tools/FirstPass.sh
... ... @@ -85,10 +85,13 @@
85 85 then
86 86 set -x
87 87 echo -e "## Mode DEBUG ON ##"
  88 + REDIRECTION_OUTPUT=""
  89 +else
  90 + REDIRECTION_OUTPUT=" > /dev/null 2>&1"
88 91 fi
89 92  
90 93 # mode verbose enable
91   -if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ; fi
  94 +if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ; REDIRECTION_OUTPUT=" 2> /dev/null"; fi
92 95  
93 96 # Check USAGE by arguments number
94 97 if [ $(($#-($OPTIND-1))) -ne 2 ]
... ... @@ -160,7 +163,7 @@
160 163 else
161 164 rm $RES_DIR/*.lock > /dev/null 2>&1
162 165 fi
163   -mkdir -p $RES_DIR
  166 +mkdir -p $RES_DIR $REDIRECTION_OUTPUT
164 167 print_info "Make directory $RES_DIR" 1
165 168  
166 169 #--------------------#
... ... @@ -194,7 +197,7 @@
194 197 then
195 198 print_message $WARNING 2 "$WAV_FILE is not a wav file at 16000 Hz, 1 channel, 16bits\nhave to convert"
196 199 print_message $INFO 3 "avconv -i $WAV_FILE -threads 4 -vn -f wav -ac 1 -ar 16000 -ab 256000 $OUTPUT_DIR_BASENAME/$BASENAME.wav"
197   - avconv -i $WAV_FILE -threads 4 -vn -f wav -ac 1 -ar 16000 -ab 256000 $OUTPUT_DIR_BASENAME/$BASENAME.wav
  200 + avconv -i $WAV_FILE -threads 4 -vn -f wav -ac 1 -ar 16000 -ab 256000 $OUTPUT_DIR_BASENAME/$BASENAME.wav $REDIRECTION_OUTPUT
198 201 WAV_FILE=$OUTPUT_DIR_BASENAME/$BASENAME.wav
199 202 FILENAME=$BASENAME.wav
200 203 print_message $INFO 1 "new wav file : $WAV_FILE"
... ... @@ -217,7 +220,7 @@
217 220 print_info "$BIN_PATH/lia_plp_mt.32 --lst $OUTPUT_DIR_BASENAME/list.tmp --input_dir $(dirname $WAV_FILE) --output_dir $OUTPUT_DIR_BASENAME --input_type WAV --output_type HTK --nb_coef 12 --cms
218 221 " 2
219 222  
220   -$BIN_PATH/lia_plp_mt$ARCH --lst $OUTPUT_DIR_BASENAME/list.tmp --input_dir $(dirname $WAV_FILE) --output_dir $OUTPUT_DIR_BASENAME --input_type WAV --output_type HTK --nb_coef 12 --cms
  223 +$BIN_PATH/lia_plp_mt$ARCH --lst $OUTPUT_DIR_BASENAME/list.tmp --input_dir $(dirname $WAV_FILE) --output_dir $OUTPUT_DIR_BASENAME --input_type WAV --output_type HTK --nb_coef 12 --cms $REDIRECTION_OUTPUT
221 224  
222 225 if [ $CHECK -eq 1 ]
223 226 then
... ... @@ -238,7 +241,7 @@
238 241 # Calcul seg file
239 242 print_info "java -Xmx4096m -jar $BIN_PATH/LIUM_SpkDiarization-4.2.jar --fInputMask=${WAV_FILE} --sOutputMask=${SEG_FILE} $BASENAME" 2
240 243 #java -Xmx8000m -Xms2048 -jar $BIN_PATH/LIUM_SpkDiarization-4.2.jar --fInputMask=${WAV_FILE} --sOutputMask=${SEG_FILE} $BASENAME
241   -java -Xmx4096m -jar $BIN_PATH/LIUM_SpkDiarization-4.2.jar --fInputMask=${WAV_FILE} --sOutputMask=${SEG_FILE} $BASENAME #–doCEClustering
  244 +java -Xmx4096m -jar $BIN_PATH/LIUM_SpkDiarization-4.2.jar --fInputMask=${WAV_FILE} --sOutputMask=${SEG_FILE} $BASENAME $REDIRECTION_OUTPUT #–doCEClustering
242 245  
243 246 if [ $CHECK -eq 1 ] && ( [ ! -e $SEG_FILE ] || [ -z $SEG_FILE ] )
244 247 then
... ... @@ -265,7 +268,7 @@
265 268 print_info "Cut PLP file depending to LBL segmentations" 1
266 269 print_info "$BIN_PATH/gcep $PLP_FILE $LBL_FILE 500 $PLP_DIR -FSEG" 2
267 270  
268   -$SPEERAL_TOOLS/gcep $PLP_FILE $LBL_FILE 500 $PLP_DIR -FSEG
  271 +$SPEERAL_TOOLS/gcep $PLP_FILE $LBL_FILE 500 $PLP_DIR -FSEG $REDIRECTION_OUTPUT
269 272  
270 273 if [ $CHECK -eq 1 ]
271 274 then
272 275  
... ... @@ -316,9 +319,9 @@
316 319 todo=$OUTPUT_DIR_BASENAME/plp_${MODS[$i]}.lst
317 320 while [ $redo -gt 0 ]; do
318 321 rm $RES_DIR/*.lock > /dev/null 2>&1
319   - print_info "$SPEERAL_BIN $todo $RES_DIR ${SPEERAL_CFG[$i]} -r $PLP_DIR -m $SPEERAL_AM/${MODS[$i]}.hmm -c $SPEERAL_AM/${MODS[$i]}.cls $FORKS --lock" 2
  322 + print_info "$SPEERAL_BIN $todo $RES_DIR ${SPEERAL_CFG[$i]} -r $PLP_DIR -m $SPEERAL_AM/${MODS[$i]}.hmm -c $SPEERAL_AM/${MODS[$i]}.cls $FORKS --lock $REDIRECTION_OUTPUT" 2
320 323 # Run speeral
321   - $SPEERAL_BIN ${todo} $RES_DIR ${SPEERAL_CFG[$i]} -r $PLP_DIR -m $SPEERAL_AM/${MODS[$i]}.hmm -c $SPEERAL_AM/${MODS[$i]}.cls $FORKS --lock
  324 + $SPEERAL_BIN ${todo} $RES_DIR ${SPEERAL_CFG[$i]} -r $PLP_DIR -m $SPEERAL_AM/${MODS[$i]}.hmm -c $SPEERAL_AM/${MODS[$i]}.cls $FORKS --lock $REDIRECTION_OUTPUT
322 325  
323 326 # Check if error
324 327 if [ $CHECK -eq 1 ]
325 328  
326 329  
... ... @@ -374,13 +377,13 @@
374 377 #---------------#
375 378  
376 379 # .res => .ctm
377   -$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format CTM --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.ctm
  380 +$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format CTM --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.ctm $REDIRECTION_OUTPUT
378 381 # .res => .trs
379 382 echo -e "name $AUTHOR\nfileName $BASENAME\nfileExt wav\nsegFile $OUTPUT_DIR_BASENAME/$BASENAME.seg" > $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg
380   -$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format TRS --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.trs --trs_config $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg
381   -rm $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg
  383 +$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format TRS --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.trs --trs_config $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg $REDIRECTION_OUTPUT
  384 +rm $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg 2> /dev/null
382 385 # .res => .txt
383   -$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format TXT --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.txt
  386 +$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format TXT --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.txt $REDIRECTION_OUTPUT
384 387  
385 388 # unlock directory
386 389 mv "$OUTPUT_DIR_BASENAME/FIRSTPASS.lock" "$OUTPUT_DIR_BASENAME/FIRSTPASS.unlock"
main_tools/ScoringRes.sh
... ... @@ -138,8 +138,8 @@
138 138 fi
139 139 done < $SRT_FILE > $SCORING_DIR/$BASENAME.tmp.txt
140 140  
141   -#cat $SCORING_DIR/$BASENAME.tmp.txt | sed -e "s|\n| |g" | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -e "s|<s>||g" | sed -e "s|</s>||g" | sed -e "s|\n+| |g" > $SCORING_DIR/$BASENAME.tmp2.txt
142   -cat $SCORING_DIR/$BASENAME.tmp.txt | sed -e "s|\n| |g" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | sed -e "s|<s>||g" | sed -e "s|</s>||g" | sed -e "s|\n+| |g" > $SCORING_DIR/$BASENAME.tmp2.txt
  141 +cat $SCORING_DIR/$BASENAME.tmp.txt | sed -e "s|\n| |g" | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -e "s|<s>||g" | sed -e "s|</s>||g" | sed -e "s|\n+| |g" > $SCORING_DIR/$BASENAME.tmp2.txt
  142 +#cat $SCORING_DIR/$BASENAME.tmp.txt | sed -e "s|\n| |g" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | sed -e "s|<s>||g" | sed -e "s|</s>||g" | sed -e "s|\n+| |g" > $SCORING_DIR/$BASENAME.tmp2.txt
143 143  
144 144 $SCRIPT_PATH/srt2stm.pl $SCORING_DIR/$BASENAME.tmp2.txt > "$SCORING_DIR/$BASENAME.stm"
145 145 rm $SCORING_DIR/$BASENAME.tmp.txt $SCORING_DIR/$BASENAME.tmp2.txt
main_tools/ThirdPass.sh
... ... @@ -23,7 +23,7 @@
23 23  
24 24 # Include scripts
25 25 . $SCRIPT_PATH"/Tools.sh"
26   -. $SCRIPT_PATH"/CheckThirdPassPass.sh"
  26 +. $SCRIPT_PATH"/CheckThirdPass.sh"
27 27  
28 28 # where is ThirdPass.cfg
29 29 THIRDPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ThirdPass.cfg"