From 0bf609bcceb3af008651888fa40b72c381245e37 Mon Sep 17 00:00:00 2001 From: rey jean-Francois Date: Tue, 30 Jul 2013 14:28:34 +0200 Subject: [PATCH] update and add script to extract TV corpus --- README | 1 + TODO | 3 ++- main_tools/ConfPass.sh | 4 +++- main_tools/FirstPass.sh | 27 +++++++++++++++------------ main_tools/ScoringRes.sh | 4 ++-- main_tools/ThirdPass.sh | 2 +- 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/README b/README index 782cb0d..eaf4870 100644 --- a/README +++ b/README @@ -3,6 +3,7 @@ # version 1.0 # #-------------------# +"Observatoire Transmedia pour l'étude des évolutions et transformations du monde médiatique" OTMEDIA_HOME install.sh diff --git a/TODO b/TODO index 01b16ae..3ce5471 100644 --- a/TODO +++ b/TODO @@ -1 +1,2 @@ --Check and add Verbose messages +- Check and add Verbose messages +- Modifiy SOLR request diff --git a/main_tools/ConfPass.sh b/main_tools/ConfPass.sh index 447dd6b..9081324 100755 --- a/main_tools/ConfPass.sh +++ b/main_tools/ConfPass.sh @@ -204,9 +204,11 @@ for f in `ls ${RES_CONF_DIR}`; do $SCRIPT_PATH/formatRES.pl $RES_CONF_DIR/$f; do # create USF configuration file echo -e "name $AUTHOR\nfileName $BASENAME\nfileExt wav\nsegFile $OUTPUT_DIR/$BASENAME.seg" > $OUTPUT_DIR/$BASENAME.usf_cfg # create USF file -$SCRIPT_PATH/res2out.pl --dir $RES_CONF_DIR --format USF --ignore $RULES/asupp --out $USF_FILE --usf_config $OUTPUT_DIR/$BASENAME.usf_cfg +$SCRIPT_PATH/res2out.pl --dir $RES_CONF_DIR --format USF --ignore $RULES/asupp --out $USF_FILE.tmp --usf_config $OUTPUT_DIR/$BASENAME.usf_cfg rm $OUTPUT_DIR/$BASENAME.usf_cfg +cat $USF_FILE.tmp | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -f > $USF_FILE cp $USF_FILE ${OUTPUT_DIR}/${BASENAME}.usf +rm $USF_FILE.tmp #----------------# # Check USF file # diff --git a/main_tools/FirstPass.sh b/main_tools/FirstPass.sh index 13fe7db..9d93d7c 100755 --- a/main_tools/FirstPass.sh +++ b/main_tools/FirstPass.sh @@ -85,10 +85,13 @@ if [ $DEBUG -eq 1 ] then set -x echo -e "## Mode DEBUG ON ##" + REDIRECTION_OUTPUT="" +else + REDIRECTION_OUTPUT=" > /dev/null 2>&1" fi # mode verbose enable -if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ; fi +if [ $VERBOSE -gt 0 ]; then echo -e "## Verbose level : $VERBOSE ##" ; REDIRECTION_OUTPUT=" 2> /dev/null"; fi # Check USAGE by arguments number if [ $(($#-($OPTIND-1))) -ne 2 ] @@ -160,7 +163,7 @@ then else rm $RES_DIR/*.lock > /dev/null 2>&1 fi -mkdir -p $RES_DIR +mkdir -p $RES_DIR $REDIRECTION_OUTPUT print_info "Make directory $RES_DIR" 1 #--------------------# @@ -194,7 +197,7 @@ if [ $error -eq 1 ] then print_message $WARNING 2 "$WAV_FILE is not a wav file at 16000 Hz, 1 channel, 16bits\nhave to convert" print_message $INFO 3 "avconv -i $WAV_FILE -threads 4 -vn -f wav -ac 1 -ar 16000 -ab 256000 $OUTPUT_DIR_BASENAME/$BASENAME.wav" - avconv -i $WAV_FILE -threads 4 -vn -f wav -ac 1 -ar 16000 -ab 256000 $OUTPUT_DIR_BASENAME/$BASENAME.wav + avconv -i $WAV_FILE -threads 4 -vn -f wav -ac 1 -ar 16000 -ab 256000 $OUTPUT_DIR_BASENAME/$BASENAME.wav $REDIRECTION_OUTPUT WAV_FILE=$OUTPUT_DIR_BASENAME/$BASENAME.wav FILENAME=$BASENAME.wav print_message $INFO 1 "new wav file : $WAV_FILE" @@ -217,7 +220,7 @@ echo $FILENAME > $OUTPUT_DIR_BASENAME/list.tmp print_info "$BIN_PATH/lia_plp_mt.32 --lst $OUTPUT_DIR_BASENAME/list.tmp --input_dir $(dirname $WAV_FILE) --output_dir $OUTPUT_DIR_BASENAME --input_type WAV --output_type HTK --nb_coef 12 --cms " 2 -$BIN_PATH/lia_plp_mt$ARCH --lst $OUTPUT_DIR_BASENAME/list.tmp --input_dir $(dirname $WAV_FILE) --output_dir $OUTPUT_DIR_BASENAME --input_type WAV --output_type HTK --nb_coef 12 --cms +$BIN_PATH/lia_plp_mt$ARCH --lst $OUTPUT_DIR_BASENAME/list.tmp --input_dir $(dirname $WAV_FILE) --output_dir $OUTPUT_DIR_BASENAME --input_type WAV --output_type HTK --nb_coef 12 --cms $REDIRECTION_OUTPUT if [ $CHECK -eq 1 ] then @@ -238,7 +241,7 @@ print_info "Launch speakers diarization" 1 # Calcul seg file print_info "java -Xmx4096m -jar $BIN_PATH/LIUM_SpkDiarization-4.2.jar --fInputMask=${WAV_FILE} --sOutputMask=${SEG_FILE} $BASENAME" 2 #java -Xmx8000m -Xms2048 -jar $BIN_PATH/LIUM_SpkDiarization-4.2.jar --fInputMask=${WAV_FILE} --sOutputMask=${SEG_FILE} $BASENAME -java -Xmx4096m -jar $BIN_PATH/LIUM_SpkDiarization-4.2.jar --fInputMask=${WAV_FILE} --sOutputMask=${SEG_FILE} $BASENAME #–doCEClustering +java -Xmx4096m -jar $BIN_PATH/LIUM_SpkDiarization-4.2.jar --fInputMask=${WAV_FILE} --sOutputMask=${SEG_FILE} $BASENAME $REDIRECTION_OUTPUT #–doCEClustering if [ $CHECK -eq 1 ] && ( [ ! -e $SEG_FILE ] || [ -z $SEG_FILE ] ) then @@ -265,7 +268,7 @@ fi print_info "Cut PLP file depending to LBL segmentations" 1 print_info "$BIN_PATH/gcep $PLP_FILE $LBL_FILE 500 $PLP_DIR -FSEG" 2 -$SPEERAL_TOOLS/gcep $PLP_FILE $LBL_FILE 500 $PLP_DIR -FSEG +$SPEERAL_TOOLS/gcep $PLP_FILE $LBL_FILE 500 $PLP_DIR -FSEG $REDIRECTION_OUTPUT if [ $CHECK -eq 1 ] then @@ -316,9 +319,9 @@ do todo=$OUTPUT_DIR_BASENAME/plp_${MODS[$i]}.lst while [ $redo -gt 0 ]; do rm $RES_DIR/*.lock > /dev/null 2>&1 - print_info "$SPEERAL_BIN $todo $RES_DIR ${SPEERAL_CFG[$i]} -r $PLP_DIR -m $SPEERAL_AM/${MODS[$i]}.hmm -c $SPEERAL_AM/${MODS[$i]}.cls $FORKS --lock" 2 + print_info "$SPEERAL_BIN $todo $RES_DIR ${SPEERAL_CFG[$i]} -r $PLP_DIR -m $SPEERAL_AM/${MODS[$i]}.hmm -c $SPEERAL_AM/${MODS[$i]}.cls $FORKS --lock $REDIRECTION_OUTPUT" 2 # Run speeral - $SPEERAL_BIN ${todo} $RES_DIR ${SPEERAL_CFG[$i]} -r $PLP_DIR -m $SPEERAL_AM/${MODS[$i]}.hmm -c $SPEERAL_AM/${MODS[$i]}.cls $FORKS --lock + $SPEERAL_BIN ${todo} $RES_DIR ${SPEERAL_CFG[$i]} -r $PLP_DIR -m $SPEERAL_AM/${MODS[$i]}.hmm -c $SPEERAL_AM/${MODS[$i]}.cls $FORKS --lock $REDIRECTION_OUTPUT # Check if error if [ $CHECK -eq 1 ] @@ -374,13 +377,13 @@ rm ${OUTPUT_DIR_BASENAME}/.tmp #---------------# # .res => .ctm -$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format CTM --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.ctm +$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format CTM --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.ctm $REDIRECTION_OUTPUT # .res => .trs echo -e "name $AUTHOR\nfileName $BASENAME\nfileExt wav\nsegFile $OUTPUT_DIR_BASENAME/$BASENAME.seg" > $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg -$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format TRS --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.trs --trs_config $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg -rm $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg +$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format TRS --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.trs --trs_config $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg $REDIRECTION_OUTPUT +rm $OUTPUT_DIR_BASENAME/$BASENAME.trs_cfg 2> /dev/null # .res => .txt -$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format TXT --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.txt +$SCRIPT_PATH/res2out.pl --dir $RES_DIR --format TXT --ignore $RULES/asupp --out $OUTPUT_DIR_BASENAME/$BASENAME.1pass.txt $REDIRECTION_OUTPUT # unlock directory mv "$OUTPUT_DIR_BASENAME/FIRSTPASS.lock" "$OUTPUT_DIR_BASENAME/FIRSTPASS.unlock" diff --git a/main_tools/ScoringRes.sh b/main_tools/ScoringRes.sh index 29bac02..d350948 100755 --- a/main_tools/ScoringRes.sh +++ b/main_tools/ScoringRes.sh @@ -138,8 +138,8 @@ do fi done < $SRT_FILE > $SCORING_DIR/$BASENAME.tmp.txt -#cat $SCORING_DIR/$BASENAME.tmp.txt | sed -e "s|\n| |g" | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -e "s|||g" | sed -e "s|||g" | sed -e "s|\n+| |g" > $SCORING_DIR/$BASENAME.tmp2.txt -cat $SCORING_DIR/$BASENAME.tmp.txt | sed -e "s|\n| |g" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | sed -e "s|||g" | sed -e "s|||g" | sed -e "s|\n+| |g" > $SCORING_DIR/$BASENAME.tmp2.txt +cat $SCORING_DIR/$BASENAME.tmp.txt | sed -e "s|\n| |g" | $SCRIPT_PATH/CleanFilter.sh | $SCRIPT_PATH/ApplyCorrectionRules.pl ${LEXICON}.regex | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -e "s|||g" | sed -e "s|||g" | sed -e "s|\n+| |g" > $SCORING_DIR/$BASENAME.tmp2.txt +#cat $SCORING_DIR/$BASENAME.tmp.txt | sed -e "s|\n| |g" | $SCRIPT_PATH/BdlexUC.pl $RULES/basic -t | sed -f $RULES/preprocess.regex | sed -f $RULES/lastprocess.regex | sed -e "s|||g" | sed -e "s|||g" | sed -e "s|\n+| |g" > $SCORING_DIR/$BASENAME.tmp2.txt $SCRIPT_PATH/srt2stm.pl $SCORING_DIR/$BASENAME.tmp2.txt > "$SCORING_DIR/$BASENAME.stm" rm $SCORING_DIR/$BASENAME.tmp.txt $SCORING_DIR/$BASENAME.tmp2.txt diff --git a/main_tools/ThirdPass.sh b/main_tools/ThirdPass.sh index d546e91..effd950 100755 --- a/main_tools/ThirdPass.sh +++ b/main_tools/ThirdPass.sh @@ -23,7 +23,7 @@ SCRIPT_PATH=$OTMEDIA_HOME/tools/scripts # Include scripts . $SCRIPT_PATH"/Tools.sh" -. $SCRIPT_PATH"/CheckThirdPassPass.sh" +. $SCRIPT_PATH"/CheckThirdPass.sh" # where is ThirdPass.cfg THIRDPASS_CONFIG_FILE=$OTMEDIA_HOME"/cfg/ThirdPass.cfg" -- 1.8.2.3