From 665a8dac322f0a4232d39c379136a945f4d76081 Mon Sep 17 00:00:00 2001 From: rey jean-Francois Date: Mon, 5 Aug 2013 16:54:37 +0200 Subject: [PATCH] ! follow the white rabbit ! --- COPYING | 0 HOWTO | 128 ++++++++++++++++++++++++++++++++++++ INSTALL | 63 ++++++++++++++++++ README | 25 ++++--- SOLR.INSTALL | 3 +- install.sh | 35 ++++++---- main_tools/ExploitConfidencePass.sh | 6 +- 7 files changed, 232 insertions(+), 28 deletions(-) create mode 100644 COPYING create mode 100644 INSTALL diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..e69de29 diff --git a/HOWTO b/HOWTO index e69de29..4e01261 100644 --- a/HOWTO +++ b/HOWTO @@ -0,0 +1,128 @@ +#---------------# +# OTMEDIA LIA # +# HOWTO # +# version 1.0 # +#---------------# + +1\ Main options +--------------- + +There are five main options for otmedia scripts. +-h : for help +-D : Debug mode +-v n : Verbose mode 1 low to 3 high +-c : Check results +-r : force to rerun a script, without deleting work already done + +2\ Main scripts +--------------- + 2.1\ FirstPass.sh + ----------------- + + FirstPass.sh do speaker diarization and transcription of an audio file. Convert it into wav format if not already done (16000Hz, 16 bits, mono). + If a .SRT file is present in the same directory of the audio file it will copy it. + + $> FisrtPass.sh [options] 110624FR2_20002100.wav result_directory + + Options: + -f n : number of forks for speeral + + Output : result_directory/110624FR2_20002100/res_p1/ + + 2.2\ SecondPass.sh + ------------------ + + SecondPass.sh do speaker adaptation and transcriptions base on the first pass. + + $> SecondPass.sh [options] result_directory/110624FR2_20002100/ + + Options: + -f n : number of forks for speeral + + Output : result_directory/110624FR2_20002100/res_p2/ + + 2.3\ ConfPass.sh + ---------------- + + ConfPass.sh do confidence measure using the second or third pass. + + $> Confpass.sh [options] result_directory/110624FR2_20002100/ + + Output : result_directory/110624FR2_20002100/conf/res_p2/scored_ctm/ + and result_directory/110624FR2_20002100.usf file + + 2.4\ ExploitConfidencePass.sh + ----------------------------- + + It exploits confidence pass measure to : + - boost confidente zone + - find alternative in non confidente zone (using SOLR DB) + - extend the lexicon + + $> ExploitConfidencePass.sh [options] result_directory/110624FR2_20002100 + + Output : result_directory/110624FR2_20002100/trigg/speeral + result_directory/110624FR2_20002100/LEX/speeral/_ext + + 2.5\ ThirstPass.sh + ------------------ + + ThirdPass.sh do transcriptions using SecondPass speaker adaptation and ExploitConfidencePass trigg files and new lexicon. + + $> ThirdPass.sh [options] result_directory/110624FR2_20002100/ + + Options : + -f n : number of forks for speeral + + Output : result_directory/110624FR2_20002100/conf/res_p3 + + 2.6\ RecomposePass.sh + -------------------- + + RecomposePass.sh copy results that missing in ThirsPass from the Second and First Pass. + + $> RecomposePass.sh [options] result_directory/110624FR2_20002100/ + + Output : result_directory/110624FR2_20002100/res_all + + 2.7\ ScoringRes.sh + ------------------ + + ScoringRes.sh run differents scoring tools to score the results using SRT file if exists. + + $> ScoringRes.sh [options] result_directory/110624FR2_20002100/ + + Output : result_directory/110624FR2_20002100/scoring + + 2.8\ CheckResults.sh + -------------------- + + CheckResults.sh parse results directories to synthesize works already done. + + $> CheckResults.sh [options] result_directory + + Output : "Directory name #plp #res_p1 #treil_p2 #treil_p3 usf_p2 usf_p3" + #plp number of plp files + #res_p1 number of .res files at first pass + #treil_p2 number of .treil files at second pass + #treil_p3 number of .treil files at third pass + usf_p2 usf file from confidence pass result on second pass (OK|ERR|NAN) + usf_p3 usf file from confidence pass result on third pass (OK|ERR|NAN) + +3\ OneScriptToRuleThemAll.sh +---------------------------- + + The script to do all OTMEDIA LIA pass in one call. + + $> OneScriptToRuleThemAll.sh [options] 110624FR2_20002100.wav result_directory + + Options : (default options are availables) + -a Do every pass + -1 Do First pass + -2 Do Second pass + -3 Do Third pass + -C Do Confidence pass + -e Do Exploit Confidence pass + -R Do Recompose pass + -s Do Scoring pass + diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..e048dc2 --- /dev/null +++ b/INSTALL @@ -0,0 +1,63 @@ +#---------------# +# OTMEDIA LIA # +# INSTALL # +# version : 1.0 # +#---------------# + +OTMEDIA LIA ready to use ? Really ? +No ! You have to do manualy configuartion for some features. +Let see... + +SUMMARY +------- + +1\ Before installation +2\ install.sh script +3\ SOLR install + + +1\ Before installation +---------------------- + +- Check and install dependencies. +- In 64 bits architcture be sure you can run 32 bits programs. +- Have 300 Go of free space. +- Have acces to the network and the nyx server. + +2/ install.sh script +-------------------- + +install.sh script will do most of the work. +It will check dependencies and configure pass tools. +By default it will do a complet install (300 Go). + +You can modifiy behavior by editing install.sh : + +To disable lexicon adaption using SOLR DB put EXPLOITCONFPASS to 0 (mainly the 290 Go). +To disable confidence measure put CONFPASS to 0. +To disable second and third pass put PASS2 to 0. + +run install.sh and follow the white rabbit. + +3\ SOLR install +--------------- + +The install.sh script download otmedia-2013-04.tar.gz and untar it in OTMEDIA_HOME/tools/SOLR/ . +See SOLR.INSTALL file to install OTMEDIA SOLR DB. + + + + + + + + + + + + + + + + + diff --git a/README b/README index 49716fd..4e2377c 100644 --- a/README +++ b/README @@ -5,11 +5,11 @@ \___/ |_| |_| |_|_____|____/___/_/ \_\ |_____|___/_/ \_\ -#-------------------# -# OTMEDIA LIA # -# README # -# version 1.0 # -#-------------------# +#---------------# +# OTMEDIA LIA # +# README # +# version 1.0 # +#---------------# DESCRIPTION ----------- @@ -22,6 +22,13 @@ DESCRIPTION Web Site : http://www.otmedia.fr OTMEDIA LIA project is a set of tools to transcribe radio and TV shows. + It does multiple things : + - First pass : default transcription with speeral and speaker diarization. + - Second pass : speaker adaptation and a second transcription pass with speeral. + - Confidence pass : calcul confidence measure from transcription output. + - Exploit Confidence Measure : use SOLR DB data to extend the lexicon on low confidence measure and create trigg files. + - Third pass : second pass using the new lexicon and trigg files. + DEPENDENCIES ------------ @@ -57,7 +64,7 @@ Perl ( >= 5.0.0) Perl is a programming language. -iconvi ( >= 2.0.0) +iconv ( >= 2.0.0) Available from : http://www.gnu.org and debian package @@ -89,15 +96,16 @@ INSTALLATION Quick install below. - Before launch installation : + Before launching installation : Be certain that all dependencies are satisfied. + Have 300 Go of free space for complet install. Issue the following commands to the shell : $> ./install.sh $> export OTMEDIA_HOME=path/to/OTMEDIA/directory - Read SOLR.INSTALL part 3/ to install SOLRDB. + Read SOLR.INSTALL part 3 to install SOLRDB. RUNNING ------- @@ -113,6 +121,7 @@ KNOWN BUGS ---------- Many. + For Bug report, please contact Pascal Nocera at pascal.nocera@univ-avignon.fr COPYRIGHT --------- diff --git a/SOLR.INSTALL b/SOLR.INSTALL index e81230a..a4642fd 100644 --- a/SOLR.INSTALL +++ b/SOLR.INSTALL @@ -13,7 +13,8 @@ 3/ Configure Tomcat and SOLR -SOLR_OTMEDIA_PATH=OTMEDIA_PATH/tools/SOLR/otemdia-2013-04 +otmedia-2013-04 SOLR DB is untar in : +SOLR_OTMEDIA_PATH=OTMEDIA_HOME/tools/SOLR/otemdia-2013-04 3.1/ Set context file ---------------- diff --git a/install.sh b/install.sh index ea3d1d8..7b61206 100755 --- a/install.sh +++ b/install.sh @@ -1,8 +1,9 @@ #!/bin/bash #-------------------# +# OTMEDIA LIA # # Install script # -# OTMEDIA # +# version : 1.0.0 # #-------------------# # Color variables @@ -30,7 +31,7 @@ if [ "$test" == "x86_64" ]; then ARCH=".64"; else ARCH=""; fi # and 1 to enable # PASS1=1 # First Pass -PASS2=1 # Second Pass +PASS2=1 # Second and Third Pass CONFPASS=1 # Confidence Pass EXPLOITCONFPASS=1 # SOLR query and trigg @@ -98,6 +99,15 @@ then exit 1; fi echo -e "python : \t ${txtgrn}OK${txtrst}" + + ## csh shell + test=$(whereis csh) + if [ "$test" == "csh:" ] + then + echo -e "${txtpur}ERROR${txtrst} csh shell not found\n You have to install csh shell\n sudo apt-get install csh" + exit 1; + fi + echo -e "csh shell : \t ${txtgrn}OK${txtrst}" fi ## Perl @@ -118,15 +128,6 @@ then fi echo -e "iconv : \t ${txtgrn}OK${txtrst}" -## csh shell -test=$(whereis csh) -if [ "$test" == "csh:" ] -then - echo -e "${txtpur}ERROR${txtrst} csh shell not found\n You have to install csh shell\n sudo apt-get install csh" - exit 1; -fi -echo -e "csh shell : \t ${txtgrn}OK${txtrst}" - ## SRI LM if [ -z "$SRILM" ] && [ -z "$MACHINE_TYPE" ] then @@ -136,8 +137,6 @@ fi export SRILM_BIN=$SRILM/bin/$MACHINE_TYPE echo -e "SRILM toolkit : \t ${txtgrn}OK${txtrst}" - - ### Speeral Configuration ### echo -e "\n\t${txtblu}Speeral configuration${txtrst}\n" @@ -190,7 +189,7 @@ then if [ "$test" == "" ] then echo -e "${txtpur}ERROR${txtrst} TOMCAT seems to not be installed)\n You have to install TOMCAT\n" - exit 1; + #exit 1; fi echo -e "\nTOMCAT : \t ${txtgrn}OK${txtrst}\n" # SOLR secondly @@ -234,6 +233,7 @@ cat ~/.bashrc | grep -v "OTMEDIA_HOME" | grep -v "SRILM_BIN" > ~/.bashrc.org cp ~/.bashrc.org ~/.bashrc export OTMEDIA_HOME=$PWD echo "export OTMEDIA_HOME=$PWD" >> ~/.bashrc +echo "export $PATH=$PATH:$PWD/main_tools" >> ~/.bashrc echo "export SRILM_BIN=$SRILM/bin/$MACHINE_TYPE" >> ~/.bashrc #echo "export LIA_TAGG_LANG=french" >> ~/.bashrc #echo "export LIA_TAGG=$OTMEDIA_HOME/tools/lia_ltbox/lia_tagg/" >> ~/.bashrc @@ -252,6 +252,13 @@ echo -e "\n\t${txtgrn}### Install completed ###${txtrst}\n" echo -e "do : source ~/.bashrc" echo -e "or set variable :\n" echo "export OTMEDIA_HOME=$PWD" +echo "export PATH=$PATH:$OTMEDIA_HOME/main_tools" echo "export SRILM_BIN=$SRILM/bin/$MACHINE_TYPE" + +echo " \\\\ " +echo " ,-~~~-\\\\_" +echo " ( .\ " +echo " @\___(__--'" + echo "${txtgrn}Yes${txtylw}I${txtpur}Rastafari{$txtrst}" diff --git a/main_tools/ExploitConfidencePass.sh b/main_tools/ExploitConfidencePass.sh index 06a0eb1..0c4366b 100755 --- a/main_tools/ExploitConfidencePass.sh +++ b/main_tools/ExploitConfidencePass.sh @@ -47,7 +47,7 @@ fi #---------------# # Parse Options # #---------------# -while getopts ":hDv:cf:r" opt +while getopts ":hDv:cr" opt do case $opt in h) @@ -57,7 +57,6 @@ do echo -e "\t\t-D :\tDEBUG mode on" echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode" echo -e "\t\t-c :\tCheck process, stop if error detected" - echo -e "\t\t-f n :\tspecify a speeral forks number (default 1)" echo -e "\t\t-r n :\tforce rerun without deleting files" exit 1 ;; @@ -70,9 +69,6 @@ do c) CHECK=1 ;; - f) - FORKS="--forks $OPTARG" - ;; r) RERUN=1 ;; -- 1.8.2.3