Commit 665a8dac322f0a4232d39c379136a945f4d76081
1 parent
b9a54507e8
Exists in
master
! follow the white rabbit !
Showing 6 changed files with 232 additions and 28 deletions Side-by-side Diff
| 1 | +#---------------# | |
| 2 | +# OTMEDIA LIA # | |
| 3 | +# HOWTO # | |
| 4 | +# version 1.0 # | |
| 5 | +#---------------# | |
| 6 | + | |
| 7 | +1\ Main options | |
| 8 | +--------------- | |
| 9 | + | |
| 10 | +There are five main options for otmedia scripts. | |
| 11 | +-h : for help | |
| 12 | +-D : Debug mode | |
| 13 | +-v n : Verbose mode 1 low to 3 high | |
| 14 | +-c : Check results | |
| 15 | +-r : force to rerun a script, without deleting work already done | |
| 16 | + | |
| 17 | +2\ Main scripts | |
| 18 | +--------------- | |
| 19 | + 2.1\ FirstPass.sh | |
| 20 | + ----------------- | |
| 21 | + | |
| 22 | + FirstPass.sh do speaker diarization and transcription of an audio file. Convert it into wav format if not already done (16000Hz, 16 bits, mono). | |
| 23 | + If a .SRT file is present in the same directory of the audio file it will copy it. | |
| 24 | + | |
| 25 | + $> FisrtPass.sh [options] 110624FR2_20002100.wav result_directory | |
| 26 | + | |
| 27 | + Options: | |
| 28 | + -f n : number of forks for speeral | |
| 29 | + | |
| 30 | + Output : result_directory/110624FR2_20002100/res_p1/ | |
| 31 | + | |
| 32 | + 2.2\ SecondPass.sh | |
| 33 | + ------------------ | |
| 34 | + | |
| 35 | + SecondPass.sh do speaker adaptation and transcriptions base on the first pass. | |
| 36 | + | |
| 37 | + $> SecondPass.sh [options] result_directory/110624FR2_20002100/ | |
| 38 | + | |
| 39 | + Options: | |
| 40 | + -f n : number of forks for speeral | |
| 41 | + | |
| 42 | + Output : result_directory/110624FR2_20002100/res_p2/ | |
| 43 | + | |
| 44 | + 2.3\ ConfPass.sh | |
| 45 | + ---------------- | |
| 46 | + | |
| 47 | + ConfPass.sh do confidence measure using the second or third pass. | |
| 48 | + | |
| 49 | + $> Confpass.sh [options] result_directory/110624FR2_20002100/ <res_p2|res_p3> | |
| 50 | + | |
| 51 | + Output : result_directory/110624FR2_20002100/conf/res_p2/scored_ctm/ | |
| 52 | + and result_directory/110624FR2_20002100.usf file | |
| 53 | + | |
| 54 | + 2.4\ ExploitConfidencePass.sh | |
| 55 | + ----------------------------- | |
| 56 | + | |
| 57 | + It exploits confidence pass measure to : | |
| 58 | + - boost confidente zone | |
| 59 | + - find alternative in non confidente zone (using SOLR DB) | |
| 60 | + - extend the lexicon | |
| 61 | + | |
| 62 | + $> ExploitConfidencePass.sh [options] result_directory/110624FR2_20002100 | |
| 63 | + | |
| 64 | + Output : result_directory/110624FR2_20002100/trigg/speeral | |
| 65 | + result_directory/110624FR2_20002100/LEX/speeral/_ext | |
| 66 | + | |
| 67 | + 2.5\ ThirstPass.sh | |
| 68 | + ------------------ | |
| 69 | + | |
| 70 | + ThirdPass.sh do transcriptions using SecondPass speaker adaptation and ExploitConfidencePass trigg files and new lexicon. | |
| 71 | + | |
| 72 | + $> ThirdPass.sh [options] result_directory/110624FR2_20002100/ | |
| 73 | + | |
| 74 | + Options : | |
| 75 | + -f n : number of forks for speeral | |
| 76 | + | |
| 77 | + Output : result_directory/110624FR2_20002100/conf/res_p3 | |
| 78 | + | |
| 79 | + 2.6\ RecomposePass.sh | |
| 80 | + -------------------- | |
| 81 | + | |
| 82 | + RecomposePass.sh copy results that missing in ThirsPass from the Second and First Pass. | |
| 83 | + | |
| 84 | + $> RecomposePass.sh [options] result_directory/110624FR2_20002100/ | |
| 85 | + | |
| 86 | + Output : result_directory/110624FR2_20002100/res_all | |
| 87 | + | |
| 88 | + 2.7\ ScoringRes.sh | |
| 89 | + ------------------ | |
| 90 | + | |
| 91 | + ScoringRes.sh run differents scoring tools to score the results using SRT file if exists. | |
| 92 | + | |
| 93 | + $> ScoringRes.sh [options] result_directory/110624FR2_20002100/ | |
| 94 | + | |
| 95 | + Output : result_directory/110624FR2_20002100/scoring | |
| 96 | + | |
| 97 | + 2.8\ CheckResults.sh | |
| 98 | + -------------------- | |
| 99 | + | |
| 100 | + CheckResults.sh parse results directories to synthesize works already done. | |
| 101 | + | |
| 102 | + $> CheckResults.sh [options] result_directory | |
| 103 | + | |
| 104 | + Output : "Directory name #plp #res_p1 #treil_p2 #treil_p3 usf_p2 usf_p3" | |
| 105 | + #plp number of plp files | |
| 106 | + #res_p1 number of .res files at first pass | |
| 107 | + #treil_p2 number of .treil files at second pass | |
| 108 | + #treil_p3 number of .treil files at third pass | |
| 109 | + usf_p2 usf file from confidence pass result on second pass (OK|ERR|NAN) | |
| 110 | + usf_p3 usf file from confidence pass result on third pass (OK|ERR|NAN) | |
| 111 | + | |
| 112 | +3\ OneScriptToRuleThemAll.sh | |
| 113 | +---------------------------- | |
| 114 | + | |
| 115 | + The script to do all OTMEDIA LIA pass in one call. | |
| 116 | + | |
| 117 | + $> OneScriptToRuleThemAll.sh [options] 110624FR2_20002100.wav result_directory | |
| 118 | + | |
| 119 | + Options : (default options are availables) | |
| 120 | + -a Do every pass | |
| 121 | + -1 Do First pass | |
| 122 | + -2 Do Second pass | |
| 123 | + -3 Do Third pass | |
| 124 | + -C Do Confidence pass | |
| 125 | + -e Do Exploit Confidence pass | |
| 126 | + -R Do Recompose pass | |
| 127 | + -s Do Scoring pass |
INSTALL
| 1 | +#---------------# | |
| 2 | +# OTMEDIA LIA # | |
| 3 | +# INSTALL # | |
| 4 | +# version : 1.0 # | |
| 5 | +#---------------# | |
| 6 | + | |
| 7 | +OTMEDIA LIA ready to use ? Really ? | |
| 8 | +No ! You have to do manualy configuartion for some features. | |
| 9 | +Let see... | |
| 10 | + | |
| 11 | +SUMMARY | |
| 12 | +------- | |
| 13 | + | |
| 14 | +1\ Before installation | |
| 15 | +2\ install.sh script | |
| 16 | +3\ SOLR install | |
| 17 | + | |
| 18 | + | |
| 19 | +1\ Before installation | |
| 20 | +---------------------- | |
| 21 | + | |
| 22 | +- Check and install dependencies. | |
| 23 | +- In 64 bits architcture be sure you can run 32 bits programs. | |
| 24 | +- Have 300 Go of free space. | |
| 25 | +- Have acces to the network and the nyx server. | |
| 26 | + | |
| 27 | +2/ install.sh script | |
| 28 | +-------------------- | |
| 29 | + | |
| 30 | +install.sh script will do most of the work. | |
| 31 | +It will check dependencies and configure pass tools. | |
| 32 | +By default it will do a complet install (300 Go). | |
| 33 | + | |
| 34 | +You can modifiy behavior by editing install.sh : | |
| 35 | + | |
| 36 | +To disable lexicon adaption using SOLR DB put EXPLOITCONFPASS to 0 (mainly the 290 Go). | |
| 37 | +To disable confidence measure put CONFPASS to 0. | |
| 38 | +To disable second and third pass put PASS2 to 0. | |
| 39 | + | |
| 40 | +run install.sh and follow the white rabbit. | |
| 41 | + | |
| 42 | +3\ SOLR install | |
| 43 | +--------------- | |
| 44 | + | |
| 45 | +The install.sh script download otmedia-2013-04.tar.gz and untar it in OTMEDIA_HOME/tools/SOLR/ . | |
| 46 | +See SOLR.INSTALL file to install OTMEDIA SOLR DB. |
README
| ... | ... | @@ -5,11 +5,11 @@ |
| 5 | 5 | \___/ |_| |_| |_|_____|____/___/_/ \_\ |_____|___/_/ \_\ |
| 6 | 6 | |
| 7 | 7 | |
| 8 | -#-------------------# | |
| 9 | -# OTMEDIA LIA # | |
| 10 | -# README # | |
| 11 | -# version 1.0 # | |
| 12 | -#-------------------# | |
| 8 | +#---------------# | |
| 9 | +# OTMEDIA LIA # | |
| 10 | +# README # | |
| 11 | +# version 1.0 # | |
| 12 | +#---------------# | |
| 13 | 13 | |
| 14 | 14 | DESCRIPTION |
| 15 | 15 | ----------- |
| ... | ... | @@ -22,6 +22,13 @@ |
| 22 | 22 | Web Site : http://www.otmedia.fr |
| 23 | 23 | |
| 24 | 24 | OTMEDIA LIA project is a set of tools to transcribe radio and TV shows. |
| 25 | + It does multiple things : | |
| 26 | + - First pass : default transcription with speeral and speaker diarization. | |
| 27 | + - Second pass : speaker adaptation and a second transcription pass with speeral. | |
| 28 | + - Confidence pass : calcul confidence measure from transcription output. | |
| 29 | + - Exploit Confidence Measure : use SOLR DB data to extend the lexicon on low confidence measure and create trigg files. | |
| 30 | + - Third pass : second pass using the new lexicon and trigg files. | |
| 31 | + | |
| 25 | 32 | |
| 26 | 33 | DEPENDENCIES |
| 27 | 34 | ------------ |
| ... | ... | @@ -57,7 +64,7 @@ |
| 57 | 64 | |
| 58 | 65 | Perl is a programming language. |
| 59 | 66 | |
| 60 | -iconvi ( >= 2.0.0) | |
| 67 | +iconv ( >= 2.0.0) | |
| 61 | 68 | Available from : http://www.gnu.org |
| 62 | 69 | and debian package |
| 63 | 70 | |
| 64 | 71 | |
| 65 | 72 | |
| ... | ... | @@ -89,15 +96,16 @@ |
| 89 | 96 | |
| 90 | 97 | Quick install below. |
| 91 | 98 | |
| 92 | - Before launch installation : | |
| 99 | + Before launching installation : | |
| 93 | 100 | |
| 94 | 101 | Be certain that all dependencies are satisfied. |
| 102 | + Have 300 Go of free space for complet install. | |
| 95 | 103 | |
| 96 | 104 | Issue the following commands to the shell : |
| 97 | 105 | $> ./install.sh |
| 98 | 106 | $> export OTMEDIA_HOME=path/to/OTMEDIA/directory |
| 99 | 107 | |
| 100 | - Read SOLR.INSTALL part 3/ to install SOLRDB. | |
| 108 | + Read SOLR.INSTALL part 3 to install SOLRDB. | |
| 101 | 109 | |
| 102 | 110 | RUNNING |
| 103 | 111 | ------- |
| ... | ... | @@ -113,6 +121,7 @@ |
| 113 | 121 | ---------- |
| 114 | 122 | |
| 115 | 123 | Many. |
| 124 | + For Bug report, please contact Pascal Nocera at pascal.nocera@univ-avignon.fr | |
| 116 | 125 | |
| 117 | 126 | COPYRIGHT |
| 118 | 127 | --------- |
SOLR.INSTALL
install.sh
| 1 | 1 | #!/bin/bash |
| 2 | 2 | |
| 3 | 3 | #-------------------# |
| 4 | +# OTMEDIA LIA # | |
| 4 | 5 | # Install script # |
| 5 | -# OTMEDIA # | |
| 6 | +# version : 1.0.0 # | |
| 6 | 7 | #-------------------# |
| 7 | 8 | |
| 8 | 9 | # Color variables |
| ... | ... | @@ -30,7 +31,7 @@ |
| 30 | 31 | # and 1 to enable |
| 31 | 32 | # |
| 32 | 33 | PASS1=1 # First Pass |
| 33 | -PASS2=1 # Second Pass | |
| 34 | +PASS2=1 # Second and Third Pass | |
| 34 | 35 | CONFPASS=1 # Confidence Pass |
| 35 | 36 | EXPLOITCONFPASS=1 # SOLR query and trigg |
| 36 | 37 | |
| ... | ... | @@ -98,6 +99,15 @@ |
| 98 | 99 | exit 1; |
| 99 | 100 | fi |
| 100 | 101 | echo -e "python : \t ${txtgrn}OK${txtrst}" |
| 102 | + | |
| 103 | + ## csh shell | |
| 104 | + test=$(whereis csh) | |
| 105 | + if [ "$test" == "csh:" ] | |
| 106 | + then | |
| 107 | + echo -e "${txtpur}ERROR${txtrst} csh shell not found\n You have to install csh shell\n sudo apt-get install csh" | |
| 108 | + exit 1; | |
| 109 | + fi | |
| 110 | + echo -e "csh shell : \t ${txtgrn}OK${txtrst}" | |
| 101 | 111 | fi |
| 102 | 112 | |
| 103 | 113 | ## Perl |
| ... | ... | @@ -118,15 +128,6 @@ |
| 118 | 128 | fi |
| 119 | 129 | echo -e "iconv : \t ${txtgrn}OK${txtrst}" |
| 120 | 130 | |
| 121 | -## csh shell | |
| 122 | -test=$(whereis csh) | |
| 123 | -if [ "$test" == "csh:" ] | |
| 124 | -then | |
| 125 | - echo -e "${txtpur}ERROR${txtrst} csh shell not found\n You have to install csh shell\n sudo apt-get install csh" | |
| 126 | - exit 1; | |
| 127 | -fi | |
| 128 | -echo -e "csh shell : \t ${txtgrn}OK${txtrst}" | |
| 129 | - | |
| 130 | 131 | ## SRI LM |
| 131 | 132 | if [ -z "$SRILM" ] && [ -z "$MACHINE_TYPE" ] |
| 132 | 133 | then |
| ... | ... | @@ -136,8 +137,6 @@ |
| 136 | 137 | export SRILM_BIN=$SRILM/bin/$MACHINE_TYPE |
| 137 | 138 | echo -e "SRILM toolkit : \t ${txtgrn}OK${txtrst}" |
| 138 | 139 | |
| 139 | - | |
| 140 | - | |
| 141 | 140 | ### Speeral Configuration ### |
| 142 | 141 | |
| 143 | 142 | echo -e "\n\t${txtblu}Speeral configuration${txtrst}\n" |
| ... | ... | @@ -190,7 +189,7 @@ |
| 190 | 189 | if [ "$test" == "" ] |
| 191 | 190 | then |
| 192 | 191 | echo -e "${txtpur}ERROR${txtrst} TOMCAT seems to not be installed)\n You have to install TOMCAT\n" |
| 193 | - exit 1; | |
| 192 | + #exit 1; | |
| 194 | 193 | fi |
| 195 | 194 | echo -e "\nTOMCAT : \t ${txtgrn}OK${txtrst}\n" |
| 196 | 195 | # SOLR secondly |
| ... | ... | @@ -234,6 +233,7 @@ |
| 234 | 233 | cp ~/.bashrc.org ~/.bashrc |
| 235 | 234 | export OTMEDIA_HOME=$PWD |
| 236 | 235 | echo "export OTMEDIA_HOME=$PWD" >> ~/.bashrc |
| 236 | +echo "export $PATH=$PATH:$PWD/main_tools" >> ~/.bashrc | |
| 237 | 237 | echo "export SRILM_BIN=$SRILM/bin/$MACHINE_TYPE" >> ~/.bashrc |
| 238 | 238 | #echo "export LIA_TAGG_LANG=french" >> ~/.bashrc |
| 239 | 239 | #echo "export LIA_TAGG=$OTMEDIA_HOME/tools/lia_ltbox/lia_tagg/" >> ~/.bashrc |
| 240 | 240 | |
| ... | ... | @@ -252,7 +252,14 @@ |
| 252 | 252 | echo -e "do : source ~/.bashrc" |
| 253 | 253 | echo -e "or set variable :\n" |
| 254 | 254 | echo "export OTMEDIA_HOME=$PWD" |
| 255 | +echo "export PATH=$PATH:$OTMEDIA_HOME/main_tools" | |
| 255 | 256 | echo "export SRILM_BIN=$SRILM/bin/$MACHINE_TYPE" |
| 257 | + | |
| 258 | + | |
| 259 | +echo " \\\\ " | |
| 260 | +echo " ,-~~~-\\\\_" | |
| 261 | +echo " ( .\ " | |
| 262 | +echo " @\___(__--'" | |
| 256 | 263 | |
| 257 | 264 | echo "${txtgrn}Yes${txtylw}I${txtpur}Rastafari{$txtrst}" |
main_tools/ExploitConfidencePass.sh
| ... | ... | @@ -47,7 +47,7 @@ |
| 47 | 47 | #---------------# |
| 48 | 48 | # Parse Options # |
| 49 | 49 | #---------------# |
| 50 | -while getopts ":hDv:cf:r" opt | |
| 50 | +while getopts ":hDv:cr" opt | |
| 51 | 51 | do |
| 52 | 52 | case $opt in |
| 53 | 53 | h) |
| ... | ... | @@ -57,7 +57,6 @@ |
| 57 | 57 | echo -e "\t\t-D :\tDEBUG mode on" |
| 58 | 58 | echo -e "\t\t-v l :\tVerbose mode, l=(1|2|3) level mode" |
| 59 | 59 | echo -e "\t\t-c :\tCheck process, stop if error detected" |
| 60 | - echo -e "\t\t-f n :\tspecify a speeral forks number (default 1)" | |
| 61 | 60 | echo -e "\t\t-r n :\tforce rerun without deleting files" |
| 62 | 61 | exit 1 |
| 63 | 62 | ;; |
| ... | ... | @@ -69,9 +68,6 @@ |
| 69 | 68 | ;; |
| 70 | 69 | c) |
| 71 | 70 | CHECK=1 |
| 72 | - ;; | |
| 73 | - f) | |
| 74 | - FORKS="--forks $OPTARG" | |
| 75 | 71 | ;; |
| 76 | 72 | r) |
| 77 | 73 | RERUN=1 |