Blame view
egs/csj/s5/local/csj_make_trans/csj_autorun.sh
6.84 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
#! /bin/bash # Copyright 2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki) # 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe) # Apache 2.0 # Acknowledgement This work was supported by JSPS KAKENHI Grant Number 26280055. if [ $# -ne 3 ]; then echo "Usage: "`basename $0`" <speech-dir> <transcription-dir> <csj-version>" echo "e.g., "`basename $0`" /database/NINJAL/CSJ data/csj-data usb (or dvd)" echo "See comments in the script for more details" exit 1 fi resource=$1 outd=$2 csjv=$3 set -e # exit on error case "$csjv" in "merl" ) SDB=sdb/ ; WAV=WAV/ ; disc=CSJ2004 ;; # Set SDB directory and WAV directory respectively. "usb" ) SDB=MORPH/SDB/ ; WAV=WAV/ ; disc="core noncore" ;; # Set SDB directory and WAV directory respectively. "dvd" ) num=dvd ; SDB= ; WAV= ; disc=$num`seq -s " "$num 3 17| sed "s/ $num$//"` ;; # Set preserved format name to $num. *) echo "Input variable is usb or dvd only. $csjv is UNAVAILABLE VERSION." && exit 1; esac [ ! -e $resource ] && echo "Not exist CSJ or incorrect PATH." && exit 1; if [ ! -e $outd/.done_make_trans ];then ( echo "Make Transcription and PATH of WAV file." mkdir -p $outd rm -f $outd/README.txt echo "Contents about generated directory and file ## About each directory {dvd3(dvd) or core(usb)} :Contain training data eval/ :Official evaluation data set ( *** Extract from dvd *** ) excluded/ :Same speaker data including evaluation data (e.g. A01M0056) ( *** Extract from dvd *** ) ## About each file {dvd3(dvd) or core(usb)}/A01F0055 A01F0055-trans.text :Transcriptions (utterances with R-tags are removed) A01F0055-wav.list :Path about existing wav file A01F0055.4lex :File for making lexicon" >$outd/README.txt # Make transcription file for each dvd and each lecture [ ! -x "`which nkf `" ]\ && echo "This processing is need to prepare \"nkf\" command. Please retry after installing command \"nkf\"." && exit 1; for vol in $disc ;do mkdir -p $outd/$vol ( if [ $csjv = "merl" ]; then ids=`ls $resource/$vol/$SDB | sed 's:.sdb::g' | sed 's/00README.txt//g'` else ids=`ls $resource/${SDB}$vol | sed 's:.sdb::g' | sed 's/00README.txt//g'` fi for id in $ids; do mkdir -p $outd/$vol/$id case "$csjv" in "usb" ) TPATH="$resource/${SDB}$vol" ; WPATH="$resource/${WAV}$vol" ;; "dvd" ) TPATH="$resource/$vol/$id" ; WPATH="$resource/$vol/$id" ;; "merl" ) TPATH="$resource/$vol/$SDB" ; WPATH="$resource/$vol/$WAV" ;; esac local/csj_make_trans/csj2kaldi4m.pl $TPATH/${id}.sdb $outd/$vol/$id/${id}.4lex $outd/$vol/$id/${id}.4trn.t || exit 1; local/csj_make_trans/csjconnect.pl 0.5 10 $outd/$vol/$id/${id}.4trn.t $id > $outd/$vol/$id/${id}-trans.text || exit 1; rm $outd/$vol/$id/${id}.4trn.t if [ -e $WPATH/${id}-L.wav ]; then find $WPATH -iname "${id}-[L,R].wav" >$outd/$vol/$id/${id}-wav.list else find $WPATH -iname ${id}.wav >$outd/$vol/$id/${id}-wav.list || exit 1; fi done if [ -s $outd/$vol/$id/${id}-trans.text ] ;then echo -n >$outd/$vol/.done_$vol echo "Complete processing transcription data in $vol" else echo "Bad processing of making transcriptions part" && exit; fi )& done wait if [ -e $outd/$vol/.done_$vol ] ;then echo -n >$outd/.done_make_trans echo "Done!" else echo "Bad processing of making transcriptions part" && exit; fi ) fi ## Exclude speech data given by test set speakers. if [ ! -e $outd/.done_mv_eval_dup ]; then ( echo "Make evaluation set 1, 2, 3. And exclude speech data given by test set speakers." mkdir -p $outd/{\eval,excluded} mkdir -p $outd/eval/eval{1,2,3} # Exclude speaker ID A01M0056="S05M0613 R00M0187 D01M0019 D04M0056 D02M0028 D03M0017" # Evaluation set ID eval1="A01M0110 A01M0137 A01M0097 A04M0123 A04M0121 A04M0051 A03M0156 A03M0112 A03M0106 A05M0011" eval2="A01M0056 A03F0072 A02M0012 A03M0016 A06M0064 A06F0135 A01F0034 A01F0063 A01F0001 A01M0141" eval3="S00M0112 S00F0066 S00M0213 S00F0019 S00M0079 S01F0105 S00F0152 S00M0070 S00M0008 S00F0148" # Speech data given by test set speakers (e.g. eval2 : A01M0056) for list in $A01M0056 ; do find . -type d -name $list | xargs -i mv {} $outd/excluded done wait # Evaluation data for list in $eval1 $eval2 $eval3 ; do find . -type d -name $list | xargs -i mv {} $outd/eval done wait mv $outd/eval/{A01M0110,A01M0137,A01M0097,A04M0123,A04M0121,A04M0051,A03M0156,A03M0112,A03M0106,A05M0011} $outd/eval/eval1 mv $outd/eval/{A01M0056,A03F0072,A02M0012,A03M0016,A06M0064,A06F0135,A01F0034,A01F0063,A01F0001,A01M0141} $outd/eval/eval2 mv $outd/eval/{S00M0112,S00F0066,S00M0213,S00F0019,S00M0079,S01F0105,S00F0152,S00M0070,S00M0008,S00F0148} $outd/eval/eval3 [ 10 -eq `ls $outd/eval/eval1 | wc -l` ] && echo -n >$outd/eval/.done_eval1 [ 10 -eq `ls $outd/eval/eval2 | wc -l` ] && echo -n >$outd/eval/.done_eval2 [ 10 -eq `ls $outd/eval/eval3 | wc -l` ] && echo -n >$outd/eval/.done_eval3 if [ 3 -eq `ls -a $outd/eval | grep done_eval | wc -l` ] ;then echo -n >$outd/.done_mv_eval_dup echo "Done!" else echo "Bad processing of making evaluation set part" && exit; fi ) fi ## make lexicon.txt if [ ! -e $outd/.done_make_lexicon ]; then echo "Make lexicon file." ( lexicon=$outd/lexicon rm -f $outd/lexicon/lexicon.txt mkdir -p $lexicon cat $outd/*/*/*.4lex | grep -v "+ー" | grep -v "++" | grep -v "×" > $lexicon/lexicon.txt sort -u $lexicon/lexicon.txt > $lexicon/lexicon_htk.txt local/csj_make_trans/vocab2dic.pl -p local/csj_make_trans/kana2phone -e $lexicon/ERROR_v2d -o $lexicon/lexicon.txt $lexicon/lexicon_htk.txt cut -d'+' -f1,3- $lexicon/lexicon.txt >$lexicon/lexicon_htk.txt cut -f1,3- $lexicon/lexicon_htk.txt | perl -ape 's:\t: :g' >$lexicon/lexicon.txt if [ -s $lexicon/lexicon.txt ] ;then echo -n >$outd/.done_make_lexicon echo "Done!" else echo "Bad processing of making lexicon file" && exit; fi ) fi [ ! 3 -le `ls -a $outd | grep done | wc -l` ] \ && echo "ERROR : Processing is incorrect." && exit; echo "Finish processing original CSJ data" && echo -n >$outd/.done_make_all |