Blame view
egs/csj/s5/local/csj_data_prep.sh
4.22 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
#!/bin/bash # Copyright 2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki) # 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe) # Apache 2.0 # Acknowledgement This work was supported by JSPS KAKENHI Grant Number 26280055. # CSJ 291 hours training data preparation. # "Academic lecture" 270 hours (Exclude the speakers in the evaluation set from all speech data) # + "Other" 21 hours # Actually, amount time of training data is 240 hours, this is excluding R-tag utterance and silence section. # To be run from one directory above this script. ## The input is a directory that contains the CSJ corpus. ## Note: If necessary, rewrite the "cat" command used in the followings ## to locate the .wav file path. . ./path.sh set -e # exit on error #check existing directories if [ $# -ne 1 ] && [ $# -ne 2 ]; then echo "Usage: csj_data_prep.sh <csj-data dir> [<mode_number>]" echo " mode_number can be 0, 1, 2, 3, (0=default using academic lecture and other data, 1=using academic lecture data," echo " 2=using all data except for dialog data, 3=using all data)" exit 1; fi CSJ=$1 mode=0 if [ $# -eq 2 ]; then mode=$2 fi dir=data/local/train mkdir -p $dir # Audio data directory check if [ ! -d $CSJ ]; then echo "Error: run.sh requires a directory argument" exit 1; fi # CSJ dictionary file check if [ ! -f $dir/lexicon.txt ]; then cp $CSJ/lexicon/lexicon.txt $dir || exit 1; fi ### Config of using wav data that relates with acoustic model training ### if [ $mode -eq 3 ] then cat $CSJ/*/*/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using All data elif [ $mode -eq 2 ] then cat $CSJ/*/{A*,M*,R*,S*}/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using All data except for "dialog" data elif [ $mode -eq 1 ] then cat $CSJ/*/A*/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using "Academic lecture" data else # cat $CSJ/*/{A*,M*}/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using "Academic lecture" and "other" data cat $CSJ/*/{A,M}*/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using "Academic lecture" and "other" data fi n=`cat $dir/wav.flist | wc -l` [ $n -ne 986 ] && \ echo "Warning: expected 986 data files (Case : Using 'Academic lecture' and 'Other' data), found $n." # (1a) Transcriptions preparation # make basic transcription file (add segments info) ##e.g A01F0055_0172 00380.213 00385.951 => A01F0055_0380213_0385951 ## for CSJ awk '{ spkutt_id=$1; split(spkutt_id,T,"[_ ]"); name=T[1]; stime=$2; etime=$3; printf("%s_%07.0f_%07.0f",name, int(1000*stime), int(1000*etime)); for(i=4;i<=NF;i++) printf(" %s", tolower($i)); printf " " }' $CSJ/*/*/*-trans.text |sort > $dir/transcripts1.txt # This data is for training language models # Except evaluation set (30 speakers) # test if trans. file is sorted export LC_ALL=C; sort -c $dir/transcripts1.txt || exit 1; # check it's sorted. # Remove Option. # **NOTE: modified the pattern matches to make them case insensitive cat $dir/transcripts1.txt \ | perl -ane 's:\<s\>::gi; s:\<\/s\>::gi; print;' \ | awk '{if(NF > 1) { print; } } ' |sort > $dir/text # (1c) Make segments files from transcript #segments file format is: utt-id start-time end-time, e.g.: #A01F0055_0380213_0385951 => A01F0055_0380213_0385951 A01F0055 00380.213 00385.951 awk '{ segment=$1; split(segment,S,"[_]"); spkid=S[1]; startf=S[2]; endf=S[3]; print segment " " spkid " " startf/1000 " " endf/1000 }' < $dir/text > $dir/segments sed -e 's?.*/??' -e 's?.wav??' -e 's?\-[R,L]??' $dir/wav.flist | paste - $dir/wav.flist \ > $dir/wavflist.scp awk '{ printf("%s cat %s | ", $1, $2); }' < $dir/wavflist.scp | sort > $dir/wav.scp || exit 1; awk '{segment=$1; split(segment,S,"[_]"); spkid=S[1]; print $1 " " spkid}' $dir/segments > $dir/utt2spk || exit 1; sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1; # Copy stuff into its final locations [this has been moved from the format_data script] mkdir -p data/train for f in spk2utt utt2spk wav.scp text segments; do cp data/local/train/$f data/train/ || exit 1; done echo "CSJ data preparation succeeded." utils/fix_data_dir.sh data/train |