Blame view
egs/voxforge/s5/local/voxforge_data_prep.sh
5.42 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
#!/bin/bash # Copyright 2012 Vassil Panayotov # 2014 Johns Hopkins University (author: Daniel Povey) # Apache 2.0 # Makes train/test splits . ./path.sh echo "=== Starting initial VoxForge data preparation ..." echo "--- Making test/train data split ..." # The number of speakers in the test set nspk_test=30 . utils/parse_options.sh if [ $# != 1 ]; then echo "Usage: $0 <data-directory>"; exit 1; fi command -v flac >/dev/null 2>&1 ||\ { echo "FLAC decompressor needed but not found"'!' ; exit 1; } DATA=$1 locdata=data/local loctmp=$locdata/tmp rm -rf $loctmp >/dev/null 2>&1 mkdir -p $locdata mkdir -p $loctmp # The "sed" expression below is quite messy because some of the directrory # names don't follow the "speaker-YYYYMMDD-<random_3letter_suffix>" convention. # The ";tx;d;:x" part of the expression is to filter out the directories, # not matched by the expression find $DATA/ -mindepth 1 -maxdepth 1 |\ perl -ane ' s:.*/((.+)\-[0-9]{8,10}[a-z]*([_\-].*)?):$2: && print; ' | \ sort -u > $loctmp/speakers_all.txt nspk_all=$(wc -l <$loctmp/speakers_all.txt) if [ "$nspk_test" -ge "$nspk_all" ]; then echo "${nspk_test} test speakers requested, but there are only ${nspk_all} speakers in total!" exit 1; fi utils/shuffle_list.pl <$loctmp/speakers_all.txt | head -n $nspk_test | sort -u >$loctmp/speakers_test.txt awk 'NR==FNR{spk[$0]; next} !($0 in spk)' \ $loctmp/speakers_test.txt $loctmp/speakers_all.txt |\ sort -u > $loctmp/speakers_train.txt wc -l $loctmp/speakers_all.txt wc -l $loctmp/speakers_{train,test}.txt # expand speaker names to their respective directories for d in $(find ${DATA}/ -mindepth 1 -maxdepth 1 -type l -or -type d); do basename $d done | awk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \ $loctmp/speakers_test.txt - | sort > $loctmp/dir_test.txt if [ ! -s $loctmp/dir_test.txt ]; then echo "$0: file $loctmp/dir_test.txt is empty" exit 1; fi for d in $(find ${DATA}/ -mindepth 1 -maxdepth 1 -type l -or -type d); do basename $d; done | awk 'BEGIN {FS="-"} NR==FNR{arr[$1]; next;} ($1 in arr)' \ $loctmp/speakers_train.txt - | sort > $loctmp/dir_train.txt if [ ! -s $loctmp/dir_test.txt ]; then echo "$0: file $loctmp/dir_train.txt is empty" exit 1; fi logdir=exp/data_prep mkdir -p $logdir echo -n > $logdir/make_trans.log rm ${locdata}/spk2gender 2>/dev/null for s in test train; do echo "--- Preparing ${s}_wav.scp, ${s}_trans.txt and ${s}.utt2spk ..." for d in $(cat $loctmp/dir_${s}.txt); do spkname=`echo $d | cut -f1 -d'-'`; spksfx=`echo $d | cut -f2- -d'-'`; # | sed -e 's:_:\-:g'`; idpfx="${spkname}-${spksfx}"; dir=${DATA}/$d rdm=`find $dir/etc/ -iname 'readme'` if [ -z $rdm ]; then echo "No README file for $d - skipping this directory ..." continue fi spkgender=$(perl -ane ' s/.*gender\:\W*(.).*/lc($1)/ei && print; ' <$rdm) if [ "$spkgender" != "f" -a "$spkgender" != "m" ]; then echo "Illegal or empty gender ($spkgender) for \"$d\" - assuming m(ale) ..." spkgender="m" fi echo "$spkname $spkgender" >> $locdata/spk2gender.tmp if [ ! -f ${dir}/etc/PROMPTS ]; then echo "No etc/PROMPTS file exists in $dir - skipping the dir ..." \ >> $logdir/make_trans.log continue fi if [ -d ${dir}/wav ]; then wavtype=wav elif [ -d ${dir}/flac ]; then wavtype=flac else echo "No 'wav' or 'flac' dir in $dir - skipping ..." continue fi all_wavs=() all_utt2spk_entries=() for w in ${dir}/${wavtype}/*${wavtype}; do bw=`basename $w` wavname=${bw%.$wavtype} all_wavs+=("$wavname") id="${idpfx}-${wavname}" if [ ! -s $w ]; then echo "$w is zero-size - skipping ..." 1>&2 continue fi if [ $wavtype == "wav" ]; then echo "$id $w" else echo "$id flac -c -d --silent $w |" fi all_utt2spk_entries+=("$id $spkname") done >> ${loctmp}/${s}_wav.scp.unsorted for a in "${all_utt2spk_entries[@]}"; do echo $a; done >> $loctmp/${s}.utt2spk.unsorted if [ ! -f ${loctmp}/${s}_wav.scp.unsorted ]; then echo "$0: processed no data: error: pattern ${dir}/${wavtype}/*${wavtype} might match nothing" exit 1; fi local/make_trans.py $dir/etc/PROMPTS ${idpfx} "${all_wavs[@]}" \ 2>>${logdir}/make_trans.log >> ${loctmp}/${s}_trans.txt.unsorted done # filter out the audio for which there is no proper transcript awk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \ ${loctmp}/${s}_trans.txt.unsorted ${loctmp}/${s}_wav.scp.unsorted |\ sort -k1 > ${locdata}/${s}_wav.scp awk 'NR==FNR{trans[$1]; next} ($1 in trans)' FS=" " \ ${loctmp}/${s}_trans.txt.unsorted $loctmp/${s}.utt2spk.unsorted |\ sort -k1 > ${locdata}/${s}.utt2spk sort -k1 < ${loctmp}/${s}_trans.txt.unsorted > ${locdata}/${s}_trans.txt echo "--- Preparing ${s}.spk2utt ..." cat $locdata/${s}_trans.txt |\ cut -f1 -d' ' |\ awk 'BEGIN {FS="-"} {names[$1]=names[$1] " " $0;} END {for (k in names) {print k, names[k];}}' | sort -k1 > $locdata/${s}.spk2utt done; trans_err=$(wc -l <${logdir}/make_trans.log) if [ "${trans_err}" -ge 1 ]; then echo -n "$trans_err errors detected in the transcripts." echo " Check ${logdir}/make_trans.log for details!" fi awk '{spk[$1]=$2;} END{for (s in spk) print s " " spk[s]}' \ $locdata/spk2gender.tmp | sort -k1 > $locdata/spk2gender echo "*** Initial VoxForge data preparation finished!" |