Blame view
egs/zeroth_korean/s5/local/data_prep.sh
3.03 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
#!/bin/bash # Copyright 2018 Atlas Guide (Author : Lucas Jo) # 2018 Gridspace Inc. (Author: Wonkyum Lee) # Apache 2.0 # Modified by Lucas Jo 2017 (Altas Guide) if [ "$#" -ne 2 ]; then echo "Usage: $0 <db-dir> <part>" echo "e.g.: $0 ./db/train_data_01 data/train_data_01" exit 1 fi db_dir=$1 data_part=$2 src=${db_dir}/${data_part} dst=data/${data_part} # all utterances are FLAC compressed if ! which flac >&/dev/null; then echo "Please install 'flac' on ALL worker nodes!" exit 1 fi spk_file=${db_dir}/AUDIO_INFO mkdir -p $dst || exit 1; [ ! -d $src ] && echo "$0: no such directory $src" && exit 1; [ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1; wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp trans=$dst/text; [[ -f "$trans" ]] && rm $trans utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur for scriptid_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do scriptid=$(basename $scriptid_dir) if ! [ $scriptid -eq $scriptid ]; then # not integer. echo "$0: unexpected subdirectory name $scriptid" exit 1; fi for reader_dir in $(find -L $scriptid_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do reader=$(basename $reader_dir) if ! [ "$reader" -eq "$reader" ]; then echo "$0: unexpected reader-subdirectory name $reader" exit 1; fi reader_gender=$(egrep "^$reader\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($3)}') if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then echo "Unexpected gender: '$reader_gender'" exit 1; fi echo " "$scriptid $reader $reader_gender find -L $reader_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ awk -v "dir=$reader_dir" '{printf "%s flac -c -d -s %s/%s.flac | ", $0, dir, $0}' >>$wav_scp|| exit 1 reader_trans=$reader_dir/${reader}_${scriptid}.trans.txt [ ! -f $reader_trans ] && echo "$0: expected file $reader_trans to exist" && exit 1 cat $reader_trans >>$trans # NOTE: Each chapter is dedicated to each speaker. awk -v "reader=$reader" -v "scriptid=$scriptid" '{printf "%s %s_%s ", $1, reader, scriptid}' \ <$reader_trans >>$utt2spk || exit 1 # reader -> gender map (again using per-chapter granularity) echo "${reader}_${scriptid} $reader_gender" >>$spk2gender done done # sort cat $wav_scp | sort > tmp cp tmp $wav_scp cat $trans | sort > tmp cp tmp $trans cat $utt2spk | sort > tmp cp tmp $utt2spk cat $spk2gender | sort > tmp cp tmp $spk2gender rm tmp spk2utt=$dst/spk2utt utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 ntrans=$(wc -l <$trans) nutt2spk=$(wc -l <$utt2spk) ! [ "$ntrans" -eq "$nutt2spk" ] && \ echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1; utils/data/get_utt2dur.sh $dst 1>&2 || exit 1 utils/validate_data_dir.sh --no-feats $dst || exit 1; echo "$0: successfully prepared data in $dst" exit 0 |