Blame view
egs/formosa/s5/local/prepare_data.sh
2.37 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
#!/bin/bash # Copyright 2015-2016 Sarah Flora Juan # Copyright 2016 Johns Hopkins University (Author: Yenda Trmal) # Copyright 2018 Yuan-Fu Liao, National Taipei University of Technology # AsusTek Computer Inc. (Author: Alex Hung) # Apache 2.0 set -e -o pipefail train_dir=NER-Trs-Vol1/Train eval_dir=NER-Trs-Vol1-Eval eval_key_dir=NER-Trs-Vol1-Eval-Key . ./path.sh . parse_options.sh for x in $train_dir $eval_dir; do if [ ! -d "$x" ] ; then echo >&2 "The directory $x does not exist" fi done if [ -z "$(command -v dos2unix 2>/dev/null)" ]; then echo "dos2unix not found on PATH. Please install it manually." exit 1; fi # have to remvoe previous files to avoid filtering speakers according to cmvn.scp and feats.scp rm -rf data/all data/train data/test data/eval data/local/train mkdir -p data/all data/train data/test data/eval data/local/train # make utt2spk, wav.scp and text find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s " $y $y' \; | dos2unix > data/all/utt2spk find $train_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s " $y $x' \; | dos2unix > data/all/wav.scp find $train_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | dos2unix > data/all/text # fix_data_dir.sh fixes common mistakes (unsorted entries in wav.scp, # duplicate entries and so on). Also, it regenerates the spk2utt from # utt2spk utils/fix_data_dir.sh data/all echo "Preparing train and test data" # test set: JZ, GJ, KX, YX grep -E "(JZ|GJ|KX|YX)_" data/all/utt2spk | awk '{print $1}' > data/all/cv.spk utils/subset_data_dir_tr_cv.sh --cv-spk-list data/all/cv.spk data/all data/train data/test # for LM training echo "cp data/train/text data/local/train/text for language model training" cat data/train/text | awk '{$1=""}1;' | awk '{$1=$1}1;' > data/local/train/text # preparing EVAL set. find $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s " $y $y' \; | dos2unix > data/eval/utt2spk find $eval_dir -name *.wav -exec sh -c 'x={}; y=$(basename -s .wav $x); printf "%s %s " $y $x' \; | dos2unix > data/eval/wav.scp find $eval_key_dir -name *.txt -exec sh -c 'x={}; y=$(basename -s .txt $x); printf "%s " $y; cat $x' \; | dos2unix > data/eval/text utils/fix_data_dir.sh data/eval echo "Data preparation completed." exit 0; |