Blame view
egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh
2.66 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
#!/bin/bash # Copyright Vijayaditya Peddinti, 2016. # Apache 2.0. # This script generates uniformly segmented data dir, if the directory # already has a segments file (e.g. in data/dev_aspire) we create directory # without segments and then uniformly segment it. # It also extracts hires mfcc features set -e set -x stage=1 num_jobs=30 overlap=5 # size of the overlap window=10 # size of the uniform segment . ./cmd.sh [ -f ./path.sh ] && . ./path.sh . utils/parse_options.sh || exit 1; if [ $# -ne 2 ]; then echo "Usage: $0 [options] <data-set> <out-data-set>" echo " Options:" echo " --stage (1|2|3) # start scoring script from part-way through." echo "e.g.:" echo "$0 data/train data/lang exp/nnet3/tdnn" exit 1; fi data_set=$1 segmented_data_set=$2 if [ "$data_set" == "dev_aspire" ]; then if [ $stage -le 1 ]; then echo "$0: Creating the data dir with whole recordings without segmentation" # create a whole directory without the segments unseg_dir=data/${data_set}_whole_hires src_dir=data/${data_set} utils/data/convert_data_dir_to_whole.sh $src_dir $unseg_dir echo "$0: Creating the $unseg_dir/reco2file_and_channel file" cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel fi data_set=${data_set}_whole else utils/copy_data_dir.sh data/$data_set data/${data_set}_hires fi if [ $stage -le 2 ]; then echo "$0: Extracting features" steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \ --mfcc-config conf/mfcc_hires.conf data/${data_set}_hires steps/compute_cmvn_stats.sh data/${data_set}_hires utils/fix_data_dir.sh data/${data_set}_hires utils/validate_data_dir.sh --no-text data/${data_set}_hires fi if [ $stage -le 3 ]; then echo "$0: Generating uniform segments with length $window and overlap $overlap." [ -d data/${segmented_data_set}_hires ] && rm -r data/${segmented_data_set}_hires if [ ! -f data/${data_set}_hires/segments ]; then utils/data/get_segments_for_data.sh data/${data_set}_hires > \ data/${data_set}_hires/segments fi mkdir -p data/${segmented_data_set}_hires utils/data/get_uniform_subsegments.py \ --max-segment-duration=$window \ --overlap-duration=$overlap \ --max-remaining-duration=$(perl -e "print $window/ 2.0") \ data/${data_set}_hires/segments > data/${segmented_data_set}_hires/sub_segments utils/data/subsegment_data_dir.sh data/${data_set}_hires \ data/${segmented_data_set}_hires/sub_segments data/${segmented_data_set}_hires steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires utils/fix_data_dir.sh data/${segmented_data_set}_hires utils/validate_data_dir.sh --no-text data/${segmented_data_set}_hires fi |