Blame view
egs/gale_mandarin/s5/local/gale_data_prep_split.sh
2.16 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
#!/bin/bash # Copyright 2014 (author: Ahmed Ali, Hainan Xu) # Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal) # Apache 2.0 if [ $# -ne 1 ]; then echo "Arguments should be the <gale folder>"; exit 1 fi set -e -o pipefail #data will data/local galeData=$(utils/make_absolute.sh $1) mkdir -p data/local dir=$(utils/make_absolute.sh data/local) # some problem with the text data; same utt id but different transcription cat $galeData/all | awk '{print$2}' | \ sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list utils/filter_scp.pl --exclude -f 2 \ $galeData/dup.list $galeData/all > $galeData/all.nodup mv $galeData/all $galeData/all.orig mv $galeData/all.nodup $galeData/all grep -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.dev grep -v -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.train cat $galeData/all.dev | awk '{print$2}' > $galeData/dev_utt_list cat $galeData/all.train | awk '{print$2}' > $galeData/train_utt_list mkdir -p $dir/dev mkdir -p $dir/train utils/filter_scp.pl -f 1 $galeData/dev_utt_list $galeData/utt2spk > $dir/dev/utt2spk utils/utt2spk_to_spk2utt.pl $dir/dev/utt2spk | sort -u > $dir/dev/spk2utt utils/filter_scp.pl -f 1 $galeData/train_utt_list $galeData/utt2spk > $dir/train/utt2spk utils/utt2spk_to_spk2utt.pl $dir/train/utt2spk | sort -u > $dir/train/spk2utt for x in dev train; do outdir=$dir/$x file=$galeData/all.$x mkdir -p $outdir awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf " "}' $file | sort -u > $outdir/text done cat $dir/dev/segments | awk '{print$2}' | sort -u > $galeData/dev.wav.list cat $dir/train/segments | awk '{print$2}' | sort -u > $galeData/train.wav.list utils/filter_scp.pl -f 1 $galeData/dev.wav.list $galeData/wav.scp > $dir/dev/wav.scp utils/filter_scp.pl -f 1 $galeData/train.wav.list $galeData/wav.scp > $dir/train/wav.scp cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}} {if (seen[$1]) { print $0}}' > $dir/train/wav.scp echo data prep split succeeded |