Blame view
egs/lre07/v1/local/split_long_utts.sh
3.5 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
#!/bin/bash max_utt_len=60 # 60 seconds. stage=0 cleanup=true . utils/parse_options.sh if [ $# -ne 2 ]; then echo "Usage: $0 [options] <in-data-dir> <out-data-dir>" echo "e.g.: $0 --max-utt-len 120 data/train data/train_split" echo "This script splits up long utterances into smaller pieces." echo "It assumes the wav.scp contains has a certain form, with .sph" echo "files in it (so the script is not completely general)." exit 1; fi in_dir=$1 dir=$2 for f in $in_dir/{utt2spk,spk2utt,wav.scp,utt2lang}; do if [ ! -f $f ]; then echo "$0: expected input file $f to exist"; exit 1; fi done if [ $stage -le 0 ]; then utils/validate_data_dir.sh --no-text --no-feats $in_dir || exit 1; mkdir -p $dir/temp || exit 1; cat $in_dir/wav.scp | perl -ane ' $_ =~ m:(^\S+) .* (\S+\.sph): || die "bad line $_"; $utt=$1; $sph=$2; open (F, "<$sph") || die "Could not open sph file $sph"; $samp_count = -1; while (<F>) { if (m/sample_rate -i (\d+)/) { $1 == 8000 || die "bad sample rate for $sph: $_"; } if (m/sample_count -i (\d+)/) { $samp_count = $1; } if (m/end_head/) { last; } } close(F); $samp_count > 0 || die "Could not get sample count for sph file $sph "; $secs = $samp_count / 8000; print "$utt $secs "; ' > $dir/temp/utt2len fi if [ $(cat $dir/temp/utt2len | wc -l) -ne $(cat $in_dir/utt2spk | wc -l) ]; then echo "utt2spk and utt2len files have mismatched lengths"; exit 1; fi if [ $(cat $dir/temp/utt2len | wc -l) -ne $(cat $in_dir/utt2lang | wc -l) ]; then echo "utt2spk and utt2lang files have mismatched lengths"; exit 1; fi if [ $stage -le 1 ]; then # Create, in a pipe, a file with lines # <utt-id> <length> <speaker-id> <language-id> # and pipe it into a perl script that outputs the segments file. awk '{print $2}' $in_dir/utt2spk | paste $dir/temp/utt2len - | \ paste - <(awk '{print $2}' $in_dir/utt2lang) | perl -e ' ($max_utt_len, $out_dir) = @ARGV; open(UTT2SPK, ">$out_dir/utt2spk") || die "opening utt2spk file $out_dir/utt2spk"; open(SEGMENTS, ">$out_dir/segments") || die "opening segments file $out_dir/segments"; open(UTT2LANG, ">$out_dir/utt2lang") || die "opening segments file $out_dir/utt2lang"; while(<STDIN>) { ($utt, $len, $speaker, $language) = split(" ", $_); defined $speaker || die "Bad line $_"; $reco = $utt; # old utt-id becomes recording-id. if ($len <= $max_utt_len) { print SEGMENTS "${utt}-1 ${utt} 0 -1 "; print UTT2SPK "${utt}-1 $speaker "; print UTT2LANG "${utt}-1 $language "; } else { # We will now allow split length to exceed max_utt_len. $num_split = int(($len + 0.999*$max_utt_len) / $max_utt_len); $num_split >= 1 || die; $split_len = $len / $num_split; for ($n = 1; $n <= $num_split; $n++) { $n_text = $n; # this will help remain in string-sorted order if ($num_split >= 10 && $n < 10) { $n_text = "0$n_text"; } if ($num_split >= 100 && $n < 100) { $n_text = "00$n_text"; } $t_start = $split_len * ($n - 1); $t_end = $split_len * $n; print SEGMENTS "${utt}-$n_text ${utt} $t_start $t_end "; print UTT2SPK "${utt}-$n_text $speaker "; print UTT2LANG "${utt}-$n_text $language "; } } } close(SEGMENTS)||die; close(UTT2SPK)||die; close(UTT2LANG)||die; ' $max_utt_len $dir fi cp $in_dir/wav.scp $dir/ utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt utils/validate_data_dir.sh --no-text --no-feats $dir || exit 1; $cleanup && rm -r $dir/temp exit 0; |