Blame view
egs/wsj/s5/utils/data/remove_dup_utts.sh
1.76 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
#!/bin/bash # Remove excess utterances once they appear more than a specified # number of times with the same transcription, in a data set. # E.g. useful for removing excess "uh-huh" from training. if [ $# != 3 ]; then echo "Usage: remove_dup_utts.sh max-count <src-data-dir> <dest-data-dir>" echo "e.g.: remove_dup_utts.sh 10 data/train data/train_nodup" echo "This script is used to filter out utterances that have from over-represented" echo "transcriptions (such as 'uh-huh'), by limiting the number of repetitions of" echo "any given word-sequence to a specified value. It's often used to get" echo "subsets for early stages of training." exit 1; fi maxcount=$1 srcdir=$2 destdir=$3 mkdir -p $destdir [ ! -f $srcdir/text ] && echo "$0: Invalid input directory $srcdir" && exit 1; ! mkdir -p $destdir && echo "$0: could not create directory $destdir" && exit 1; ! [ "$maxcount" -gt 1 ] && echo "$0: invalid max-count '$maxcount'" && exit 1; cp $srcdir/* $destdir cat $srcdir/text | \ perl -e ' $maxcount = shift @ARGV; @all = (); $p1 = 103349; $p2 = 71147; $k = 0; sub random { # our own random number generator: predictable. $k = ($k + $p1) % $p2; return ($k / $p2); } while(<>) { push @all, $_; @A = split(" ", $_); shift @A; $text = join(" ", @A); $count{$text} ++; } foreach $line (@all) { @A = split(" ", $line); shift @A; $text = join(" ", @A); $n = $count{$text}; if ($n < $maxcount || random() < ($maxcount / $n)) { print $line; } }' $maxcount >$destdir/text echo "Reduced number of utterances from `cat $srcdir/text | wc -l` to `cat $destdir/text | wc -l`" echo "Using fix_data_dir.sh to reconcile the other files." utils/fix_data_dir.sh $destdir rm -r $destdir/.backup exit 0 |