subset_data_dir.sh 5.66 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159


#!/bin/bash
# Copyright 2010-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0


# This script operates on a data directory, such as in data/train/.
# See http://kaldi.sourceforge.net/data_prep.html#data_prep_data
# for what these directories contain.

# The script It creates a subset of that data, consisting of some specified
# number of utterances.  (The selected utterances are distributed evenly
# throughout the file, by the program ./subset_scp.pl).

# There are four options, none compatible with any other.

# If you give the --per-spk option, it will attempt to select the supplied
# number of utterances for each speaker (typically you would supply a much
# smaller number in this case).

# If you give the --speakers option, it selects a subset of n randomly
# selected speakers.

# If you give the --shortest option, it will give you the n shortest utterances.

# If you give the --first option, it will just give you the n first utterances.

# If you give the --last option, it will just give you the n last utterances.


shortest=false
perspk=false
first_opt=""
speakers=false
spk_list_specified=false

if [ "$1" == "--per-spk" ]; then
  perspk=true;
  shift;
elif [ "$1" == "--shortest" ]; then
  shortest=true;
  shift;
elif [ "$1" == "--first" ]; then
  first_opt="--first";
  shift;
elif [ "$1" == "--speakers" ]; then
  speakers=true
  shift;
elif [ "$1" == "--last" ]; then
  first_opt="--last";
  shift;
elif [ "$1" == "--spk-list" ]; then
  spk_list_specified=true
  shift;
fi


if [ $# != 3 ]; then
  echo "Usage: "
  echo "  subset_data_dir.sh [--speakers|--shortest|--first|--last|--per-spk] <srcdir> <num-utt> <destdir>"
  echo "  subset_data_dir.sh [--spk-list <speaker-list-file>] <srcdir> <destdir>"
  echo "By default, randomly selects <num-utt> utterances from the data directory."
  echo "With --speakers, randomly selects enough speakers that we have <num-utt> utterances"
  echo "With --first, selects the first <num-utt> utterances"
  echo "With --last, selects the last <num-utt> utterances"
  echo "With --shortest, selects the shortest <num-utt> utterances."
  exit 1;
fi

if $spk_list_specified; then
  spk_list=$1
  srcdir=$2
  destdir=$3
else
  srcdir=$1
  numutt=$2
  destdir=$3
fi


export LC_ALL=C

if [ ! -f $srcdir/utt2spk ]; then
  echo "subset_data_dir.sh: no such file $srcdir/utt2spk" 
  exit 1;
fi

function do_filtering {
  # assumes the utt2spk and spk2utt files already exist.
  [ -f $srcdir/feats.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/feats.scp >$destdir/feats.scp
  [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/wav.scp >$destdir/wav.scp
  [ -f $srcdir/text ] && utils/filter_scp.pl $destdir/utt2spk <$srcdir/text >$destdir/text
  [ -f $srcdir/spk2gender ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/spk2gender >$destdir/spk2gender
  [ -f $srcdir/cmvn.scp ] && utils/filter_scp.pl $destdir/spk2utt <$srcdir/cmvn.scp >$destdir/cmvn.scp
  if [ -f $srcdir/segments ]; then
     utils/filter_scp.pl $destdir/utt2spk <$srcdir/segments >$destdir/segments
     awk '{print $2;}' $destdir/segments | sort | uniq > $destdir/reco # recordings.
     # The next line would override the command above for wav.scp, which would be incorrect.
     [ -f $srcdir/wav.scp ] && utils/filter_scp.pl $destdir/reco <$srcdir/wav.scp >$destdir/wav.scp
     [ -f $srcdir/reco2file_and_channel ] && \
       utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel
     
     # Filter the STM file for proper sclite scoring (this will also remove the comments lines)
     [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm
     
     rm $destdir/reco
  fi
  srcutts=`cat $srcdir/utt2spk | wc -l`
  destutts=`cat $destdir/utt2spk | wc -l`
  echo "$0: reducing #utt from $srcutts to $destutts"
}


if $spk_list_specified; then
  mkdir -p $destdir
  utils/filter_scp.pl "$spk_list" $srcdir/spk2utt > $destdir/spk2utt || exit 1;
  utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk || exit 1;
  do_filtering; # bash function.
  exit 0;  
elif $speakers; then
  mkdir -p $destdir
  utils/shuffle_list.pl < $srcdir/spk2utt | awk -v numutt=$numutt '{ if (tot < numutt){ print; } tot += (NF-1); }' | \
    sort > $destdir/spk2utt
  utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
  do_filtering; # bash function.
  exit 0;  
elif $perspk; then
  mkdir -p $destdir
  awk '{ n='$numutt'; printf("%s ",$1); skip=1; while(n*(skip+1) <= NF-1) { skip++; }
         for(x=2; x<=NF && x <= n*skip; x += skip) { printf("%s ", $x); } 
         printf("\n"); }' <$srcdir/spk2utt >$destdir/spk2utt
  utils/spk2utt_to_utt2spk.pl < $destdir/spk2utt > $destdir/utt2spk
  do_filtering; # bash function.
  exit 0;
else
  if [ $numutt -gt `cat $srcdir/feats.scp | wc -l` ]; then
    echo "subset_data_dir.sh: cannot subset to more utterances than you originally had."
    exit 1;
  fi 
  mkdir -p $destdir || exit 1;

  ## scripting note: $shortest evaluates to true or false
  ## so this becomes the command true or false.
  if $shortest; then
    # select the n shortest utterances.
    . ./path.sh
    [ ! -f $srcdir/feats.scp ] && echo "$0: you selected --shortest but no feats.scp exist." && exit 1;
    feat-to-len scp:$srcdir/feats.scp ark,t:$destdir/tmp.len || exit 1;
    sort -n -k2 $destdir/tmp.len | awk '{print $1}' | head -$numutt >$destdir/tmp.uttlist
    utils/filter_scp.pl $destdir/tmp.uttlist $srcdir/utt2spk >$destdir/utt2spk
    rm $destdir/tmp.uttlist $destdir/tmp.len
  else
    utils/subset_scp.pl $first_opt $numutt $srcdir/utt2spk > $destdir/utt2spk || exit 1;
  fi
  utils/utt2spk_to_spk2utt.pl < $destdir/utt2spk > $destdir/spk2utt
  do_filtering;
  exit 0;
fi