combine_short_segments.sh 6 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178


#!/bin/bash

# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script copies and modifies a data directory while combining
# segments whose duration is lower than a specified minimum segment
# length.
#
# Note: this does not work for the wav.scp, since there is no natural way to
# concatenate segments; you have to operate on directories that already have
# features extracted.

#


# begin configuration section
cleanup=true
# end configuration section

. utils/parse_options.sh

if [ $# != 3 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <min-segment-length-in-seconds> <dir>"
  echo "e.g.:"
  echo " $0 data/train 1.55 data/train_comb"
  # options documentation here.
  exit 1;
fi


export LC_ALL=C

srcdir=$1
min_seg_len=$2
dir=$3

if [ "$dir" == "$srcdir" ]; then
  echo "$0: this script requires <srcdir> and <dir> to be different."
  exit 1
fi

for f in $srcdir/utt2spk $srcdir/feats.scp; do
  [ ! -s $f ] && echo "$0: expected file $f to exist and be nonempty" && exit 1
done

if ! awk '{if (NF != 2) exit(1);}' <$srcdir/feats.scp; then
  echo "$0: could not combine short segments because $srcdir/feats.scp has "
  echo " entries with too many fields"
fi

if ! mkdir -p $dir; then
  echo "$0: could not create directory $dir"
  exit 1;
fi

if ! utils/validate_data_dir.sh $srcdir; then
  echo "$0: failed to validate input directory $srcdir.  If needed, run   utils/fix_data_dir.sh $srcdir"
  exit 1
fi

if ! python -c "x=float('$min_seg_len'); assert(x>0.0 and x<100.0);" 2>/dev/null; then
  echo "$0: bad <min-segment-length-in-seconds>: got '$min_seg_len'"
  exit 1
fi

set -e
set -o pipefail

# make sure $srcdir/utt2dur exists.
utils/data/get_utt2dur.sh $srcdir

utils/data/internal/choose_utts_to_combine.py --min-duration=$min_seg_len \
  $srcdir/spk2utt $srcdir/utt2dur $dir/utt2utts $dir/utt2spk $dir/utt2dur

utils/utt2spk_to_spk2utt.pl < $dir/utt2spk > $dir/spk2utt

# create the feats.scp.
# if a line of utt2utts is like 'utt2-comb2 utt2 utt3', then
# the utils/apply_map.pl will create a line that looks like
# 'utt2-comb2 foo.ark:4315 foo.ark:431423'
# and the awk command creates suitable command lines like:
# 'utt2-comb2 concat-feats foo.ark:4315 foo.ark:431423 - |'
utils/apply_map.pl -f 2- $srcdir/feats.scp <$dir/utt2utts | \
  awk '{if (NF<=2){print;} else { $1 = $1 " concat-feats --print-args=false"; $NF = $NF " - |"; print; }}' > $dir/feats.scp

# create $dir/text by concatenating the source 'text' entries for the original
# utts.
utils/apply_map.pl -f 2- $srcdir/text <$dir/utt2utts > $dir/text

if [ -f $srcdir/utt2uniq ]; then
  # the utt2uniq file is such that if 2 utts were derived from the same original
  # utt (e.g. by speed perturbing) they map to the same 'uniq' value.  This is
  # so that we can properly hold out validation data for neural net training and
  # know that we're not training on perturbed verions of that utterance.  We
  # need to obtain the utt2uniq file so that if any 2 'new' utts contain any of
  # the same 'old' utts, their 'uniq' values are the same [but otherwise as far
  # as possible, the 'uniq' values are different.]
  #
  # we'll do this by arranging the old 'uniq' values into groups as necessary to
  # capture this property.

  # The following command creates 'uniq_sets', each line of which contains
  # a set of original 'uniq' values, and effectively we assert that they must
  # be grouped together to the same 'uniq' value.
  # the first awk command prints a group of the original utterance-ids that
  # are combined together into a single new utterance, and the apply_map
  # command converts those into a list of original 'uniq' values.
  awk '{$1 = ""; print;}' < $dir/utt2utts | \
    utils/apply_map.pl $srcdir/utt2uniq > $dir/uniq_sets

  # The next command creates $dir/uniq2merged_uniq, which is a map from the
  # original 'uniq' values to the 'merged' uniq values.
  # for example, if $dir/uniq_sets were to contain
  # a b
  # b c
  # d
  # then we'd obtain a uniq2merged_uniq file that looks like:
  # a a
  # b a
  # c a
  # d d
  # ... because a and b appear together, and b and c appear together,
  # they have to be merged into the same set, and we name that set 'a'
  # (in general, we take the lowest string in lexicographical order).

  cat $dir/uniq_sets | LC_ALL=C python -c '
import sys;
from collections import defaultdict
uniq2orig_uniq = dict()
equal_pairs = set()  # set of 2-tuples (a,b) which should have equal orig_uniq
while True:
    line = sys.stdin.readline()
    if line == "": break
    split_line = line.split() # list of uniq strings that should map in same set
    # initialize uniq2orig_uniq to the identity mapping
    for uniq in split_line: uniq2orig_uniq[uniq] = uniq
    for a in split_line[1:]: equal_pairs.add((split_line[0], a))

changed = True
while changed:
    changed = False
    for a,b in equal_pairs:
         min_orig_uniq = min(uniq2orig_uniq[a], uniq2orig_uniq[b])
         for x in [a,b]:
             if uniq2orig_uniq[x] != min_orig_uniq:
                 uniq2orig_uniq[x] = min_orig_uniq
                 changed = True

for uniq in sorted(uniq2orig_uniq.keys()):
    print uniq, uniq2orig_uniq[uniq]
' > $dir/uniq_to_orig_uniq
  rm $dir/uniq_sets


  # In the following command, suppose we have a line like:
  # utt1-comb2 utt1 utt2
  # .. the first awk command retains only the first original utt, to give
  # utt1-comb2 utt1
  # [we can pick one arbitrarily since we know any of them would map to the same
  # orig_uniq value.]
  # the first apply_map.pl command maps the 'utt1' to the 'uniq' value it mapped to
  # in $srcdir, and the second apply_map.pl command maps it to the grouped 'uniq'
  # value obtained by the inline python script above.
  awk '{print $1, $2}' < $dir/utt2utts | utils/apply_map.pl -f 2 $srcdir/utt2uniq | \
    utils/apply_map.pl -f 2 $dir/uniq_to_orig_uniq > $dir/utt2uniq
  rm $dir/uniq_to_orig_uniq
fi

# note: the user will have to recompute the cmvn, as the speakers may have changed.
rm $dir/cmvn.scp 2>/dev/null || true

utils/validate_data_dir.sh --no-wav $dir

if $cleanup; then
  rm $dir/utt2utts
fi