subsegment_data_dir.sh 9.62 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244


#!/bin/bash

# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0


# This script allows you to specify a 'segments' file with segments
# relative to existing utterances, with lines like
#  utterance_foo-1 utterance_foo 7.5 8.2
#  utterance_foo-2 utterance_foo 8.9 10.1
# and a 'text' file with sub-segmented text like
#  utterance_foo-1 hello there
#  utterance_foo-2 how are you
# and combine this with an existing data-dir that was all relative
# to the original utterance-ids like 'utterance_foo', producing
# a new subsegmented output directory.
#
# It does the right thing for you on the various files that the
# data directory contained (except you have to recreate
# the CMVN stats).


segment_end_padding=0.0
cmd=run.pl
nj=1

. utils/parse_options.sh

if [ $# != 4 ] && [ $# != 3 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <subsegments-file> [<text-file>] <destdir>"
  echo "This script sub-segments a data directory.  <subsegments-file> is to"
  echo "have lines of the form <new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>"
  echo "and <text-file> is of the form <new-utt> <word1> <word2> ... <wordN>."
  echo "This script appropriately combines the <subsegments-file> with the original"
  echo "segments file, if necessary, and if not, creates a segments file."
  echo "e.g.:"
  echo " $0 data/train [options] exp/tri3b_resegment/segments exp/tri3b_resegment/text data/train_resegmented"
  echo " Options:"
  echo "  --segment-end-padding <padding-time>       # e.g. 0.02.  Default 0.0.  If provided,"
  echo "                                             # we will add this value to the end times of <destdir>/segments"
  echo "                                             # when creating it.  This can be useful to account for"
  echo "                                             # end effects in feature generation.  The reason this is"
  echo "                                             # not just applied to the input segments file, is that"
  echo "                                             # for purposes of computing the num-frames of the parts of"
  echo "                                             # matrices in feats.scp, the padding should not be done."
  echo "  See also: resolve_ctm_overlaps.py"
  exit 1;
fi


export LC_ALL=C

srcdir=$1
subsegments=$2

add_subsegment_text=false
if [ $# -eq 4 ]; then
  new_text=$3
  dir=$4
  add_subsegment_text=true

  if [ ! -f "$new_text" ]; then
    echo "$0: no such file $new_text"
    exit 1
  fi

else
  dir=$3
fi

for f in "$subsegments" "$srcdir/utt2spk"; do
  if [ ! -f "$f" ]; then
    echo "$0: no such file $f"
    exit 1;
  fi
done

if ! mkdir -p $dir; then
  echo "$0: failed to create directory $dir"
fi

if $add_subsegment_text; then
  if ! cmp <(awk '{print $1}' <$subsegments)  <(awk '{print $1}' <$new_text); then
    echo "$0: expected the first fields of the files $subsegments and $new_text to be identical"
    exit 1
  fi
fi

# create the utt2spk in $dir
if ! awk '{if (NF != 4 || !($4 > $3)) { print("Bad line: " $0); exit(1) } }' <$subsegments; then
  echo "$0: failed checking subsegments file $subsegments"
  exit 1
fi

set -e
set -o pipefail

# Create a mapping from the new to old utterances.  This file will be deleted later.
awk '{print $1, $2}' < $subsegments > $dir/new2old_utt

# Create the new utt2spk file [just map from the second field
utils/apply_map.pl -f 2 $srcdir/utt2spk < $dir/new2old_utt >$dir/utt2spk
# .. and the new spk2utt file.
utils/utt2spk_to_spk2utt.pl  <$dir/utt2spk >$dir/spk2utt

if $add_subsegment_text; then
  # the new text file is just what the user provides.
  cp $new_text $dir/text
fi

# copy the source wav.scp
cp $srcdir/wav.scp $dir
if [ -f $srcdir/reco2file_and_channel ]; then
  cp $srcdir/reco2file_and_channel $dir
fi

# copy the source reco2dur
if [ -f $srcdir/reco2dur ]; then
  cp $srcdir/reco2dur $dir
fi

if [ -f $srcdir/segments ]; then
  # we have to map the segments file.
  # What's going on below is a little subtle.
  # $srcdir/segments has lines like: <old-utt-id> <recording-id> <start-time> <end-time>
  # and $subsegments has lines like: <new-utt-id> <old-utt-id> <start-time> <end-time>
  # The apply-map command replaces <old-utt-id> [the 2nd field of $subsegments]
  # with <recording-id> <start-time> <end-time>.
  # so after that first command we have lines like
  # <new-utt-id> <recording-id> <start-time-of-old-utt-within-recording> <end-time-old-utt-within-recording> \
  #   <start-time-of-new-utt-within-old-utt> <end-time-of-new-utt-within-old-utt>
  # which the awk command turns into:
  # <new-utt-id> <recording-id> <start-time-of-new-utt-within-recording> <end-time-of-new-utt-within-recording>
  utils/apply_map.pl -f 2 $srcdir/segments <$subsegments | \
    awk -v pad=$segment_end_padding '{ print $1, $2, $5+$3, $6+$3+pad; }' >$dir/segments
else
  # the subsegments file just becomes the segments file.
  awk -v pad=$segment_end_padding '{$4 += pad; print}' <$subsegments >$dir/segments
fi

if [ -f $srcdir/utt2uniq ]; then
  utils/apply_map.pl -f 2 $srcdir/utt2uniq <$dir/new2old_utt >$dir/utt2uniq
fi

if [ -f $srcdir/feats.scp ]; then
  # We want to avoid recomputing the features.   We'll use sub-matrices of the
  # original feature matrices, using the [] notation that is available for
  # matrices in Kaldi.
  if [ ! -s $srcdir/frame_shift ]; then
    frame_shift=$(utils/data/get_frame_shift.sh $srcdir) || exit 1
  else
    frame_shift=$(cat $srcdir/frame_shift)
  fi
  echo "$0: note: frame shift is $frame_shift [affects feats.scp]"

  # The subsegments format is <new-utt-id> <old-utt-id> <start-time> <end-time>.
  # e.g. 'utt_foo-1 utt_foo 7.21 8.93'
  # The first awk command replaces this with the format:
  # <new-utt-id> <old-utt-id> <first-frame> <last-frame>
  # e.g. 'utt_foo-1 utt_foo 721 893'
  # and the apply_map.pl command replaces 'utt_foo' (the 2nd field) with its corresponding entry
  # from the original wav.scp, so we get a line like:
  # e.g. 'utt_foo-1 foo-bar.ark:514231 721 892'
  # Note: the reason we subtract one from the last time is that it's going to
  # represent the 'last' frame, not the 'end' frame [i.e. not one past the last],
  # in the matlab-like, but zero-indexed [first:last] notion.  For instance, a segment with 1 frame
  # would have start-time 0.00 and end-time 0.01, which would become the frame range
  # [0:0]
  # The second awk command turns this into something like
  # utt_foo-1 foo-bar.ark:514231[721:892]
  # It has to be a bit careful because the format actually allows for more general things
  # like pipes that might contain spaces, so it has to be able to produce output like the
  # following:
  # utt_foo-1 some command|[721:892]
  # The 'end' frame is ensured to not exceed the feature archive size of
  # <old-utt-id>. This is done using the script fix_subsegment_feats.pl.
  # e.g if the number of frames in foo-bar.ark is 891, then the features are
  # truncated to that many frames.
  # utt_foo-1 foo-bar.ark:514231[721:890]
  # Lastly, utils/data/normalize_data_range.pl will only do something nontrivial if
  # the original data-dir already had data-ranges in square brackets.

  # Here, we computes the maximum 'end' frame allowed for each <new-utt-id>.
  # This is equal to the number of frames in the feature archive for <old-utt-id>.
  if [ ! -f $srcdir/utt2num_frames ]; then
    echo "$0: WARNING: Could not find $srcdir/utt2num_frames. It might take a long time to run get_utt2num_frames.sh."
    echo "Increase the number of jobs or write this file while extracting features by passing --write-utt2num-frames true to steps/make_mfcc.sh etc."
  fi
  utils/data/get_utt2num_frames.sh --cmd "$cmd" --nj $nj $srcdir
  awk '{print $1" "$2}' $subsegments | \
    utils/apply_map.pl -f 2 $srcdir/utt2num_frames > \
    $dir/utt2max_frames

  awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' <$subsegments| \
    utils/apply_map.pl -f 2 $srcdir/feats.scp | \
    awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
    utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
    utils/data/normalize_data_range.pl >$dir/feats.scp || { echo "Failed to create $dir/feats.scp" && exit; }

  # Parse the frame ranges from feats.scp, which is in the form of [first-frame:last-frame]
  # and write the number-of-frames = last-frame - first-frame + 1 for the utterance.
  cat $dir/feats.scp | perl -ne 'm/^(\S+) .+\[(\d+):(\d+)\]$/; print "$1 " . ($3-$2+1) . "\n"' > \
    $dir/utt2num_frames

  # Here we add frame ranges to the elements of vad.scp, as we did for rows of feats.scp above.
  if [ -f $srcdir/vad.scp ]; then
    cat $subsegments | awk -v s=$frame_shift '{print $1, $2, int(($3/s)+0.5), int(($4/s)-0.5);}' | \
      utils/apply_map.pl -f 2 $srcdir/vad.scp | \
      awk '{p=NF-1; for (n=1;n<NF-2;n++) printf("%s ", $n); k=NF-2; l=NF-1; printf("%s[%d:%d]\n", $k, $l, $NF)}' | \
      utils/data/fix_subsegment_feats.pl $dir/utt2max_frames | \
      utils/data/normalize_data_range.pl >$dir/vad.scp
  fi
fi


if [ -f $dir/cmvn.scp ]; then
  rm $dir/cmvn.scp
  echo "$0: warning: removing $dir/cmvn.scp, you will have to regenerate it from the features."
fi

# remove the utt2dur file in case it's now invalid-- it be regenerated from the segments file.
rm $dir/utt2dur 2>/dev/null || true

if [ -f $srcdir/spk2gender ]; then
  cp $srcdir/spk2gender $dir
fi
if [ -f $srcdir/glm ]; then
  cp $srcdir/glm $dir
fi
if [ -f $srcdir/stm ]; then
  cp $srcdir/stm $dir
fi

for f in ctm; do
  if [ -f $srcdir/$f ]; then
    echo "$0: not copying $srcdir/$f to $dir because sub-segmenting it is "
    echo " ... not implemented yet (and probably it's not needed.)"
  fi
done

rm $dir/new2old_utt

echo "$0: subsegmented data from $srcdir to $dir"