segmentation.pl 16.4 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403


#!/usr/bin/env perl
use warnings; #sed replacement for -w perl parameter
# Copyright 2013  Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

# This program is for segmentation of data, e.g. long telephone conversations,
# into short chunks.  The input (stdin) should be a sequence of lines like
# sw0-20348-A  0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 2 2 2 2 2 2 2 2 2 2 ...  2 2 0 0 0
# where there is a number for each frame and the numbers mean 0 for silence, 1
# for noise, laughter and other nonspeech events, and 2 for speech.  This will
# typically be derived from some kind of fast recognition (see
# ../steps/resegment_data.sh), followed by ali-to-phones --per-frame=true and
# then mapping phones to these classes 0, 1 and 2.
#
# The algorithm is as follows:
#  (1) Find contiguous sequences of classes 1 or 2 (i.e. speech and/or noise), with e.g.
#      "1 1 1 2 2" counted as a single contiguous sequence.  Each such sequence is an
#      initial segment.
#  (2) While the proportion of silence in the segments is less than $silence_proportion,
#      add a single silence frame to the left and right of each segment, as long
#      as this does not take us past the ends of the file or into another segment.  
#      At this point, do not merge segments.
#  (3) Merging segments:
#      Get a list of all boundaries between segments that ended up touching each other
#      during phase 2.  Sort them according to the number of silence frames at the boundary,
#      with those with the least silence to be processed first.  Go through the boundaries
#      in order, merging each pair of segments, as long as doing so does not create
#      a segment larger than $max_segment_length.
#  (4) Splitting excessively long segments:
#      For all segments that are longer than $hard_max_segment_length, split them equally
#      into the smallest number of pieces such that the pieces will be no longer than
#      $hard_max_segment_length.  Print a warning.
#  (5) Removing any segments that contain no speech.  (remove segments that have only silence
#      and noise.
#
#  By default, the utterance-ids will be of the form <RECORDING-ID>-<START-TIME>-<END-TIME>,
#  where <START-TIME> and <END-TIME> are measured 0.01 seconds, using fixed-width
#  integers with enough digits to print out all the segments (the number of digits being
#  decided per line of the input).  For instance, if the input recording-id was
#  sw0-20348-A, an example line of the "segments-file" output would be:
#   sw0-20348-A-00124-00298 sw0-20348-A 1.24 2.98
#  (interpreted as <UTTERANCE-ID> <RECORDING-ID> <START-TIME> <END-TIME>)
#  and the number of digits has to be that large because the same recording has
#  a segment something like
#   sw0-20348-A-13491-13606 sw0-20348-A 134.91 136.06
#  The "_" and "-" in the output are separately configurable by means of the
#  --first-separator and --second-separator options.  However, generally speaking,
#  it is safer to use "-" than, say, "_", because "-" appears very early in the
#  ASCII table, and using it as the separator will tend to ensure than when
#  you sort the utterances and the recording-ids they will sort the same way.
#  This matters because recording-ids will often equal speaker-ids, and Kaldi scripts
#  require that the utterance-ids and speaker-ids sort in the "same order".


use Getopt::Long;

$silence_proportion = 0.2; # The amount of silence at the sides of segments is
                           # tuned to give this proportion of silence.

$frame_shift = 0.01; # Affects the interpretation of the options such as max_segment_length,
                     # and the seconds in the "segments" file.
$max_segment_length = 15.0; # Maximum segment length while we are merging segments...
                            # it will not allow merging segments to make segments longer than this.
$hard_max_segment_length = 30.0; # A hard maximum on the segment length; it will
                                 # break segments to get below this, even if there is
                                 # no silence, and print a warning.
$first_separator = "-";   # separator between recording-id and start-time, in utterance-id.
$second_separator = "-";  # separator between start-time and end-time, in utterance-id.
$remove_noise_only_segments = "true";  # boolean option; if true,
                                       # remove segments that have no speech.


GetOptions('silence-proportion:f' => \$silence_proportion,
           'frame-shift:f' => \$frame_shift,
           'max-segment-length:f' => \$max_segment_length,
           'hard-max-segment-length:f' => \$hard_max_segment_length,
           'first-separator:s' => \$first_separator,
           'second-separator:s' => \$second_separator,
           'remove-noise-only-segments:s' => \$remove_noise_only_segments);

if (@ARGV != 0) {
  print STDERR "$0:\n" .
               "Usage: segmentation.pl [options] < per-frame-archive > segments-file\n" .
               "This program is called from steps/resegment_data.sh.  Please see\n" .
               "the extensive comment in the source.  Options:\n" .
               "--silence-proportion <float> (default: $silence_proportion)\n" .
               "--frame-shift <float> (default: $frame_shift, in seconds)\n" .
               "--max-segment-length <float> (default: $max_segment_length, in seconds)\n" .
               "--hard-max-segment-length (default: $hard_max_segment_length, in seconds)\n" .
               "--first-separator <string> (default: $first_separator), affects utterance-ids\n" .
               "--second-separator <string> (default: $second_separator), affects utterance-ids\n" .
               "--remove-noise-only-segments <true|false> (default: true)\n";
  exit 1;
}

($silence_proportion > 0.01 && $silence_proportion < 0.99) ||
  die "Invalid silence-proportion value '$silence_proportion'";
($frame_shift > 0.0001 && $frame_shift <= 1.0) ||
  die "Very strange frame-shift value '$frame_shift'";
($max_segment_length > 1.0 && $max_segment_length < 100.0) ||
  die "Very strange max-segment-length value '$max_segment_length'";
($hard_max_segment_length > 4.0 && $hard_max_segment_length < 500.0) ||
  die "Very strange hard-max-segment-length value '$hard_max_segment_length'";
($hard_max_segment_length >= $max_segment_length) ||
  die "hard-max-segment-length may not be less than max-segment-length";
($remove_noise_only_segments eq 'false' ||
 $remove_noise_only_segments eq 'true') || 
  die "Option --remove-noise-only-segments takes args true or false";


sub get_initial_segments {
  # This operates on the global arrays @A, @S and @N.  It sets the elements of
  # @S to 1 if start of segment, and @E to 1 if end of segment, end of segment
  # being defined as one past the last frame in the segment.

  for (my $n = 0; $n < $N; $n++) {
    if ($A[$n] == 0) {
      if ($n > 0 && $A[$n-1] != 0) {
        $E[$n] = 1;
      }
    } else {
      if ($n == 0 || $A[$n-1] == 0) {
        $S[$n] = 1;
      }
    }
  }
  if ($A[$N-1] != 0) { # Handle the special case
    $E[$N] = 1;        # where the last frame of the file is silence or noise.
  }
}


sub set_silence_proportion {
  $num_nonsil_frames = 0;
  # Get number of frames that are inside segments.  Initially, this will
  # all be non-silence.
  $in_segment = 0;

  my @active_frames = (); # active_frames are segment start/end frames.
  for (my $n = 0; $n <= $N; $n++) {
    if ($n < $N && $S[$n] == 1) {
      $in_segment == 0 || die; 
      $in_segment = 1; 
      push @active_frames, $n;
    }
    if ($E[$n] == 1) { 
      $in_segment == 1 || die; 
      $in_segment = 0; 
      push @active_frames, $n;
    }
    if ($n < $N) {
      ($in_segment == ($A[$n] != 0 ? 1 : 0)) || die; # Just a check.
      if ($in_segment) { $num_nonsil_frames++; }
    }
  }
  $in_segment == 0 || die; # should not be still in a segment after file-end.
  if ($num_nonsil_frames == 0) {
    print STDERR "$0: warning: no segments found for recording $recording_id\n";
    return;
  }
  #(target-segment-frames - num-nonsil-frames) / target-segment-frames =  sil-proportion
  # -> target-segment-frames = (num-nonsil-frames) / (1 - sil-proportion).
  my $target_segment_frames = int($num_nonsil_frames / (1.0 - $silence_proportion));
  my $num_segment_frames = $num_nonsil_frames;
  while ($num_segment_frames < $target_segment_frames) {
    $changed = 0;
    for (my $i = 0; $i < @active_frames; $i++) {
      my $n = $active_frames[$i];
      if ($E[$n] == 1 && $n < $N && $S[$n] != 1) {
        # shift the end of this segment one frame to the right.
        $E[$n] = 0;
        $E[$n+1] = 1;
        $active_frames[$i] = $n + 1;
        $num_segment_frames++;
        $changed = 1;
      }
      if ($n < $N && $S[$n] == 1 && $n > 0 && $E[$n] != 1) {
        # shift the start of this segment one frame to the left
        $S[$n] = 0;
        $S[$n-1] = 1;
        $active_frames[$i] = $n - 1;
        $num_segment_frames++;
        $changed = 1;
      }
      if ($num_segment_frames == $target_segment_frames) {
        last;
      }
    }
    if ($changed == 0) { last; } # avoid an infinite loop.
  }
  if ($num_segment_frames < $target_segment_frames) {
    my $proportion = 
      ($num_segment_frames - $num_nonsil_frames) / $num_segment_frames;
    print STDERR "$0: warning: for recording $recording_id, only got a proportion " .
      "$proportion of silence frames, versus target $silence_proportion\n";
  }
}

sub merge_segments() {
  my @boundaries = ();
  my @num_silence_phones = (); # for any index into @S where there
                               # is a boundary between contiguous segments
                               # (i.e. an index which is both a segment-start
                               # and segment-end index), the number of silence
                               # frames at that boundary (i.e. at the end of the
                               # previous segment and the beginning of the next
                               # one.
  for ($n = 0; $n < $N; $n++) {
    if ($S[$n] == 1 && $E[$n] == 1) {
      push @boundaries, $n;
      my $num_sil = 0;
      my $p;
      # note: here we can count the silence phones without regard to the
      # segment boundaries, since we'll hit nonsilence before we get to
      # the end/beginning of these segments.
      for ($p = $n; $p < $N; $p++) {
        if ($A[$p] == 0) { $num_sil++; }
        else { last; }
      }
      for ($p = $n - 1; $p >= 0; $p--) {
        if ($A[$p] == 0) { $num_sil++; }
        else { last; }
      }
      
      $num_silence_phones[$n] = $num_sil; # should be the num of silence
    }
  }

  # Sort on increasing number of silence-phones, so we join the segments with
  # the smallest amount of silence at the boundary first.
  my @sorted_boundaries = 
    sort { $num_silence_phones[$a] <=> $num_silence_phones[$b] } @boundaries;

  foreach $n (@sorted_boundaries) {
    # Join the segments only if the length of the resulting segment would
    # be no more than $max_segment_length.
    ($S[$n] == 1 && $E[$n] == 1) || die;
    my $num_frames = 2; # total number of frames in the two segments we'll be merging..
                        # start the count from 2 because the loops below do not
                        # count the 1st frame of the segment to the right and
                        # the last frame of the segment to the left.
    my $p;
    for ($p = $n + 1; $p <= @A && $E[$p] == 0; $p++) {
      $num_frames++;
    }
    $E[$p] == 1 || die;
    for ($p = $n - 1; $p >= 0 && $S[$p] == 0; $p--) {
      $num_frames++;
    }
    $S[$p] == 1 || die;
    if ($num_frames * $frame_shift <= $max_segment_length) {
      # Join this pair of segments.
      $S[$n] = 0;
      $E[$n] = 0;
    }
  }
}

sub split_long_segments {
  for (my $n = 0; $n < @A; $n++) {
    if ($S[$n] == 1) { # segment starts here...
      my $p;
      for ($p = $n + 1; $p <= @A; $p++) {
        if ($E[$p] == 1) { last; }
      }
      ($E[$p] == 1) || die;
      my $segment_length = $p - $n;
      my $max_frames = int($hard_max_segment_length / $frame_shift);
      if ($segment_length > $max_frames) {
        # The segment is too long, we need to split it.  First work out
        # how many pieces to split it into.
        # We divide and round up to nearest larger int.
        my $num_pieces = int(($segment_length / $max_frames) + 0.99999);
        my $segment_length_in_seconds = $segment_length * $frame_shift;
        print STDERR "$0: warning: for recording $recording_id, splitting segment of " .
          "length $segment_length_in_seconds seconds into $num_pieces pieces " .
          "(--hard-max-segment-length $hard_max_segment_length)\n";
        my $frames_per_piece = int($segment_length / $num_pieces);
        my $i;
        for ($i = 1; $i < $num_pieces; $i++) {
          my $q = $n + $i * $frames_per_piece;
          # Insert a segment boundary at frame $q.
          $S[$q] = 1;
          $E[$q] = 1;
        }
      }
      if ($p - 1 > $n) {
        $n = $p - 1; # avoids some redundant work.
      }
    }
  }
}

sub remove_noise_only_segments {
  for (my $n = 0; $n < $N; $n++) {
    if ($S[$n] == 1) { # segment starts here...
      my $p;
      my $saw_speech = 0;
      for ($p = $n; $p <= $N; $p++) {
        if ($E[$p] == 1 && $p != $n) { last; }
        if ($A[$p] == 2) { $saw_speech = 1; }
      }
      $E[$p] == 1 || die;
      if (! $saw_speech) { # There was no speech in this segment, so remove it.
        $S[$n] = 0;
        $E[$p] = 0;
      }
      if ($p - 1 > $n) {
        $n = $p - 1; # Avoid some redundant work.
      }
    }
  }
}

sub print_segments {
  # We also do some sanity checking here.
  my @segments = (); # each element will be a string start-time:end-time, in frames.

  $N == @S || die; # check array size.
  ($N+1) == @E || die; # check array size.

  my $max_end_time = 0;

  for (my $n = 0; $n < $N; $n++) {
    if ($E[$n] == 1 && $S[$n] != 1) {
      die "Ending segment before starting it: n=$n.\n";
    }
    if ($S[$n]) {
      my $p;
      for ($p = $n + 1; $p < $N && $E[$p] != 1; $p++) {
        $S[$p] && die; # should not start a segment again, before ending it.
      }
      $E[$p] == 1 || die;
      push @segments, "$n:$p"; # push the start/end times.
      $max_end_time = $p;
      if ($p < $N && $S[$p] == 1) { $n = $p - 1; }
      else { $n = $p; }
      # note: we increment $n again before the next loop instance.
    }
  }

  if (@segments == 0) {
    print STDERR "$0: warning: no segments for recording $recording_id\n";
  }

  # we'll be printing the times out in hundredths of a second (regardless of the
  # value of $frame_shift), and first need to know how many digits we need (we'll be
  # printing with "%05d" or similar, for zero-padding.
  $max_end_time_hundredths_second = int(100.0 * $frame_shift * $max_end_time);
  $num_digits = 1;
  my $i = 1;
  while ($i < $max_end_time_hundredths_second) {
    $i *= 10;
    $num_digits++;
  }
  $format_str = "%0${num_digits}d"; # e.g. "%05d"

  foreach $s (@segments) {
    my ($start,$end) = split(":", $s);
    ($end > $start) || die;
    my $start_seconds = sprintf("%.2f", $frame_shift * $start);
    my $end_seconds = sprintf("%.2f", $frame_shift * $end);
    my $start_str = sprintf($format_str, $start_seconds * 100);
    my $end_str = sprintf($format_str, $end_seconds * 100);
    my $utterance_id = "${recording_id}${first_separator}${start_str}${second_separator}${end_str}";
    print "$utterance_id $recording_id $start_seconds $end_seconds\n"; # <-- Here is where the output happens.
  }
}


while (<STDIN>) {
  @A = split; # split line on whitespace.
  if (@A <= 1) {
    print STDERR "$0: warning: invalid input line $_";
    next;
  }
  $recording_id = shift @A;  # e.g. sw0-12430
  for ($n = 0; $n < @A; $n++) {
    $a = $A[$n];
    if ($a != 0 && $a != 1 && $a != 2) {
      die "Invalid value $a: expecting 0, 1 or 2.  Line is: $_";
    }
    $A[$n] = 0 + $a; # cast to integer, might be a bit faster.
  }
  # The array @S will contain 1 if a segment starts there and 0
  # otherwise.  The array @E will contain 1 if a segment ends there
  # and 0 otherwise.
  $N = @A; # number of elements in @A.  Used globally.
  @S = (0) x $N;         # 0 repeated $N times.
  @E = (0) x ($N + 1);   # 0 repeated $N+1 times (one more since if the last frame is
                         # in a segment, the end-marker goes one past that, at index $N.)

  get_initial_segments();       # stage (1) in the comment above.
  set_silence_proportion();     # stage (2)
  merge_segments();             # stage (3)
  split_long_segments();        # stage (4)
  if ($remove_noise_only_segments eq 'true') {
    remove_noise_only_segments(); # stage (5)
  }
  print_segments();
}