split_long_utterance.sh 4.96 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146


#!/bin/bash

# Copyright 2014  Guoguo Chen
# Apache 2.0

# Begin configuration section.
seg_length=30
min_seg_length=10
overlap_length=5
# End configuration section.

echo "$0 $@"

[ -f ./path.sh ] && . ./path.sh
. parse_options.sh || exit 1;

if [ $# -ne 2 ]; then
  echo "This script truncates the long audio into smaller overlapping segments"
  echo ""
  echo "Usage: $0 [options] <input-dir> <output-dir>"
  echo " e.g.: $0 data/train_si284_long data/train_si284_split"
  echo ""
  echo "Options:"
  echo "    --min-seg-length        # minimal segment length"
  echo "    --seg-length            # length of segments in seconds."
  echo "    --overlap-length        # length of overlap in seconds."
  exit 1;
fi

input_dir=$1
output_dir=$2

for f in spk2utt text utt2spk wav.scp; do
  [ ! -f $input_dir/$f ] && echo "$0: no such file $input_dir/$f" && exit 1;
done

[ ! $seg_length -gt $overlap_length ] \
  && echo "$0: --seg-length should be longer than --overlap-length." && exit 1;

# Checks if sox is on the path.
sox=`which sox`
[ $? -ne 0 ] && echo "$0: sox command not found." && exit 1;
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
[ ! -x $sph2pipe ] && echo "$0: sph2pipe command not found." && exit 1;

mkdir -p $output_dir
cp -f $input_dir/spk2gender $output_dir/spk2gender 2>/dev/null
cp -f $input_dir/text $output_dir/text.orig
cp -f $input_dir/wav.scp $output_dir/wav.scp

# We assume the audio length in header is correct and get it from there. It is
# a little bit annoying that old version of sox does not support the following:
#   $audio_cmd | sox --i -D
# we have to put it in the following format for the old versions:
#   $sox --i -D "|$audio_cmd"
# Another way is to count all the samples to get the duration, but it takes
# longer time, so we do not use it here.. The command is:
#   $audio_cmd | sox -t wav - -n stat | grep -P "^Length" | awk '{print $1;}'
#
# Note: in the wsj example the process takes couple of minutes because of the
#       audio file concatenation; in a real case it should be much faster since
#       it just reads the header.
cat $output_dir/wav.scp | perl -e '
  $no_orig_seg = "false";       # Original segment file may or may not exist.
  ($u2s_in, $u2s_out, $seg_in,
   $seg_out, $orig2utt, $sox, $slen, $mslen, $olen) = @ARGV;
  open(UI, "<$u2s_in") || die "Error: fail to open $u2s_in\n";
  open(UO, ">$u2s_out") || die "Error: fail to open $u2s_out\n";
  open(SI, "<$seg_in") || ($no_orig_seg = "true");
  open(SO, ">$seg_out") || die "Error: fail to open $seg_out\n";
  open(UMAP, ">$orig2utt") || die "Error: fail to open $orig2utt\n";
  # If the original segment file exists, we have to work out the segment
  # duration from the segment file. Otherwise we work that out from the wav.scp
  # file.
  if ($no_orig_seg eq "false") {
    while (<SI>) {
      chomp;
      @col = split;
      @col == 4 || die "Error: bad line $_\n";
      ($seg_id, $wav_id, $seg_start, $seg_end) = @col;
      $seg2wav{$seg_id} = $wav_id;
      $seg_start{$seg_id} = $seg_start;
      $seg_end{$seg_id} = $seg_end;
    }
  } else {
    while (<STDIN>) {
      chomp;
      @col = split;
      @col >= 2 || "bad line $_\n";
      if ((@col > 2) &&  ($col[-1] eq "|")) {
        $wav_id = shift @col; pop @col;
        $audio_cmd = join(" ", @col);
        $duration = `$sox --i -D '\''|$audio_cmd'\''`;
      } else {
        @col == 2 || die "Error: bad line $_\n in wav.scp";
        $wav_id = $col[0];
        $audio_file = $col[1];
        $duration = `$sox --i -D $audio_file`;
      }
      chomp($duration);
      $seg2wav{$wav_id} = $wav_id;
      $seg_start{$wav_id} = 0;
      $seg_end{$wav_id} = $duration;
    }
  }
  while (<UI>) {
    chomp;
    @col = split;
    @col == 2 || die "Error: bad line $_\n";
    $utt2spk{$col[0]} = $col[1];
  }
  foreach $seg (sort keys %seg2wav) {
    $index = 0;
    $step = $slen - $olen;
    print UMAP "$seg";
    while ($seg_start{$seg} + $index * $step < $seg_end{$seg}) {
      $new_seg = $seg . "_" . sprintf("%05d", $index);
      $start = $seg_start{$seg} + $index * $step;
      $end = $start + $slen;
      defined($utt2spk{$seg}) || die "Error: speaker not found for $seg\n";
      print UO "$new_seg $utt2spk{$seg}\n";
      print UMAP " $new_seg"; 
      $index += 1;
      if ($end - $olen + $mslen >= $seg_end{$seg}) {
        # last segment will have at least $mslen seconds.
        $end = $seg_end{$seg};
        print SO "$new_seg $seg2wav{$seg} $start $end\n";
        last;
      } else {
        print SO "$new_seg $seg2wav{$seg} $start $end\n";
      }
    }
    print UMAP "\n";
  }' $input_dir/utt2spk $output_dir/utt2spk \
    $input_dir/segments $output_dir/segments $output_dir/orig2utt \
    $sox $seg_length $min_seg_length $overlap_length

# CAVEAT: We are not dealing with channels here. Each channel should have a
# unique file name in wav.scp.
paste -d ' ' <(cut -d ' ' -f 1 $output_dir/wav.scp) \
  <(cut -d ' ' -f 1 $output_dir/wav.scp) | awk '{print $1" "$2" A";}' \
  > $output_dir/reco2file_and_channel

utils/fix_data_dir.sh $output_dir

exit 0;