modify_speaker_info.sh 4.41 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125


#!/bin/bash

# Copyright 2013-2016  Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

# This script copies a data directory (like utils/copy_data.sh) while
# modifying (splitting or merging) the speaker information in that data directory.
#
# This is done without looking at the data at all; we use only duration
# constraints and maximum-num-utts-per-speaker to assign contiguous
# sets of utterances to speakers.
#
# This has two general uses:
# (1) when dumping iVectors for training purposes, it's helpful to have
#   a good variety of iVectors, and this can be accomplished by splitting
#   speakers up into multiple copies of those speakers.  We typically
#   use the --utts-per-spk-max 2 option for this.
# (2) when dealing with data that is not diarized, and given that we
#   haven't checked any diarization scripts into Kaldi yet, this
#   script can do a "dumb" diarization that just groups consecutive
#   utterances into groups based on length constraints.
#   There are two cases here:

#       a) With --respect-speaker-info true (the default),
#         it only splits within existing speakers.
#         This is suitable when you have existing speaker
#         info that's meaningful in some way, e.g. represents
#         individual recordings.
#      b) With --respect-speaker-info false,
#        it completely ignores the existing speaker information
#        and constructs new speaker identities based on
#        utterance names.  This is suitable in scenarios when
#        you have a one-to-one map between speakers and
#        utterances.

# begin configuration section
utts_per_spk_max=-1
seconds_per_spk_max=-1
respect_speaker_info=true
# end configuration section

. utils/parse_options.sh

if [ $# != 2 ]; then
  echo "Usage: "
  echo "  $0 [options] <srcdir> <destdir>"
  echo "e.g.:"
  echo " $0 --utts-per-spk-max 2 data/train data/train-max2"
  echo "Options"
  echo "   --utts-per-spk-max <n>  # number of utterances per speaker maximum,"
  echo "                           # default -1 (meaning no maximum).  E.g. 2."
  echo "   --seconds-per-spk-max <n> # number of seconds per speaker maximum,"
  echo "                             # default -1 (meaning no maximum).  E.g. 60."
  echo "   --respect-speaker-info <true|false>  # If true, respect the"
  echo "                                        # existing speaker map (i.e. do not"
  echo "                                        # assign utterances from different"
  echo "                                        # speakers to the same generated speaker)."
  echo "                                        # Default: true."
  echo "Note: one or both of the --utts-per-spk-max or --seconds-per-spk-max"
  echo "options is required."
  exit 1;
fi

export LC_ALL=C

srcdir=$1
destdir=$2

if [ "$destdir"  == "$srcdir" ]; then
  echo "$0: <srcdir> must be different from <destdir>."
  exit 1
fi

if [ "$seconds_per_spk_max" == "-1" ] && ! [ "$utts_per_spk_max" -gt 0 ]; then
  echo "$0: one or both of the --utts-per-spk-max or --seconds-per-spk-max options must be provided."
fi

if [ ! -f $srcdir/utt2spk ]; then
  echo "$0: no such file $srcdir/utt2spk"
  exit 1;
fi

set -e;
set -o pipefail

mkdir -p $destdir

if [ "$seconds_per_spk_max" != -1 ]; then
  # we need the utt2dur file.
  utils/data/get_utt2dur.sh $srcdir
  utt2dur_opt="--utt2dur=$srcdir/utt2dur"
else
  utt2dur_opt=
fi

utils/data/internal/modify_speaker_info.py \
   $utt2dur_opt --respect-speaker-info=$respect_speaker_info \
  --utts-per-spk-max=$utts_per_spk_max --seconds-per-spk-max=$seconds_per_spk_max \
  <$srcdir/utt2spk >$destdir/utt2spk

utils/utt2spk_to_spk2utt.pl <$destdir/utt2spk >$destdir/spk2utt

# This script won't create the new cmvn.scp, it should be recomputed.
if [ -f $destdir/cmvn.scp ]; then
  mkdir -p $destdir/.backup
  mv $destdir/cmvn.scp $destdir/.backup
  echo "$0: moving $destdir/cmvn.scp to $destdir/.backup/cmvn.scp"
fi

# these things won't be affected by the change of speaker mapping.
for f in feats.scp segments wav.scp reco2file_and_channel text stm glm ctm; do
  [ -f $srcdir/$f ] && cp $srcdir/$f $destdir/
done


orig_num_spk=$(wc -l <$srcdir/spk2utt)
new_num_spk=$(wc -l <$destdir/spk2utt)

echo "$0: copied data from $srcdir to $destdir, number of speakers changed from $orig_num_spk to $new_num_spk"
opts=
[ ! -f $srcdir/feats.scp ] && opts="--no-feats"
[ ! -f $srcdir/text ] && opts="$opts --no-text"
[ ! -f $srcdir/wav.scp ] && opts="$opts --no-wav"

utils/validate_data_dir.sh $opts $destdir