gender_id.sh 6.47 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164


#!/bin/bash

# Copyright    2013  Daniel Povey
#              2014  David Snyder
# Apache 2.0.

# This script gets gender-id information for a set of utterances.
# The output is a file utt2gender in the experimental directory.

# Begin configuration section.
nj=10
cmd="run.pl"
stage=-4
num_gselect1=20 # Gaussian-selection using diagonal model: number of Gaussians to select
num_gselect2=3 # Gaussian-selection using full-covariance model.
male_prior=0.5
cleanup=true
# End configuration section.

echo "$0 $@"  # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;


if [ $# != 5 ]; then
  echo "Usage: $0 <gender-independent-ubm-dir> <male-ubm-dir> <female-ubm-dir> <data> <exp-dir>"
  echo " e.g.: $0  exp/ubm_2048 exp/ubm_2048_male exp/ubm_2048_female data/test exp/test_gender"
  echo "main options (for others, see top of script file)"
  echo "  --config <config-file>                           # config containing options"
  echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
  echo "  --nj <n|10>                                      # Number of jobs (also see num-processes and num-threads)"
  echo "  --male-prior <p|0.5>                             # Prior probability of male speaker"
  echo "  --cleanup <true,false|true>                      # If true, clean up temporary files"
  echo "  --num-processes <n|4>                            # Number of processes for each queue job (relates"
  echo "                                                   # to summing accs in memory)"
  echo "  --num-threads <n|4>                              # Number of threads for each process (can't be usefully"
  echo "                                                   # increased much above 4)"
  echo "  --stage <stage|-4>                               # To control partial reruns"
  echo "  --num-gselect <n|20>                             # Number of Gaussians to select using"
  echo "                                                   # diagonal model."
  echo "  --sum-accs-opt <option|''>                       # Option e.g. '-l hostname=a15' to localize"
  echo "                                                   # sum-accs process to nfs server."
  exit 1;
fi

ubmdir=$1
male_ubmdir=$2
female_ubmdir=$3
data=$4
dir=$5

delta_opts=`cat $ubmdir/delta_opts 2>/dev/null`
if [ -f $ubmdir/delta_opts ]; then
  cp $ubmdir/delta_opts $male_ubmdir/ 2>/dev/null
  cp $ubmdir/delta_opts $female_ubmdir/ 2>/dev/null
fi

for f in $ubmdir/final.ubm $male_ubmdir/final.ubm $female_ubmdir/final.ubm $data/feats.scp $data/vad.scp; do
  [ ! -f $f ] && echo "No such file $f" && exit 1;
done

# Set various variables.
mkdir -p $dir/log || exit 1;
sdata=$data/split$nj
utils/split_data.sh $data $nj || exit 1;

ng1=$(fgmm-global-info --print-args=false $ubmdir/final.ubm | grep gaussians | awk '{print $NF}')
ng2=$(fgmm-global-info --print-args=false $male_ubmdir/final.ubm | grep gaussians | awk '{print $NF}')
ng3=$(fgmm-global-info --print-args=false $female_ubmdir/final.ubm | grep gaussians | awk '{print $NF}')
if ! [ $ng1 -eq $ng2 ] || ! [ $ng1 -eq $ng3 ]; then
  echo "$0:  Number of Gaussians mismatch between speaker-independent, male "
  echo "$0:  and female UBMs: $ng1 vs $ng2 vs $ng3"
  exit 1;
fi


## Set up features.
feats="ark,s,cs:add-deltas $delta_opts scp:$sdata/JOB/feats.scp ark:- | apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 ark:- ark:- | select-voiced-frames ark:- scp,s,cs:$sdata/JOB/vad.scp ark:- |"

if [ $stage -le -2 ]; then
  $cmd $dir/log/convert.log \
    fgmm-global-to-gmm $ubmdir/final.ubm $dir/final.dubm || exit 1;
fi

# Do Gaussian selection using diagonal form of model and then the full-covariance model.
# Even though this leads to, in some sense, less accurate likelihoods, I think it
# may improve the results for the same reason it sometimes helps to used fixed
# Gaussian posteriors rather than posteriors from the adapted model.

if [ $stage -le -1 ]; then
  echo $nj > $dir/num_jobs
  echo "$0: doing Gaussian selection"
  $cmd JOB=1:$nj $dir/log/gselect.JOB.log \
    gmm-gselect --n=$num_gselect1 $dir/final.dubm "$feats" ark:- \| \
    fgmm-gselect --gselect=ark,s,cs:- --n=$num_gselect2 $ubmdir/final.ubm \
      "$feats" "ark:|gzip -c >$dir/gselect.JOB.gz" || exit 1;
fi

if ! [ $nj -eq $(cat $dir/num_jobs) ]; then
  echo "Number of jobs mismatch" 
  exit 1;
fi


if [ $stage -le 0 ]; then
  $cmd JOB=1:$nj $dir/log/get_male_logprob.JOB.log \
    fgmm-global-get-frame-likes --average=true \
     "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" $male_ubmdir/final.ubm \
      "$feats" ark,t:$dir/male_logprob.JOB || exit 1;
fi
if [ $stage -le 1 ]; then
  $cmd JOB=1:$nj $dir/log/get_female_logprob.JOB.log \
    fgmm-global-get-frame-likes --average=true \
     "--gselect=ark,s,cs:gunzip -c $dir/gselect.JOB.gz|" $female_ubmdir/final.ubm \
      "$feats" ark,t:$dir/female_logprob.JOB || exit 1;
fi

if [ $stage -le 2 ]; then

  for j in $(seq $nj); do cat $dir/male_logprob.$j; done > $dir/male_logprob
  for j in $(seq $nj); do cat $dir/female_logprob.$j; done > $dir/female_logprob

  n1=$(cat $dir/male_logprob | wc -l)
  n2=$(cat $dir/female_logprob | wc -l)

  if [ $n1 -ne $n2 ]; then
    echo "Number of lines mismatch, male versus female UBM probs: $n1 vs $n2"
    exit 1;
  fi

  paste $dir/male_logprob $dir/female_logprob | \
    awk '{if ($1 != $3) { print >/dev/stderr "Sorting mismatch"; exit(1);  } print $1, $2, $4;}' \
    >$dir/logprob || exit 1;

  cat $dir/logprob | \
    awk -v pmale=$male_prior '{lratio = log(pmale/(1-pmale))+$2-$3; print $1, 1/(1+exp(-lratio));}' \
    >$dir/ratio || exit 1;

  cat $dir/ratio | awk '{if ($2 > 0.5) { print $1, "m"; } else { print $1, "f"; }}' > $dir/utt2gender
fi

if [ $stage -le 3 ] && [ -f $data/spk2gender ]; then
  utils/apply_map.pl -f 2 $data/spk2gender  <$data/utt2spk | \
    utils/filter_scp.pl $dir/utt2gender > $dir/utt2gender.ref
  n1=$(cat $dir/utt2gender | wc -l)
  n2=$(cat $dir/utt2gender.ref | wc -l)
  ! [ $n1 -eq $n2 ] && echo "Number-of-utterances mismatch $n1 vs $n2" && exit 1;
  ! paste $dir/utt2gender $dir/utt2gender.ref | awk '{if ($1 != $3) { exit(1); }}' && \
     echo "sorting problem, compare $dir/utt2gender and $dir/utt2gender.ref" && exit 1;
  ! paste $dir/utt2gender $dir/utt2gender.ref | awk '{if ($2 != $4) { print; }}' > $dir/utt2gender.incorrect
  n3=$(cat $dir/utt2gender.incorrect | wc -l)
  
  err=$(perl -e "printf('%.2f', (100.0 * $n3 / $n1));")
  echo "Gender-id error rate is $err%" | tee $dir/error_rate
fi


if $cleanup; then
  rm $dir/gselect.*.gz
fi

exit 0;