dict_dir_add_pronprobs.sh 9.67 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252


#!/bin/bash

# Apache 2.0.
# Copyright  2014  Johns Hopkins University (author: Daniel Povey)
#            2014  Guoguo Chen
#            2015  Hainan Xu


# The thing that this script implements is described in the paper:
# "PRONUNCIATION AND SILENCE PROBABILITY MODELING FOR ASR"
# by Guoguo Chen et al, see
# http://www.danielpovey.com/files/2015_interspeech_silprob.pdf

. ./path.sh || exit 1;

# begin configuration
max_normalize=true
# end configuration

. utils/parse_options.sh || exit 1;

set -e

if [[ $# -ne 3 && $# -ne 5 ]]; then
  echo "Usage: $0 [options] <input-dict-dir> <input-pron-counts> \\"
  echo "          [input-sil-counts] [input-bigram-counts] <output-dict-dir>"
  echo " e.g.: $0 data/local/dict \\"
  echo "          exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \\"
  echo "          exp/tri3/pron_bigram_counts_nowb.txt data/local/dict_prons"
  echo " e.g.: $0 data/local/dict \\"
  echo "          exp/tri3/pron_counts_nowb.txt data/local/dict_prons"
  echo ""
  echo "This script takes pronunciation counts, e.g. generated by aligning your training"
  echo "data and getting the prons using steps/get_prons.sh, and creates a modified"
  echo "dictionary directory with pronunciation probabilities. If the [input-sil-counts]"
  echo "parameter is provided, it will also include silprobs in the generated lexicon."
  echo "Options:"
  echo "   --max-normalize   (true|false)             # default true.  If true,"
  echo "                                              # divide each pron-prob by the"
  echo "                                              # most likely pron-prob per word."
  exit 1;
fi

if [ $# -eq 3 ]; then
  srcdir=$1
  pron_counts=$2
  dir=$3
elif [ $# -eq 5 ]; then
  srcdir=$1
  pron_counts=$2
  sil_counts=$3
  bigram_counts=$4
  dir=$5
fi

if [ ! -s $pron_counts ]; then
  echo "$0: expected file $pron_counts to exist";
  exit 1;
fi

mkdir -p $dir || exit 1;
utils/validate_dict_dir.pl $srcdir;

if [ -f $srcdir/lexicon.txt ]; then
  src_lex=$srcdir/lexicon.txt
  perl -ane 'print join(" ", split(" ", $_)) . "\n";' < $src_lex |\
    sort -u > $dir/lexicon.txt
elif [ -f $srcdir/lexiconp.txt ]; then
  echo "$0: removing the pron-probs from $srcdir/lexiconp.txt to create $dir/lexicon.txt"
  # the Perl command below normalizes the spaces (avoid double space).
  src_lex=$srcdir/lexiconp.txt
  awk '{$2 = ""; print $0;}' <$srcdir/lexiconp.txt |\
    perl -ane 'print join(" ", split(" " ,$_)) . "\n";' |\
    sort -u > $dir/lexicon.txt || exit 1;
fi


# the cat and awk commands below are implementing add-one smoothing.
cat <(awk '{print 1, $0;}' <$dir/lexicon.txt) $pron_counts | \
  awk '{ count = $1; $1 = ""; word_count[$2] += count; pron_count[$0] += count; pron2word[$0] = $2; }
       END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word];
          print num / den, p } } ' | \
    awk '{ word = $2; $2 = $1; $1 = word; print; }' | grep -v '^<eps>' |\
    sort -k1,1 -k2g,2 -k3 > $dir/lexiconp.txt


n_old=$(wc -l <$dir/lexicon.txt)
n_new=$(wc -l <$dir/lexiconp.txt)

if [ "$n_old" != "$n_new" ]; then
  echo "$0: number of lines differs from $dir/lexicon.txt $n_old vs $dir/lexiconp.txt $n_new"
  echo "Probably something went wrong (e.g. input prons were generated from a different lexicon"
  echo "than $srcdir, or you used pron_counts.txt when you should have used pron_counts_nowb.txt"
  echo "or something else.  Make sure the prons in $src_lex $pron_counts look"
  echo "the same."
  exit 1;
fi

if $max_normalize; then
  echo "$0: normalizing pronprobs so maximum is 1 for each word."
  cat $dir/lexiconp.txt | awk '{if ($2 > max[$1]) { max[$1] = $2; }} END{for (w in max) { print w, max[w]; }}' > $dir/maxp.txt

  awk -v maxf=$dir/maxp.txt  'BEGIN{ while (getline <maxf) { max[$1] = $2; }} { $2 = $2 / max[$1]; print }' <$dir/lexiconp.txt > $dir/lexicon_tmp.txt || exit 1;

  if ! [ $(wc -l  <$dir/lexicon_tmp.txt)  -eq $(wc -l  <$dir/lexiconp.txt) ]; then
    echo "$0: error max-normalizing pron-probs"
    exit 1;
  fi
  mv $dir/lexicon_tmp.txt $dir/lexiconp.txt
  rm $dir/maxp.txt
fi

# Create $dir/lexiconp_silprob.txt and $dir/silprob.txt if silence counts file
# exists. The format of $dir/lexiconp_silprob.txt is:
# word pron-prob P(s_r | w)  F(s_l | w) F(n_l | w) pron
#  where:  P(s_r | w) is the probability of silence to the right of the word
#          F(s_l | w) is a factor which is greater than one if silence to the
#                  left of the word is more than averagely probable.
#          F(n_l | w) is a factor which is greater than one if nonsilence to the
#                  left of the word is more than averagely probable.
if [ -n "$sil_counts" ]; then
  if [ ! -s "$sil_counts" ]; then
    echo "$0: expected file $sil_counts to exist and not empty" && exit 1;
  fi
  cat $sil_counts | perl -e '
    # Load silence counts
    %sil_wpron = (); %nonsil_wpron = (); %wpron_sil = (); %wpron_nonsil = ();
    $sil_count = 0; $nonsil_count = 0;
    while (<STDIN>) {
      chomp; @col = split; @col >= 5 || die "'$0': bad line \"$_\"\n";
      $wpron = join(" ", @col[4..scalar(@col)-1]);
      ($sil_wpron{$wpron}, $nonsil_wpron{$wpron},
       $wpron_sil{$wpron}, $wpron_nonsil{$wpron}) = @col[0..3];
      $sil_count += $sil_wpron{$wpron}; $nonsil_count += $nonsil_wpron{$wpron};
    }

    # Open files.
    ($lexiconp, $bigram_counts, $lexiconp_silprob, $silprob) = @ARGV;
    open(LP, "<$lexiconp") || die "'$0': fail to open $lexiconp\n";
    open(WPC, "<$bigram_counts") || die "'$0': fail to open $bigram_counts\n";
    open(SP, ">$silprob") || die "'$0': fail to open $silprob\n";
    open(LPSP, ">$lexiconp_silprob") ||
      die "'$0': fail to open $lexiconp_silprob\n";

    # Computes P(s_r | w) in the paper.
    $lambda2 = 2;             # Smoothing term, \lambda_2 in the paper.
    %P_w_sr = ();
    %all_wprons = ();
    $sil_prob = sprintf("%.2f", $sil_count / ($sil_count + $nonsil_count));
    while (<LP>) {
      chomp; @col = split; @col >= 3 || die "'$0': bad line \"$_\"\n";
      $word = shift @col; $pron_prob = shift @col; $pron = join(" ", @col);
      unshift(@col, $word); $wpron = join(" ", @col);

      $wpron_sil_count = $wpron_sil{$wpron} + $sil_prob * $lambda2;
      $wpron_nonsil_count = $wpron_nonsil{$wpron} + (1 - $sil_prob) * $lambda2;
      $sil_after_prob = sprintf("%.2f",
        $wpron_sil_count / ($wpron_sil_count + $wpron_nonsil_count));
      if ($sil_after_prob == "0.00") { $sil_after_prob = "0.01"; }
      if ($sil_after_prob == "1.00") { $sil_after_prob = "0.99"; }
      $P_w_sr{$wpron} = $sil_after_prob;

      $all_wprons{$wpron} = $pron_prob;
    }

    # Reads C(v ? w) in the paper.
    %wpron_pair_count = ();
    while (<WPC>) {
      chomp; @col = split("\t"); @col == 3 || die "'$0': bad line \"$_\"\n";
      $count = shift @col; $wpron1 = shift @col; $wpron2 = shift @col;
      $key = "${wpron1}\t${wpron2}";
      $wpron_pair_count{$key} = $count;
    }

    # Computes \bar{C}(s w) and \bar{C}(n w) in the paper.
    %bar_C_s_w = ();
    %bar_C_n_w = ();
    foreach my $key (keys %wpron_pair_count) {
      $count = $wpron_pair_count{$key};
      ($wpron1, $wpron2) = split("\t", $key);
      $bar_C_s_w{$wpron2} += $count * $P_w_sr{$wpron1};
      $bar_C_n_w{$wpron2} += $count * (1 - $P_w_sr{$wpron1});
    }

    # Computes F(s_l | w) and F(n_l | w) in the paper.
    $lambda3 = 2;             # Smoothing term, \lambda_3 in the paper.
    foreach my $wpron (keys %all_wprons) {
      @col = split(" ", $wpron);
      $word = shift @col;
      $pron = join(" ", @col);
      $pron_prob = $all_wprons{$wpron};

      $F_sl_w = ($sil_wpron{$wpron} + $lambda3) / ($bar_C_s_w{$wpron} + $lambda3);
      $F_nl_w = ($nonsil_wpron{$wpron} + $lambda3) / ($bar_C_n_w{$wpron} + $lambda3);
      $F_sl_w = sprintf("%.2f", $F_sl_w);
      $F_nl_w = sprintf("%.2f", $F_nl_w);
      if ($F_sl_w == "0.00") { $F_sl_w = "0.01"; }
      if ($F_nl_w == "0.00") { $F_nl_w = "0.01"; }

      print LPSP "$word $pron_prob $P_w_sr{$wpron} $F_sl_w $F_nl_w $pron\n";
    }

    # Create silprob.txt
    $BOS_sil_count = $wpron_sil{"<s>"} + $sil_prob * $lambda2;
    $BOS_nonsil_count = $wpron_nonsil{"<s>"} + (1 - $sil_prob) * $lambda2;
    $P_BOS_sr = sprintf("%.2f", $BOS_sil_count / ($BOS_sil_count + $BOS_nonsil_count));
    $F_sl_EOS = ($sil_wpron{"</s>"} + $lambda3) / ($bar_C_s_w{"</s>"} + $lambda3);
    $F_nl_EOS = ($nonsil_wpron{"</s>"} + $lambda3) / ($bar_C_n_w{"</s>"} + $lambda3);
    if ($P_BOS_sr == "1.00") { $P_BOS_sr = "0.99"; }
    if ($P_BOS_sr == "0.00") { $P_BOS_sr = "0.01"; }
    if ($F_sl_EOS == "0.00") { $F_sl_EOS = "0.01"; }
    if ($F_nl_EOS == "0.00") { $F_nl_EOS = "0.01"; }
    print SP "<s> $P_BOS_sr\n</s>_s $F_sl_EOS\n</s>_n $F_nl_EOS\noverall $sil_prob\n";
    ' $dir/lexiconp.txt $bigram_counts $dir/lexiconp_silprob_unsorted.txt $dir/silprob.txt
    sort -k1,1 -k2g,2 -k6 $dir/lexiconp_silprob_unsorted.txt > $dir/lexiconp_silprob.txt
fi

# now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are
# in the same order.
cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/  / /g' >$dir/lexicon.txt


# add mandatory files.
for f in silence_phones.txt nonsilence_phones.txt; do
  if [ ! -f $srcdir/$f ]; then
    echo "$0: expected $srcdir/$f to exist."
    exit 1;
  fi
  cp $srcdir/$f $dir/ || exit 1;
done


# add optional files (at least, I think these are optional; would have to check the docs).
for f in optional_silence.txt extra_questions.txt; do
  if [ -f $srcdir/$f ]; then
    cp $srcdir/$f $dir || exit 1;
  fi
done


echo "$0: produced dictionary directory with probabilities in $dir/"
echo "$0: validating $dir .."
sleep 1
utils/validate_dict_dir.pl $dir || exit 1;


echo "Some low-probability prons include: "
echo "# sort -k2,2 -n $dir/lexiconp.txt  | head -n 8"

sort -k2,2 -n $dir/lexiconp.txt  | head -n 8

exit 0