Blame view

scripts/rnnlm/change_vocab.sh 2.84 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
  #!/usr/bin/env bash
  
  # Copyright 2018  Xiaohui Zhang
  
  # This script prepares a new rnnlm-dir commpatible with a new vocab from a provided word-list,
  # given an exisiting rnnlm-dir containing a trained rnnlm. Basically, we copy the feature 
  # embedding, a trained rnnlm and some config files from the old rnnlm-dir. And then we re-
  # generate the unigram_probs.txt (a fixed unigram prob is assigned to words out of the orignal vocab),
  # word_feats.txt and word embeddings.
  
  cmd=run.pl
  oov_unigram_prob=0.0000001
  
  . utils/parse_options.sh
  
  if [ $# != 3 ]; then
    echo "Usage: $0 [options] <word-list> <rnnlm-in-dir> <rnnlm-out-dir>"
    echo "Prepare a new directory <rnnlm-out-dir> with a given <word-list> and a valid <rnnlm-in-dir>."
    echo "  <word-list> is a vocabulary file with mapping to integers."
    exit 1
  fi
  
  set -e
  [ -f path.sh ] && . ./path.sh
  
  word_list=$1
  rnnlm_in_dir=$2
  rnnlm_out_dir=$3
  
  for f in features.txt data_weights.txt oov.txt xconfig; do
    if [ ! -f $rnnlm_in_dir/config/$f ]; then
      echo "$0: file $rnnlm_in_dir/config/$f is not present."
      exit 1
    fi
  done
  
  for f in unigram_probs.txt feat_embedding.final.mat final.raw; do
    if [ ! -f $rnnlm_in_dir/$f ]; then
      echo "$0: file $rnnlm_in_dir/$f is not present."
      exit 1
    fi
  done
  
  echo "$0: Copying config directory."
  mkdir -p $rnnlm_out_dir/config
  for f in features.txt data_weights.txt oov.txt xconfig; do
    cp $rnnlm_in_dir/config/$f $rnnlm_out_dir/config
  done
  
  for f in feat_embedding.final.mat final.raw; do
    cp -L $rnnlm_in_dir/$f $rnnlm_out_dir/
  done
  
  echo "$0: Re-generating words.txt, unigram_probs.txt, word_feats.txt and word_embedding.final.mat."
  cp $word_list $rnnlm_out_dir/config/words.txt
  
  brk_id=`cat $rnnlm_out_dir/config/words.txt | wc -l`
  echo "<brk> $brk_id" >> $rnnlm_out_dir/config/words.txt
  
  # Generate new unigram_probs.txt. For words within the original vocab, we just take the prob
  # from the original unigram_probs.txt. For new words added, we assign the prob as $oov_unigram_prob.
  awk -v s=$rnnlm_in_dir/unigram_probs.txt -v t=$rnnlm_in_dir/config/words.txt  -v oov_prob=$oov_unigram_prob \
    'BEGIN { while ((getline<s) > 0) { id2prob[$1] = $2; } 
             while ((getline<t) > 0) { word2prob[$1] = id2prob[$2]; }
     } 
     { if ($1 in word2prob) print $2" "word2prob[$1]; else print $2" "oov_prob; }' \
     $rnnlm_out_dir/config/words.txt | sort -k1,1 -n > $rnnlm_out_dir/unigram_probs.txt
  
  rnnlm/get_special_symbol_opts.py < $rnnlm_out_dir/config/words.txt > $rnnlm_out_dir/special_symbol_opts.txt
  
  # Re-compute words_feats.txt and word embeddings.
  rnnlm/get_word_features.py --unigram-probs=$rnnlm_out_dir/unigram_probs.txt --treat-as-bos='#0' \
    $rnnlm_out_dir/config/words.txt $rnnlm_out_dir/config/features.txt > $rnnlm_out_dir/word_feats.txt
  
  rnnlm-get-word-embedding $rnnlm_out_dir/word_feats.txt $rnnlm_out_dir/feat_embedding.final.mat \
    $rnnlm_out_dir/word_embedding.final.mat