Blame view

egs/wsj/s5/utils/subword/prepare_subword_text.sh 1.67 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  #!/bin/bash
  
  # 2019 Dongji Gao
  
  # This script generates subword text form word text.
  # For example, <noise> internatioal -> <noise> inter@@ nation@@ al
  # @@ here is the separator indicate the poisition of subword in word.
  # Subword directly followed by separator can only appear at he begining or middle of word.
  # "<noise>" here can be reserved if added to the option "--glossaries"
  
  # Begin configuration section
  separator="@@"
  glossaries=
  # End configuration section
  
  . utils/parse_options.sh
  
  echo "$0 $@"
  
  if [ $# -ne 3 ]; then
    echo "Usage: utils/prepare_subword_text.sh <word-text> <pair_code> <subword-text>"
    echo "e.g.: utils/prepare_subword_text.sh data/train/text data/local/pair_code.txt data/train/text_subword"
    echo "    --seperator <separator>         # default: @@"
    echo "    --glossaries <reserved-words>   # glossaries are words reserved"
    exit 1;
  fi
  
  word_text=$1
  pair_code=$2
  subword_text=$3
  
  [ ! -f $word_text ] && echo "Word text $word_text does not exits." && exit 1;
  
  grep -q $separator $word_text && echo "$0: Error, word text file contains separator $separator. This might be a subword text file or you need to choose a different separator" && exit 1;
  
  glossaries_opt=
  [ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
  cut -d ' ' -f2- $word_text | \
    utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
    if [ $word_text == $subword_text ]; then
      mv $word_text ${word_text}.old
      cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text
    else
      cut -d ' ' -f1 $word_text | paste -d ' ' - ${word_text}.sub > $subword_text
    fi
  
  rm ${word_text}.sub
  echo "Subword text created."