prepare_subword_text.sh
1.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/bash
# 2019 Dongji Gao
# This script generates subword text form word text.
# For example, <noise> internatioal -> <noise> inter@@ nation@@ al
# @@ here is the separator indicate the poisition of subword in word.
# Subword directly followed by separator can only appear at he begining or middle of word.
# "<noise>" here can be reserved if added to the option "--glossaries"
# Begin configuration section
separator="@@"
glossaries=
# End configuration section
. utils/parse_options.sh
echo "$0 $@"
if [ $# -ne 3 ]; then
echo "Usage: utils/prepare_subword_text.sh <word-text> <pair_code> <subword-text>"
echo "e.g.: utils/prepare_subword_text.sh data/train/text data/local/pair_code.txt data/train/text_subword"
echo " --seperator <separator> # default: @@"
echo " --glossaries <reserved-words> # glossaries are words reserved"
exit 1;
fi
word_text=$1
pair_code=$2
subword_text=$3
[ ! -f $word_text ] && echo "Word text $word_text does not exits." && exit 1;
grep -q $separator $word_text && echo "$0: Error, word text file contains separator $separator. This might be a subword text file or you need to choose a different separator" && exit 1;
glossaries_opt=
[ -z $glossaires ] && glossaries_opt="--glossaries $glossaries"
cut -d ' ' -f2- $word_text | \
utils/lang/bpe/apply_bpe.py -c $pair_code --separator $separator $glossaires_opt > ${word_text}.sub
if [ $word_text == $subword_text ]; then
mv $word_text ${word_text}.old
cut -d ' ' -f1 ${word_text}.old | paste -d ' ' - ${word_text}.sub > $subword_text
else
cut -d ' ' -f1 $word_text | paste -d ' ' - ${word_text}.sub > $subword_text
fi
rm ${word_text}.sub
echo "Subword text created."