prepare_lexicon.sh 1.24 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45


#!/bin/bash
# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
# License: Apache 2.0

# Begin configuration section.
tag_percentage=0.1
unk="<unk>"

. ./utils/parse_options.sh

if [ $# -eq 0 ]; then
  echo "Usage: ./local/prepare_lexicon.sh <dataset_dir/text> <olex>"
  exit 1
fi

# End configuration section
set -e -o pipefail
set -o nounset                              # Treat unset variables as an error

text=$1
out=$2

mkdir -p $out
local/prepare_training_text.pl "$unk" $text > ${text}.clean
mv $text ${text}.orig
mv ${text}.clean $text
utils/fix_data_dir.sh `dirname $text`

cut -f 2- -d ' ' $text | perl -ape 's/ /\n/g;' | sort -u > $out/word_list.raw
(echo SIL; grep "<" $out/word_list.raw) | awk '{print $0, $0;}' > $out/silence_lexicon.txt
grep -v "<" $out/word_list.raw > $out/word_list.txt


local/lexicon/make_unicode_lexicon.py --tag-percentage $tag_percentage \
  --silence-lexicon $out/silence_lexicon.txt \
  $out/word_list.txt $out/lexicon.txt $out/grapheme_map.txt

local/prepare_unicode_dict.py --silence-lexicon $out/silence_lexicon.txt \
  $out/lexicon.txt $out

cp $out/lexicon.txt $out/filtered_lexicon.txt

utils/prepare_lang.sh --share-silence-phones true \
  data/local "$unk" data/local/tmp.lang data/lang