csj_prepare_dict.sh 1.33 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45


#!/bin/bash

# Making dictionary using CSJ data with morpheme analysis.
# from the one in Kaldi s5 recipe in that it uses lower-case --Arnab (Jan 2013)

# To be run from one directory above this script.

. ./path.sh

#check existing directories
[ $# != 0 ] && echo "Usage: local/csj_data_prep.sh" && exit 1;

srcdir=data/local/train  
dir=data/local/dict_nosp
mkdir -p $dir
srcdict=$srcdir/lexicon.txt

# assume csj_data_prep.sh was done already.
[ ! -f "$srcdict" ] && echo "No such file $srcdict" && exit 1;

#(2a) Dictionary preparation:
# Pre-processing (Upper-case, remove comments)
cat $srcdict > $dir/lexicon1.txt || exit 1;

cat $dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{for (p in phones) print p;}' | \
  grep -v sp > $dir/nonsilence_phones.txt  || exit 1;

#( echo sil; echo spn; echo nsn; echo lau ) > $dir/silence_phones.txt
( echo sp ; echo spn ; ) > $dir/silence_phones.txt

echo sp > $dir/optional_silence.txt

# No "extra questions" in the input to this setup, as we don't
# have stress or tone.
echo -n >$dir/extra_questions.txt

# Add to the lexicon the silences, noises etc.
( echo '<sp> sp' ; echo '<unk> spn'; ) | cat - $dir/lexicon1.txt  > $dir/lexicon2.txt || exit 1;


pushd $dir >&/dev/null
ln -sf lexicon2.txt lexicon.txt
popd >&/dev/null

echo Prepared input dictionary and phone-sets for CSJ phase 1.