train_irstlm.sh 2.12 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77


#!/bin/bash

# Copyright 2013  Mirsk Digital ApS (Author: Andreas Kirkedal)
# Apache 2.0

# This script takes data prepared in a corpus-dependent way
# in data/local/, and converts it into the "canonical" form,
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
# data/train_si284, data/train_si84, etc.

# Don't bother doing train_si84 separately (although we have the file lists
# in data/local/) because it's just the first 7138 utterances in train_si284.
# We'll create train_si84 after doing the feature extraction.

. ./path.sh || exit 1;

if [ -z $IRSTLM ] ; then
  export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v ngt >/dev/null 2>&1 ; then
  echo "$0: Error: the IRSTLM is not available or compiled" >&2
  echo "$0: Error: We used to install it by default, but." >&2
  echo "$0: Error: this is no longer the case." >&2
  echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
  echo "$0: Error: and run extras/install_irstlm.sh" >&2
  exit 1
fi

echo "Preparing train and test data"
srcdir=$4
lmdir=$5
tmpdir=data/local/lm_tmp
lang_tmp=data/local/lang_tmp
lexicon=$1
ngram=$2
lm_suffix=$3
mkdir -p $lmdir
mkdir -p $tmpdir


#grep -P -v '^[\s?|\.|\!]*$' $lexicon | grep -v '^ *$' | \
#awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt

# Envelop LM training data in context cues
add-start-end.sh < $lexicon | awk '{if(NF>=3){ printf("%s\n",$0); }}' > $lmdir/lm_input
wait

# Next, for each type of language model, create the corresponding FST
# and the corresponding lang_test_* directory.

echo "Preparing language models for test"

# Create Ngram table
ngt -i=$lmdir/lm_input -n=$ngram -o=$lmdir/train${ngram}.ngt -b=yes
wait
# Estimate trigram and quadrigram models in ARPA format
tlm -tr=$lmdir/train${ngram}.ngt -n=$ngram -lm=wb -o=$lmdir/train${ngram}.arpa
wait


test=data/lang_test_${lm_suffix}

mkdir -p $test
cp -r $srcdir/* $test

cat $lmdir/train${ngram}.arpa | \
  arpa2fst --disambig-symbol=#0 \
           --read-symbol-table=$test/words.txt - $test/G.fst

utils/validate_lang.pl $test || exit 1;

echo "Succeeded in formatting data."
exit 0;
#rm -rf $tmpdir
#rm -f $ccs