sprak_train_irstlm.sh 4.48 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159


#!/bin/bash

# Copyright 2013-2014  Mirsk Digital Aps (Author: Andreas Kirkedal)
# Apache 2.0

# This script takes data prepared in a corpus-dependent way
# in data/local/, and converts it into the "canonical" form,
# in various subdirectories of data/, e.g. data/lang, data/lang_test_ug,
# data/train_si284, data/train_si84, etc.

. ./path.sh || exit 1;
if [ -z $IRSTLM ] ; then
  export IRSTLM=$KALDI_ROOT/tools/irstlm/
fi
export PATH=${PATH}:$IRSTLM/bin
if ! command -v ngt >/dev/null 2>&1 ; then
  echo "$0: Error: the IRSTLM is not available or compiled" >&2
  echo "$0: Error: We used to install it by default, but." >&2
  echo "$0: Error: this is no longer the case." >&2
  echo "$0: Error: To install it, go to $KALDI_ROOT/tools" >&2
  echo "$0: Error: and run extras/install_irstlm.sh" >&2
  exit 1
fi

srcdict=$1
newtext=$2
lm_suffix=$3
N=$4
lmdir=$5
extdict=${srcdict}_$lm_suffix
lang_tmp=data/local/lang_tmp
extlang=data/lang_$lm_suffix

if [ ! -d $lmdir ];
  then
  mkdir -p $lmdir
fi


if [ ! -d $extdict ];
  then

  echo "Creating $extdict based on $srcdict"

  # Extend the $srcdict to include the new data
  mkdir -p $extdict
  for f in extra_questions.txt lexicon.txt nonsilence_phones.txt optional_silence.txt silence_phones.txt; do
    cp $srcdict/$f $extdict/
  done

  mv $extdict/lexicon.txt $extdict/oldlexicon.txt
fi


if [ ! -f $extdict/transcripts.uniq ];
  then
  # Create the text data for LMs and RNNs
  cat $srcdict/transcripts.txt $newtext > $extdict/transcripts.txt
  sort -u $extdict/transcripts.txt > $extdict/transcripts.uniq
fi


# Checks if espeak is available on the system. espeak is necessary to extend
# the setup because the original transcriptions were created with espeak and
# filtered

if ! which espeak >&/dev/null; then
  echo "espeak is not available on your system. You must install espeak before proceeding."
  exit 0;
fi


if [ ! -f $extdict/lexicon.txt ];
  then
  # Extend lexicon with pronunciations from espeak
  echo "Transcibing $newtext using espeak"

  cat $newtext | tr [:blank:] '\n' | grep -P -v '^[\s?|\.|\!]*$' | sort -u > $extdict/wlist.txt

  # Piped so only a number is stored in the variable
  nwords=$(wc -l < $extdict/wlist.txt)
  nsplit=$((nwords / 8))

  # Create wordlist
  # Run through espeak to get phonetics
  split -l $nsplit $extdict/wlist.txt $extdict/Wtemp_
  for w in $extdict/Wtemp_*; do
    (cat $w | espeak -q -vda -x > $w.pho ) &
  done
  wait

  cat $extdict/Wtemp_*.pho > $extdict/plist.txt
  rm -f $extdict/Wtemp_*


  # Filter transcription
  # Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
  # initial and trailing spaces and collapse 2 or more spaces to one space

  cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt

  #Some question marks are not caught above
  perl -pe 's/ \? / /g' $dir/plist2.txt > $dir/plist3.txt

  # Create lexicon.txt and put it in data/local/dict
  paste $dir/wlist.txt $dir/plist3.txt > $dir/lexicon1.txt

  # Remove entries without transcription
  grep -P  "^.+\t.+$" $dir/lexicon1.txt > $dir/lexicon2.txt

  echo "Combining lexicons"
  # Combine lexicons
  cat $extdict/oldlexicon.txt $extdict/newlexicon.txt > $extdict/templex
  sort -u $extdict/templex > $extdict/lexicon.txt
fi


if [ ! -d $extlang ];
  then
  # Create new lang_ext dir
  utils/prepare_lang.sh $extdict "<UNK>" $lang_tmp $extlang || exit 1;
fi

if [ ! -f $lmdir/extra4.ngt ];
  then
  echo "Preparing LM data"

  grep -P -v '^[\s?|\.|\!]*$' $newtext | \
  awk '{if(NF>=4){ printf("%s\n",$0); }}' > $lmdir/text.filt

  # Envelop LM training data in context cues
  add-start-end.sh < $lmdir/text.filt > $lmdir/lm_input


    echo "Creating new binary ngram table $lmdir/extra4.ngt"
    ngt -i=$lmdir/lm_input -n=4 -o=$lmdir/extra4.ngt -b=yes
fi

echo "Training ARPA model extra$lm_suffix"

# Randomly chose n=4 as upper bound for the ngram table
tlm -tr=$lmdir/extra4.ngt -n=$N -lm=wb -o=$lmdir/extra${N}$lm_suffix

# Next, create the corresponding FST
# and the corresponding lang_test_* directory.
test=data/lang_test_${N}${lm_suffix}
mkdir -p $test


cp -r $extlang $test

cat $lmdir/extra${N}$lm_suffix | \
  arpa2fst --disambig-symbol=#0 \
           --read-symbol-table=$test/words.txt - $test/G.fst

utils/validate_lang.pl $test || exit 1;

exit 0;