run.sh 4.94 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190


#!/bin/bash

. ./cmd.sh
. ./path.sh

stage=0

# the location of the LDC corpus; this location works for the CLSP grid.
datadir=/export/corpora5/LDC/LDC2006S37

# The corpus and lexicon are on openslr.org
#speech_url="http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
lexicon_url="http://www.openslr.org/resources/34/santiago.tar.gz"

# Location of the Movie subtitles text corpus
subtitles_url="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip"

. utils/parse_options.sh

set -e
set -o pipefail
set -u


# don't change tmpdir, the location is used explicitly in scripts in local/.
tmpdir=data/local/tmp

if [ $stage -le 0 ]; then
  if [ ! -d $datadir ]; then
    echo "$0: please download and un-tar http://www.openslr.org/resources/39/LDC2006S37.tar.gz"
    echo "  and set $datadir to the directory where it is located."
    exit 1
  fi
  if [ ! -s santiago.txt ]; then
    echo "$0: downloading the lexicon"
    wget -c http://www.openslr.org/resources/34/santiago.tar.gz
    tar -xvzf santiago.tar.gz
  fi
  # Get data for lm training
  local/subs_download.sh $subtitles_url
fi

if [ $stage -le 1 ]; then
  echo "Making lists for building models."
  local/prepare_data.sh $datadir
fi

if [ $stage -le 2 ]; then
  mkdir -p data/local/dict $tmpdir/dict
  local/prepare_dict.sh
fi

if [ $stage -le 3 ]; then
  utils/prepare_lang.sh \
    data/local/dict "<UNK>" \
    data/local/lang data/lang
fi

if [ $stage -le 4 ]; then
  mkdir -p $tmpdir/subs/lm
  local/subs_prepare_data.pl
fi

if [ $stage -le 5 ]; then
  echo "point 1"
  local/prepare_lm.sh  $tmpdir/subs/lm/in_vocabulary.txt
fi

if [ $stage -le 6 ]; then
  echo "point 2"
  utils/format_lm.sh \
    data/lang data/local/lm/trigram.arpa.gz data/local/dict/lexicon.txt \
    data/lang_test
fi

if [ $stage -le 7 ]; then
  echo "$0: extracting acoustic features."
  mkdir -p exp

  for fld in native nonnative test devtest train; do
    if [ -e data/$fld/cmvn.scp ]; then
      rm data/$fld/cmvn.scp
    fi

    steps/make_mfcc.sh --cmd "$train_cmd" --nj 4 data/$fld
    utils/fix_data_dir.sh data/$fld
    steps/compute_cmvn_stats.sh data/$fld
    utils/fix_data_dir.sh data/$fld
  done
fi

if [ $stage -le 8 ]; then
  echo "$0 monophone training"
  steps/train_mono.sh --nj 8 --cmd "$train_cmd" data/train data/lang exp/mono || exit 1;

  # evaluation
  (
    # make decoding graph for monophones
    utils/mkgraph.sh data/lang_test exp/mono exp/mono/graph || exit 1;

    # test monophones
    for x in native nonnative devtest test; do
      steps/decode.sh --nj 8 exp/mono/graph data/$x exp/mono/decode_${x} || exit 1;
    done
  ) &
fi

if [ $stage -le 9 ]; then
  # align with monophones
  steps/align_si.sh --nj 8 --cmd "$train_cmd" \
    data/train data/lang exp/mono exp/mono_ali

  echo "$0 Starting  triphone training in exp/tri1"
  steps/train_deltas.sh  --cmd "$train_cmd" --cluster-thresh 100 \
    1500 25000 data/train data/lang exp/mono_ali exp/tri1

  wait  # wait for the previous decoding jobs to finish in case there's just one
        # machine.
  (
    utils/mkgraph.sh \
    data/lang_test exp/tri1 exp/tri1/graph || exit 1;

    for x in native nonnative devtest test; do
      steps/decode.sh --nj 8 exp/tri1/graph data/$x exp/tri1/decode_${x} || exit 1;
    done
  ) &

fi

if [ $stage -le 10 ]; then
  echo "$0: Starting delta system alignment"
  steps/align_si.sh \
    --nj 8 --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_ali

  echo "$0: starting lda+mllt triphone training in exp/tri2b"

  steps/train_lda_mllt.sh \
    --splice-opts "--left-context=3 --right-context=3" \
    2000 30000 data/train data/lang exp/tri1_ali exp/tri2b

  wait  # wait for the previous decoding jobs to finish in case there's just one
        # machine.

  (
    utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph || exit 1;

    for x in native nonnative devtest test; do
      steps/decode.sh --nj 8 exp/tri2b/graph data/$x exp/tri2b/decode_${x} || exit 1;
    done
  ) &
fi

if  [ $stage -le 11 ]; then
  echo "$0: Starting LDA+MLLT system alignment"

  steps/align_si.sh \
    --use-graphs true --nj 8 --cmd "$train_cmd" \
    data/train data/lang exp/tri2b exp/tri2b_ali

  echo "$0 Starting (SAT) triphone training in exp/tri3b"
  steps/train_sat.sh \
    --cmd "$train_cmd" \
    3100 50000 data/train data/lang exp/tri2b_ali exp/tri3b

  echo "$0 Starting exp/tri3b_ali"
  steps/align_fmllr.sh \
    --nj 8 --cmd "$train_cmd" \
    data/train data/lang exp/tri3b exp/tri3b_ali

  wait  # wait for the previous decoding jobs to finish in case there's just one
        # machine.
  (
    # make decoding graphs for SAT models
    utils/mkgraph.sh \
      data/lang_test exp/tri3b exp/tri3b/graph ||  exit 1;

    for x in native nonnative devtest test; do
      echo "$0: decoding $x with tri3b models."
      steps/decode_fmllr.sh \
        --nj 8 --cmd "$decode_cmd"  exp/tri3b/graph data/$x exp/tri3b/decode_${x}
    done
  ) &
fi

if [ $stage -le 12 ]; then
  echo "$0: train and test chain models."
  local/chain/run_tdnn.sh
fi

wait