Blame view

egs/yomdle_tamil/v1/local/semisup/run_semisup.sh 2.25 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
  #!/bin/bash
  
  # Copyright 2017  Vimal Manohar
  #           2018  Ashish Arora
  # Apache 2.0
  
  # This script demonstrates semi-supervised training using 25k line images of 
  # supervised data and 22k line images of unsupervised data.
  # We assume the supervised data is in data/train and unsupervised data
  # is in data/train_unsup. 
  # For LM training, we use 5 million lines of tamil text.
  
  set -e
  set -o pipefail
  stage=0
  nj=30
  exp_root=exp/semisup_100k
  . ./cmd.sh
  . ./path.sh
  . ./utils/parse_options.sh
  
  mkdir -p data/train_unsup/data
  if [ $stage -le 0 ]; then
    echo "stage 0: Processing train unsupervised data...$(date)"
    local/semisup/process_data.py data/download/ \
      data/local/splits/train_unsup.txt \
      data/train_unsup
    image/fix_data_dir.sh data/train_unsup
  fi
  
  if [ $stage -le 1 ]; then
    echo "stage 1: Obtaining image groups. calling get_image2num_frames..."
    image/get_image2num_frames.py --feat-dim 40 data/train_unsup
    image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train_unsup
    echo "Extracting features and calling compute_cmvn_stats: $(date) "
    local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/train_unsup
    steps/compute_cmvn_stats.sh data/train_unsup || exit 1;
    image/fix_data_dir.sh data/train_unsup
  fi
  
  for f in data/train/utt2spk data/train_unsup/utt2spk \
    data/train/text; do
    if [ ! -f $f ]; then
      echo "$0: Could not find $f"
      exit 1;
    fi
  done
  
  # Prepare semi-supervised train set 
  if [ $stage -le 1 ]; then
    utils/combine_data.sh data/semisup100k_250k \
      data/train_aug data/train_unsup || exit 1
  fi
  
  ###############################################################################
  # Semi-supervised training using 25k line images supervised data and 
  # 22k hours unsupervised data. We use tree, lattices 
  # and seed chain system from the previous stage.
  ###############################################################################
  if [ $stage -le 2 ]; then
    local/semisup/chain/run_cnn_chainali_semisupervised_1b.sh \
      --supervised-set train_aug \
      --unsupervised-set train_unsup \
      --sup-chain-dir exp/chain/cnn_e2eali_1b \
      --sup-lat-dir exp/chain/e2e_train_lats \
      --sup-tree-dir exp/chain/tree_e2e \
      --chain-affix "" \
      --tdnn-affix _semisup_1a \
      --exp-root $exp_root || exit 1
  fi