Blame view

egs/heroico/s5/local/prepare_data.sh 3.53 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
  #!/bin/bash
  
  # Copyright 2017 John Morgan
  # Apache 2.0.
  
  . ./cmd.sh
  . ./path.sh
  stage=0
  datadir=$1
  
  . ./utils/parse_options.sh
  
  set -e
  set -o pipefail
  
  tmpdir=data/local/tmp
  
  # acoustic models are trained on the heroico corpus
  # testing is done on the usma corpus
  # heroico consists of 2 parts: answers and recordings (recited)
  
  answers_transcripts=$datadir/data/transcripts/heroico-answers.txt
  recordings_transcripts=$datadir/data/transcripts/heroico-recordings.txt
  
  # usma is all recited
  usma_transcripts=$datadir/data/transcripts/usma-prompts.txt
  
  # make acoustic model training  lists
  if [ $stage -le 0 ]; then
    mkdir -p $tmpdir/heroico $tmpdir/usma
  
    local/get_wav_list.sh $datadir/data
  
    # make separate lists for heroico answers and recordings
    # the transcripts are converted to UTF8
    export LC_ALL=en_US.UTF-8
    cat $answers_transcripts  | iconv -f ISO-8859-1 -t UTF-8 | \
      tr -d '\r' |  local/heroico_answers_make_lists.pl
  
    utils/fix_data_dir.sh $tmpdir/heroico/answers
  
    cat $recordings_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \
      tr -d '\r' | local/heroico_recordings_make_lists.pl
  
    utils/fix_data_dir.sh $tmpdir/heroico/recordings/train
    utils/fix_data_dir.sh $tmpdir/heroico/recordings/devtest
  
    # consolidate heroico lists
    mkdir -p $tmpdir/heroico/lists/train $tmpdir/heroico/lists/devtest
  
    for x in wav.scp utt2spk text; do
      cat $tmpdir/heroico/answers/$x $tmpdir/heroico/recordings/train/$x | \
        tr -d '\r' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x
    done
  
    for x in wav.scp utt2spk text; do
      cat $tmpdir/heroico/recordings/devtest/$x | tr -d '\r' | \
        sort -k1,1 -u >$tmpdir/heroico/lists/devtest/$x
    done
  
    utils/fix_data_dir.sh $tmpdir/heroico/lists/train
    utils/fix_data_dir.sh $tmpdir/heroico/lists/devtest
  fi
  
  if [ $stage -le 1 ]; then
    #  make separate lists for usma (US military academy) native and nonnative
    cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \
      tr -d '\r' | dos2unix | local/usma_native_make_lists.pl
  
    cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \
      tr -d '\r' | local/usma_nonnative_make_lists.pl
  
    for n in native nonnative; do
      mkdir -p $tmpdir/usma/$n/lists
      for x in wav.scp utt2spk text; do
        sort $tmpdir/usma/$n/$x >$tmpdir/usma/$n/lists/$x
      done
  
      utils/fix_data_dir.sh  $tmpdir/usma/$n/lists
    done
  
    mkdir -p data/train $tmpdir/lists/train data/devtest $tmpdir/lists/devtest
  
    # get training lists
    for x in wav.scp utt2spk text; do
      cat $tmpdir/heroico/answers/${x} $tmpdir/heroico/recordings/train/${x} | \
        tr -d '\r' >$tmpdir/lists/train/$x
      sort $tmpdir/lists/train/$x >data/train/$x
    done
  
    # get devtest lists
    for x in wav.scp utt2spk text; do
      cat $tmpdir/heroico/lists/devtest/$x | \
         tr -d '\r' >$tmpdir/lists/devtest/$x
      sort $tmpdir/lists/devtest/$x >data/devtest/$x
    done
  
    utils/utt2spk_to_spk2utt.pl data/train/utt2spk | sort >data/train/spk2utt
    utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk | sort >data/devtest/spk2utt
  
    utils/fix_data_dir.sh data/train
    utils/fix_data_dir.sh data/devtest
  
    # make testing  lists
    mkdir -p data/test data/native data/nonnative $tmpdir/usma/lists
  
    for x in wav.scp text utt2spk; do
      for n in native nonnative; do
        cat $tmpdir/usma/$n/lists/$x
      done >$tmpdir/usma/lists/$x
  
      cat $tmpdir/usma/lists/$x >data/test/$x
  
      for n in native nonnative; do
        sort $tmpdir/usma/$n/$x >data/$n/$x
      done
    done
  
    for n in native nonnative test; do
      utils/utt2spk_to_spk2utt.pl data/$n/utt2spk | sort >data/$n/spk2utt
      utils/fix_data_dir.sh data/$n
    done
  fi