Blame view

egs/tunisian_msa/s5/local/prepare_data.sh 3.98 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
  #!/bin/bash  
  
  # Copyright 2018 John Morgan
  # Apache 2.0.
  
  # configuration variables
  tmpdir=data/local/tmp
  download_dir=$(pwd)
  tmp_tunis=$tmpdir/tunis
  tmp_libyan=$tmpdir/libyan
  data_dir=$download_dir/Tunisian_MSA/data
  # location of test data 
  libyan_src=$data_dir/speech/test/Libyan_MSA
  # end of configuration variable settings
  
  # process the Tunisian MSA devtest data
  
  # get list of  wav files
  for s in devtest/CTELLONE/Recordings_Arabic/6 devtest/CTELLTHREE/Recordings_Arabic/10; do
    echo "$0: looking for wav files for $s."
    mkdir -p $tmp_tunis/$s
    find $data_dir/speech/$s -type f \
    -name "*.wav" | grep Recordings_Arabic > $tmp_tunis/$s/wav.txt
  
    local/devtest_recordings_make_lists.pl \
    $data_dir/transcripts/devtest/recordings.tsv $s tunis
  
    mkdir -p data/devtest
  
    for x in wav.scp utt2spk text; do
      cat     $tmp_tunis/$s/$x | tr "	" " " >> data/devtest/$x
    done
  done
  
  utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk | sort > data/devtest/spk2utt
  
  utils/fix_data_dir.sh data/devtest
  
  # training data consists of 2 parts: answers and recordings (recited)
  answers_transcripts=$data_dir/transcripts/train/answers.tsv
  recordings_transcripts=$data_dir/transcripts/train/recordings.tsv
  
  # location of test data
  cls_rec_tr=$libyan_src/cls/data/transcripts/recordings/cls_recordings.tsv
  lfi_rec_tr=$libyan_src/lfi/data/transcripts/recordings/lfi_recordings.tsv
  srj_rec_tr=$libyan_src/srj/data/transcripts/recordings/srj_recordings.tsv
  mbt_rec_tr=$data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv
  
  # make acoustic model training  lists
  mkdir -p $tmp_tunis
  
  # get  wav file names
  
  # for recited speech
  # the data collection laptops had names like CTELLONE CTELLTWO ...
  for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do
    find $data_dir/speech/train/$machine -type f -name "*.wav" | grep Recordings \
    >> $tmp_tunis/recordings_wav.txt
  done
  
  # get file names for Answers 
  for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do
    find $data_dir/speech/train/$machine -type f \
      -name "*.wav" \
      | grep Answers >> $tmp_tunis/answers_wav.txt
  done
  
  # make separate transcription lists for answers and recordings
  export LC_ALL=en_US.UTF-8
  local/answers_make_lists.pl $answers_transcripts
  
  utils/fix_data_dir.sh $tmp_tunis/answers
  
  local/recordings_make_lists.pl $recordings_transcripts
  
  utils/fix_data_dir.sh $tmp_tunis/recordings
  
  # consolidate lists
  # acoustic models will be trained on both recited and prompted speech
  mkdir -p $tmp_tunis/lists
  
  for x in wav.scp utt2spk text; do
    cat $tmp_tunis/answers/$x $tmp_tunis/recordings/$x > $tmp_tunis/lists/$x
  done
  
  utils/fix_data_dir.sh $tmp_tunis/lists
  
  # get training lists
  mkdir -p data/train
  for x in wav.scp utt2spk text; do
    sort $tmp_tunis/lists/$x | tr "	" " " > data/train/$x
  done
  
  utils/utt2spk_to_spk2utt.pl data/train/utt2spk | sort > data/train/spk2utt
  
  utils/fix_data_dir.sh data/train
  
  # process the Libyan MSA data
  mkdir -p $tmp_libyan
  
  for s in cls lfi srj; do
    mkdir -p $tmp_libyan/$s
  
    # get list of  wav files
    find $libyan_src/$s -type f \
      -name "*.wav" \
      | grep recordings > $tmp_libyan/$s/recordings_wav.txt
  
    echo "$0: making recordings list for $s"
    local/test_recordings_make_lists.pl \
      $libyan_src/$s/data/transcripts/recordings/${s}_recordings.tsv $s libyan
  done
  
  # process the Tunisian MSA test data
  
  mkdir -p $tmp_tunis/mbt
  
  # get list of  wav files
  find $data_dir/speech/test/mbt -type f \
    -name "*.wav" \
    | grep recordings > $tmp_tunis/mbt/recordings_wav.txt
  
  echo "$0: making recordings list for mbt"
  local/test_recordings_make_lists.pl \
    $data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv mbt tunis
  
  mkdir -p data/test
  # get the Libyan files
  for s in cls lfi srj; do
    for x in wav.scp utt2spk text; do
      cat     $tmp_libyan/$s/recordings/$x | tr "	" " " >> data/test/$x
    done
  done
  
  for x in wav.scp utt2spk text; do
    cat     $tmp_tunis/mbt/recordings/$x | tr "	" " " >> data/test/$x
  done
  
  utils/utt2spk_to_spk2utt.pl data/test/utt2spk | sort > data/test/spk2utt
  
  utils/fix_data_dir.sh data/test