Blame view

egs/callhome_egyptian/s5/local/callhome_data_prep.sh 8.1 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
  #!/bin/bash
  #
  # Johns Hopkins University : (Gaurav Kumar)
  # The input is the Callhome Egyptian Arabic Dataset which contains *.sph files
  # In addition the transcripts are needed as well.
  
  #TODO: Rewrite intro, copyright stuff and dir information
  # To be run from one directory above this script.
  
  stage=0
  
  export LC_ALL=C
  
  
  if [ $# -lt 6 ]; then
     echo "Arguments should be the location of the Callhome Egyptian Arabic Speech and Transcript Directories, se
  e ../run.sh for example."
     exit 1;
  fi
  
  cdir=`pwd`
  dir=`pwd`/data/local/data
  local=`pwd`/local
  utils=`pwd`/utils
  tmpdir=`pwd`/data/local/tmp
  
  mkdir -p $dir
  mkdir -p $tmpdir
  
  . ./path.sh || exit 1; # Needed for KALDI_ROOT
  export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
  sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  if [ ! -x $sph2pipe ]; then
     echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
     exit 1;
  fi
  cd $dir
  
  # Make directory of links to the ECA data.  This relies on the command
  # line arguments being absolute pathnames.
  rm -r links/ 2>/dev/null
  mkdir links/
  ln -s $* links
  
  # Basic spot checks to see if we got the data that we needed
  if [ ! -d links/LDC97S45 -o ! -d links/LDC97T19 ];
  then
          echo "The speech and the data directories need to be named LDC97S45 and LDC97T19 respectively"
          exit 1;
  fi
  if [ ! -d links/LDC2002S37 -o ! -d links/LDC2002T38 ];
  then
          echo "The Callhome supplement directories need to be named LDC2002S37 and LDC2002T38."
          o
          exit 1;
  fi
  if [ ! -d links/LDC2002S22 -o ! -d links/LDC2002T39 ];
  then
          echo "The H5-ECA directories need to be named LDC2002S22 and LDC2002T39."
          exit 1;
  fi
  
  if [ ! -d links/LDC97S45/CALLHOME/ARABIC/DEVTEST -o ! -d links/LDC97S45/CALLHOME/ARABIC/EVLTEST -o ! -d links/LDC97S45/CALLHOME/ARABIC/TRAIN ];
  then
          echo "Dev, Eval or Train directories missing or not properly organised within the speech data dir"
          exit 1;
  fi
  
  #Check the transcripts directories as well to see if they exist
  if [ ! -d links/LDC97T19/callhome_arabic_trans_970711/transcrp/devtest -o ! -d links/LDC97T19/callhome_arabic_trans_970711/transcrp/evaltest -o ! -d links/LDC97T19/callhome_arabic_trans_970711/transcrp/train ]
  then
          echo "Transcript directories missing or not properly organised"
          exit 1;
  fi
  
  if [ ! -d links/LDC2002S37/SPEECH ];
  then
          echo "Callhome supplement directories missing or not properly organised within the speech data dir"
          exit 1;
  fi
  
  if [ ! -d links/LDC2002T38/ch_ara_transcr_suppl/transcr ]
  then
          echo "Callhome supplement Transcript directories missing or not properly organised"
          exit 1;
  fi
  
  if [ ! -d links/LDC2002S22/SPEECH ];
  then
          echo "H5 directories missing or not properly organised within the speech data dir"
          exit 1;
  fi
  
  if [ ! -d links/LDC2002T39/transcr ]
  then
          echo "H5 Transcript directories missing or not properly organised"
          exit 1;
  fi
  
  speech_train=$dir/links/LDC97S45/CALLHOME/ARABIC/TRAIN
  speech_dev=$dir/links/LDC97S45/CALLHOME/ARABIC/DEVTEST
  speech_test=$dir/links/LDC97S45/CALLHOME/ARABIC/EVLTEST
  transcripts_train=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/train/roman
  transcripts_dev=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/devtest/roman
  transcripts_test=$dir/links/LDC97T19/callhome_arabic_trans_970711/transcrp/evaltest/roman
  speech_sup=$dir/links/LDC2002S37/SPEECH
  transcripts_sup=$dir/links/LDC2002T38/ch_ara_transcr_suppl/transcr
  speech_h5=$dir/links/LDC2002S22/SPEECH
  transcripts_h5=$dir/links/LDC2002T39/transcr
  
  fcount_train=`find ${speech_train} -iname '*.SPH' | wc -l`
  fcount_dev=`find ${speech_dev} -iname '*.SPH' | wc -l`
  fcount_test=`find ${speech_test} -iname '*.SPH' | wc -l`
  fcount_t_train=`find ${transcripts_train} -iname '*.txt' | wc -l`
  fcount_t_dev=`find ${transcripts_dev} -iname '*.txt' | wc -l`
  fcount_t_test=`find ${transcripts_test} -iname '*.txt' | wc -l`
  fcount_sup=`find ${speech_sup} -iname '*.SPH' | wc -l`
  fcount_t_sup=`find ${transcripts_sup} -iname '*.txt' | wc -l`
  fcount_h5=`find ${speech_h5} -iname '*.SPH' | wc -l`
  fcount_t_h5=`find ${transcripts_h5} -iname '*.txt' | wc -l`
  
  #Now check if we got all the files that we needed
  if [ $fcount_train != 80 -o $fcount_dev != 20 -o $fcount_test != 20 -o $fcount_t_train != 80 -o $fcount_t_dev != 20 -o $fcount_t_test != 20 ];
  then
          echo "Incorrect number of files in the data directories"
          echo "The paritions should contain 80/20/20 files"
          exit 1;
  fi
  if [ $fcount_sup != 20 -o $fcount_t_sup != 20 ];
  then
          echo "Incorrect number of files in the ECA sup data directories"
          echo "The paritions should contain 20/20 files"
          exit 1;
  fi
  if [ $fcount_h5 != 20 -o $fcount_t_h5 != 20 ];
  then
          echo "Incorrect number of files in the H5 data directories"
          echo "The paritions should contain 20/20 files"
          exit 1;
  fi
  
  if [ $stage -le 0 ]; then
    #Gather all the speech files together to create a file list
    (
        find $speech_train -iname '*.sph';
        find $speech_dev -iname '*.sph';
        find $speech_test -iname '*.sph';
        find $speech_sup -iname '*.sph';
        find $speech_h5 -iname '*.sph';
    )  > $tmpdir/callhome_train_sph.flist
  
    #Get all the transcripts in one place
  
    (
      find $transcripts_train -iname '*.txt';
      find $transcripts_dev -iname '*.txt';
      find $transcripts_test -iname '*.txt';
      find $transcripts_sup -iname '*.txt';
      find $transcripts_h5 -iname '*.txt';
    )  > $tmpdir/callhome_train_transcripts.flist
  
  fi
  
  if [ $stage -le 1 ]; then
    $local/callhome_make_trans.pl $tmpdir
    mkdir -p $dir/train_all
    mv $tmpdir/reco2file_and_channel $dir/train_all/
  fi
  
  if [ $stage -le 2 ]; then
    sort $tmpdir/text.1 | grep -v '((' | \
    awk '{if (NF > 1){ print; }}' | \
    sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \
    sed 's:<lname>\([^<]*\)<\/lname>:\1:g' | \
    sed 's:<lname[\/]*>::g' | \
    sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
    sed 's:<\s*cough[\/]*>:[noise]:g' | \
    sed 's:<sneeze[\/]*>:[noise]:g' | \
    sed 's:<breath[\/]*>:[noise]:g' | \
    sed 's:<lipsmack[\/]*>:[noise]:g' | \
    sed 's:<background>[^<]*<\/background>:[noise]:g' | \
    sed -r 's:<[/]?background[/]?>:[noise]:g' | \
    #One more time to take care of nested stuff
    sed 's:<laugh>[^<]*<\/laugh>:[laughter]:g' | \
    sed -r 's:<[/]?laugh[/]?>:[laughter]:g' | \
    #now handle the exceptions, find a cleaner way to do this?
    sed 's:<foreign langenglish::g' | \
    sed 's:</foreign::g' | \
    sed -r 's:<[/]?foreing\s*\w*>::g' | \
    sed 's:</b::g' | \
    sed 's:<foreign langengullís>::g' | \
    sed 's:foreign>::g' | \
    sed 's:>::g' | \
    #How do you handle numbers?
    grep -v '()' | \
    #Now go after the non-printable characters
    sed -r 's:¿::g' > $tmpdir/text.2
  
    cp $tmpdir/text.2 $dir/train_all/text
  
  
    #Create segments file and utt2spk file
    ! cat $dir/train_all/text | perl -ane 'm:([^-]+)-([AB])-(\S+): || die "Bad line $_;"; print "$1-$2-$3 $1-$2
  "; ' > $dir/train_all/utt2spk \
    && echo "Error producing utt2spk file" && exit 1;
  
    # Remove utterances that have the same start and end time. Corresponding text entries will be removed when use
    # fix_data_dir.sh and validate_data_dir.sh later
    cat $dir/train_all/text | perl -ane 'm:((\S+-[AB])-(\d+)-(\d+))\s: || die; $utt = $1; $reco = $2;
   $s = sprintf("%.2f", 0.01*$3); $e = sprintf("%.2f", 0.01*$4); print "$utt $reco $s $e
  "; ' | \
     awk '{if (!(NF != 4 || ($4 <= $3 && $4 != -1))) {print $0}}' >$dir/train_all/segments
  
    $utils/utt2spk_to_spk2utt.pl <$dir/train_all/utt2spk > $dir/train_all/spk2utt
  fi
  
  if [ $stage -le 3 ]; then
    cat $tmpdir/callhome_train_sph.flist | perl -ane 'm:/([^/]+)\.SPH$: || die "bad line $_; ";  print lc($1)," $_"; ' > $tmpdir/sph.scp
    cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |
  ", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |
  ", $1, sph2pipe, $2);}' | \
    sort -k1,1 -u  > $dir/train_all/wav.scp || exit 1;
  fi
  
  if [ $stage -le 4 ]; then
    # Build the speaker to gender map, the temporary file with the speaker in gender information is already created by fsp_make_trans.pl.
    cd $cdir
    #TODO: needs to be rewritten
    $local/callhome_make_spk2gender > $dir/train_all/spk2gender
  fi
  
  echo "CALLHOME ECA Data preparation succeeded."