Blame view

egs/reverb/s5/local/prepare_real_data.sh 5.31 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
  #!/bin/bash
  #
  # Copyright 2018 Johns Hopkins University (Author: Shinji Watanabe)
  # Copyright 2018 Johns Hopkins University (Author: Aswin Shanmugam Subramanian)
  # Apache 2.0
  # This script is adapted from data preparation scripts in the Kaldi reverb recipe
  # https://github.com/kaldi-asr/kaldi/tree/master/egs/reverb/s5/local
  
  # Begin configuration section.
  wavdir=${PWD}/wav
  # End configuration section
  . ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
  
  . ./path.sh
  
  echo >&2 "$0" "$@"
  if [ $# -ne 1 ] ; then
    echo >&2 "$0" "$@"
    echo >&2 "$0: Error: wrong number of arguments"
    echo -e >&2 "Usage:
    $0 [opts] <reverb-dir>"
    echo -e >&2 "eg:
    $0 /export/corpora5/REVERB_2014/REVERB"
    exit 1
  fi
  
  set -e -o pipefail
  
  reverb=$1
  
  # working directory
  dir=${PWD}/data/local/data
  mkdir -p ${dir}
  
  for task in dt et; do
      if [ ${task} == 'dt' ]; then
  	mlf=${reverb}/MC_WSJ_AV_Dev/mlf/WSJ.mlf
      elif [ ${task} == 'et' ]; then
  	mlf=${reverb}/MC_WSJ_AV_Eval/mlf/WSJ.mlf
      fi
      # MLF transcription correction
      # taken from HTK baseline script
      sed -e '
  # dos to unix line feed conversion
  s/\x0D$//' \
  	-e "
              s/\x60//g              # remove unicode character grave accent.
         " \
  	-e "
              # fix the single quote for the word yield
              # and the quoted ROOTS
              # e.g. yield' --> yield
              # reason: YIELD' is not in dict, while YIELD is
              s/YIELD'/YIELD/g
              s/'ROOTS'/ROOTS/g
              s/'WHERE/WHERE/g
              s/PEOPLE'/PEOPLE/g
              s/SIT'/SIT/g
              s/'DOMINEE/DOMINEE/g
              s/CHURCH'/CHURCH/g" \
  	-e '
                # fix the single missing double full stop issue at the end of an utterance
                # e.g. I. C. N should be  I. C. N.
                # reason: N is not in dict, while N. is
                /^[A-Z]$/ {
                # append a line
                        N
                # search for single dot on the second line
                        /
  \./ {
                # found it - now replace the
                                s/\([A-Z]\)
  \./\1\.
  \./
                        }
                }' \
  	$mlf |\
  	perl local/mlf2text.pl > ${dir}/${task}.txt
  done
  
  
  noiseword="<NOISE>";
  for nch in 1 2 8; do
      taskdir=data/local/reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/${nch}ch
      # make a wav list
      for task in dt et; do
  	if [ ${task} == 'dt' ]; then
  	    audiodir=${reverb}/MC_WSJ_AV_Dev
  	    audiodir_wpe=${wavdir}/WPE/${nch}ch/MC_WSJ_AV_Dev
  	elif [ ${task} == 'et' ]; then
  	    audiodir=${reverb}/MC_WSJ_AV_Eval
  	    audiodir_wpe=${wavdir}/WPE/${nch}ch/MC_WSJ_AV_Eval
  	fi
  	for x in `ls ${taskdir} | grep RealData | grep _${task}_`; do
  	    perl -se 'while(<>){m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; $id = lc $1; print "$id $dir$_";}' -- -dir=${audiodir} ${taskdir}/$x |\
  		sed -e "s/^\(...\)/\1_${x}_\1/"
  	done > ${dir}/${task}_real_${nch}ch_wav.scp
  	for x in `ls ${taskdir} | grep RealData | grep _${task}_`; do
  	    perl -se 'while(<>){m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; $id = lc $1; print "$id $dir$_";}' -- -dir=${audiodir_wpe} ${taskdir}/$x |\
  		sed -e "s/^\(...\)/\1_${x}_\1/"
  	done > ${dir}/${task}_real_${nch}ch_wpe_wav.scp
      done
      # make a transcript
      for task in dt et; do
  	for x in `ls ${taskdir} | grep RealData | grep _${task}_`; do
  	    perl -se 'while(<>){m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_"; $id = lc $1; print "$id
  ";}' ${taskdir}/$x |\
  		perl local/find_transcripts_txt.pl ${dir}/${task}.txt |\
  		sed -e "s/^\(...\)/\1_${x}_\1/"
  	done > ${dir}/${task}_real_${nch}ch.trans1 || exit 1;
  	cat ${dir}/${task}_real_${nch}ch.trans1 | local/normalize_transcript.pl ${noiseword} > ${dir}/${task}_real_${nch}ch.txt || exit 1;
      done
      
      # Make the utt2spk and spk2utt files.
      for task in dt et; do
  	cat ${dir}/${task}_real_${nch}ch_wav.scp | awk '{print $1}' | awk -F '_' '{print $0 " " $1}' > ${dir}/${task}_real_${nch}ch.utt2spk || exit 1;
  	cat ${dir}/${task}_real_${nch}ch.utt2spk | ./utils/utt2spk_to_spk2utt.pl > ${dir}/${task}_real_${nch}ch.spk2utt || exit 1;
      done
  done
  
  # finally copy the above files to the data directory
  for nch in 1 2 8; do
      for task in dt et; do
  	datadir=data/${task}_real_${nch}ch
  	mkdir -p ${datadir}
  	sort ${dir}/${task}_real_${nch}ch_wav.scp > ${datadir}/wav.scp
  	sort ${dir}/${task}_real_${nch}ch.txt     > ${datadir}/text
  	sort ${dir}/${task}_real_${nch}ch.utt2spk > ${datadir}/utt2spk
  	sort ${dir}/${task}_real_${nch}ch.spk2utt > ${datadir}/spk2utt
  	./utils/fix_data_dir.sh ${datadir}
  	if [ ${nch} != 1 ]; then
  	    datadir=data/${task}_real_${nch}ch_beamformit
  	    mkdir -p ${datadir}
  	    sort ${dir}/${task}_real_1ch_wpe_wav.scp | sed -e "s/-[1-8]_/-bf${nch}_/" | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp
  	    sort ${dir}/${task}_real_1ch.txt     > ${datadir}/text
  	    sort ${dir}/${task}_real_1ch.utt2spk > ${datadir}/utt2spk
  	    sort ${dir}/${task}_real_1ch.spk2utt > ${datadir}/spk2utt
  	    ./utils/fix_data_dir.sh ${datadir}
  	fi
  	datadir=data/${task}_real_${nch}ch_wpe
  	mkdir -p ${datadir}
  	sort ${dir}/${task}_real_1ch_wpe_wav.scp | sed -e "s/WPE\/1ch/WPE\/${nch}ch/" > ${datadir}/wav.scp
  	sort ${dir}/${task}_real_1ch.txt     > ${datadir}/text
  	sort ${dir}/${task}_real_1ch.utt2spk > ${datadir}/utt2spk
  	sort ${dir}/${task}_real_1ch.spk2utt > ${datadir}/spk2utt
  	./utils/fix_data_dir.sh ${datadir}
      done
  done