Blame view

egs/chime4/s5_1ch/local/simu_noisy_chime4_data_prep.sh 5.26 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
  #!/bin/bash
  set -e
  
  # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
  # Apache 2.0.
  
  # This is modified from the script in standard Kaldi recipe to account
  # for the way the WSJ data is structured on the Edinburgh systems.
  # - Arnab Ghoshal, 29/05/12
  
  # Modified from the script for CHiME2 baseline
  # Shinji Watanabe 02/13/2015
  # Modified to use data of six channels
  # Szu-Jui Chen 09/29/2017
  
  # Config:
  eval_flag=true # make it true when the evaluation data are released
  
  . utils/parse_options.sh || exit 1;
  
  if [ $# -ne 1 ]; then
    printf "
  USAGE: %s <corpus-directory>
  
  " `basename $0`
    echo "The argument should be a the top-level Chime4 directory."
    echo "It is assumed that there will be a 'data' subdirectory"
    echo "within the top-level corpus directory."
    exit 1;
  fi
  
  echo "$0 $@"  # Print the command line for logging
  
  audio_dir=$1/data/audio/16kHz/isolated
  trans_dir=$1/data/transcriptions
  
  echo "extract all channels (CH[1-6].wav) for noisy data"
  
  dir=`pwd`/data/local/data
  lmdir=`pwd`/data/local/nist_lm
  mkdir -p $dir $lmdir
  local=`pwd`/local
  utils=`pwd`/utils
  
  . ./path.sh # Needed for KALDI_ROOT
  export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
  sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  if [ ! -x $sph2pipe ]; then
    echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
    exit 1;
  fi
  
  if $eval_flag; then
  list_set="tr05_simu_noisy dt05_simu_noisy et05_simu_noisy"
  else
  list_set="tr05_simu_noisy dt05_simu_noisy"
  fi
  
  cd $dir
  
  find $audio_dir -name '*CH[1-6].wav' | grep 'tr05_bus_simu\|tr05_caf_simu\|tr05_ped_simu\|tr05_str_simu' | sort -u > tr05_simu_noisy.flist
  find $audio_dir -name '*CH[1-6].wav' | grep 'dt05_bus_simu\|dt05_caf_simu\|dt05_ped_simu\|dt05_str_simu' | sort -u > dt05_simu_noisy.flist
  if $eval_flag; then
  find $audio_dir -name '*CH[1-6].wav' | grep 'et05_bus_simu\|et05_caf_simu\|et05_ped_simu\|et05_str_simu' | sort -u > et05_simu_noisy.flist
  fi
  
  # make a dot format from json annotation files
  cp $trans_dir/dt05_simu.dot_all dt05_simu.dot
  if $eval_flag; then
  cp $trans_dir/et05_simu.dot_all et05_simu.dot
  fi
  
  # make a scp file from file list
  for x in $list_set; do
      cat $x.flist | awk -F'[/]' '{print $NF}'| sed -e 's/\.wav/_SIMU/' > ${x}_wav.id.temp
      cat ${x}_wav.id.temp | awk -F'_' '{print $3}' | awk -F'.' '{print $2}' > $x.ch
      cat ${x}_wav.id.temp | awk -F'_' '{print $1}' > $x.part1
      cat ${x}_wav.id.temp | sed -e 's/^..._//' > $x.part2
      paste -d"_" $x.part1 $x.ch $x.part2 > ${x}_wav.ids
      paste -d" " ${x}_wav.ids $x.flist | sort -t_ -k1,1 -k3 > ${x}_wav.scp.temp
  done
  
  # make a transcription from dot
  # simulation training data extract dot file from original WSJ0 data
  # since it is generated from these data
  if [ ! -e dot_files.flist ]; then
    echo "Could not find $dir/dot_files.flist files, first run local/clean_wsj0_data_prep.sh";
    exit 1;
  fi
  cat tr05_simu_noisy_wav.scp.temp | awk -F'[_]' '{print $3}' | tr '[A-Z]' '[a-z]' \
      | $local/find_noisy_transcripts.pl dot_files.flist | cut -f 2- -d" " > tr05_simu_noisy.txt
  cat tr05_simu_noisy_wav.scp.temp | cut -f 1 -d" " > tr05_simu_noisy.ids
  paste -d" " tr05_simu_noisy.ids tr05_simu_noisy.txt | sort -t_ -k1,1 -k3 > tr05_simu_noisy.trans1
  # dt05 and et05 simulation data are generated from the CHiME4 booth recording
  # and we use CHiME4 dot files
  cat dt05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> dt05_simu_noisy.ids
  cat dt05_simu.dot | sed -e 's/(.*)//' > dt05_simu_noisy.txt
  paste -d" " dt05_simu_noisy.ids dt05_simu_noisy.txt | \
  awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
  sort -k 1 > dt05_simu_noisy.trans1
  if $eval_flag; then
  cat et05_simu.dot | sed -e 's/(\(.*\))/\1/' | awk '{print $NF ".CH1_SIMU"}'> et05_simu_noisy.ids
  cat et05_simu.dot | sed -e 's/(.*)//' > et05_simu_noisy.txt
  paste -d" " et05_simu_noisy.ids et05_simu_noisy.txt | \
  awk '{print}{sub(/CH1/, "CH2",$0);print}{sub(/CH2/, "CH3",$0);print}{sub(/CH3/, "CH4",$0);print}{sub(/CH4/, "CH5",$0);print}{sub(/CH5/, "CH6",$0);print}' | \
  sort -k 1 > et05_simu_noisy.trans1
  fi
  
  # Do some basic normalization steps.  At this point we don't remove OOVs--
  # that will be done inside the training scripts, as we'd like to make the
  # data-preparation stage independent of the specific lexicon used.
  noiseword="<NOISE>";
  for x in $list_set;do
    cat ${x}_wav.scp.temp | awk '{print $1}' > $x.txt.part1
    cat $x.trans1 | awk '{$1=""; print $0}' | sed 's/^[ \t]*//g' > $x.txt.part2
    paste -d" " $x.txt.part1 $x.txt.part2 > $x.trans1
    cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
      | sort > $x.txt || exit 1;
  done
  
  # Make the utt2spk and spk2utt files.
  for x in $list_set; do
    sort ${x}_wav.scp.temp > ${x}_wav.scp
    cat ${x}_wav.scp | awk -F'_' '{print $1"_"$2}' > $x.spk
    cat ${x}_wav.scp | awk '{print $1}' > $x.utt
    paste -d" " $x.utt $x.spk > $x.utt2spk
    cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
  done
  
  # copying data to data/...
  for x in $list_set; do
    mkdir -p ../../$x
    cp ${x}_wav.scp ../../$x/wav.scp || exit 1;
    cp ${x}.txt     ../../$x/text    || exit 1;
    cp ${x}.spk2utt ../../$x/spk2utt || exit 1;
    cp ${x}.utt2spk ../../$x/utt2spk || exit 1;
  done
  
  # clean up temp files
  rm *.temp
  rm *.part{1,2}
  
  echo "Data preparation succeeded"