Blame view

egs/chime5/s5/local/prepare_data.sh 4.7 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
  #!/bin/bash
  #
  # Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal)
  # Apache 2.0
  
  # Begin configuration section.
  mictype=worn # worn, ref or others
  cleanup=true
  # End configuration section
  . ./utils/parse_options.sh  # accept options.. you can run this run.sh with the
  
  . ./path.sh
  
  echo >&2 "$0" "$@"
  if [ $# -ne 3 ] ; then
    echo >&2 "$0" "$@"
    echo >&2 "$0: Error: wrong number of arguments"
    echo -e >&2 "Usage:
    $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>"
    echo -e >&2 "eg:
    $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train"
    exit 1
  fi
  
  set -e -o pipefail
  
  adir=$1
  jdir=$2
  dir=$3
  
  json_count=$(find -L $jdir -name "*.json" | wc -l)
  wav_count=$(find -L $adir -name "*.wav" | wc -l)
  
  if [ "$json_count" -eq 0 ]; then
    echo >&2 "We expect that the directory $jdir will contain json files."
    echo >&2 "That implies you have supplied a wrong path to the data."
    exit 1
  fi
  if [ "$wav_count" -eq 0 ]; then
    echo >&2 "We expect that the directory $adir will contain wav files."
    echo >&2 "That implies you have supplied a wrong path to the data."
    exit 1
  fi
  
  echo "$0: Converting transcription to text"
  
  mkdir -p $dir
  for file in $jdir/*json; do
    ./local/json2text.py --mictype $mictype $file
  done | \
    sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\
    sed -e 's/ - / /g' |\
    sed -e 's/mm-/mm/g' > $dir/text.orig
  
  echo "$0: Creating datadir $dir for type=\"$mictype\""
  
  if [ $mictype == "worn" ]; then
    # convert the filenames to wav.scp format, use the basename of the file
    # as a the wav.scp key, add .L and .R for left and right channel
    # i.e. each file will have two entries (left and right channel)
    find -L $adir -name  "S[0-9]*_P[0-9]*.wav" | \
      perl -ne '{
        chomp;
        $path = $_;
        next unless $path;
        @F = split "/", $path;
        ($f = $F[@F-1]) =~ s/.wav//;
        @F = split "_", $f;
        print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |
  ";
        print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |
  ";
      }' | sort > $dir/wav.scp
  
    # generate the transcripts for both left and right channel
    # from the original transcript in the form
    # P09_S03-0006072-0006147 gimme the baker
    # create left and right channel transcript
    # P09_S03.L-0006072-0006147 gimme the baker
    # P09_S03.R-0006072-0006147 gimme the baker
    sed -n 's/  *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text
  elif [ $mictype == "ref" ]; then
    # fixed reference array
  
    # first get a text, which will be used to extract reference arrays
    perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text
  
    find -L $adir | grep "\.wav" | sort > $dir/wav.flist
    # following command provide the argument for grep to extract only reference arrays
    grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "
  " " "` $dir/wav.flist > $dir/wav.flist2
    paste -d" " \
  	<(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \
  	$dir/wav.flist2 | sort > $dir/wav.scp
  else
    # array mic case
    # convert the filenames to wav.scp format, use the basename of the file
    # as a the wav.scp key
    find -L $adir -name "*.wav" -ipath "*${mictype}*" |\
      perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\
      sort -u > $dir/wav.scp
  
    # convert the transcripts from
    # P09_S03-0006072-0006147 gimme the baker
    # to the per-channel transcripts
    # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker
    # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker
    # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker
    # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker
    perl -ne '$l=$_;
      for($i=1; $i<=4; $i++) {
        ($x=$l)=~ s/-/.CH\Q$i\E-/;
        print $x;}' $dir/text.orig | sort > $dir/text
  
  fi
  $cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist
  
  # Prepare 'segments', 'utt2spk', 'spk2utt'
  if [ $mictype == "worn" ]; then
    cut -d" " -f 1 $dir/text | \
      awk -F"-" '{printf("%s %s %08.2f %08.2f
  ", $0, $1, $2/100.0, $3/100.0)}' |\
      sed -e "s/_[A-Z]*\././2" \
      > $dir/segments
  elif [ $mictype == "ref" ]; then
    cut -d" " -f 1 $dir/text | \
      awk -F"-" '{printf("%s %s %08.2f %08.2f
  ", $0, $1, $2/100.0, $3/100.0)}' |\
      sed -e "s/_[A-Z]*\././2" |\
      sed -e "s/ P.._/ /" > $dir/segments
  else
    cut -d" " -f 1 $dir/text | \
      awk -F"-" '{printf("%s %s %08.2f %08.2f
  ", $0, $1, $2/100.0, $3/100.0)}' |\
      sed -e "s/_[A-Z]*\././2" |\
      sed -e 's/ P.._/ /' > $dir/segments
  fi
  cut -f 1 -d ' ' $dir/segments | \
    perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_
  ";' > $dir/utt2spk
  
  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
  
  # Check that data dirs are okay!
  utils/validate_data_dir.sh --no-feats $dir || exit 1