prepare_data.sh 4.7 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136


#!/bin/bash
#
# Copyright  2017  Johns Hopkins University (Author: Shinji Watanabe, Yenda Trmal)
# Apache 2.0

# Begin configuration section.
mictype=worn # worn, ref or others
cleanup=true
# End configuration section
. ./utils/parse_options.sh  # accept options.. you can run this run.sh with the

. ./path.sh

echo >&2 "$0" "$@"
if [ $# -ne 3 ] ; then
  echo >&2 "$0" "$@"
  echo >&2 "$0: Error: wrong number of arguments"
  echo -e >&2 "Usage:\n  $0 [opts] <audio-dir> <json-transcript-dir> <output-dir>"
  echo -e >&2 "eg:\n  $0 /corpora/chime5/audio/train /corpora/chime5/transcriptions/train data/train"
  exit 1
fi

set -e -o pipefail

adir=$1
jdir=$2
dir=$3

json_count=$(find -L $jdir -name "*.json" | wc -l)
wav_count=$(find -L $adir -name "*.wav" | wc -l)

if [ "$json_count" -eq 0 ]; then
  echo >&2 "We expect that the directory $jdir will contain json files."
  echo >&2 "That implies you have supplied a wrong path to the data."
  exit 1
fi
if [ "$wav_count" -eq 0 ]; then
  echo >&2 "We expect that the directory $adir will contain wav files."
  echo >&2 "That implies you have supplied a wrong path to the data."
  exit 1
fi

echo "$0: Converting transcription to text"

mkdir -p $dir
for file in $jdir/*json; do
  ./local/json2text.py --mictype $mictype $file
done | \
  sed -e "s/\[inaudible[- 0-9]*\]/[inaudible]/g" |\
  sed -e 's/ - / /g' |\
  sed -e 's/mm-/mm/g' > $dir/text.orig

echo "$0: Creating datadir $dir for type=\"$mictype\""

if [ $mictype == "worn" ]; then
  # convert the filenames to wav.scp format, use the basename of the file
  # as a the wav.scp key, add .L and .R for left and right channel
  # i.e. each file will have two entries (left and right channel)
  find -L $adir -name  "S[0-9]*_P[0-9]*.wav" | \
    perl -ne '{
      chomp;
      $path = $_;
      next unless $path;
      @F = split "/", $path;
      ($f = $F[@F-1]) =~ s/.wav//;
      @F = split "_", $f;
      print "${F[1]}_${F[0]}.L sox $path -t wav - remix 1 |\n";
      print "${F[1]}_${F[0]}.R sox $path -t wav - remix 2 |\n";
    }' | sort > $dir/wav.scp

  # generate the transcripts for both left and right channel
  # from the original transcript in the form
  # P09_S03-0006072-0006147 gimme the baker
  # create left and right channel transcript
  # P09_S03.L-0006072-0006147 gimme the baker
  # P09_S03.R-0006072-0006147 gimme the baker
  sed -n 's/  *$//; h; s/-/\.L-/p; g; s/-/\.R-/p' $dir/text.orig | sort > $dir/text
elif [ $mictype == "ref" ]; then
  # fixed reference array

  # first get a text, which will be used to extract reference arrays
  perl -ne 's/-/.ENH-/;print;' $dir/text.orig | sort > $dir/text

  find -L $adir | grep "\.wav" | sort > $dir/wav.flist
  # following command provide the argument for grep to extract only reference arrays
  grep `cut -f 1 -d"-" $dir/text | awk -F"_" '{print $2 "_" $3}' | sed -e "s/\.ENH//" | sort | uniq | sed -e "s/^/ -e /" | tr "\n" " "` $dir/wav.flist > $dir/wav.flist2
  paste -d" " \
	<(awk -F "/" '{print $NF}' $dir/wav.flist2 | sed -e "s/\.wav/.ENH/") \
	$dir/wav.flist2 | sort > $dir/wav.scp
else
  # array mic case
  # convert the filenames to wav.scp format, use the basename of the file
  # as a the wav.scp key
  find -L $adir -name "*.wav" -ipath "*${mictype}*" |\
    perl -ne '$p=$_;chomp $_;@F=split "/";$F[$#F]=~s/\.wav//;print "$F[$#F] $p";' |\
    sort -u > $dir/wav.scp

  # convert the transcripts from
  # P09_S03-0006072-0006147 gimme the baker
  # to the per-channel transcripts
  # P09_S03_U01_NOLOCATION.CH1-0006072-0006147 gimme the baker
  # P09_S03_U01_NOLOCATION.CH2-0006072-0006147 gimme the baker
  # P09_S03_U01_NOLOCATION.CH3-0006072-0006147 gimme the baker
  # P09_S03_U01_NOLOCATION.CH4-0006072-0006147 gimme the baker
  perl -ne '$l=$_;
    for($i=1; $i<=4; $i++) {
      ($x=$l)=~ s/-/.CH\Q$i\E-/;
      print $x;}' $dir/text.orig | sort > $dir/text

fi
$cleanup && rm -f $dir/text.* $dir/wav.scp.* $dir/wav.flist

# Prepare 'segments', 'utt2spk', 'spk2utt'
if [ $mictype == "worn" ]; then
  cut -d" " -f 1 $dir/text | \
    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
    sed -e "s/_[A-Z]*\././2" \
    > $dir/segments
elif [ $mictype == "ref" ]; then
  cut -d" " -f 1 $dir/text | \
    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
    sed -e "s/_[A-Z]*\././2" |\
    sed -e "s/ P.._/ /" > $dir/segments
else
  cut -d" " -f 1 $dir/text | \
    awk -F"-" '{printf("%s %s %08.2f %08.2f\n", $0, $1, $2/100.0, $3/100.0)}' |\
    sed -e "s/_[A-Z]*\././2" |\
    sed -e 's/ P.._/ /' > $dir/segments
fi
cut -f 1 -d ' ' $dir/segments | \
  perl -ne 'chomp;$utt=$_;s/_.*//;print "$utt $_\n";' > $dir/utt2spk

utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt

# Check that data dirs are okay!
utils/validate_data_dir.sh --no-feats $dir || exit 1