aspire_data_prep.sh 5.44 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156


#!/bin/bash
# Copyright 2015  Johns Hopkins University (Author: Vijayaditya Peddinti)
# Apache 2.0.
set -e
stage=0
# Location of aspire data.
aspire_data=/export/corpora/LDC/LDC2017S21/IARPA-ASpIRE-Dev-Sets-v2.0/data  # for JHU

mean_rms=0.0417 # determined from the mean rms value of data/train_rvb/mean_rms
. ./path.sh # Needed for KALDI_ROOT

. utils/parse_options.sh

dev_transcript=$aspire_data/dev_and_dev_test_STM_files
dev_audio=$aspire_data/dev_and_dev_test_audio/ASpIRE_single_dev
test_audio=$aspire_data/dev_and_dev_test_audio/ASpIRE_single_dev_test
if [ ! -f $aspire_data/my_english.glm ]; then
  echo "Expected to find the glm file, provided in ASpIRE challenge."
  echo "Please provide the glm file in $aspire_data." && exit 1;
fi

# (1) Get transcripts in one file, and clean them up ..
tmpdir=`pwd`/data/local/data
mkdir -p $tmpdir
if [ $stage -le 0 ]; then

  find $dev_transcript/ -name 'dev.stm'  > $tmpdir/transcripts.flist
  find $dev_audio/ -name '*.wav'  > $tmpdir/wav.flist
  find $test_audio/ -name '*.wav'  > $tmpdir/wav_test.flist

  n=$(awk '{print $1}' $(cat $tmpdir/transcripts.flist) | uniq | wc -l)
  if [ $n -ne 30 ]; then
    echo "Expected to find 30 transcript files in the aspire_single_dev_transcript directory, found $n"
    exit 1;
  fi
  n=`cat $tmpdir/wav.flist | wc -l`
  if [ $n -ne 30 ]; then
    echo "Expected to find 30 .wav files in the aspire_single_dev directory, found $n"
    exit 1;
  fi
  n=`cat $tmpdir/wav_test.flist | wc -l`
  if [ $n -ne 60 ]; then
    echo "Expected to find 60 .wav files in the aspire_single_dev_test data, found $n"
    exit 1;
  fi
fi

# create the dev_aspire files
dev=data/dev_aspire
if [ $stage -le 1 ]; then
  mkdir -p $dev

# transcription file format
# single_074f59de 1 single_074f59de 497.775 506.595 um everybody can't get their needs met in in in in a in a negotiations or to to their satisfaction but at least you're attemptin
  
  echo -n > $tmpdir/text.1 || exit 1;
  
  python -c "
import sys
trans_file = open('$tmpdir/text.1', 'w')
utt2spk_file = open('$dev/utt2spk', 'w')
segments_file = open('$dev/segments', 'w')
stm_file = open('$dev/stm', 'w')
utt2spk = []

for file_name in open('$tmpdir/transcripts.flist', 'r').readlines():
  lines = open(file_name.strip()).readlines()
  for line in lines:
    parts = line.split()
    file_id = parts[0]
    utt_id = '{0}-{1}-{2:06}-{3:06}'.format(parts[0], parts[1], int(float(parts[3]) * 1000), int(float(parts[4]) * 1000))
    spk_id = '{0}-{1}'.format(parts[0], parts[1])
    stm_file.write('{0} A {0} {1}\n'.format(spk_id, ' '.join(parts[3:]))) 
    trans_file.write('{0} {1}\n'.format(utt_id, ' '.join(parts[5:])))
    utt2spk.append(('{0} {1}\n'.format(utt_id, spk_id)))
    segments_file.write('{0} {1}-1 {2} {3}\n'.format(utt_id, file_id, parts[3], parts[4]))
stm_file.close()
trans_file.close()
utt2spk.sort()
utt2spk_file.write(''.join(utt2spk))
utt2spk_file.close()
segments_file.close()
" || exit 1; 
fi

if [ $stage -le 2 ]; then
  sort $tmpdir/text.1 | grep -v '((' | \
    awk '{if (NF > 1){ print; }}' | \
    sed 's:\[laugh\]:[laughter]:g' | \
    sed 's:\[sigh\]:[noise]:g' | \
    sed 's:\[cough\]:[noise]:g' | \
    sed 's:\[sigh\]:[noise]:g' | \
    sed 's:\[mn\]:[noise]:g' | \
    sed 's:\[breath\]:[noise]:g' | \
    sed 's:\[lipsmack\]:[noise]:g' > $tmpdir/text.2
  cp $tmpdir/text.2 $dev/text

  utils/utt2spk_to_spk2utt.pl <$dev/utt2spk > $dev/spk2utt
fi

if [ $stage -le 3 ]; then
  for f in `cat $tmpdir/wav.flist`; do
    # convert to absolute path
    utils/make_absolute.sh $f
  done > $tmpdir/wav_abs.flist
  
  cat $tmpdir/wav_abs.flist | python -c "
import sys, os, subprocess, re

for line in sys.stdin.readlines():
  if len(line.strip()) == 0:
    continue
  proc = subprocess.Popen('sox {0} -n stat'.format(line.strip()).split(), stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  out, err = proc.communicate()
  out_rms = $mean_rms/float(re.split('RMS\s+amplitude:', err)[1].split()[0])
  line = line.strip()
  file_id=os.path.splitext(os.path.split(line)[1])[0]+'-1'
  print '{0} sox --vol {1} {2} -r 8000 -t wav - |'.format(file_id, out_rms, line)
"| sort -k1,1 -u  > $dev/wav.scp || exit 1;
  cat $dev/wav.scp |awk '{printf("%s %s A\n", $1, $1)}' > $dev/reco2file_and_channel
  cp $aspire_data/my_english.glm $dev/glm
fi

# prepare test data
if [ $stage -le 4 ]; then
  for dataset in test ; do
    test=data/${dataset}_aspire
    mkdir -p $test
    for f in `cat $tmpdir/wav_${dataset}.flist`; do
      # convert to absolute path
      utils/make_absolute.sh $f
    done > $tmpdir/wav_${dataset}_abs.flist
    cat $tmpdir/wav_${dataset}_abs.flist | \
    python -c "
import sys, os, subprocess, re

lines = sys.stdin.readlines()
for line in lines:
  if len(line.strip()) == 0:
    continue
  proc = subprocess.Popen('sox {0} -n stat'.format(line.strip()).split(), stdout = subprocess.PIPE, stderr = subprocess.PIPE)
  out, err = proc.communicate()
  out_rms = $mean_rms/float(re.split('RMS\s+amplitude:', err)[1].split()[0])
  line = line.strip()
  file_id=os.path.splitext(os.path.split(line)[1])[0]+'-1'
  print '{0} sox --vol {1} {2} -r 8000 -t wav - |'.format(file_id, out_rms, line)
    " | sort -k1,1 -u  > $test/wav.scp || exit 1;

    cat $test/wav.scp |awk '{printf("%s %s\n", $1, $1)}' > $test/utt2spk
    cat $test/wav.scp |awk '{printf("%s %s\n", $1, $1)}' > $test/spk2utt
    cat $test/wav.scp |awk '{printf("%s %s A\n", $1, $1)}' > $test/reco2file_and_channel
    cp $aspire_data/my_english.glm $test/glm
  done
fi

echo "Aspire dev/test/eval data preparation succeeded"