prepare_data.sh 3.98 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141


#!/bin/bash  

# Copyright 2018 John Morgan
# Apache 2.0.

# configuration variables
tmpdir=data/local/tmp
download_dir=$(pwd)
tmp_tunis=$tmpdir/tunis
tmp_libyan=$tmpdir/libyan
data_dir=$download_dir/Tunisian_MSA/data
# location of test data 
libyan_src=$data_dir/speech/test/Libyan_MSA
# end of configuration variable settings

# process the Tunisian MSA devtest data

# get list of  wav files
for s in devtest/CTELLONE/Recordings_Arabic/6 devtest/CTELLTHREE/Recordings_Arabic/10; do
  echo "$0: looking for wav files for $s."
  mkdir -p $tmp_tunis/$s
  find $data_dir/speech/$s -type f \
  -name "*.wav" | grep Recordings_Arabic > $tmp_tunis/$s/wav.txt

  local/devtest_recordings_make_lists.pl \
  $data_dir/transcripts/devtest/recordings.tsv $s tunis

  mkdir -p data/devtest

  for x in wav.scp utt2spk text; do
    cat     $tmp_tunis/$s/$x | tr "	" " " >> data/devtest/$x
  done
done

utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk | sort > data/devtest/spk2utt

utils/fix_data_dir.sh data/devtest

# training data consists of 2 parts: answers and recordings (recited)
answers_transcripts=$data_dir/transcripts/train/answers.tsv
recordings_transcripts=$data_dir/transcripts/train/recordings.tsv

# location of test data
cls_rec_tr=$libyan_src/cls/data/transcripts/recordings/cls_recordings.tsv
lfi_rec_tr=$libyan_src/lfi/data/transcripts/recordings/lfi_recordings.tsv
srj_rec_tr=$libyan_src/srj/data/transcripts/recordings/srj_recordings.tsv
mbt_rec_tr=$data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv

# make acoustic model training  lists
mkdir -p $tmp_tunis

# get  wav file names

# for recited speech
# the data collection laptops had names like CTELLONE CTELLTWO ...
for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do
  find $data_dir/speech/train/$machine -type f -name "*.wav" | grep Recordings \
  >> $tmp_tunis/recordings_wav.txt
done

# get file names for Answers 
for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do
  find $data_dir/speech/train/$machine -type f \
    -name "*.wav" \
    | grep Answers >> $tmp_tunis/answers_wav.txt
done

# make separate transcription lists for answers and recordings
export LC_ALL=en_US.UTF-8
local/answers_make_lists.pl $answers_transcripts

utils/fix_data_dir.sh $tmp_tunis/answers

local/recordings_make_lists.pl $recordings_transcripts

utils/fix_data_dir.sh $tmp_tunis/recordings

# consolidate lists
# acoustic models will be trained on both recited and prompted speech
mkdir -p $tmp_tunis/lists

for x in wav.scp utt2spk text; do
  cat $tmp_tunis/answers/$x $tmp_tunis/recordings/$x > $tmp_tunis/lists/$x
done

utils/fix_data_dir.sh $tmp_tunis/lists

# get training lists
mkdir -p data/train
for x in wav.scp utt2spk text; do
  sort $tmp_tunis/lists/$x | tr "	" " " > data/train/$x
done

utils/utt2spk_to_spk2utt.pl data/train/utt2spk | sort > data/train/spk2utt

utils/fix_data_dir.sh data/train

# process the Libyan MSA data
mkdir -p $tmp_libyan

for s in cls lfi srj; do
  mkdir -p $tmp_libyan/$s

  # get list of  wav files
  find $libyan_src/$s -type f \
    -name "*.wav" \
    | grep recordings > $tmp_libyan/$s/recordings_wav.txt

  echo "$0: making recordings list for $s"
  local/test_recordings_make_lists.pl \
    $libyan_src/$s/data/transcripts/recordings/${s}_recordings.tsv $s libyan
done

# process the Tunisian MSA test data

mkdir -p $tmp_tunis/mbt

# get list of  wav files
find $data_dir/speech/test/mbt -type f \
  -name "*.wav" \
  | grep recordings > $tmp_tunis/mbt/recordings_wav.txt

echo "$0: making recordings list for mbt"
local/test_recordings_make_lists.pl \
  $data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv mbt tunis

mkdir -p data/test
# get the Libyan files
for s in cls lfi srj; do
  for x in wav.scp utt2spk text; do
    cat     $tmp_libyan/$s/recordings/$x | tr "	" " " >> data/test/$x
  done
done

for x in wav.scp utt2spk text; do
  cat     $tmp_tunis/mbt/recordings/$x | tr "	" " " >> data/test/$x
done

utils/utt2spk_to_spk2utt.pl data/test/utt2spk | sort > data/test/spk2utt

utils/fix_data_dir.sh data/test