sprak_data_prep.sh 5.19 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132


#!/bin/bash

# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
# Copyright 2013-2014  Mirsk Digital Aps (Author: Andreas Kirkedal)
# Copyright 2016 KTH Royal Institute of Technology (Author: Emelie Kullmann)
# Apache 2.0.


dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/transcript_lm
traindir=`pwd`/data/local/trainsrc
testdir=`pwd`/data/local/testsrc
rm -rf $lmdir $traindir $testdir $devdir
mkdir -p $dir $lmdir $traindir $testdir $devdir
local=`pwd`/local
utils=`pwd`/utils


. ./path.sh

# Checks if python3 is available on the system and install python3 in userspace if not
# This recipe currently relies on version 3 because python3 uses utf8 as internal 
# string representation

#if ! which python3 >&/dev/null; then
#  echo "Python3 is not installed, to install it you should probably do:"
#  echo "sudo apt-get install python3" || exit 1;
#fi

if [ ! -d $dir/download ]; then
    mkdir -p $dir/download/0467-1 $dir/download/0467-2 $dir/download/0467-3
fi 

echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while."

if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then 
    ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-1.tar.gz --directory-prefix=$dir/download )
fi

if [ ! -f $dir/download/sve.16khz.0467-2.tar.gz ]; then 
    ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-2.tar.gz --directory-prefix=$dir/download )
fi

if [ ! -f $dir/download/sve.16khz.0467-3.tar.gz ]; then 
    ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-3.tar.gz --directory-prefix=$dir/download )
fi

if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then 
    ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0468.tar.gz --directory-prefix=$dir/download )
fi    

echo "Corpus files downloaded."

if [ ! -d $dir/download/0468 ]; then
    echo "Unpacking files."
    tar -xzf $dir/download/sve.16khz.0467-1.tar.gz -C $dir/download/0467-1
    tar -xzf $dir/download/sve.16khz.0467-2.tar.gz -C $dir/download/0467-2
    tar -xzf $dir/download/sve.16khz.0467-3.tar.gz -C $dir/download/0467-3
    tar -xzf $dir/download/sve.16khz.0468.tar.gz -C $dir/download/0468    

     
    echo "Corpus unpacked succesfully."
fi

sph2pipe=$(which sph2pipe) || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
   echo "Could not find (or execute) the sph2pipe program at $sph2pipe . Did you run 'make' in the tools directory?";
   exit 1;
fi

echo "done"

echo "Converting downloaded files to a format consumable by Kaldi scripts."

rm -rf $dir/corpus_processed 
mkdir -p $dir/corpus_processed/training/0467-1 $dir/corpus_processed/training/0467-2 $dir/corpus_processed/training/0467-3 

# Create parallel file lists and text files, but keep sound files in the same location to save disk space
# Writes the lists to data/local/data (~ 310h)
echo "Creating parallel data for training data."
python $local/sprak2kaldi.py $dir/download/0467-1 $dir/corpus_processed/training/0467-1  # ~140h
python $local/sprak2kaldi.py $dir/download/0467-2 $dir/corpus_processed/training/0467-2  # ~125h
python $local/sprak2kaldi.py $dir/download/0467-3 $dir/corpus_processed/training/0467-3  # ~128h

mv $dir/corpus_processed/training/0467-1/'r4670118.791213 8232' $dir/corpus_processed/training/0467-1/'r4670118.791213_8232'
for f in $dir/corpus_processed/training/0467-1/r4670118.791213_8232/*.txt; do
    mv "$f" "${f// /_}";
done

(
# Ditto test set (~ 93h)
    echo "Creating parallel data for test data."
    rm -rf $dir/corpus_processed/test/0468 
    mkdir -p $dir/corpus_processed/test/0468 
    python $local/sprak2kaldi.py $dir/download/0468 $dir/corpus_processed/test/0468
) 


# Create the LM training data 
(
    echo "Writing the LM text to file and normalising."
    cat $dir/corpus_processed/training/0467-1/txtlist $dir/corpus_processed/training/0467-2/txtlist $dir/corpus_processed/training/0467-3/txtlist | while read l; do cat $l; done > $lmdir/lmsents
    python local/normalize_transcript.py $lmdir/lmsents $lmdir/lmsents.norm
    sort -u $lmdir/lmsents.norm > $lmdir/transcripts.uniq
)

# Combine training file lists
echo "Combine file lists."
cat $dir/corpus_processed/training/0467-1/txtlist $dir/corpus_processed/training/0467-2/txtlist $dir/corpus_processed/training/0467-3/txtlist > $dir/traintxtfiles
cat $dir/corpus_processed/training/0467-1/sndlist $dir/corpus_processed/training/0467-2/sndlist $dir/corpus_processed/training/0467-3/sndlist > $dir/trainsndfiles


# Move test file lists to the right location
cp $dir/corpus_processed/test/0468/txtlist $dir/testtxtfiles
cp $dir/corpus_processed/test/0468/sndlist $dir/testsndfiles

# Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
# Use sph2pipe because the wav files are actually sph files
echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" 
python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe
python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe


# Create the main data sets
local/create_datasets.sh $testdir data/test 
local/create_datasets.sh $traindir data/train 


echo "Data preparation succeeded"