Blame view

egs/sprakbanken_swe/s5/local/sprak_data_prep.sh 5.19 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
  #!/bin/bash
  
  # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
  # Copyright 2013-2014  Mirsk Digital Aps (Author: Andreas Kirkedal)
  # Copyright 2016 KTH Royal Institute of Technology (Author: Emelie Kullmann)
  # Apache 2.0.
  
  
  dir=`pwd`/data/local/data
  lmdir=`pwd`/data/local/transcript_lm
  traindir=`pwd`/data/local/trainsrc
  testdir=`pwd`/data/local/testsrc
  rm -rf $lmdir $traindir $testdir $devdir
  mkdir -p $dir $lmdir $traindir $testdir $devdir
  local=`pwd`/local
  utils=`pwd`/utils
  
  
  . ./path.sh
  
  # Checks if python3 is available on the system and install python3 in userspace if not
  # This recipe currently relies on version 3 because python3 uses utf8 as internal 
  # string representation
  
  #if ! which python3 >&/dev/null; then
  #  echo "Python3 is not installed, to install it you should probably do:"
  #  echo "sudo apt-get install python3" || exit 1;
  #fi
  
  if [ ! -d $dir/download ]; then
      mkdir -p $dir/download/0467-1 $dir/download/0467-2 $dir/download/0467-3
  fi 
  
  echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while."
  
  if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-1.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/sve.16khz.0467-2.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-2.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/sve.16khz.0467-3.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-3.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0468.tar.gz --directory-prefix=$dir/download )
  fi    
  
  echo "Corpus files downloaded."
  
  if [ ! -d $dir/download/0468 ]; then
      echo "Unpacking files."
      tar -xzf $dir/download/sve.16khz.0467-1.tar.gz -C $dir/download/0467-1
      tar -xzf $dir/download/sve.16khz.0467-2.tar.gz -C $dir/download/0467-2
      tar -xzf $dir/download/sve.16khz.0467-3.tar.gz -C $dir/download/0467-3
      tar -xzf $dir/download/sve.16khz.0468.tar.gz -C $dir/download/0468    
  
       
      echo "Corpus unpacked succesfully."
  fi
  
  sph2pipe=$(which sph2pipe) || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  if [ ! -x $sph2pipe ]; then
     echo "Could not find (or execute) the sph2pipe program at $sph2pipe . Did you run 'make' in the tools directory?";
     exit 1;
  fi
  
  echo "done"
  
  echo "Converting downloaded files to a format consumable by Kaldi scripts."
  
  rm -rf $dir/corpus_processed 
  mkdir -p $dir/corpus_processed/training/0467-1 $dir/corpus_processed/training/0467-2 $dir/corpus_processed/training/0467-3 
  
  # Create parallel file lists and text files, but keep sound files in the same location to save disk space
  # Writes the lists to data/local/data (~ 310h)
  echo "Creating parallel data for training data."
  python $local/sprak2kaldi.py $dir/download/0467-1 $dir/corpus_processed/training/0467-1  # ~140h
  python $local/sprak2kaldi.py $dir/download/0467-2 $dir/corpus_processed/training/0467-2  # ~125h
  python $local/sprak2kaldi.py $dir/download/0467-3 $dir/corpus_processed/training/0467-3  # ~128h
  
  mv $dir/corpus_processed/training/0467-1/'r4670118.791213 8232' $dir/corpus_processed/training/0467-1/'r4670118.791213_8232'
  for f in $dir/corpus_processed/training/0467-1/r4670118.791213_8232/*.txt; do
      mv "$f" "${f// /_}";
  done
  
  (
  # Ditto test set (~ 93h)
      echo "Creating parallel data for test data."
      rm -rf $dir/corpus_processed/test/0468 
      mkdir -p $dir/corpus_processed/test/0468 
      python $local/sprak2kaldi.py $dir/download/0468 $dir/corpus_processed/test/0468
  ) 
  
  
  # Create the LM training data 
  (
      echo "Writing the LM text to file and normalising."
      cat $dir/corpus_processed/training/0467-1/txtlist $dir/corpus_processed/training/0467-2/txtlist $dir/corpus_processed/training/0467-3/txtlist | while read l; do cat $l; done > $lmdir/lmsents
      python local/normalize_transcript.py $lmdir/lmsents $lmdir/lmsents.norm
      sort -u $lmdir/lmsents.norm > $lmdir/transcripts.uniq
  )
  
  # Combine training file lists
  echo "Combine file lists."
  cat $dir/corpus_processed/training/0467-1/txtlist $dir/corpus_processed/training/0467-2/txtlist $dir/corpus_processed/training/0467-3/txtlist > $dir/traintxtfiles
  cat $dir/corpus_processed/training/0467-1/sndlist $dir/corpus_processed/training/0467-2/sndlist $dir/corpus_processed/training/0467-3/sndlist > $dir/trainsndfiles
  
  
  # Move test file lists to the right location
  cp $dir/corpus_processed/test/0468/txtlist $dir/testtxtfiles
  cp $dir/corpus_processed/test/0468/sndlist $dir/testsndfiles
  
  # Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
  # Use sph2pipe because the wav files are actually sph files
  echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" 
  python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe
  python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe
  
  
  
  # Create the main data sets
  local/create_datasets.sh $testdir data/test 
  local/create_datasets.sh $traindir data/train 
  
  
  
  
  echo "Data preparation succeeded"