Blame view

egs/sprakbanken/s5/local/sprak_data_prep.sh 5.51 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
  #!/bin/bash
  
  # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
  # Copyright 2013-2014  Mirsk Digital Aps (Author: Andreas Kirkedal)
  # Copyright 2015-2016  Andreas Kirkedal
  # Apache 2.0.
  
  
  dir=`pwd`/data/local/data
  lmdir=`pwd`/data/local/transcript_lm
  traindir=`pwd`/data/local/trainsrc
  testdir=`pwd`/data/local/testsrc
  devdir=`pwd`/data/local/devsrc
  rm -rf $lmdir $traindir $testdir $devdir
  mkdir -p $dir $lmdir $traindir $testdir $devdir
  local=`pwd`/local
  utils=`pwd`/utils
  
  . ./path.sh
  
  if [ ! -d $dir/download ]; then
      mkdir -p $dir/download/0565-1 $dir/download/0565-2
  fi 
  
  echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while. The connection closes every 50-60 seconds and the repo maintainers do not have othersuggestions than increasing the number of retries."
  
  if [ ! -f $dir/download/da.16kHz.0565-1.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-1.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/da.16kHz.0565-2.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-2.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/da.16kHz.0611.tar.gz ]; then 
      ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0611.tar.gz --directory-prefix=$dir/download )
  fi    
  wait
  
  echo "Corpus files downloaded."
  
  if [ ! -d $dir/download/0611 ]; then
      echo "Unpacking files."
      tar -xzf $dir/download/da.16kHz.0565-1.tar.gz -C $dir/download/0565-1
      tar -xzf $dir/download/da.16kHz.0565-2.tar.gz -C $dir/download/0565-2 
      tar -xzf $dir/download/da.16kHz.0611.tar.gz -C $dir/download    
  
      # Note: rename "da 0611 test" to "da_0611_test" for this to work
      mv $dir/download/"da 0611 test" $dir/download/0611
      wait     
      echo "Corpus unpacked succesfully."
  fi
  
  
  sph2pipe=$(which sph2pipe) || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  if [ ! -x $sph2pipe ]; then
     echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
     exit 1;
  fi
  
  
  echo "Converting downloaded files to a format consumable by Kaldi scripts."
  
  rm -rf $dir/corpus_processed 
  mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/0565-2 $dir/corpus_processed/training/0611_Stasjon05
  
  
  # Create parallel file lists and text files, but keep sound files in the same location to save disk space
  # Writes the lists to data/local/data (~ 310h)
  echo "Creating parallel data for training data."
  python $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1   # ~130h
  python $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2   # ~115h
  python $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05  # ~51h 
  
  (
  # Ditto dev set (~ 16h)
      echo "Creating parallel data for test data."
      rm -rf $dir/corpus_processed/dev03 
      mkdir -p $dir/corpus_processed/dev03 
      python $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 || exit 1;
  )
  
  (
  # Ditto test set (about 9 hours)
      echo "Creating parallel data for development data."
      rm -rf $dir/corpus_processed/test06 
      mkdir -p $dir/corpus_processed/test06 
      python $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
  )
  
  # Create the LM training data 
  # Test and dev data is disjoint from training data, so we use those transcripts)
  
  # Because training data is read aloud, there are many occurences of the same
  # sentence and bias towards the domain. Make a version where  
  # the sentences are unique to reduce bias.
  
  (
      echo "Writing the LM text to file and normalising."
      cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents
      python local/normalize_transcript.py local/norm_dk/numbersLow.tbl $lmdir/lmsents $lmdir/lmsents.norm
      local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt
      sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq
  )
  
  # Combine training file lists
  echo "Combine file lists."
  cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles
  cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles
  
  # Move test file lists to the right location
  cp $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
  cp $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
  
  # Move test file lists to the right location
  cp $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
  cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles
  
  # Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
  # Use sph2pipe because the wav files are actually sph files
  echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" 
  python $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe 
  python $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe 
  python $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe 
  
  
  # Create the main data sets
  local/create_datasets.sh $testdir data/test 
  local/create_datasets.sh $devdir data/dev 
  local/create_datasets.sh $traindir data/train 
  
  ## TODO
  
  # Extract gender from spl files 
  # Decide how to handle cases with no gender specification
  
  echo "Data preparation succeeded"