Yannick Estève / ONTRAC-Kaldi

Blame view

egs/sprakbanken_swe/s5/local/sprak_data_prep.sh 5.19 KB
  #!/bin/bash
  
  # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
  # Copyright 2013-2014  Mirsk Digital Aps (Author: Andreas Kirkedal)
  # Copyright 2016 KTH Royal Institute of Technology (Author: Emelie Kullmann)
  # Apache 2.0.
  
  
  dir=`pwd`/data/local/data
  lmdir=`pwd`/data/local/transcript_lm
  traindir=`pwd`/data/local/trainsrc
  testdir=`pwd`/data/local/testsrc
  rm -rf $lmdir $traindir $testdir $devdir
  mkdir -p $dir $lmdir $traindir $testdir $devdir
  local=`pwd`/local
  utils=`pwd`/utils
  
  
  . ./path.sh
  
  # Checks if python3 is available on the system and install python3 in userspace if not
  # This recipe currently relies on version 3 because python3 uses utf8 as internal 
  # string representation
  
  #if ! which python3 >&/dev/null; then
  #  echo "Python3 is not installed, to install it you should probably do:"
  #  echo "sudo apt-get install python3" || exit 1;
  #fi
  
  if [ ! -d $dir/download ]; then
      mkdir -p $dir/download/0467-1 $dir/download/0467-2 $dir/download/0467-3
  fi 
  
  echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while."
  
  if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-1.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/sve.16khz.0467-2.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-2.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/sve.16khz.0467-3.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0467-3.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/sve.16khz.0467-1.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/sve.16khz.0468.tar.gz --directory-prefix=$dir/download )
  fi    
  
  echo "Corpus files downloaded."
  
  if [ ! -d $dir/download/0468 ]; then
      echo "Unpacking files."
      tar -xzf $dir/download/sve.16khz.0467-1.tar.gz -C $dir/download/0467-1
      tar -xzf $dir/download/sve.16khz.0467-2.tar.gz -C $dir/download/0467-2
      tar -xzf $dir/download/sve.16khz.0467-3.tar.gz -C $dir/download/0467-3
      tar -xzf $dir/download/sve.16khz.0468.tar.gz -C $dir/download/0468    
  
       
      echo "Corpus unpacked succesfully."
  fi
  
  sph2pipe=$(which sph2pipe) || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  if [ ! -x $sph2pipe ]; then
     echo "Could not find (or execute) the sph2pipe program at $sph2pipe . Did you run 'make' in the tools directory?";
     exit 1;
  fi
  
  echo "done"
  
  echo "Converting downloaded files to a format consumable by Kaldi scripts."
  
  rm -rf $dir/corpus_processed 
  mkdir -p $dir/corpus_processed/training/0467-1 $dir/corpus_processed/training/0467-2 $dir/corpus_processed/training/0467-3 
  
  # Create parallel file lists and text files, but keep sound files in the same location to save disk space
  # Writes the lists to data/local/data (~ 310h)
  echo "Creating parallel data for training data."
  python $local/sprak2kaldi.py $dir/download/0467-1 $dir/corpus_processed/training/0467-1  # ~140h
  python $local/sprak2kaldi.py $dir/download/0467-2 $dir/corpus_processed/training/0467-2  # ~125h
  python $local/sprak2kaldi.py $dir/download/0467-3 $dir/corpus_processed/training/0467-3  # ~128h
  
  mv $dir/corpus_processed/training/0467-1/'r4670118.791213 8232' $dir/corpus_processed/training/0467-1/'r4670118.791213_8232'
  for f in $dir/corpus_processed/training/0467-1/r4670118.791213_8232/*.txt; do
      mv "$f" "${f// /_}";
  done
  
  (
  # Ditto test set (~ 93h)
      echo "Creating parallel data for test data."
      rm -rf $dir/corpus_processed/test/0468 
      mkdir -p $dir/corpus_processed/test/0468 
      python $local/sprak2kaldi.py $dir/download/0468 $dir/corpus_processed/test/0468
  ) 
  
  
  # Create the LM training data 
  (
      echo "Writing the LM text to file and normalising."
      cat $dir/corpus_processed/training/0467-1/txtlist $dir/corpus_processed/training/0467-2/txtlist $dir/corpus_processed/training/0467-3/txtlist | while read l; do cat $l; done > $lmdir/lmsents
      python local/normalize_transcript.py $lmdir/lmsents $lmdir/lmsents.norm
      sort -u $lmdir/lmsents.norm > $lmdir/transcripts.uniq
  )
  
  # Combine training file lists
  echo "Combine file lists."
  cat $dir/corpus_processed/training/0467-1/txtlist $dir/corpus_processed/training/0467-2/txtlist $dir/corpus_processed/training/0467-3/txtlist > $dir/traintxtfiles
  cat $dir/corpus_processed/training/0467-1/sndlist $dir/corpus_processed/training/0467-2/sndlist $dir/corpus_processed/training/0467-3/sndlist > $dir/trainsndfiles
  
  
  # Move test file lists to the right location
  cp $dir/corpus_processed/test/0468/txtlist $dir/testtxtfiles
  cp $dir/corpus_processed/test/0468/sndlist $dir/testsndfiles
  
  # Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
  # Use sph2pipe because the wav files are actually sph files
  echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" 
  python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe
  python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe
  
  
  
  # Create the main data sets
  local/create_datasets.sh $testdir data/test 
  local/create_datasets.sh $traindir data/train 
  
  
  
  
  echo "Data preparation succeeded"