Yannick Estève / ONTRAC-Kaldi

Blame view

egs/sprakbanken/s5/local/sprak_data_prep.sh 5.51 KB
  #!/bin/bash
  
  # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
  # Copyright 2013-2014  Mirsk Digital Aps (Author: Andreas Kirkedal)
  # Copyright 2015-2016  Andreas Kirkedal
  # Apache 2.0.
  
  
  dir=`pwd`/data/local/data
  lmdir=`pwd`/data/local/transcript_lm
  traindir=`pwd`/data/local/trainsrc
  testdir=`pwd`/data/local/testsrc
  devdir=`pwd`/data/local/devsrc
  rm -rf $lmdir $traindir $testdir $devdir
  mkdir -p $dir $lmdir $traindir $testdir $devdir
  local=`pwd`/local
  utils=`pwd`/utils
  
  . ./path.sh
  
  if [ ! -d $dir/download ]; then
      mkdir -p $dir/download/0565-1 $dir/download/0565-2
  fi 
  
  echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while. The connection closes every 50-60 seconds and the repo maintainers do not have othersuggestions than increasing the number of retries."
  
  if [ ! -f $dir/download/da.16kHz.0565-1.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-1.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/da.16kHz.0565-2.tar.gz ]; then 
      ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-2.tar.gz --directory-prefix=$dir/download )
  fi
  
  if [ ! -f $dir/download/da.16kHz.0611.tar.gz ]; then 
      ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0611.tar.gz --directory-prefix=$dir/download )
  fi    
  wait
  
  echo "Corpus files downloaded."
  
  if [ ! -d $dir/download/0611 ]; then
      echo "Unpacking files."
      tar -xzf $dir/download/da.16kHz.0565-1.tar.gz -C $dir/download/0565-1
      tar -xzf $dir/download/da.16kHz.0565-2.tar.gz -C $dir/download/0565-2 
      tar -xzf $dir/download/da.16kHz.0611.tar.gz -C $dir/download    
  
      # Note: rename "da 0611 test" to "da_0611_test" for this to work
      mv $dir/download/"da 0611 test" $dir/download/0611
      wait     
      echo "Corpus unpacked succesfully."
  fi
  
  
  sph2pipe=$(which sph2pipe) || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  if [ ! -x $sph2pipe ]; then
     echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
     exit 1;
  fi
  
  
  echo "Converting downloaded files to a format consumable by Kaldi scripts."
  
  rm -rf $dir/corpus_processed 
  mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/0565-2 $dir/corpus_processed/training/0611_Stasjon05
  
  
  # Create parallel file lists and text files, but keep sound files in the same location to save disk space
  # Writes the lists to data/local/data (~ 310h)
  echo "Creating parallel data for training data."
  python $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1   # ~130h
  python $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2   # ~115h
  python $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05  # ~51h 
  
  (
  # Ditto dev set (~ 16h)
      echo "Creating parallel data for test data."
      rm -rf $dir/corpus_processed/dev03 
      mkdir -p $dir/corpus_processed/dev03 
      python $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 || exit 1;
  )
  
  (
  # Ditto test set (about 9 hours)
      echo "Creating parallel data for development data."
      rm -rf $dir/corpus_processed/test06 
      mkdir -p $dir/corpus_processed/test06 
      python $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
  )
  
  # Create the LM training data 
  # Test and dev data is disjoint from training data, so we use those transcripts)
  
  # Because training data is read aloud, there are many occurences of the same
  # sentence and bias towards the domain. Make a version where  
  # the sentences are unique to reduce bias.
  
  (
      echo "Writing the LM text to file and normalising."
      cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents
      python local/normalize_transcript.py local/norm_dk/numbersLow.tbl $lmdir/lmsents $lmdir/lmsents.norm
      local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt
      sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq
  )
  
  # Combine training file lists
  echo "Combine file lists."
  cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles
  cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles
  
  # Move test file lists to the right location
  cp $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
  cp $dir/corpus_processed/dev03/sndlist $dir/devsndfiles
  
  # Move test file lists to the right location
  cp $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
  cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles
  
  # Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
  # Use sph2pipe because the wav files are actually sph files
  echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" 
  python $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe 
  python $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe 
  python $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe 
  
  
  # Create the main data sets
  local/create_datasets.sh $testdir data/test 
  local/create_datasets.sh $devdir data/dev 
  local/create_datasets.sh $traindir data/train 
  
  ## TODO
  
  # Extract gender from spl files 
  # Decide how to handle cases with no gender specification
  
  echo "Data preparation succeeded"