sprak_data_prep.sh 5.51 KB

# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
# Copyright 2013-2014  Mirsk Digital Aps (Author: Andreas Kirkedal)
# Copyright 2015-2016  Andreas Kirkedal
# Apache 2.0.

rm -rf $lmdir $traindir $testdir $devdir
mkdir -p $dir $lmdir $traindir $testdir $devdir

. ./path.sh

if [ ! -d $dir/download ]; then
    mkdir -p $dir/download/0565-1 $dir/download/0565-2

echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while. The connection closes every 50-60 seconds and the repo maintainers do not have othersuggestions than increasing the number of retries."

if [ ! -f $dir/download/da.16kHz.0565-1.tar.gz ]; then 
    ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-1.tar.gz --directory-prefix=$dir/download )

if [ ! -f $dir/download/da.16kHz.0565-2.tar.gz ]; then 
    ( wget --tries 100 http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-2.tar.gz --directory-prefix=$dir/download )

if [ ! -f $dir/download/da.16kHz.0611.tar.gz ]; then 
    ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0611.tar.gz --directory-prefix=$dir/download )

echo "Corpus files downloaded."

if [ ! -d $dir/download/0611 ]; then
    echo "Unpacking files."
    tar -xzf $dir/download/da.16kHz.0565-1.tar.gz -C $dir/download/0565-1
    tar -xzf $dir/download/da.16kHz.0565-2.tar.gz -C $dir/download/0565-2 
    tar -xzf $dir/download/da.16kHz.0611.tar.gz -C $dir/download    

    # Note: rename "da 0611 test" to "da_0611_test" for this to work
    mv $dir/download/"da 0611 test" $dir/download/0611
    echo "Corpus unpacked succesfully."

sph2pipe=$(which sph2pipe) || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
   exit 1;

echo "Converting downloaded files to a format consumable by Kaldi scripts."

rm -rf $dir/corpus_processed 
mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/0565-2 $dir/corpus_processed/training/0611_Stasjon05

# Create parallel file lists and text files, but keep sound files in the same location to save disk space
# Writes the lists to data/local/data (~ 310h)
echo "Creating parallel data for training data."
python $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1   # ~130h
python $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2   # ~115h
python $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05  # ~51h 

# Ditto dev set (~ 16h)
    echo "Creating parallel data for test data."
    rm -rf $dir/corpus_processed/dev03 
    mkdir -p $dir/corpus_processed/dev03 
    python $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 || exit 1;

# Ditto test set (about 9 hours)
    echo "Creating parallel data for development data."
    rm -rf $dir/corpus_processed/test06 
    mkdir -p $dir/corpus_processed/test06 
    python $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;

# Create the LM training data 
# Test and dev data is disjoint from training data, so we use those transcripts)

# Because training data is read aloud, there are many occurences of the same
# sentence and bias towards the domain. Make a version where  
# the sentences are unique to reduce bias.

    echo "Writing the LM text to file and normalising."
    cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents
    python local/normalize_transcript.py local/norm_dk/numbersLow.tbl $lmdir/lmsents $lmdir/lmsents.norm
    local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt
    sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq

# Combine training file lists
echo "Combine file lists."
cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist $dir/corpus_processed/training/0611_Stasjon05/txtlist > $dir/traintxtfiles
cat $dir/corpus_processed/training/0565-1/sndlist $dir/corpus_processed/training/0565-2/sndlist $dir/corpus_processed/training/0611_Stasjon05/sndlist > $dir/trainsndfiles

# Move test file lists to the right location
cp $dir/corpus_processed/dev03/txtlist $dir/devtxtfiles
cp $dir/corpus_processed/dev03/sndlist $dir/devsndfiles

# Move test file lists to the right location
cp $dir/corpus_processed/test06/txtlist $dir/testtxtfiles
cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles

# Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
# Use sph2pipe because the wav files are actually sph files
echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" 
python $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe 
python $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe 
python $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe 

# Create the main data sets
local/create_datasets.sh $testdir data/test 
local/create_datasets.sh $devdir data/dev 
local/create_datasets.sh $traindir data/train 


# Extract gender from spl files 
# Decide how to handle cases with no gender specification

echo "Data preparation succeeded"