Yannick Estève / ONTRAC-Kaldi

Blame view

egs/gp/s1/local/gp_data_prep.sh 4.55 KB
  #!/bin/bash -u
  
  # Copyright 2012  Arnab Ghoshal
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  set -o errexit
  
  function error_exit () {
    echo -e "$@" >&2; exit 1;
  }
  
  function read_dirname () {
    local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
    [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
    local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
    echo $retval
  }
  
  PROG=`basename $0`;
  usage="Usage: $PROG <arguments>
  
  Prepare train, dev, eval file lists for a language.
  
  
  Required arguments:
  
    --config-dir=DIR\tDirecory containing the necessary config files
  
    --corpus-dir=DIR\tDirectory for the GlobalPhone corpus
  
    --lm-dir=DIR\t\tDirectory containing language models
  
    --work-dir=DIR\t\tWorking directory
  
  ";
  
  if [ $# -lt 4 ]; then
    error_exit $usage;
  fi
  
  while [ $# -gt 0 ];
  do
    case "$1" in
    --help) echo -e $usage; exit 0 ;;
    --config-dir=*)
    CONFDIR=`read_dirname $1`; shift ;;
    --corpus-dir=*)
    GPDIR=`read_dirname $1`; shift ;;
    --lm-dir=*)
    LMDIR=`read_dirname $1`; shift ;;
    --work-dir=*)
    WDIR=`read_dirname $1`; shift ;;
    *)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
    esac
  done
  
  # (1) check if the config files are in place:
  cd $CONFDIR
  [ -f dev_spk.list ] || error_exit "$PROG: Dev-set speaker list not found.";
  [ -f eval_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
  [ -f lang_codes.txt ] || error_exit "$PROG: Mapping for language name to 2-letter code not found.";
  
  cd $WDIR
  [ -f path.sh ] && . ./path.sh  # Sets the PATH to contain necessary executables
  
  # (2) get the various file lists (for audio, transcription, etc.) for the
  # specified language.
  for LCODE in GE PO SP SW; do
    mkdir -p data/$LCODE
    gp_prep_flists.sh --corpus-dir=$GPDIR --dev-spk=$CONFDIR/dev_spk.list \
      --eval-spk=$CONFDIR/eval_spk.list --lang-map=$CONFDIR/lang_codes.txt \
      --work-dir=data $LCODE 2>data/$LCODE/prep_flists.log & 
    # Running these in parallel since this does audio conversion (to figure out
    # which files cannot be processed) and takes some time to run. 
  done
  wait;
  
  # (3) Normalize the dictionary and transcripts.
  for LCODE in GE PO SP SW; do
    full_name=`awk '/'$LCODE'/ {print $2}' $CONFDIR/lang_codes.txt`;
    gp_norm_dict_${LCODE}.pl -i $GPDIR/Dictionaries/${LCODE}/${full_name}-GPDict.txt | sort -u > data/$LCODE/local/lexicon_nosil_${LCODE}.txt
    (echo -e '!SIL\tSIL
  <UNK>\tSPN';) \
      | cat - data/$LCODE/local/lexicon_nosil_${LCODE}.txt \
      > data/$LCODE/local/lexicon_${LCODE}.txt;
    
    # add disambig symbols to the lexicon:
    ndisambig=`add_lex_disambig.pl data/$LCODE/local/lexicon_${LCODE}.txt data/$LCODE/local/lexicon_disambig_${LCODE}.txt`
    ndisambig=$[$ndisambig+1];  # add one disambig symbol for silence
    echo $ndisambig > data/$LCODE/local/lex_ndisambig
  
    # Get the list of phones and map them to integers (adding silence and spoken
    # nosie to the list).
    cut -f2 data/$LCODE/local/lexicon_nosil_${LCODE}.txt | sed -e "s?_.*??g" \
      | tr ' ' '
  ' | sort -u \
      | awk 'BEGIN{ print "<eps> 0"; print "SIL 1"; print "SPN 2"; N=3; } 
             { printf("%s %d
  ", $1, N++); }' > data/$LCODE/local/phones.txt
    # If using word-boundary markers on phones, use this in the awk command above
             # { printf("%s_WB %d
  ", $1, N++); }
    # If using position markers on phones, use these in the awk command above
             # { printf("%s_B %d
  ", $1, N++); }
             # { printf("%s_E %d
  ", $1, N++); }
             # { printf("%s_S %d
  ", $1, N++); }
  
    # Get the list of words:
    cut -f1 data/$LCODE/local/lexicon_${LCODE}.txt | sort -u \
      | awk 'BEGIN{print "<eps> 0";} {printf("%s %d
  ", $1, NR);} 
             END{printf("#0 %d
  ", NR+1);}' > data/$LCODE/local/words.txt
  
    for x in train dev eval; do
      gp_norm_trans_${LCODE}.pl -i data/$LCODE/local/${x}_${LCODE}.trans \
        > data/$LCODE/local/${x}_${LCODE}.trans2;
    done
  
  done
  
  # (4) Normalize the LMs - this is very Edinburgh-specific since we have some 
  # LMs that came with the GlobalPhone corpus.
  gp_prep_lms_edin.sh --lm-dir=$LMDIR --work-dir=$WDIR
  
  echo "Finished data preparation."