Yannick Estève / ONTRAC-Kaldi

Blame view

egs/gp/s1/local/gp_format_data.sh 6.24 KB
  #!/bin/bash -u
  
  # Copyright 2012  Arnab Ghoshal
  # Copyright 2010-2011  Microsoft Corporation
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  set -o errexit
  set -o pipefail
  
  function error_exit () {
    echo -e "$@" >&2; exit 1;
  }
  
  function read_dirname () {
    local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
    [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
    local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
    echo $retval
  }
  
  PROG=`basename $0`;
  usage="Usage: $PROG <arguments>
  
  Prepare train, dev, eval file lists for a language.
  
  
  Required arguments:
  
    --hmm-proto=FILE\tPrototype of the HMM topology
  
    --work-dir=DIR\t\tWorking directory
  
  ";
  
  if [ $# -lt 2 ]; then
    error_exit $usage;
  fi
  
  while [ $# -gt 0 ];
  do
    case "$1" in
    --help) echo -e $usage; exit 0 ;;
    --hmm-proto=*)
    PROTO=`expr "X$1" : '[^=]*=\(.*\)'`;
    [ -f $PROTO ] || error_exit "Cannot find HMM prototype file '$PROTO'"; 
    shift ;;
    --work-dir=*)
    WDIR=`read_dirname $1`; shift ;;
    *)  echo "Unknown argument: $1, exiting"; error_exit $usage ;;
    esac
  done
  
  cd $WDIR
  . ./path.sh
  
  echo "Preparing train data"
  
  for LCODE in GE PO SP SW; do
  # (0) Create a directory to contain files needed in training:
    for x in train dev eval; do 
      mkdir -p data/$LCODE/$x
      cp data/$LCODE/local/${x}_${LCODE}_wav.scp data/$LCODE/$x/wav.scp
      cp data/$LCODE/local/${x}_${LCODE}.trans2 data/$LCODE/$x/text
      cp data/$LCODE/local/${x}_${LCODE}.spk2utt data/$LCODE/$x/spk2utt
      cp data/$LCODE/local/${x}_${LCODE}.utt2spk data/$LCODE/$x/utt2spk
    done
  
    mkdir -p data/$LCODE/lang
    cp data/$LCODE/local/phones.txt -t data/$LCODE/lang/
    cp data/$LCODE/local/words.txt -t data/$LCODE/lang/
  
  # (1) Generate colon-separated lists of silence and non-silence phones, and 
  #     the file 'oov.txt' containing a word that all OOVs map to during training.
    silphones="SIL SPN";
    silphones.pl data/$LCODE/lang/phones.txt "$silphones" \
      data/$LCODE/lang/silphones.csl data/$LCODE/lang/nonsilphones.csl
    echo "<UNK>" > data/$LCODE/lang/oov.txt
  
  # (2) Create the L.fst without disambiguation symbols, for use in training.
    make_lexicon_fst.pl data/$LCODE/local/lexicon_${LCODE}.txt 0.5 SIL \
      | fstcompile --isymbols=data/$LCODE/lang/phones.txt \
        --osymbols=data/$LCODE/lang/words.txt --keep_isymbols=false \
        --keep_osymbols=false \
      | fstarcsort --sort_type=olabel > data/$LCODE/lang/L.fst
  
  # (3) Create phonesets.txt and extra_questions.txt.
    gp_make_questions.pl -i data/$LCODE/lang/phones.txt \
      -m data/$LCODE/lang/phonesets_mono.txt -r data/$LCODE/lang/roots.txt
    # gp_extra_questions_${LCODE}.pl -i data/$LCODE/lang/phones.txt \
    #   -e data/$LCODE/lang/extra_questions.txt
    grep -v SIL data/$LCODE/lang/phonesets_mono.txt \
      > data/$LCODE/lang/phonesets_cluster.txt
  
  # (4), Finally, for training, create the HMM topology prototype:
    silphonelist=`cat data/$LCODE/lang/silphones.csl | sed 's/:/ /g'`
    nonsilphonelist=`cat data/$LCODE/lang/nonsilphones.csl | sed 's/:/ /g'`
    sed -e "s:NONSILENCEPHONES:$nonsilphonelist:" \
      -e "s:SILENCEPHONES:$silphonelist:" $PROTO > data/$LCODE/lang/topo
  
  done
  
  echo "Preparing test data"
  
  for LCODE in GE PO SP SW; do
  # (0) Copy over some files common to traina and test:
    mkdir -p data/$LCODE/lang_test
    for f in phones.txt words.txt L.fst silphones.csl nonsilphones.csl; do
      cp data/$LCODE/lang/$f -t data/$LCODE/lang_test/
    done
  
  # (1) Create a list of phones including the disambiguation symbols.
  #     --include-zero includes the #0 symbol that is passed from G.fst
    ndisambig=`cat data/$LCODE/local/lex_ndisambig`;
    add_disambig.pl --include-zero data/$LCODE/lang_test/phones.txt $ndisambig \
      > data/$LCODE/lang_test/phones_disambig.txt
    cp data/$LCODE/lang_test/phones_disambig.txt -t data/$LCODE/lang/  # for MMI.
  
  # (2) Create the lexicon FST with disambiguation symbols. There is an extra
  #     step where we create a loop to "pass through" the disambiguation symbols
  #     from G.fst.  
    phone_disambig_symbol=`grep \#0 data/$LCODE/lang_test/phones_disambig.txt | awk '{print $2}'`
    word_disambig_symbol=`grep \#0 data/$LCODE/lang_test/words.txt | awk '{print $2}'`
  
    make_lexicon_fst.pl data/$LCODE/local/lexicon_disambig_${LCODE}.txt 0.5 SIL \
      '#'$ndisambig \
      | fstcompile --isymbols=data/$LCODE/lang_test/phones_disambig.txt \
        --osymbols=data/$LCODE/lang_test/words.txt --keep_isymbols=false \
        --keep_osymbols=false \
      | fstaddselfloops  "echo $phone_disambig_symbol |" \
        "echo $word_disambig_symbol |" \
      | fstarcsort --sort_type=olabel > data/$LCODE/lang_test/L_disambig.fst
  
    # Needed for discriminative training
    cp data/$LCODE/lang_test/L_disambig.fst -t data/$LCODE/lang/
  
  # (3) Create L_align.fst, which is as L.fst but with alignment symbols (#1 
  #     and #2 at the beginning and end of words, on the input side). These are 
  #     used to work out word boundaries. Useful if we ever need to create ctm's
    cat data/$LCODE/local/lexicon_${LCODE}.txt \
      | awk '{printf("%s #1 ", $1); for (n=2; n <= NF; n++) 
             { printf("%s ", $n); } print "#2"; }' \
      | make_lexicon_fst.pl - 0.5 SIL \
      | fstcompile --isymbols=data/$LCODE/lang_test/phones_disambig.txt \
        --osymbols=data/$LCODE/lang_test/words.txt --keep_isymbols=false \
        --keep_osymbols=false \
      | fstarcsort --sort_type=olabel > data/$LCODE/lang_test/L_align.fst
  
  done
  
  # Convert the different available language models to FSTs, and create separate 
  # decoding configurations for each. -- This is very Edinburgh specific.
  
  # TODO(arnab): The core formatting is done in a format_lm fucntion inside this 
  # script, which will be common across setups, so it can probably be taken out 
  # and put as a separate script in the utils directory.
  gp_format_lms_edin.sh data
  
  echo "Succeeded in formatting data."