Blame view

egs/gp/s5/local/gp_data_prep.sh 3.46 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
  #!/bin/bash -u
  
  # Copyright 2012  Arnab Ghoshal
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  set -o errexit
  
  function error_exit () {
    echo -e "$@" >&2; exit 1;
  }
  
  function read_dirname () {
    local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`;
    [ -d "$dir_name" ] || error_exit "Argument '$dir_name' not a directory";
    local retval=`cd $dir_name 2>/dev/null && pwd || exit 1`
    echo $retval
  }
  
  PROG=`basename $0`;
  usage="Usage: $PROG <arguments>
  
  Prepare train, dev, eval file lists for a language.
  
  e.g.: $PROG --config-dir=conf --corpus-dir=corpus --languages=\"GE PO SP\"
  
  
  Required arguments:
  
    --config-dir=DIR\tDirecory containing the necessary config files
  
    --corpus-dir=DIR\tDirectory for the GlobalPhone corpus
  
    --languages=STR\tSpace separated list of two letter language codes
  
  ";
  
  if [ $# -lt 3 ]; then
    error_exit $usage;
  fi
  
  while [ $# -gt 0 ];
  do
    case "$1" in
    --help) echo -e $usage; exit 0 ;;
    --config-dir=*)
    CONFDIR=`read_dirname $1`; shift ;;
    --corpus-dir=*)
    GPDIR=`read_dirname $1`; shift ;;
    --languages=*)
    LANGUAGES=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;;
    *)  echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;;
    esac
  done
  
  # (1) check if the config files are in place:
  pushd $CONFDIR > /dev/null
  [ -f dev_spk.list ] || error_exit "$PROG: Dev-set speaker list not found.";
  [ -f eval_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
  [ -f lang_codes.txt ] || error_exit "$PROG: Mapping for language name to 2-letter code not found.";
  
  popd > /dev/null
  [ -f path.sh ] && . ./path.sh  # Sets the PATH to contain necessary executables
  
  # (2) get the various file lists (for audio, transcription, etc.) for the
  # specified language.
  printf "Preparing file lists ... "
  for L in $LANGUAGES; do
    mkdir -p data/$L/local/data
    local/gp_prep_flists.sh --corpus-dir=$GPDIR --dev-spk=$CONFDIR/dev_spk.list \
      --eval-spk=$CONFDIR/eval_spk.list --lang-map=$CONFDIR/lang_codes.txt \
      --work-dir=data $L >& data/$L/prep_flists.log & 
    # Running these in parallel since this does audio conversion (to figure out
    # which files cannot be processed) and takes some time to run. 
  done
  wait;
  echo "Done"
  
  # (3) Normalize the transcripts.
  for L in $LANGUAGES; do
    printf "Language - ${L}: normalizing transcripts ... "
    for x in train dev eval; do
      local/gp_norm_trans_${L}.pl -i data/$L/local/data/${x}_${L}.trans1 \
        > data/$L/local/data/${x}_${L}.txt;
    done
    echo "Done"
  done
  
  # (4) Create a directories to contain files needed in training and testing:
  for L in $LANGUAGES; do
    printf "Language - ${L}: formatting train/test data ... "
    for x in train dev eval; do
      mkdir -p data/$L/$x
      cp data/$L/local/data/${x}_${L}_wav.scp data/$L/$x/wav.scp
      cp data/$L/local/data/${x}_${L}.txt data/$L/$x/text
      cp data/$L/local/data/${x}_${L}.spk2utt data/$L/$x/spk2utt
      cp data/$L/local/data/${x}_${L}.utt2spk data/$L/$x/utt2spk
    done
    echo "Done"
  done
  
  
  echo "Finished data preparation."