Blame view

egs/timit/s5/local/timit_data_prep.sh 4.69 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  #!/bin/bash
  
  # Copyright 2013   (Authors: Bagher BabaAli, Daniel Povey, Arnab Ghoshal)
  #           2014   Brno University of Technology (Author: Karel Vesely)
  # Apache 2.0.
  
  if [ $# -ne 1 ]; then
     echo "Argument should be the Timit directory, see ../run.sh for example."
     exit 1;
  fi
  
  dir=`pwd`/data/local/data
  lmdir=`pwd`/data/local/nist_lm
  mkdir -p $dir $lmdir
  local=`pwd`/local
  utils=`pwd`/utils
  conf=`pwd`/conf
  
  . ./path.sh # Needed for KALDI_ROOT
  export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
  sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  if [ ! -x $sph2pipe ]; then
     echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
     exit 1;
  fi
  
  [ -f $conf/test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
  [ -f $conf/dev_spk.list ] || error_exit "$PROG: dev-set speaker list not found.";
  
  # First check if the train & test directories exist (these can either be upper-
  # or lower-cased
  if [ ! -d $*/TRAIN -o ! -d $*/TEST ] && [ ! -d $*/train -o ! -d $*/test ]; then
    echo "timit_data_prep.sh: Spot check of command line argument failed"
    echo "Command line argument must be absolute pathname to TIMIT directory"
    echo "with name like /export/corpora5/LDC/LDC93S1/timit/TIMIT"
    exit 1;
  fi
  
  # Now check what case the directory structure is
  uppercased=false
  train_dir=train
  test_dir=test
  if [ -d $*/TRAIN ]; then
    uppercased=true
    train_dir=TRAIN
    test_dir=TEST
  fi
  
  tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
  trap 'rm -rf "$tmpdir"' EXIT
  
  # Get the list of speakers. The list of speakers in the 24-speaker core test
  # set and the 50-speaker development set must be supplied to the script. All
  # speakers in the 'train' directory are used for training.
  if $uppercased; then
    tr '[:lower:]' '[:upper:]' < $conf/dev_spk.list > $tmpdir/dev_spk
    tr '[:lower:]' '[:upper:]' < $conf/test_spk.list > $tmpdir/test_spk
    ls -d "$*"/TRAIN/DR*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
  else
    tr '[:upper:]' '[:lower:]' < $conf/dev_spk.list > $tmpdir/dev_spk
    tr '[:upper:]' '[:lower:]' < $conf/test_spk.list > $tmpdir/test_spk
    ls -d "$*"/train/dr*/* | sed -e "s:^.*/::" > $tmpdir/train_spk
  fi
  
  cd $dir
  for x in train dev test; do
    # First, find the list of audio files (use only si & sx utterances).
    # Note: train & test sets are under different directories, but doing find on
    # both and grepping for the speakers will work correctly.
    find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WAV' \
      | grep -f $tmpdir/${x}_spk > ${x}_sph.flist
  
    sed -e 's:.*/\(.*\)/\(.*\).\(WAV\|wav\)$:\1_\2:' ${x}_sph.flist \
      > $tmpdir/${x}_sph.uttids
    paste $tmpdir/${x}_sph.uttids ${x}_sph.flist \
      | sort -k1,1 > ${x}_sph.scp
  
    cat ${x}_sph.scp | awk '{print $1}' > ${x}.uttids
  
    # Now, Convert the transcripts into our format (no normalization yet)
    # Get the transcripts: each line of the output contains an utterance
    # ID followed by the transcript.
    find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.PHN' \
      | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist
    sed -e 's:.*/\(.*\)/\(.*\).\(PHN\|phn\)$:\1_\2:' $tmpdir/${x}_phn.flist \
      > $tmpdir/${x}_phn.uttids
    while read line; do
      [ -f $line ] || error_exit "Cannot find transcription file '$line'";
      cut -f3 -d' ' "$line" | tr '
  ' ' ' | perl -ape 's: *$:
  :;'
    done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans
    paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \
      | sort -k1,1 > ${x}.trans
  
    # Do normalization steps.
    cat ${x}.trans | $local/timit_norm_trans.pl -i - -m $conf/phones.60-48-39.map -to 48 | sort > $x.text || exit 1;
  
    # Create wav.scp
    awk '{printf("%s '$sph2pipe' -f wav %s |
  ", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
  
    # Make the utt2spk and spk2utt files.
    cut -f1 -d'_'  $x.uttids | paste -d' ' $x.uttids - > $x.utt2spk
    cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1;
  
    # Prepare gender mapping
    cat $x.spk2utt | awk '{print $1}' | perl -ane 'chop; m:^.:; $g = lc($&); print "$_ $g
  ";' > $x.spk2gender
  
    # Prepare STM file for sclite:
    wav-to-duration --read-entire-file=true scp:${x}_wav.scp ark,t:${x}_dur.ark || exit 1
    awk -v dur=${x}_dur.ark \
    'BEGIN{
       while(getline < dur) { durH[$1]=$2; }
       print ";; LABEL \"O\" \"Overall\" \"Overall\"";
       print ";; LABEL \"F\" \"Female\" \"Female speakers\"";
       print ";; LABEL \"M\" \"Male\" \"Male speakers\"";
     }
     { wav=$1; spk=wav; sub(/_.*/,"",spk); $1=""; ref=$0;
       gender=(substr(spk,0,1) == "f" ? "F" : "M");
       printf("%s 1 %s 0.0 %f <O,%s> %s
  ", wav, spk, durH[wav], gender, ref);
     }
    ' ${x}.text >${x}.stm || exit 1
  
    # Create dummy GLM file for sclite:
    echo ';; empty.glm
    [FAKE]     =>  %HESITATION     / [ ] __ [ ] ;; hesitation token
    ' > ${x}.glm
  done
  
  echo "Data preparation succeeded"