Blame view
egs/timit/s5/local/timit_data_prep.sh
4.69 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
#!/bin/bash # Copyright 2013 (Authors: Bagher BabaAli, Daniel Povey, Arnab Ghoshal) # 2014 Brno University of Technology (Author: Karel Vesely) # Apache 2.0. if [ $# -ne 1 ]; then echo "Argument should be the Timit directory, see ../run.sh for example." exit 1; fi dir=`pwd`/data/local/data lmdir=`pwd`/data/local/nist_lm mkdir -p $dir $lmdir local=`pwd`/local utils=`pwd`/utils conf=`pwd`/conf . ./path.sh # Needed for KALDI_ROOT export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe if [ ! -x $sph2pipe ]; then echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; exit 1; fi [ -f $conf/test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found."; [ -f $conf/dev_spk.list ] || error_exit "$PROG: dev-set speaker list not found."; # First check if the train & test directories exist (these can either be upper- # or lower-cased if [ ! -d $*/TRAIN -o ! -d $*/TEST ] && [ ! -d $*/train -o ! -d $*/test ]; then echo "timit_data_prep.sh: Spot check of command line argument failed" echo "Command line argument must be absolute pathname to TIMIT directory" echo "with name like /export/corpora5/LDC/LDC93S1/timit/TIMIT" exit 1; fi # Now check what case the directory structure is uppercased=false train_dir=train test_dir=test if [ -d $*/TRAIN ]; then uppercased=true train_dir=TRAIN test_dir=TEST fi tmpdir=$(mktemp -d /tmp/kaldi.XXXX); trap 'rm -rf "$tmpdir"' EXIT # Get the list of speakers. The list of speakers in the 24-speaker core test # set and the 50-speaker development set must be supplied to the script. All # speakers in the 'train' directory are used for training. if $uppercased; then tr '[:lower:]' '[:upper:]' < $conf/dev_spk.list > $tmpdir/dev_spk tr '[:lower:]' '[:upper:]' < $conf/test_spk.list > $tmpdir/test_spk ls -d "$*"/TRAIN/DR*/* | sed -e "s:^.*/::" > $tmpdir/train_spk else tr '[:upper:]' '[:lower:]' < $conf/dev_spk.list > $tmpdir/dev_spk tr '[:upper:]' '[:lower:]' < $conf/test_spk.list > $tmpdir/test_spk ls -d "$*"/train/dr*/* | sed -e "s:^.*/::" > $tmpdir/train_spk fi cd $dir for x in train dev test; do # First, find the list of audio files (use only si & sx utterances). # Note: train & test sets are under different directories, but doing find on # both and grepping for the speakers will work correctly. find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.WAV' \ | grep -f $tmpdir/${x}_spk > ${x}_sph.flist sed -e 's:.*/\(.*\)/\(.*\).\(WAV\|wav\)$:\1_\2:' ${x}_sph.flist \ > $tmpdir/${x}_sph.uttids paste $tmpdir/${x}_sph.uttids ${x}_sph.flist \ | sort -k1,1 > ${x}_sph.scp cat ${x}_sph.scp | awk '{print $1}' > ${x}.uttids # Now, Convert the transcripts into our format (no normalization yet) # Get the transcripts: each line of the output contains an utterance # ID followed by the transcript. find $*/{$train_dir,$test_dir} -not \( -iname 'SA*' \) -iname '*.PHN' \ | grep -f $tmpdir/${x}_spk > $tmpdir/${x}_phn.flist sed -e 's:.*/\(.*\)/\(.*\).\(PHN\|phn\)$:\1_\2:' $tmpdir/${x}_phn.flist \ > $tmpdir/${x}_phn.uttids while read line; do [ -f $line ] || error_exit "Cannot find transcription file '$line'"; cut -f3 -d' ' "$line" | tr ' ' ' ' | perl -ape 's: *$: :;' done < $tmpdir/${x}_phn.flist > $tmpdir/${x}_phn.trans paste $tmpdir/${x}_phn.uttids $tmpdir/${x}_phn.trans \ | sort -k1,1 > ${x}.trans # Do normalization steps. cat ${x}.trans | $local/timit_norm_trans.pl -i - -m $conf/phones.60-48-39.map -to 48 | sort > $x.text || exit 1; # Create wav.scp awk '{printf("%s '$sph2pipe' -f wav %s | ", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp # Make the utt2spk and spk2utt files. cut -f1 -d'_' $x.uttids | paste -d' ' $x.uttids - > $x.utt2spk cat $x.utt2spk | $utils/utt2spk_to_spk2utt.pl > $x.spk2utt || exit 1; # Prepare gender mapping cat $x.spk2utt | awk '{print $1}' | perl -ane 'chop; m:^.:; $g = lc($&); print "$_ $g ";' > $x.spk2gender # Prepare STM file for sclite: wav-to-duration --read-entire-file=true scp:${x}_wav.scp ark,t:${x}_dur.ark || exit 1 awk -v dur=${x}_dur.ark \ 'BEGIN{ while(getline < dur) { durH[$1]=$2; } print ";; LABEL \"O\" \"Overall\" \"Overall\""; print ";; LABEL \"F\" \"Female\" \"Female speakers\""; print ";; LABEL \"M\" \"Male\" \"Male speakers\""; } { wav=$1; spk=wav; sub(/_.*/,"",spk); $1=""; ref=$0; gender=(substr(spk,0,1) == "f" ? "F" : "M"); printf("%s 1 %s 0.0 %f <O,%s> %s ", wav, spk, durH[wav], gender, ref); } ' ${x}.text >${x}.stm || exit 1 # Create dummy GLM file for sclite: echo ';; empty.glm [FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token ' > ${x}.glm done echo "Data preparation succeeded" |