Blame view
egs/gp/s5/local/gp_prep_flists.sh
5.88 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
#!/bin/bash -u # Copyright 2012 Arnab Ghoshal # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail function read_dirname () { local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`; [ -d "$dir_name" ] || { echo "Argument '$dir_name' not a directory" >&2; \ exit 1; } local retval=`cd $dir_name 2>/dev/null && pwd || exit 1` echo $retval } PROG=`basename $0`; usage="Usage: $PROG <arguments> <2-letter language code> Prepare train, dev, eval file lists for a language. Required arguments: --corpus-dir=DIR\tDirectory for the GlobalPhone corpus --dev-spk=FILE\t\tDev set speaker list --eval-spk=FILE\tEval set speaker list --lang-map=FILE\tMapping from 2-letter language code to full name --work-dir=DIR\t\tPlace to write the files (in a subdirectory with the 2-letter language code) "; if [ $# -lt 6 ]; then echo -e $usage; exit 1; fi while [ $# -gt 0 ]; do case "$1" in --help) echo -e $usage; exit 0 ;; --corpus-dir=*) GPDIR=`read_dirname $1`; shift ;; --work-dir=*) WDIR=`read_dirname $1`; shift ;; --dev-spk=*) DEVSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; --eval-spk=*) EVALSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; --lang-map=*) LANGMAP=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; ??) LCODE=$1; shift ;; *) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; esac done [ -f path.sh ] && . ./path.sh # Sets the PATH to contain necessary executables tmpdir=$(mktemp -d /tmp/kaldi.XXXX); trap 'rm -rf "$tmpdir"' EXIT grep "^$LCODE" $DEVSPK | cut -f2- | tr ' ' ' ' \ | sed -e "s?^?$LCODE?" -e 's?$?_?' > $tmpdir/dev_spk grep "^$LCODE" $EVALSPK | cut -f2- | tr ' ' ' ' \ | sed -e "s?^?$LCODE?" -e 's?$?_?' > $tmpdir/eval_spk # Currently the Dev/Eval info is missing for some languages and is marked # by either TBA or XXX in the speaker list. We are currently not processing # such languages. egrep 'XXX|TBA' $tmpdir/dev_spk \ && { echo "Dev speaker list not defined. File contents:"; \ cat $tmpdir/dev_spk; exit 1; } egrep 'XXX|TBA' $tmpdir/eval_spk \ && { echo "Eval speaker list not defined. File contents:"; \ cat $tmpdir/eval_spk; exit 1; } # We are going to use the 2-letter codes throughout, but the top-level # directories of the GlobalPhone corpus use the full names of languages. full_name=`awk '/'$LCODE'/ {print $2}' $LANGMAP`; ls "$GPDIR/$full_name/adc" | sed -e "s?^?$LCODE?" -e 's?$?_?' \ > $tmpdir/all_spk grep -v -f $tmpdir/dev_spk -f $tmpdir/eval_spk $tmpdir/all_spk \ > $tmpdir/train_spk use_romanized=true trans=$tmpdir/trans_rmn.list set +e # Don't exit on error, since some transcripts may not exist find $GPDIR/$full_name/rmn -name '*.rmn' > $tmpdir/trans_rmn.list num_trans_rmn=$(wc -l $tmpdir/trans_rmn.list | awk '{print $1}') find $GPDIR/$full_name/trl -name '*.trl' > $tmpdir/trans_trl.list num_trans_trl=$(wc -l $tmpdir/trans_trl.list | awk '{print $1}') set -e if [ $num_trans_rmn -eq 0 ]; then echo "No rmn found for $LCODE: using trl (possibly ISO 8859 encoded)" trans=$tmpdir/trans_trl.list [ $num_trans_trl -eq 0 ] && \ { echo "Error: no trl transcripts found for $LCODE"; exit 1; } elif [ $num_trans_trl -eq 0 ]; then echo "No trl found for $LCODE: using rmn (GlobalPhone style ASCII encoded)" elif [ $num_trans_trl -ne $num_trans_rmn ]; then echo "Warning: # of rmn ($num_trans_rmn) and # of trl ($num_trans_trl) do not match." echo "There is possibly an error. Using rmn transcripts." fi ODIR=$WDIR/$LCODE/local/data # Directory to write file lists & transcripts mkdir -p $ODIR $WDIR/$LCODE/wav # Directory for WAV files for x in dev eval train; do find $GPDIR/$full_name/adc -name "${LCODE}*\.adc\.shn" \ | grep -f $tmpdir/${x}_spk > $ODIR/${x}_${LCODE}.flist # The audio conversion is done here since some files cannot be converted, # and those need to be removed from the file lists. # Unfortunately this needs to be done here, since sox doesn't play nice when # called directly from compute-mfcc-feats as a piped command. gp_convert_audio.sh --input-list=$ODIR/${x}_${LCODE}.flist \ --output-dir=$WDIR/$LCODE/wav \ --output-list=$ODIR/${x}_${LCODE}_wav.flist # Get the utterance IDs for the audio files successfully converted to WAV sed -e "s?.*/??" -e 's?.wav$??' $ODIR/${x}_${LCODE}_wav.flist \ > $tmpdir/${x}_basenames_wav paste $tmpdir/${x}_basenames_wav $ODIR/${x}_${LCODE}_wav.flist | sort -k1,1 \ > $tmpdir/${x}_${LCODE}_wav.scp cut -f1 $tmpdir/${x}_${LCODE}_wav.scp > $tmpdir/${x}_basenames_wav2 # Now, get the transcripts: each line of the output contains an utterance # ID followed by the transcript. sed -e 's?_$??' $tmpdir/${x}_spk | grep -f - $trans \ | gp_extract_transcripts.pl | sort -k1,1 > $tmpdir/${x}_${LCODE}.trans # Intersect the set of utterances with transcripts with the set of those # with valid audio. cut -f1 $tmpdir/${x}_${LCODE}.trans \ | join $tmpdir/${x}_basenames_wav2 - > $tmpdir/${x}_basenames # Get the common set of WAV files and transcripts. join $tmpdir/${x}_basenames $tmpdir/${x}_${LCODE}_wav.scp \ > $ODIR/${x}_${LCODE}_wav.scp join $tmpdir/${x}_basenames $tmpdir/${x}_${LCODE}.trans \ > $ODIR/${x}_${LCODE}.trans1 sed -e 's?_.*$??' $tmpdir/${x}_basenames \ | paste -d' ' $tmpdir/${x}_basenames - \ > $ODIR/${x}_${LCODE}.utt2spk utt2spk_to_spk2utt.pl $ODIR/${x}_${LCODE}.utt2spk \ > $ODIR/${x}_${LCODE}.spk2utt || exit 1; done |