Blame view
egs/gp/s1/local/gp_prep_flists.sh
4.8 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
#!/bin/bash -u # Copyright 2012 Arnab Ghoshal # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, # MERCHANTABLITY OR NON-INFRINGEMENT. # See the Apache 2 License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail function read_dirname () { local dir_name=`expr "X$1" : '[^=]*=\(.*\)'`; [ -d "$dir_name" ] || { echo "Argument '$dir_name' not a directory" >&2; \ exit 1; } local retval=`cd $dir_name 2>/dev/null && pwd || exit 1` echo $retval } PROG=`basename $0`; usage="Usage: $PROG <arguments> <2-letter language code> Prepare train, dev, eval file lists for a language. Required arguments: --corpus-dir=DIR\tDirectory for the GlobalPhone corpus --dev-spk=FILE\t\tDev set speaker list --eval-spk=FILE\tEval set speaker list --lang-map=FILE\tMapping from 2-letter language code to full name --work-dir=DIR\t\tPlace to write the files (in a subdirectory with the 2-letter language code) "; if [ $# -lt 6 ]; then echo -e $usage; exit 1; fi while [ $# -gt 0 ]; do case "$1" in --help) echo -e $usage; exit 0 ;; --corpus-dir=*) GPDIR=`read_dirname $1`; shift ;; --work-dir=*) WDIR=`read_dirname $1`; shift ;; --dev-spk=*) DEVSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; --eval-spk=*) EVALSPK=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; --lang-map=*) LANGMAP=`expr "X$1" : '[^=]*=\(.*\)'`; shift ;; ??) LCODE=$1; shift ;; *) echo "Unknown argument: $1, exiting"; echo -e $usage; exit 1 ;; esac done tmpdir=$(mktemp -d); trap 'rm -rf "$tmpdir"' EXIT grep "^$LCODE" $DEVSPK | cut -f2- | tr ' ' ' ' \ | sed -e "s?^?$LCODE?" -e 's?$?_?' > $tmpdir/dev_spk grep "^$LCODE" $EVALSPK | cut -f2- | tr ' ' ' ' \ | sed -e "s?^?$LCODE?" -e 's?$?_?' > $tmpdir/eval_spk # Currently the Dev/Eval info is missing for some languages and is marked # by either TBA or XXX in the speaker list. We are currently not processing # such languages. egrep 'XXX|TBA' $tmpdir/dev_spk \ && { echo "Dev speaker list not defined. File contents:"; \ cat $tmpdir/dev_spk; exit 1; } egrep 'XXX|TBA' $tmpdir/eval_spk \ && { echo "Eval speaker list not defined. File contents:"; \ cat $tmpdir/eval_spk; exit 1; } # We are going to use the 2-letter codes throughout, but the top-level # directories of the GlobalPhone corpus use the full names of languages. full_name=`awk '/'$LCODE'/ {print $2}' $LANGMAP`; ls "$GPDIR/$full_name/adc" | sed -e "s?^?$LCODE?" -e 's?$?_?' \ > $tmpdir/all_spk grep -v -f $tmpdir/dev_spk -f $tmpdir/eval_spk $tmpdir/all_spk \ > $tmpdir/train_spk find $GPDIR/$full_name/rmn -name '*.rmn' > $tmpdir/trans.list ODIR=$WDIR/$LCODE/local # Directory to write file lists & transcripts mkdir -p $ODIR $WDIR/$LCODE/wav # Directory for WAV files for x in dev eval train; do find $GPDIR/$full_name/adc -name "${LCODE}*\.adc\.shn" \ | grep -f $tmpdir/${x}_spk > $ODIR/${x}_${LCODE}.flist # The audio conversion is done here since some files cannot be converted, # and those need to be removed from the file lists. gp_convert_audio.sh --input-list=$ODIR/${x}_${LCODE}.flist \ --output-dir=$WDIR/$LCODE/wav \ --output-list=$ODIR/${x}_${LCODE}_wav.flist # Get the utterance IDs for the audio files successfully converted to WAV sed -e "s?.*/??" -e 's?.wav$??' $ODIR/${x}_${LCODE}_wav.flist \ > $tmpdir/${x}_basenames_wav paste $tmpdir/${x}_basenames_wav $ODIR/${x}_${LCODE}_wav.flist | sort -k1,1 \ > $tmpdir/${x}_${LCODE}_wav.scp cut -f1 $tmpdir/${x}_${LCODE}_wav.scp > $tmpdir/${x}_basenames_wav2 # Now, get the transcripts: each line of the output contains an utterance # ID followed by the transcript. sed -e 's?_$??' $tmpdir/${x}_spk | grep -f - $tmpdir/trans.list \ | gp_extract_transcripts.pl | sort -k1,1 > $tmpdir/${x}_${LCODE}.trans # Intersect the set of utterances with transcripts with the set of those # with valid audio. cut -f1 $tmpdir/${x}_${LCODE}.trans \ | join $tmpdir/${x}_basenames_wav2 - > $tmpdir/${x}_basenames # Get the common set of WAV files and transcripts. join $tmpdir/${x}_basenames $tmpdir/${x}_${LCODE}_wav.scp \ > $ODIR/${x}_${LCODE}_wav.scp join $tmpdir/${x}_basenames $tmpdir/${x}_${LCODE}.trans \ > $ODIR/${x}_${LCODE}.trans sed -e 's?_.*$??' $tmpdir/${x}_basenames \ | paste -d' ' $tmpdir/${x}_basenames - \ > $ODIR/${x}_${LCODE}.utt2spk utt2spk_to_spk2utt.pl $ODIR/${x}_${LCODE}.utt2spk \ > $ODIR/${x}_${LCODE}.spk2utt || exit 1; done |