Blame view
egs/tedlium/s5_r2_wsj/local/prepare_data.sh
3.26 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
#!/bin/bash # # Copyright 2014 Nickolay V. Shmyrev # 2014 Brno University of Technology (Author: Karel Vesely) # 2016 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 # To be run from one directory above this script. . ./path.sh export LC_ALL=C set -e set -o pipefail set -u # Prepare: test, train, for set in dev test train; do dir=data/$set.orig mkdir -p $dir # Merge transcripts into a single 'stm' file, do some mappings: # - <F0_M> -> <o,f0,male> : map dev stm labels to be coherent with train + test, # - <F0_F> -> <o,f0,female> : --||-- # - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary # - <sil> -> null : remove marked <sil>, it is modelled implicitly (in kaldi) # - (...) -> null : remove utterance names from end-lines of train # - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py) { # Add STM header, so sclite can prepare the '.lur' file echo ';; ;; LABEL "o" "Overall" "Overall results" ;; LABEL "f0" "f0" "Wideband channel" ;; LABEL "f2" "f2" "Telephone channel" ;; LABEL "male" "Male" "Male Talkers" ;; LABEL "female" "Female" "Female Talkers" ;;' # Process the STMs cat db/TEDLIUM_release2/$set/stm/*.stm | sort -k1,1 -k2,2 -k4,4n | \ sed -e 's:<F0_M>:<o,f0,male>:' \ -e 's:<F0_F>:<o,f0,female>:' \ -e 's:([0-9])::g' \ -e 's:<sil>::g' \ -e 's:([^ ]*)$::' | \ awk '{ $2 = "A"; print $0; }' } | local/join_suffix.py > data/$set.orig/stm # Prepare 'text' file # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \ awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100); for (i=7;i<=NF;i++) { printf(" %s", $i); } printf(" "); }' | tr '{}' '[]' | sort -k1,1 > $dir/text.orig cat $dir/text.orig | awk '{if (NF > 1) print $0}' | \ local/normalize_transcript.pl '<NOISE>' | awk '{if (NF > 1) print $0}' \ > $dir/text || exit 1 # Prepare 'segments', 'utt2spk', 'spk2utt' cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s %07.2f %07.2f ", $0, $1, $2/100.0, $3/100.0)}' > $dir/segments cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt # Prepare 'wav.scp', 'reco2file_and_channel' cat $dir/spk2utt | awk -v set=$set -v pwd=$PWD '{ printf("%s sph2pipe -f wav -p %s/db/TEDLIUM_release2/%s/sph/%s.sph | ", $1, pwd, set, $1); }' > $dir/wav.scp cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel # Create empty 'glm' file echo ';; empty.glm [FAKE] => %HESITATION / [ ] __ [ ] ;; hesitation token ' > data/$set.orig/glm # The training set seems to not have enough silence padding in the segmentations, # especially at the beginning of segments. Extend the times. if [ $set == "train" ]; then mv data/$set.orig/segments data/$set.orig/segments.temp utils/data/extend_segment_times.py --start-padding=0.15 \ --end-padding=0.1 <data/$set.orig/segments.temp >data/$set.orig/segments || exit 1 rm data/$set.orig/segments.temp fi # Check that data dirs are okay! utils/validate_data_dir.sh --no-feats $dir || exit 1 done |