Blame view
egs/wsj/s5/utils/combine_data.sh
4.11 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
#!/bin/bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. # 2014 David Snyder # This script combines the data from multiple source directories into # a single destination directory. # See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information # about what these directories contain. # Begin configuration section. extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..." skip_fix=false # skip the fix_data_dir.sh in the end # End configuration section. echo "$0 $@" # Print the command line for logging if [ -f path.sh ]; then . ./path.sh; fi . parse_options.sh || exit 1; if [ $# -lt 2 ]; then echo "Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..." echo "Note, files that don't appear in all source dirs will not be combined," echo "with the exception of utt2uniq and segments, which are created where necessary." exit 1 fi dest=$1; shift; first_src=$1; rm -r $dest 2>/dev/null mkdir -p $dest; export LC_ALL=C for dir in $*; do if [ ! -f $dir/utt2spk ]; then echo "$0: no such file $dir/utt2spk" exit 1; fi done # Check that frame_shift are compatible, where present together with features. dir_with_frame_shift= for dir in $*; do if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then if [[ $dir_with_frame_shift ]] && ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then echo "$0:error: different frame_shift in directories $dir and " \ "$dir_with_frame_shift. Cannot combine features." exit 1; fi dir_with_frame_shift=$dir fi done # W.r.t. utt2uniq file the script has different behavior compared to other files # it is not compulsary for it to exist in src directories, but if it exists in # even one it should exist in all. We will create the files where necessary has_utt2uniq=false for in_dir in $*; do if [ -f $in_dir/utt2uniq ]; then has_utt2uniq=true break fi done if $has_utt2uniq; then # we are going to create an utt2uniq file in the destdir for in_dir in $*; do if [ ! -f $in_dir/utt2uniq ]; then # we assume that utt2uniq is a one to one mapping cat $in_dir/utt2spk | awk '{printf("%s %s ", $1, $1);}' else cat $in_dir/utt2uniq fi done | sort -k1 > $dest/utt2uniq echo "$0: combined utt2uniq" else echo "$0 [info]: not combining utt2uniq as it does not exist" fi # some of the old scripts might provide utt2uniq as an extrafile, so just remove it extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g") # segments are treated similarly to utt2uniq. If it exists in some, but not all # src directories, then we generate segments where necessary. has_segments=false for in_dir in $*; do if [ -f $in_dir/segments ]; then has_segments=true break fi done if $has_segments; then for in_dir in $*; do if [ ! -f $in_dir/segments ]; then echo "$0 [info]: will generate missing segments for $in_dir" 1>&2 utils/data/get_segments_for_data.sh $in_dir else cat $in_dir/segments fi done | sort -k1 > $dest/segments echo "$0: combined segments" else echo "$0 [info]: not combining segments as it does not exist" fi for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do exists_somewhere=false absent_somewhere=false for d in $*; do if [ -f $d/$file ]; then exists_somewhere=true else absent_somewhere=true fi done if ! $absent_somewhere; then set -o pipefail ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1; set +o pipefail echo "$0: combined $file" else if ! $exists_somewhere; then echo "$0 [info]: not combining $file as it does not exist" else echo "$0 [info]: **not combining $file as it does not exist everywhere**" fi fi done utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt if [[ $dir_with_frame_shift ]]; then cp $dir_with_frame_shift/frame_shift $dest fi if ! $skip_fix ; then utils/fix_data_dir.sh $dest || exit 1; fi exit 0 |