Blame view

egs/wsj/s5/utils/combine_data.sh 4.11 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
  #!/bin/bash
  # Copyright 2012  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
  #           2014  David Snyder
  
  # This script combines the data from multiple source directories into
  # a single destination directory.
  
  # See http://kaldi-asr.org/doc/data_prep.html#data_prep_data for information
  # about what these directories contain.
  
  # Begin configuration section.
  extra_files= # specify additional files in 'src-data-dir' to merge, ex. "file1 file2 ..."
  skip_fix=false # skip the fix_data_dir.sh in the end
  # End configuration section.
  
  echo "$0 $@"  # Print the command line for logging
  
  if [ -f path.sh ]; then . ./path.sh; fi
  . parse_options.sh || exit 1;
  
  if [ $# -lt 2 ]; then
    echo "Usage: combine_data.sh [--extra-files 'file1 file2'] <dest-data-dir> <src-data-dir1> <src-data-dir2> ..."
    echo "Note, files that don't appear in all source dirs will not be combined,"
    echo "with the exception of utt2uniq and segments, which are created where necessary."
    exit 1
  fi
  
  dest=$1;
  shift;
  
  first_src=$1;
  
  rm -r $dest 2>/dev/null
  mkdir -p $dest;
  
  export LC_ALL=C
  
  for dir in $*; do
    if [ ! -f $dir/utt2spk ]; then
      echo "$0: no such file $dir/utt2spk"
      exit 1;
    fi
  done
  
  # Check that frame_shift are compatible, where present together with features.
  dir_with_frame_shift=
  for dir in $*; do
    if [[ -f $dir/feats.scp && -f $dir/frame_shift ]]; then
      if [[ $dir_with_frame_shift ]] &&
         ! cmp -s $dir_with_frame_shift/frame_shift $dir/frame_shift; then
        echo "$0:error: different frame_shift in directories $dir and " \
             "$dir_with_frame_shift. Cannot combine features."
        exit 1;
      fi
      dir_with_frame_shift=$dir
    fi
  done
  
  # W.r.t. utt2uniq file the script has different behavior compared to other files
  # it is not compulsary for it to exist in src directories, but if it exists in
  # even one it should exist in all. We will create the files where necessary
  has_utt2uniq=false
  for in_dir in $*; do
    if [ -f $in_dir/utt2uniq ]; then
      has_utt2uniq=true
      break
    fi
  done
  
  if $has_utt2uniq; then
    # we are going to create an utt2uniq file in the destdir
    for in_dir in $*; do
      if [ ! -f $in_dir/utt2uniq ]; then
        # we assume that utt2uniq is a one to one mapping
        cat $in_dir/utt2spk | awk '{printf("%s %s
  ", $1, $1);}'
      else
        cat $in_dir/utt2uniq
      fi
    done | sort -k1 > $dest/utt2uniq
    echo "$0: combined utt2uniq"
  else
    echo "$0 [info]: not combining utt2uniq as it does not exist"
  fi
  # some of the old scripts might provide utt2uniq as an extrafile, so just remove it
  extra_files=$(echo "$extra_files"|sed -e "s/utt2uniq//g")
  
  # segments are treated similarly to utt2uniq. If it exists in some, but not all
  # src directories, then we generate segments where necessary.
  has_segments=false
  for in_dir in $*; do
    if [ -f $in_dir/segments ]; then
      has_segments=true
      break
    fi
  done
  
  if $has_segments; then
    for in_dir in $*; do
      if [ ! -f $in_dir/segments ]; then
        echo "$0 [info]: will generate missing segments for $in_dir" 1>&2
        utils/data/get_segments_for_data.sh $in_dir
      else
        cat $in_dir/segments
      fi
    done | sort -k1 > $dest/segments
    echo "$0: combined segments"
  else
    echo "$0 [info]: not combining segments as it does not exist"
  fi
  
  for file in utt2spk utt2lang utt2dur utt2num_frames reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
    exists_somewhere=false
    absent_somewhere=false
    for d in $*; do
      if [ -f $d/$file ]; then
        exists_somewhere=true
      else
        absent_somewhere=true
        fi
    done
  
    if ! $absent_somewhere; then
      set -o pipefail
      ( for f in $*; do cat $f/$file; done ) | sort -k1 > $dest/$file || exit 1;
      set +o pipefail
      echo "$0: combined $file"
    else
      if ! $exists_somewhere; then
        echo "$0 [info]: not combining $file as it does not exist"
      else
        echo "$0 [info]: **not combining $file as it does not exist everywhere**"
      fi
    fi
  done
  
  utils/utt2spk_to_spk2utt.pl <$dest/utt2spk >$dest/spk2utt
  
  if [[ $dir_with_frame_shift ]]; then
    cp $dir_with_frame_shift/frame_shift $dest
  fi
  
  if ! $skip_fix ; then
    utils/fix_data_dir.sh $dest || exit 1;
  fi
  
  exit 0