Blame view

Scripts/utils/fix_data_dir.sh 5.26 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
  #!/bin/bash
  
  # This script makes sure that only the segments present in 
  # all of "feats.scp", "wav.scp" [if present], segments[if prsent]
  # text, and utt2spk are present in any of them.
  # It puts the original contents of data-dir into 
  # data-dir/.backup
  
  if [ $# != 1 ]; then
    echo "Usage: fix_data_dir.sh data-dir"
    exit 1
  fi
  
  data=$1
  mkdir -p $data/.backup
  
  [ ! -d $data ] && echo "$0: no such directory $data" && exit 1;
  
  [ ! -f $data/utt2spk ] && echo "$0: no such file $data/utt2spk" && exit 1;
  
  tmpdir=$(mktemp -d);
  trap 'rm -rf "$tmpdir"' EXIT HUP INT PIPE TERM
  
  export LC_ALL=C
  
  
  function check_sorted {
    file=$1
    sort -k1,1 -u <$file >$file.tmp
    if ! cmp -s $file $file.tmp; then
      echo "$0: file $1 is not in sorted order or not unique, sorting it"
      mv $file.tmp $file
    else
      rm $file.tmp
    fi
  }
  
  for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp reco2file_and_channel spk2gender; do
    if [ -f $data/$x ]; then
      cp $data/$x $data/.backup/$x
      check_sorted $data/$x
    fi
  done
  
  
  function filter_file {
    filter=$1
    file_to_filter=$2
    cp $file_to_filter ${file_to_filter}.tmp
    utils/filter_scp.pl $filter ${file_to_filter}.tmp > $file_to_filter
    if ! cmp ${file_to_filter}.tmp  $file_to_filter >&/dev/null; then
      length1=`cat ${file_to_filter}.tmp | wc -l`
      length2=`cat ${file_to_filter} | wc -l`
      if [ $length1 -ne $length2 ]; then
        echo "$0: filtered $file_to_filter from $length1 to $length2 lines based on filter $filter."
      fi
    fi
  }
  
  function filter_recordings {
    # We call this once before the stage when we filter on utterance-id, and once
    # after.
    
    if [ -f $data/segments ]; then
    # We have a segments file -> we need to filter this and the file wav.scp, and
    # reco2file_and_utt, if it exists, to make sure they have the same list of
    # recording-ids.
  
      if [ ! -f $data/wav.scp ]; then
        echo "$0: $data/segments exists but not $data/wav.scp"
        exit 1;
      fi
      awk '{print $2}' < $data/segments | sort | uniq > $tmpdir/recordings
      n1=`cat $tmpdir/recordings | wc -l`
      [ ! -s $tmpdir/recordings ] && \
        echo "Empty list of recordings (bad file $data/segments)?" && exit 1;
      utils/filter_scp.pl $data/wav.scp $tmpdir/recordings > $tmpdir/recordings.tmp
      mv $tmpdir/recordings.tmp $tmpdir/recordings
  
      
      cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
      filter_file $tmpdir/recordings $data/segments
      cp $data/segments{,.tmp}; awk '{print $2, $1, $3, $4}' <$data/segments.tmp >$data/segments
  
      filter_file $tmpdir/recordings $data/wav.scp
      [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
    fi
  }
  
  function filter_speakers {
    # throughout this program, we regard utt2spk as primary and spk2utt as derived, so...
    utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
  
    if [ -f $data/cmvn.scp ]; then
      cat $data/spk2utt | awk '{print $1}' >$tmpdir/speakers
      cat $data/cmvn.scp | awk '{print $1}' >$tmpdir/speakers.cmvn
      utils/filter_scp.pl $data/cmvn.scp $tmpdir/speakers > $tmpdir/speakers.tmp
      mv $tmpdir/speakers.tmp $tmpdir/speakers
  
      filter_file $tmpdir/speakers $data/cmvn.scp
      filter_file $tmpdir/speakers $data/spk2utt
      utils/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk
    fi
    if [ -f $data/spk2gender ]; then
      # We don't handle the case when the spk2gender does not cover all speakers.
      cat $data/spk2utt | awk '{print $1}' >$tmpdir/speakers
      filter_file $tmpdir/speakers $data/spk2gender 
    fi
  }
  
  function filter_utts {
    cat $data/utt2spk | awk '{print $1}' > $tmpdir/utts
  
  # Do a check.
  
    ! cat $data/utt2spk | sort | cmp - $data/utt2spk && \
      echo "utt2spk is not in sorted order (fix this yourself)" && exit 1;
  
    ! cat $data/utt2spk | sort -k2 | cmp - $data/utt2spk && \
      echo "utt2spk is not in sorted order when sorted first on speaker-id " && \
      echo "(fix this by making speaker-ids prefixes of utt-ids)" && exit 1;
  
    ! cat $data/spk2utt | sort | cmp - $data/spk2utt && \
      echo "spk2utt is not in sorted order (fix this yourself)" && exit 1;
  
  
    maybe_wav=
    [ ! -f $data/segments ] && maybe_wav=wav.scp  # wav indexed by utts only if segments does not exist.
    for x in feats.scp text segments $maybe_wav; do
      if [ -f $data/$x ]; then
        utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
        mv $tmpdir/utts.tmp $tmpdir/utts
      fi
    done
    [ ! -s $tmpdir/utts ] && echo "fix_data_dir.sh: no utterances remained: not proceeding further." && \
      rm $tmpdir/utts && exit 1;
  
    nutts=`cat $tmpdir/utts | wc -l`
    if [ -f $data/feats.scp ]; then
      nfeats=`cat $data/feats.scp | wc -l`
    else
      nfeats=0
    fi
    ntext=`cat $data/text | wc -l`
    if [ "$nutts" -ne "$nfeats" -o "$nutts" -ne "$ntext" ]; then
      echo "fix_data_dir.sh: kept $nutts utterances, vs. $nfeats features and $ntext transcriptions."
    else
      echo "fix_data_dir.sh: kept all $nutts utterances."
    fi
  
    for x in utt2spk feats.scp text segments $maybe_wav; do
      if [ -f $data/$x ]; then
        mv $data/$x $data/.backup/$x
        utils/filter_scp.pl $tmpdir/utts $data/.backup/$x > $data/$x
      fi
    done
  
  }
  
  filter_recordings
  filter_speakers
  filter_utts
  filter_recordings
  
  
  
  utils/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt
  
  echo "fix_data_dir.sh: old files are kept in $data/.backup"