Blame view

egs/wsj/s5/utils/data/get_reco2dur.sh 4.92 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
  #!/bin/bash
  
  # Copyright 2016  Johns Hopkins University (author: Daniel Povey)
  #           2018  Andrea Carmantini
  # Apache 2.0
  
  # This script operates on a data directory, such as in data/train/, and adds the
  # reco2dur file if it does not already exist.  The file 'reco2dur' maps from
  # recording to the duration of the recording in seconds.  This script works it
  # out from the 'wav.scp' file, or, if utterance-ids are the same as recording-ids, from the
  # utt2dur file (it first tries interrogating the headers, and if this fails, it reads the wave
  # files in entirely.)
  # We could use durations from segments file, but that's not the duration of the recordings
  # but the sum of utterance lenghts (silence in between could be excluded from segments)
  # For sum of utterance lenghts:
  # awk 'FNR==NR{uttdur[$1]=$2;next}
  # { for(i=2;i<=NF;i++){dur+=uttdur[$i];}
  #   print $1 FS dur; dur=0  }'  $data/utt2dur $data/reco2utt
  
  
  frame_shift=0.01
  cmd=run.pl
  nj=4
  
  . utils/parse_options.sh
  . ./path.sh
  
  if [ $# != 1 ]; then
    echo "Usage: $0 [options] <datadir>"
    echo "e.g.:"
    echo " $0 data/train"
    echo " Options:"
    echo " --frame-shift      # frame shift in seconds. Only relevant when we are"
    echo "                    # getting duration from feats.scp (default: 0.01). "
    exit 1
  fi
  
  export LC_ALL=C
  
  data=$1
  
  
  if [ -s $data/reco2dur ] && \
    [ $(wc -l < $data/wav.scp) -eq $(wc -l < $data/reco2dur) ]; then
    echo "$0: $data/reco2dur already exists with the expected length.  We won't recompute it."
    exit 0;
  fi
  
  if [ -s $data/utt2dur ] && \
     [ $(wc -l < $data/utt2spk) -eq $(wc -l < $data/utt2dur) ] && \
     [ ! -s $data/segments ]; then
  
    echo "$0: $data/wav.scp indexed by utt-id; copying utt2dur to reco2dur"
    cp $data/utt2dur $data/reco2dur && exit 0;
  
  elif [ -f $data/wav.scp ]; then
    echo "$0: obtaining durations from recordings"
  
    # if the wav.scp contains only lines of the form
    # utt1  /foo/bar/sph2pipe -f wav /baz/foo.sph |
    if cat $data/wav.scp | perl -e '
       while (<>) { s/\|\s*$/ |/;  # make sure final | is preceded by space.
               @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
                                 $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
               $reco = $A[0]; $sphere_file = $A[4];
  
               if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; }
               $sample_rate = -1;  $sample_count = -1;
               for ($n = 0; $n <= 30; $n++) {
                  $line = <F>;
                  if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; }
                  if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; }
                  if ($line =~ m/end_head/) { break; }
               }
               close(F);
               if ($sample_rate == -1 || $sample_count == -1) {
                 die "could not parse sphere header from $sphere_file";
               }
               $duration = $sample_count * 1.0 / $sample_rate;
               print "$reco $duration
  ";
       } ' > $data/reco2dur; then
      echo "$0: successfully obtained recording lengths from sphere-file headers"
    else
      echo "$0: could not get recording lengths from sphere-file headers, using wav-to-duration"
      if ! command -v wav-to-duration >/dev/null; then
        echo  "$0: wav-to-duration is not on your path"
        exit 1;
      fi
  
      read_entire_file=false
      if grep -q 'sox.*speed' $data/wav.scp; then
        read_entire_file=true
        echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow."
        echo "... It is much faster if you call get_reco2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or "
        echo "... perturb_data_dir_speed_3way.sh."
      fi
  
      num_recos=$(wc -l <$data/wav.scp)
      if [ $nj -gt $num_recos ]; then
        nj=$num_recos
      fi
  
      temp_data_dir=$data/wav${nj}split
      wavscps=$(for n in `seq $nj`; do echo $temp_data_dir/$n/wav.scp; done)
      subdirs=$(for n in `seq $nj`; do echo $temp_data_dir/$n; done)
  
      if ! mkdir -p $subdirs >&/dev/null; then
  	for n in `seq $nj`; do
  	    mkdir -p $temp_data_dir/$n
  	done
      fi
  
      utils/split_scp.pl $data/wav.scp $wavscps
  
  
      $cmd JOB=1:$nj $data/log/get_reco_durations.JOB.log \
        wav-to-duration --read-entire-file=$read_entire_file \
        scp:$temp_data_dir/JOB/wav.scp ark,t:$temp_data_dir/JOB/reco2dur || \
          { echo "$0: there was a problem getting the durations"; exit 1; } # This could
  
      for n in `seq $nj`; do
        cat $temp_data_dir/$n/reco2dur
      done > $data/reco2dur
    fi
    rm -r $temp_data_dir
  else
    echo "$0: Expected $data/wav.scp to exist"
    exit 1
  fi
  
  len1=$(wc -l < $data/wav.scp)
  len2=$(wc -l < $data/reco2dur)
  if [ "$len1" != "$len2" ]; then
    echo "$0: warning: length of reco2dur does not equal that of wav.scp, $len2 != $len1"
    if [ $len1 -gt $[$len2*2] ]; then
      echo "$0: less than half of recordings got a duration: failing."
      exit 1
    fi
  fi
  
  echo "$0: computed $data/reco2dur"
  
  exit 0