Blame view

egs/iam/v2/local/make_features.py 9.85 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
  #!/usr/bin/env python3
  
  # Copyright      2017  Chun Chieh Chang
  #                2017  Ashish Arora
  #                2017  Yiwen Shao
  #                2018  Hossein Hadian
  
  """ This script converts images to Kaldi-format feature matrices. The input to
      this script is the path to a data directory, e.g. "data/train". This script
      reads the images listed in images.scp and writes them to standard output
      (by default) as Kaldi-formatted matrices (in text form). It also scales the
      images so they have the same height (via --feat-dim). It can optionally pad
      the images (on left/right sides) with white pixels.
      If an 'image2num_frames' file is found in the data dir, it will be used
      to enforce the images to have the specified length in that file by padding
      white pixels (the --padding option will be ignored in this case). This relates
      to end2end chain training.
      eg. local/make_features.py data/train --feat-dim 40
  """
  import random
  import argparse
  import os
  import sys
  import scipy.io as sio
  import numpy as np
  from scipy import misc
  from scipy.ndimage.interpolation import affine_transform
  import math
  from signal import signal, SIGPIPE, SIG_DFL
  signal(SIGPIPE, SIG_DFL)
  
  parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
                                                  writes them to standard output in text format.""")
  parser.add_argument('images_scp_path', type=str,
                      help='Path of images.scp file')
  parser.add_argument('--allowed_len_file_path', type=str, default=None,
                      help='If supplied, each images will be padded to reach the '
                      'target length (this overrides --padding).')
  parser.add_argument('--out-ark', type=str, default='-',
                      help='Where to write the output feature file')
  parser.add_argument('--feat-dim', type=int, default=40,
                      help='Size to scale the height of all images')
  parser.add_argument('--padding', type=int, default=5,
                      help='Number of white pixels to pad on the left'
                      'and right side of the image.')
  parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
                     help="Flip the image left-right for right to left languages")
  parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
                     help="performs image augmentation")
  args = parser.parse_args()
  
  
  def write_kaldi_matrix(file_handle, matrix, key):
      file_handle.write(key + " [ ")
      num_rows = len(matrix)
      if num_rows == 0:
          raise Exception("Matrix is empty")
      num_cols = len(matrix[0])
  
      for row_index in range(len(matrix)):
          if num_cols != len(matrix[row_index]):
              raise Exception("All the rows of a matrix are expected to "
                              "have the same length")
          file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
          if row_index != num_rows - 1:
              file_handle.write("
  ")
      file_handle.write(" ]
  ")
  
  
  def horizontal_pad(im, allowed_lengths = None):
      if allowed_lengths is None:
          left_padding = right_padding = args.padding
      else:  # Find an allowed length for the image
          imlen = im.shape[1] # width
          allowed_len = 0
          for l in allowed_lengths:
              if l > imlen:
                  allowed_len = l
                  break
          if allowed_len == 0:
              #  No allowed length was found for the image (the image is too long)
              return None
          padding = allowed_len - imlen
          left_padding = int(padding // 2)
          right_padding = padding - left_padding
      dim_y = im.shape[0] # height
      im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
                                             dtype=int), im), axis=1)
      im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
                                                      dtype=int)), axis=1)
      return im_pad1
  
  def get_scaled_image_aug(im, mode='normal'):
      scale_size = args.feat_dim
      sx = im.shape[1]
      sy = im.shape[0]
      scale = (1.0 * scale_size) / sy
      nx = int(scale_size)
      ny = int(scale * sx) 
      scale_size = random.randint(10, 30)
      scale = (1.0 * scale_size) / sy
      down_nx = int(scale_size)
      down_ny = int(scale * sx)
      if mode == 'normal':
          im = misc.imresize(im, (nx, ny))
          return im
      else:
          im_scaled_down = misc.imresize(im, (down_nx, down_ny))
          im_scaled_up = misc.imresize(im_scaled_down, (nx, ny))
          return im_scaled_up
      return im
  
  def contrast_normalization(im, low_pct, high_pct):
      element_number = im.size
      rows = im.shape[0]
      cols = im.shape[1]
      im_contrast = np.zeros(shape=im.shape)
      low_index = int(low_pct * element_number)
      high_index = int(high_pct * element_number)
      sorted_im = np.sort(im, axis=None)
      low_thred = sorted_im[low_index]
      high_thred = sorted_im[high_index]
      for i in range(rows):
          for j in range(cols):
              if im[i, j] > high_thred:
                  im_contrast[i, j] = 255  # lightest to white
              elif im[i, j] < low_thred:
                  im_contrast[i, j] = 0  # darkest to black
              else:
                  # linear normalization
                  im_contrast[i, j] = (im[i, j] - low_thred) * \
                      255 / (high_thred - low_thred)
      return im_contrast
  
  
  def geometric_moment(frame, p, q):
      m = 0
      for i in range(frame.shape[1]):
          for j in range(frame.shape[0]):
              m += (i ** p) * (j ** q) * frame[i][i]
      return m
  
  
  def central_moment(frame, p, q):
      u = 0
      x_bar = geometric_moment(frame, 1, 0) / \
          geometric_moment(frame, 0, 0)  # m10/m00
      y_bar = geometric_moment(frame, 0, 1) / \
          geometric_moment(frame, 0, 0)  # m01/m00
      for i in range(frame.shape[1]):
          for j in range(frame.shape[0]):
              u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j]
      return u
  
  
  def height_normalization(frame, w, h):
      frame_normalized = np.zeros(shape=(h, w))
      alpha = 4
      x_bar = geometric_moment(frame, 1, 0) / \
          geometric_moment(frame, 0, 0)  # m10/m00
      y_bar = geometric_moment(frame, 0, 1) / \
          geometric_moment(frame, 0, 0)  # m01/m00
      sigma_x = (alpha * ((central_moment(frame, 2, 0) /
                           geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u20/m00)
      sigma_y = (alpha * ((central_moment(frame, 0, 2) /
                           geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u02/m00)
      for x in range(w):
          for y in range(h):
              i = int((x / w - 0.5) * sigma_x + x_bar)
              j = int((y / h - 0.5) * sigma_y + y_bar)
              frame_normalized[x][y] = frame[i][j]
      return frame_normalized
  
  
  def find_slant_project(im):
      rows = im.shape[0]
      cols = im.shape[1]
      std_max = 0
      alpha_max = 0
      col_disp = np.zeros(90, int)
      proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int)
      for r in range(rows):
          for alpha in range(-45, 45, 1):
              col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi))
          for c in range(cols):
              if im[r, c] < 100:
                  for alpha in range(-45, 45, 1):
                      proj[alpha + 45, c + col_disp[alpha] + rows] += 1
      for alpha in range(-45, 45, 1):
          proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10)
          proj_std = np.std(proj_histogram)
          if proj_std > std_max:
              std_max = proj_std
              alpha_max = alpha
      proj_std = np.std(proj, axis=1)
      return -alpha_max
  
  
  def horizontal_shear(im, degree):
      rad = degree / 180.0 * math.pi
      padding_x = int(abs(np.tan(rad)) * im.shape[0])
      padding_y = im.shape[0]
      if rad > 0:
          im_pad = np.concatenate(
              (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1)
      elif rad < 0:
          im_pad = np.concatenate(
              (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1)
      else:
          im_pad = im
      shear_matrix = np.array([[1, 0],
                               [np.tan(rad), 1]])
      sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0)
      return sheared_im
  
  
  ### main ###
  random.seed(1)
  data_list_path = args.images_scp_path
  if args.out_ark == '-':
      out_fh = sys.stdout
  else:
      out_fh = open(args.out_ark,'w')
  
  allowed_lengths = None
  allowed_len_handle = args.allowed_len_file_path
  if os.path.isfile(allowed_len_handle):
      print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
      allowed_lengths = []
      with open(allowed_len_handle) as f:
          for line in f:
              allowed_lengths.append(int(line.strip()))
      print("Read {} allowed lengths and will apply them to the "
            "features.".format(len(allowed_lengths)), file=sys.stderr)
  
  num_fail = 0
  num_ok = 0
  aug_setting = ['normal', 'scaled']
  with open(data_list_path) as f:
      for line in f:
          line = line.strip()
          line_vect = line.split(' ')
          image_id = line_vect[0]
          image_path = line_vect[1]
          im = misc.imread(image_path)
          if args.fliplr:
              im = np.fliplr(im)
          if args.augment:
              im_aug = get_scaled_image_aug(im, aug_setting[0])
              im_contrast = contrast_normalization(im_aug, 0.05, 0.2)
              slant_degree = find_slant_project(im_contrast)
              im_sheared = horizontal_shear(im_contrast, slant_degree)
              im_aug = im_sheared
          else:
              im_aug = get_scaled_image_aug(im, aug_setting[0])
          im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
          if im_horizontal_padded is None:
              num_fail += 1
              continue
          data = np.transpose(im_horizontal_padded, (1, 0))
          data = np.divide(data, 255.0)
          num_ok += 1
          write_kaldi_matrix(out_fh, data, image_id)
  
  print('Generated features for {} images. Failed for {} (image too '
        'long).'.format(num_ok, num_fail), file=sys.stderr)