Blame view
egs/iam/v2/local/make_features.py
9.85 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 |
#!/usr/bin/env python3 # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora # 2017 Yiwen Shao # 2018 Hossein Hadian """ This script converts images to Kaldi-format feature matrices. The input to this script is the path to a data directory, e.g. "data/train". This script reads the images listed in images.scp and writes them to standard output (by default) as Kaldi-formatted matrices (in text form). It also scales the images so they have the same height (via --feat-dim). It can optionally pad the images (on left/right sides) with white pixels. If an 'image2num_frames' file is found in the data dir, it will be used to enforce the images to have the specified length in that file by padding white pixels (the --padding option will be ignored in this case). This relates to end2end chain training. eg. local/make_features.py data/train --feat-dim 40 """ import random import argparse import os import sys import scipy.io as sio import numpy as np from scipy import misc from scipy.ndimage.interpolation import affine_transform import math from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and writes them to standard output in text format.""") parser.add_argument('images_scp_path', type=str, help='Path of images.scp file') parser.add_argument('--allowed_len_file_path', type=str, default=None, help='If supplied, each images will be padded to reach the ' 'target length (this overrides --padding).') parser.add_argument('--out-ark', type=str, default='-', help='Where to write the output feature file') parser.add_argument('--feat-dim', type=int, default=40, help='Size to scale the height of all images') parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, help="Flip the image left-right for right to left languages") parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, help="performs image augmentation") args = parser.parse_args() def write_kaldi_matrix(file_handle, matrix, key): file_handle.write(key + " [ ") num_rows = len(matrix) if num_rows == 0: raise Exception("Matrix is empty") num_cols = len(matrix[0]) for row_index in range(len(matrix)): if num_cols != len(matrix[row_index]): raise Exception("All the rows of a matrix are expected to " "have the same length") file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) if row_index != num_rows - 1: file_handle.write(" ") file_handle.write(" ] ") def horizontal_pad(im, allowed_lengths = None): if allowed_lengths is None: left_padding = right_padding = args.padding else: # Find an allowed length for the image imlen = im.shape[1] # width allowed_len = 0 for l in allowed_lengths: if l > imlen: allowed_len = l break if allowed_len == 0: # No allowed length was found for the image (the image is too long) return None padding = allowed_len - imlen left_padding = int(padding // 2) right_padding = padding - left_padding dim_y = im.shape[0] # height im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), dtype=int), im), axis=1) im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), dtype=int)), axis=1) return im_pad1 def get_scaled_image_aug(im, mode='normal'): scale_size = args.feat_dim sx = im.shape[1] sy = im.shape[0] scale = (1.0 * scale_size) / sy nx = int(scale_size) ny = int(scale * sx) scale_size = random.randint(10, 30) scale = (1.0 * scale_size) / sy down_nx = int(scale_size) down_ny = int(scale * sx) if mode == 'normal': im = misc.imresize(im, (nx, ny)) return im else: im_scaled_down = misc.imresize(im, (down_nx, down_ny)) im_scaled_up = misc.imresize(im_scaled_down, (nx, ny)) return im_scaled_up return im def contrast_normalization(im, low_pct, high_pct): element_number = im.size rows = im.shape[0] cols = im.shape[1] im_contrast = np.zeros(shape=im.shape) low_index = int(low_pct * element_number) high_index = int(high_pct * element_number) sorted_im = np.sort(im, axis=None) low_thred = sorted_im[low_index] high_thred = sorted_im[high_index] for i in range(rows): for j in range(cols): if im[i, j] > high_thred: im_contrast[i, j] = 255 # lightest to white elif im[i, j] < low_thred: im_contrast[i, j] = 0 # darkest to black else: # linear normalization im_contrast[i, j] = (im[i, j] - low_thred) * \ 255 / (high_thred - low_thred) return im_contrast def geometric_moment(frame, p, q): m = 0 for i in range(frame.shape[1]): for j in range(frame.shape[0]): m += (i ** p) * (j ** q) * frame[i][i] return m def central_moment(frame, p, q): u = 0 x_bar = geometric_moment(frame, 1, 0) / \ geometric_moment(frame, 0, 0) # m10/m00 y_bar = geometric_moment(frame, 0, 1) / \ geometric_moment(frame, 0, 0) # m01/m00 for i in range(frame.shape[1]): for j in range(frame.shape[0]): u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j] return u def height_normalization(frame, w, h): frame_normalized = np.zeros(shape=(h, w)) alpha = 4 x_bar = geometric_moment(frame, 1, 0) / \ geometric_moment(frame, 0, 0) # m10/m00 y_bar = geometric_moment(frame, 0, 1) / \ geometric_moment(frame, 0, 0) # m01/m00 sigma_x = (alpha * ((central_moment(frame, 2, 0) / geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00) sigma_y = (alpha * ((central_moment(frame, 0, 2) / geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00) for x in range(w): for y in range(h): i = int((x / w - 0.5) * sigma_x + x_bar) j = int((y / h - 0.5) * sigma_y + y_bar) frame_normalized[x][y] = frame[i][j] return frame_normalized def find_slant_project(im): rows = im.shape[0] cols = im.shape[1] std_max = 0 alpha_max = 0 col_disp = np.zeros(90, int) proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int) for r in range(rows): for alpha in range(-45, 45, 1): col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi)) for c in range(cols): if im[r, c] < 100: for alpha in range(-45, 45, 1): proj[alpha + 45, c + col_disp[alpha] + rows] += 1 for alpha in range(-45, 45, 1): proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10) proj_std = np.std(proj_histogram) if proj_std > std_max: std_max = proj_std alpha_max = alpha proj_std = np.std(proj, axis=1) return -alpha_max def horizontal_shear(im, degree): rad = degree / 180.0 * math.pi padding_x = int(abs(np.tan(rad)) * im.shape[0]) padding_y = im.shape[0] if rad > 0: im_pad = np.concatenate( (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) elif rad < 0: im_pad = np.concatenate( (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) else: im_pad = im shear_matrix = np.array([[1, 0], [np.tan(rad), 1]]) sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0) return sheared_im ### main ### random.seed(1) data_list_path = args.images_scp_path if args.out_ark == '-': out_fh = sys.stdout else: out_fh = open(args.out_ark,'w') allowed_lengths = None allowed_len_handle = args.allowed_len_file_path if os.path.isfile(allowed_len_handle): print("Found 'allowed_lengths.txt' file...", file=sys.stderr) allowed_lengths = [] with open(allowed_len_handle) as f: for line in f: allowed_lengths.append(int(line.strip())) print("Read {} allowed lengths and will apply them to the " "features.".format(len(allowed_lengths)), file=sys.stderr) num_fail = 0 num_ok = 0 aug_setting = ['normal', 'scaled'] with open(data_list_path) as f: for line in f: line = line.strip() line_vect = line.split(' ') image_id = line_vect[0] image_path = line_vect[1] im = misc.imread(image_path) if args.fliplr: im = np.fliplr(im) if args.augment: im_aug = get_scaled_image_aug(im, aug_setting[0]) im_contrast = contrast_normalization(im_aug, 0.05, 0.2) slant_degree = find_slant_project(im_contrast) im_sheared = horizontal_shear(im_contrast, slant_degree) im_aug = im_sheared else: im_aug = get_scaled_image_aug(im, aug_setting[0]) im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths) if im_horizontal_padded is None: num_fail += 1 continue data = np.transpose(im_horizontal_padded, (1, 0)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) print('Generated features for {} images. Failed for {} (image too ' 'long).'.format(num_ok, num_fail), file=sys.stderr) |