make_features.py 9.85 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266


#!/usr/bin/env python3

# Copyright      2017  Chun Chieh Chang
#                2017  Ashish Arora
#                2017  Yiwen Shao
#                2018  Hossein Hadian

""" This script converts images to Kaldi-format feature matrices. The input to
    this script is the path to a data directory, e.g. "data/train". This script
    reads the images listed in images.scp and writes them to standard output
    (by default) as Kaldi-formatted matrices (in text form). It also scales the
    images so they have the same height (via --feat-dim). It can optionally pad
    the images (on left/right sides) with white pixels.
    If an 'image2num_frames' file is found in the data dir, it will be used
    to enforce the images to have the specified length in that file by padding
    white pixels (the --padding option will be ignored in this case). This relates
    to end2end chain training.
    eg. local/make_features.py data/train --feat-dim 40
"""
import random
import argparse
import os
import sys
import scipy.io as sio
import numpy as np
from scipy import misc
from scipy.ndimage.interpolation import affine_transform
import math
from signal import signal, SIGPIPE, SIG_DFL
signal(SIGPIPE, SIG_DFL)

parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
                                                writes them to standard output in text format.""")
parser.add_argument('images_scp_path', type=str,
                    help='Path of images.scp file')
parser.add_argument('--allowed_len_file_path', type=str, default=None,
                    help='If supplied, each images will be padded to reach the '
                    'target length (this overrides --padding).')
parser.add_argument('--out-ark', type=str, default='-',
                    help='Where to write the output feature file')
parser.add_argument('--feat-dim', type=int, default=40,
                    help='Size to scale the height of all images')
parser.add_argument('--padding', type=int, default=5,
                    help='Number of white pixels to pad on the left'
                    'and right side of the image.')
parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
                   help="Flip the image left-right for right to left languages")
parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
                   help="performs image augmentation")
args = parser.parse_args()


def write_kaldi_matrix(file_handle, matrix, key):
    file_handle.write(key + " [ ")
    num_rows = len(matrix)
    if num_rows == 0:
        raise Exception("Matrix is empty")
    num_cols = len(matrix[0])

    for row_index in range(len(matrix)):
        if num_cols != len(matrix[row_index]):
            raise Exception("All the rows of a matrix are expected to "
                            "have the same length")
        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
        if row_index != num_rows - 1:
            file_handle.write("\n")
    file_handle.write(" ]\n")


def horizontal_pad(im, allowed_lengths = None):
    if allowed_lengths is None:
        left_padding = right_padding = args.padding
    else:  # Find an allowed length for the image
        imlen = im.shape[1] # width
        allowed_len = 0
        for l in allowed_lengths:
            if l > imlen:
                allowed_len = l
                break
        if allowed_len == 0:
            #  No allowed length was found for the image (the image is too long)
            return None
        padding = allowed_len - imlen
        left_padding = int(padding // 2)
        right_padding = padding - left_padding
    dim_y = im.shape[0] # height
    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
                                           dtype=int), im), axis=1)
    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
                                                    dtype=int)), axis=1)
    return im_pad1

def get_scaled_image_aug(im, mode='normal'):
    scale_size = args.feat_dim
    sx = im.shape[1]
    sy = im.shape[0]
    scale = (1.0 * scale_size) / sy
    nx = int(scale_size)
    ny = int(scale * sx) 
    scale_size = random.randint(10, 30)
    scale = (1.0 * scale_size) / sy
    down_nx = int(scale_size)
    down_ny = int(scale * sx)
    if mode == 'normal':
        im = misc.imresize(im, (nx, ny))
        return im
    else:
        im_scaled_down = misc.imresize(im, (down_nx, down_ny))
        im_scaled_up = misc.imresize(im_scaled_down, (nx, ny))
        return im_scaled_up
    return im

def contrast_normalization(im, low_pct, high_pct):
    element_number = im.size
    rows = im.shape[0]
    cols = im.shape[1]
    im_contrast = np.zeros(shape=im.shape)
    low_index = int(low_pct * element_number)
    high_index = int(high_pct * element_number)
    sorted_im = np.sort(im, axis=None)
    low_thred = sorted_im[low_index]
    high_thred = sorted_im[high_index]
    for i in range(rows):
        for j in range(cols):
            if im[i, j] > high_thred:
                im_contrast[i, j] = 255  # lightest to white
            elif im[i, j] < low_thred:
                im_contrast[i, j] = 0  # darkest to black
            else:
                # linear normalization
                im_contrast[i, j] = (im[i, j] - low_thred) * \
                    255 / (high_thred - low_thred)
    return im_contrast


def geometric_moment(frame, p, q):
    m = 0
    for i in range(frame.shape[1]):
        for j in range(frame.shape[0]):
            m += (i ** p) * (j ** q) * frame[i][i]
    return m


def central_moment(frame, p, q):
    u = 0
    x_bar = geometric_moment(frame, 1, 0) / \
        geometric_moment(frame, 0, 0)  # m10/m00
    y_bar = geometric_moment(frame, 0, 1) / \
        geometric_moment(frame, 0, 0)  # m01/m00
    for i in range(frame.shape[1]):
        for j in range(frame.shape[0]):
            u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j]
    return u


def height_normalization(frame, w, h):
    frame_normalized = np.zeros(shape=(h, w))
    alpha = 4
    x_bar = geometric_moment(frame, 1, 0) / \
        geometric_moment(frame, 0, 0)  # m10/m00
    y_bar = geometric_moment(frame, 0, 1) / \
        geometric_moment(frame, 0, 0)  # m01/m00
    sigma_x = (alpha * ((central_moment(frame, 2, 0) /
                         geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u20/m00)
    sigma_y = (alpha * ((central_moment(frame, 0, 2) /
                         geometric_moment(frame, 0, 0)) ** .5))  # alpha * sqrt(u02/m00)
    for x in range(w):
        for y in range(h):
            i = int((x / w - 0.5) * sigma_x + x_bar)
            j = int((y / h - 0.5) * sigma_y + y_bar)
            frame_normalized[x][y] = frame[i][j]
    return frame_normalized


def find_slant_project(im):
    rows = im.shape[0]
    cols = im.shape[1]
    std_max = 0
    alpha_max = 0
    col_disp = np.zeros(90, int)
    proj = np.zeros(shape=(90, cols + 2 * rows), dtype=int)
    for r in range(rows):
        for alpha in range(-45, 45, 1):
            col_disp[alpha] = int(r * math.tan(alpha / 180.0 * math.pi))
        for c in range(cols):
            if im[r, c] < 100:
                for alpha in range(-45, 45, 1):
                    proj[alpha + 45, c + col_disp[alpha] + rows] += 1
    for alpha in range(-45, 45, 1):
        proj_histogram, bin_array = np.histogram(proj[alpha + 45, :], bins=10)
        proj_std = np.std(proj_histogram)
        if proj_std > std_max:
            std_max = proj_std
            alpha_max = alpha
    proj_std = np.std(proj, axis=1)
    return -alpha_max


def horizontal_shear(im, degree):
    rad = degree / 180.0 * math.pi
    padding_x = int(abs(np.tan(rad)) * im.shape[0])
    padding_y = im.shape[0]
    if rad > 0:
        im_pad = np.concatenate(
            (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1)
    elif rad < 0:
        im_pad = np.concatenate(
            (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1)
    else:
        im_pad = im
    shear_matrix = np.array([[1, 0],
                             [np.tan(rad), 1]])
    sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0)
    return sheared_im


### main ###
random.seed(1)
data_list_path = args.images_scp_path
if args.out_ark == '-':
    out_fh = sys.stdout
else:
    out_fh = open(args.out_ark,'w')

allowed_lengths = None
allowed_len_handle = args.allowed_len_file_path
if os.path.isfile(allowed_len_handle):
    print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
    allowed_lengths = []
    with open(allowed_len_handle) as f:
        for line in f:
            allowed_lengths.append(int(line.strip()))
    print("Read {} allowed lengths and will apply them to the "
          "features.".format(len(allowed_lengths)), file=sys.stderr)

num_fail = 0
num_ok = 0
aug_setting = ['normal', 'scaled']
with open(data_list_path) as f:
    for line in f:
        line = line.strip()
        line_vect = line.split(' ')
        image_id = line_vect[0]
        image_path = line_vect[1]
        im = misc.imread(image_path)
        if args.fliplr:
            im = np.fliplr(im)
        if args.augment:
            im_aug = get_scaled_image_aug(im, aug_setting[0])
            im_contrast = contrast_normalization(im_aug, 0.05, 0.2)
            slant_degree = find_slant_project(im_contrast)
            im_sheared = horizontal_shear(im_contrast, slant_degree)
            im_aug = im_sheared
        else:
            im_aug = get_scaled_image_aug(im, aug_setting[0])
        im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
        if im_horizontal_padded is None:
            num_fail += 1
            continue
        data = np.transpose(im_horizontal_padded, (1, 0))
        data = np.divide(data, 255.0)
        num_ok += 1
        write_kaldi_matrix(out_fh, data, image_id)

print('Generated features for {} images. Failed for {} (image too '
      'long).'.format(num_ok, num_fail), file=sys.stderr)