Yannick Estève / ONTRAC-Kaldi

Blame view

egs/cifar/v1/image/ocr/make_features.py 8.27 KB
  #!/usr/bin/env python3
  
  # Copyright      2017  Chun Chieh Chang
  #                2017  Ashish Arora
  #                2017  Yiwen Shao
  #                2018  Hossein Hadian
  #                2018  Desh Raj
  
  """ This script converts images to Kaldi-format feature matrices. The input to
      this script is the path to a data directory, e.g. "data/train". This script
      reads the images listed in images.scp and writes them to standard output
      (by default) as Kaldi-formatted matrices (in text form). It also scales the
      images so they have the same height (via --feat-dim). It can optionally pad
      the images (on left/right sides) with white pixels. It by default performs 
      augmentation, (directly scaling down and scaling up). It will double the 
      data but we can turn augmentation off (via --no-augment).
      If an 'image2num_frames' file is found in the data dir, it will be used
      to enforce the images to have the specified length in that file by padding
      white pixels (the --padding option will be ignored in this case). This relates
      to end2end chain training.
      eg. local/make_features.py data/train --feat-dim 40
  """
  import random
  import argparse
  import os
  import sys
  import numpy as np
  from scipy import misc
  import math
  from signal import signal, SIGPIPE, SIG_DFL
  signal(SIGPIPE, SIG_DFL)
  
  parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
                                                  writes them to standard output in text format.""")
  parser.add_argument('images_scp_path', type=str,
                      help='Path of images.scp file')
  parser.add_argument('--allowed_len_file_path', type=str, default=None,
                      help='If supplied, each images will be padded to reach the '
                      'target length (this overrides --padding).')
  parser.add_argument('--out-ark', type=str, default='-',
                      help='Where to write the output feature file')
  parser.add_argument('--feat-dim', type=int, default=40,
                      help='Size to scale the height of all images')
  parser.add_argument('--padding', type=int, default=5,
                      help='Number of white pixels to pad on the left'
                      'and right side of the image.')
  parser.add_argument('--num-channels', type=int, default=1,
                      help='Number of color channels')
  parser.add_argument('--vertical-shift', type=int, default=0,
                      help='total number of padding pixel per column')
  parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
                     help="Flip the image left-right for right to left languages")
  parser.add_argument('--augment_type', type=str, default='no_aug',
                      choices=['no_aug', 'random_scale','random_shift'],
                      help='Subset of data to process.')
  args = parser.parse_args()
  
  
  def write_kaldi_matrix(file_handle, matrix, key):
      file_handle.write(key + " [ ")
      num_rows = len(matrix)
      if num_rows == 0:
          raise Exception("Matrix is empty")
      num_cols = len(matrix[0])
  
      for row_index in range(len(matrix)):
          if num_cols != len(matrix[row_index]):
              raise Exception("All the rows of a matrix are expected to "
                              "have the same length")
          file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
          if row_index != num_rows - 1:
              file_handle.write("
  ")
      file_handle.write(" ]
  ")
  
  def horizontal_pad(im, allowed_lengths = None):
      if allowed_lengths is None:
          left_padding = right_padding = args.padding
      else:  # Find an allowed length for the image
          imlen = im.shape[1] # width
          allowed_len = 0
          for l in allowed_lengths:
              if l > imlen:
                  allowed_len = l
                  break
          if allowed_len == 0:
              #  No allowed length was found for the image (the image is too long)
              return None
          padding = allowed_len - imlen
          left_padding = int(padding // 2)
          right_padding = padding - left_padding
      dim_y = im.shape[0] # height
      if args.num_channels in [1,4]:
          im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
                                                 dtype=int), im), axis=1)
          im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
                                                          dtype=int)), axis=1)
      else:
          im_pad = np.concatenate((255 * np.ones((dim_y, left_padding, args.num_channels),
                                                 dtype=int), im), axis=1)
          im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding, args.num_channels),
                                                          dtype=int)), axis=1)
      return im_pad1
  
  def get_scaled_image_aug(im, mode='normal'):
      scale_size = args.feat_dim
      sx = im.shape[1]
      sy = im.shape[0]
      scale = (1.0 * scale_size) / sy
      nx = int(scale_size)
      ny = int(scale * sx) 
      scale_size = random.randint(10, 30)
      scale = (1.0 * scale_size) / sy
      down_nx = int(scale_size)
      down_ny = int(scale * sx)
      if mode == 'normal':
          im = misc.imresize(im, (nx, ny))
          return im
      else:
          im_scaled_down = misc.imresize(im, (down_nx, down_ny))
          im_scaled_up = misc.imresize(im_scaled_down, (nx, ny))
          return im_scaled_up
      return im
  
  def vertical_shift(im, mode='normal'):
      if args.vertical_shift == 0:
          return im
      total = args.vertical_shift
      if mode == 'notmid':
          val = random.randint(0, 1)
          if val == 0:
              mode = 'top'
          else:
              mode = 'bottom'
      if mode == 'normal':
          top = int(total / 2)
          bottom = total - top
      elif mode == 'top':  # more padding on top
          top = random.randint(total / 2, total)
          bottom = total - top
      elif mode == 'bottom':  # more padding on bottom
          top = random.randint(0, total / 2)
          bottom = total - top
      width = im.shape[1]
      im_pad = np.concatenate(
          (255 * np.ones((top, width), dtype=int) -
           np.random.normal(2, 1, (top, width)).astype(int), im), axis=0)
      im_pad = np.concatenate(
          (im_pad, 255 * np.ones((bottom, width), dtype=int) -
           np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0)
      return im_pad
  
  ### main ###
  random.seed(1)
  data_list_path = args.images_scp_path
  if args.out_ark == '-':
      out_fh = sys.stdout
  else:
      out_fh = open(args.out_ark,'w')
  
  allowed_lengths = None
  allowed_len_handle = args.allowed_len_file_path
  if os.path.isfile(allowed_len_handle):
      print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
      allowed_lengths = []
      with open(allowed_len_handle) as f:
          for line in f:
              allowed_lengths.append(int(line.strip()))
      print("Read {} allowed lengths and will apply them to the "
            "features.".format(len(allowed_lengths)), file=sys.stderr)
  
  num_fail = 0
  num_ok = 0
  with open(data_list_path) as f:
      for line in f:
          line = line.strip()
          line_vect = line.split(' ')
          image_id = line_vect[0]
          image_path = line_vect[1]
          if args.num_channels == 4:
              im = misc.imread(image_path, mode='L')
          else:
              im = misc.imread(image_path)
          if args.fliplr:
              im = np.fliplr(im)
          if args.augment_type == 'no_aug' or 'random_shift':
              im = get_scaled_image_aug(im, 'normal')
          elif args.augment_type == 'random_scale':
              im = get_scaled_image_aug(im, 'scaled')
          im = horizontal_pad(im, allowed_lengths)
          if im is None:
              num_fail += 1
              continue
          if args.augment_type == 'no_aug' or 'random_scale':
              im = vertical_shift(im, 'normal')
          elif args.augment_type == 'random_shift':
              im = vertical_shift(im, 'notmid')
          if args.num_channels in [1,4]:
              data = np.transpose(im, (1, 0))
          elif args.num_channels == 3:
              H = im.shape[0]
              W = im.shape[1]
              C = im.shape[2]
              data = np.reshape(np.transpose(im, (1, 0, 2)), (W, H * C))
          data = np.divide(data, 255.0)
          num_ok += 1
          write_kaldi_matrix(out_fh, data, image_id)
  
  print('Generated features for {} images. Failed for {} (image too '
        'long).'.format(num_ok, num_fail), file=sys.stderr)