Parcollet Titouan / Quaternion Convolutional Neural Networks for End-to-End Automatic Speech Recognition

Blame view

datasets/transformers.py 13.3 KB
  from collections import OrderedDict
  
  import numpy
  
  from theano import config
  
  from fuel.transformers import Transformer
  from picklable_itertools.extras import equizip
  
  class MaximumFrameCache(Transformer):
      """Cache examples, and create batches of maximum number of frames.
  
      Given a data stream which reads large chunks of data, this data
      stream caches these chunks and returns batches with a maximum number
      of acoustic frames.
  
      Parameters
      ----------
      max_frames : int
          maximum number of frames per batch
  
      Attributes
      ----------
      cache : list of lists of objects
          This attribute holds the cache at any given point. It is a list of
          the same size as the :attr:`sources` attribute. Each element in
          this list is a deque of examples that are currently in the
          cache. The cache gets emptied at the start of each epoch, and gets
          refilled when needed through the :meth:`get_data` method.
  
      """
      def __init__(self, data_stream, max_frames, rng):
          super(MaximumFrameCache, self).__init__(
              data_stream)
          self.max_frames = max_frames
          self.cache = OrderedDict([(name, []) for name in self.sources])
          self.num_frames = []
          self.rng = rng
          self.produces_examples = False
  
      def next_request(self):
          curr_max = 0
          for i, n_frames in enumerate(self.num_frames):
              # Select max number of frames because of future padding
              curr_max = max(n_frames, curr_max)
              total = curr_max * (i + 1)
              if total >= self.max_frames:
                  return i + 1
          return len(self.num_frames)
  
      def get_data(self, request=None):
          if not self.cache[self.cache.keys()[0]]:
              self._cache()
          data = []
          request = self.next_request()
          for source_name in self.cache:
              data.append(numpy.asarray(self.cache[source_name][:request]))
          self.cache = OrderedDict([(name, dt[request:]) for name, dt
                                    in self.cache.iteritems()])
          self.num_frames = self.num_frames[request:]
  
          return tuple(data)
  
      def get_epoch_iterator(self, **kwargs):
          self.cache = OrderedDict([(name, []) for name in self.sources])
          self.num_frames = []
          return super(MaximumFrameCache, self).get_epoch_iterator(**kwargs)
  
      def _cache(self):
          data = next(self.child_epoch_iterator)
          indexes = range(len(data[0]))
          self.rng.shuffle(indexes)
          data = [[dt[i] for i in indexes] for dt in data]
          self.cache = OrderedDict([(name, self.cache[name] + dt) for name, dt
                                    in equizip(self.data_stream.sources, data)])
          self.num_frames.extend([x.shape[0] for x in data[0]])
  
  
  class Transpose(Transformer):
      """Transpose axes of datastream.
      """
      def __init__(self, datastream, axes_list):
          super(Transpose, self).__init__(datastream)
          self.axes_list = axes_list
          self.produces_examples = False
  
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          transposed_data = []
          for axes, data in zip(self.axes_list, data):
              transposed_data.append(numpy.transpose(data, axes))
          return transposed_data
  
  
  class AddUniformAlignmentMask(Transformer):
      """Adds an uniform alignment mask to the incoming batch.
  
      Parameters
      ----------
  
      """
      def __init__(self, data_stream):
          super(AddUniformAlignmentMask, self).__init__(data_stream)
          self.sources = self.data_stream.sources + ('alignment',)
  
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          sources = self.data_stream.sources
  
          x_idx = sources.index('x')
          y_idx = sources.index('y')
          x_mask_idx = sources.index('x_mask')
          y_mask_idx = sources.index('y_mask')
  
          batch_size = data[x_idx].shape[1]
          max_len_output = data[y_idx].shape[0]
          max_len_input = data[x_idx].shape[0]
          mask_shape = (max_len_output, batch_size, max_len_input)
          alignment = numpy.zeros(mask_shape, dtype=config.floatX)
  
          for k in xrange(batch_size):
              in_size = numpy.count_nonzero(data[x_mask_idx][:,k])
              out_size = numpy.count_nonzero(data[y_mask_idx][:,k])
              n = int(in_size/out_size) # Maybe clever way than int to do this
              v = numpy.hstack([numpy.ones(n, dtype=config.floatX),
                                numpy.zeros(max_len_input - n,
                                            dtype=config.floatX)])
              alignment[0,k] = v
              for i in xrange(1, out_size):
                  alignment[i,k] = numpy.roll(v, i*n)
  
              # DEBUG
              #plt.figure()
              #plt.imshow(alignment[:,k,:], cmap='gray', interpolation='none')
              #plt.show()
          data = data + (alignment,)
  
          return data
  
  
  class AlignmentPadding(Transformer):
      def __init__(self, data_stream, alignment_source):
          super(AlignmentPadding, self).__init__(data_stream)
          self.alignment_source = alignment_source
  
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          data = OrderedDict(equizip(self.sources, data))
  
          alignments = data[self.alignment_source]
  
          input_lengths = [alignment.shape[1] for alignment in alignments]
          output_lengths = [alignment.shape[0] for alignment in alignments]
          max_input_length = max(input_lengths)
          max_output_length = max(output_lengths)
  
          batch_size = len(alignments)
  
          padded_alignments = numpy.zeros((max_output_length, batch_size,
                                           max_input_length))
          padded_targets = numpy.zeros((batch_size, max_output_length))
          padded_targets_mask = numpy.zeros((batch_size, max_output_length))
          for i, alignment in enumerate(alignments):
              out_size, inp_size = alignment.shape
              padded_alignments[:out_size, i, :inp_size] = alignment
              alignment_index = [list(align).index(1)
                                 for align in alignment]
              occurance = [alignment_index.count(j)
                           for j in set(alignment_index)]
              alignment_target = data['phonemes'][i][alignment_index]
              padded_targets[i, :out_size] = alignment_target
              #get rid of start label
              padded_targets[i, 0] = data['phonemes'][i, 1]
              alignment_target_mask = sum([[occur]*num for occur, num in
                                            zip(data['phonemes_mask'][i],
                                                occurance)], [])
              padded_targets_mask[i, :out_size] = alignment_target_mask
          data[self.alignment_source] = padded_alignments
          data['phonemes'] = padded_targets.astype('int')
          data['phonemes_mask'] = padded_targets_mask
          return data.values()
  
  
  class Reshape(Transformer):
      """Reshapes data in the stream according to shape source."""
      def __init__(self, data_source, shape_source, **kwargs):
          super(Reshape, self).__init__(**kwargs)
          self.data_source = data_source
          self.shape_source = shape_source
          self.sources = tuple(source for source in self.data_stream.sources
                               if source != shape_source)
  
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          data = OrderedDict(zip(self.data_stream.sources, data))
          shapes = data.pop(self.shape_source)
          reshaped_data = []
          for dt, shape in zip(data[self.data_source], shapes):
              reshaped_data.append(dt.reshape(shape))
          data[self.data_source] = reshaped_data
          return data.values()
  
  class ConvReshape(Transformer):
      def __init__(self, data_source, quaternion, **kwargs):
          super(ConvReshape, self).__init__(**kwargs)
          self.data_source = data_source
          self.sources = tuple(source for source in self.data_stream.sources)
          self.quaternion = quaternion
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          data = OrderedDict(zip(self.data_stream.sources, data))
          #shapes = data.pop(self.shape_source)
          reshaped_data = []
          if self.data_source in ['features', 'X']:
              for dt in data[self.data_source]:
                  shape = (1, dt.shape[0], 3, dt.shape[1] / 3)
                  if self.quaternion:
                     empty_channel = numpy.zeros((1, shape[1], 1, shape[3]))
                     reshaped_data.append(numpy.concatenate([dt.reshape(shape),
                                                             empty_channel],
                                                             axis=2))
                  else:
                    reshaped_data.append(dt.reshape(shape))
          else:
              for dt in data[self.data_source]:
                  shape = (numpy.prod(dt.shape), 1)
                  reshaped_data.append(dt.reshape(shape))
          if len(reshaped_data) == 1:
              reshaped_data = reshaped_data * 2
              data['phonemes'] = numpy.repeat(data['phonemes'], 2, axis=0)
              data['features_mask'] = numpy.repeat(data['features_mask'],
                                                   2, axis=1)
              data['phonemes_mask'] = numpy.repeat(data['phonemes_mask'],
                                                   2, axis=1)
          data[self.data_source] = numpy.vstack(reshaped_data)
          # remove the start label
          data['phonemes'] = data['phonemes'][:, 1:]
          # get the indice starts at 0
          data['phonemes'] = data['phonemes'] - 1.
          # remove the end label
          data['phonemes'][data['phonemes']==61] = 0
          # recover original padding
          data['phonemes'][data['phonemes']==-1] = 0
          data['features_mask'] = numpy.sum(data['features_mask'], axis=0)[:, None]
          data['phonemes_mask'] = numpy.sum(data['phonemes_mask'], axis=0)[:, None] - 2
          return data.values()
  
  
  class DictRep(Transformer):
      def __init__(self, data_source):
          super(DictRep, self).__init__(data_source)
          self.data_source = data_source
  
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          data = OrderedDict(zip(self.data_stream.sources, data))
          return (data.values(), numpy.zeros([data['phonemes_mask'].shape[0]]))
  
  
  class Subsample(Transformer):
      def __init__(self, data_stream, source, step):
          super(Subsample, self).__init__(data_stream)
          self.source = source
          self.step = step
  
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          data = OrderedDict(equizip(self.sources, data))
          dt = data[self.source]
  
          indexes = ((slice(None, None, self.step),) +
                  (slice(None),) * (len(dt.shape) - 1))
          subsampled = dt[indexes]
          data[self.source] = subsampled
          return data.values()
  
  
  class WindowFeatures(Transformer):
      def __init__(self, data_stream, source, window_size):
          super(WindowFeatures, self).__init__(data_stream)
          self.source = source
          self.window_size = window_size
  
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          data = OrderedDict(equizip(self.sources, data))
          feature_batch = data[self.source]
  
          windowed_features = []
          for features in feature_batch:
              features_padded = features.copy()
  
              features_shifted = [features]
              # shift forward
              for i in xrange(self.window_size / 2):
                  feats = numpy.roll(features_padded, i + 1, axis=0)
                  feats[:i + 1, :] = 0
                  features_shifted.append(feats)
              features_padded = features.copy()
  
              # shift backward
              for i in xrange(self.window_size / 2):
                  feats = numpy.roll(features_padded, -i - 1, axis=0)
                  feats[-i - 1:, :] = 0
                  features_shifted.append(numpy.roll(features_padded, -i - 1,
                                                     axis=0))
              windowed_features.append(numpy.concatenate(
                  features_shifted, axis=1))
          data[self.source] = windowed_features
          return data.values()
  
  
  class Normalize(Transformer):
      """Normalizes each features : x = (x - means)/stds"""
      def __init__(self, data_stream, means, stds, over='features'):
          super(Normalize, self).__init__(data_stream)
          self.means = means
          self.stds = stds
          self.over = over
  
          self.produces_examples = False
  
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          data = OrderedDict(zip(self.data_stream.sources, data))
          for i in range(len(data[self.over])):
              data[self.over][i] -= self.means
              data[self.over][i] /= self.stds
          return data.values()
  
  
  def length_getter(dt):
      def get_length(k):
          return dt[k].shape[0]
      return get_length
  
  
  class SortByLegth(Transformer):
      def __init__(self, data_stream, source='features'):
          super(SortByLegth, self).__init__(data_stream)
          self.source = source
  
      def get_data(self, request=None):
          data = next(self.child_epoch_iterator)
          data = OrderedDict(zip(self.data_stream.sources, data))
          dt = data[self.source]
          indexes = sorted(range(len(dt)), key=length_getter(dt))
          for source in self.sources:
              data[source] = [data[source][k] for k in indexes]
          return data.values()