Yannick Estève / ONTRAC-Kaldi

Blame view

egs/sprakbanken/s5/local/writenumbers.py 7.85 KB
  #!/usr/bin/env python
  # -*- coding: utf-8 -*-
  '''
  # Copyright 2014 Author: Andreas Kirkedal
  
  # Licensed under the Apache License, Version 2.0 (the "License");                                                    
  # you may not use this file except in compliance with the License.                                                  
  # You may obtain a copy of the License at                                                                          
  #                                                                                                                 
  #  http://www.apache.org/licenses/LICENSE-2.0                                                                    
  #                                                                                                               
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY                                 
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED                                   
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,                                       
  # MERCHANTABLITY OR NON-INFRINGEMENT.                                                                                 
  # See the Apache 2 License for the specific language governing permissions and                                       
  # limitations under the License.
  
  
  Writes out numbers between 0 and 100 (ordinals and cardinals) and CPR numbers. 
  Currently only has a table for Danish
  
  Changed to write output to file to prevent problems with shell ascii codec.
  '''
  from __future__ import print_function
  
  import sys
  import os
  import codecs
  from re import split as charsplit
  
  
  def list2string(wordlist, lim=" ", newline=False):
      '''Converts a list to a string with a delimiter $lim and the possible
      addition of newline characters.'''
      strout = ""
      for w in wordlist:
          strout += w + lim
      if newline:
          return strout.strip(lim) + "
  "
      else:
          return strout.strip(lim)
  
  
  def loadNumTable(filename):
      '''Loads a table of numbers into a dictionary.'''
      tabSepTable = codecs.open(filename, "r", "utf8")
      numdict = {}
      for line in tabSepTable:
          num, txt = line.split("\t", 1)
          numdict[num] = txt.strip()
      
      return numdict
  
  
  def get_birth_date(number):
          """Split the date parts from the number and return the birth date."""
          day = int(number[0:2])
          month = int(number[2:4])
          year = int(number[4:6])
          if number[6] in '5678' and year >= 58:
              year += 1800
          elif number[6] in '0123' or (number[6] in '49' and year >= 37):
              year += 1900
          else:
              year += 2000
          try:
              return datetime.date(year, month, day)
          except ValueError:
              return False
  
  
  def leadingZero(string):
      '''Returns true if the first character is 0'''
      return string[0] == '0'
  
  
  def onlydigits(string):
      '''Returns only the numbers in the string'''
      return list2string([x for x in string if x.isdigit()], lim="")
  
  
  def isDKCPR(string):
      '''Checks whether a numeric string is a DKCPR. Only checks length'''
      number = onlydigits(string)
      if len(number) == 10:
          return True
      else:
          return False
  
  
  def writeZeroDigit(s, k, t):
      '''Converts numeric strings that start with "0" to their spoken form'''
      numbers = [t[x] for x in s if x in k]
      if len(s.strip(".")) == len(numbers):
          return list2string(numbers)
      else:
          return s
  
  
  def writeOutCPR(s, k, t):
      '''Writes out a DKCPR number to it's spoken form'''
      digits = onlydigits(s)
      parts = [digits[0:2], digits[2:4], digits[4:6], digits[6:8], digits[8:]]
      numbers = []
      for n in parts:
          if n == '': 
              sys.exit('n: ' +n+ '
  ' +s)
          if leadingZero(n):
              numbers.append(writeZeroDigit(n, k, t))
          else:
              numbers.append(t[n])
      return list2string(numbers)
  
  
  def hundreds(string):
      '''Checks if a number is in the hundreds'''
      if len(string) == 3 and string.isnumeric():
          return True
      return False
  
  
  def writeHundreds(s, t):
      '''Converts Danish numbers in the hundreds to their spoken form'''
      huns = s[0]
      tens = s[1:3]
      if tens == '00':
          return list2string([t[huns], t['100']])
      elif leadingZero(tens):
          numbers = [t[huns], t['100'], "OG", t[tens[1]]]
      else:
          numbers = [t[huns], t['100'], "OG", t[tens]]
      return list2string(numbers)
      
  
  def thousands(string):
      '''Checks if a number is in the thousands'''
      if len(string) == 4 and string.isnumeric():
          return True
      return False
  
  
  def writeThousands(s, t):
      '''Converts Danish numbers in the thousands to their spoken form'''
      thous = s[0]
      huns = s[1:4]
      numbers = [t[thous], t['1000']]
      if huns == '000':
          return list2string(numbers)
      elif huns[0] != '0':
          numbers.append(writeHundreds(huns, t))
      else:
          numbers.append("OG")
          if leadingZero(huns[1:3]):
              numbers.append(t[huns[2:3]])
          else:
              numbers.append(t[huns[1:3]])
      return list2string(numbers)
  
  
  def writeNumber(tok, table, keys):
      '''Converts many numbers to their Danish spoken form'''
      try:
          if tok in keys:
              return table[tok]
          elif isDKCPR(tok):
              return writeOutCPR(tok, keys, table)
          elif leadingZero(tok):
              return writeZeroDigit(tok, keys, table)
          elif hundreds(tok):
              return writeHundreds(tok, table)
          elif thousands(tok):
              return writeThousands(tok, table)        
          else:
              return tok
      except KeyError:
          return tok
  
  
  def splitNumeric(s, splitchar="([-,/])"):
      '''Splits a date, decimal or other token containind numbers or returns False'''
      parts = charsplit(splitchar, s)
      if len(parts) > 2 and parts[0].isnumeric():
          return parts
      return False    
  
  
  def writeOutSplits(s):
      '''Writes common separators to their spoken form. Context-sensitive ''' 
      d = {"-": 'TIL',
           "/": 'SKRÅSTREG',
           "-": 'STREG'
           }
      splitchar = d.keys()
      
      if len(s) == 3:
          if s[2].isnumeric():
              if s[1] == "-":
                  s1 = d[s[1]]
              elif s[1] == "/":
                  s[1] == ""
          elif s[0].isalpha() and s[2].isnumeric():
              s[1] == ""
      else:
          for num, dig in enumerate(s):
              if dig in splitchar:
                  s[num] = d[dig]
      return s
      
  def rmPvAnnotation(string):
      if string[0] == "_" and string[-1] == "_":
  #        print(string+ ": " +string.strip("_"))
          return string.strip("_")
      else:
          return string
  
  def normNumber(line, table):
      tokens = line.split()
      keys = list(table.keys())
      for num, tok in enumerate(tokens):
          newtoks = splitNumeric(tok)
          if newtoks != False:
              newtoks = writeOutSplits(newtoks)
              written = [writeNumber(x, table, keys) for x in newtoks if x.isnumeric()]
              newstring = list2string(written)
          else:
              newstring = writeNumber(tok, table, keys)
          tokens[num] = newstring
      return list2string(tokens, newline=True)
  
  
  def writeOutNumbers(infile, outfile, tablefile="numbers.tbl"):
      '''Uses a table of numbers-text to write out numbers in the infile.'''
      text = codecs.open(infile, "r", "utf8")
      fout = codecs.open(outfile, "w", "utf8")
      table = loadNumTable(tablefile)
  #    keys = table.keys()
      for line in text:
          cleanline = normNumber(line, table)
          #print(cleanline)
          fout.write(cleanline)
       
      text.close()
      fout.close()
  
  
  if __name__ == '__main__':
      
      try:
          tablefile = sys.argv[1]
          infile = sys.argv[2]
          outfile = sys.argv[3]
  
      except IndexError:
          print("python3 writenumbers.py <tablefile> <infile> <outfile>")
          sys.exit("Terminate")
  
  
  
      writeOutNumbers(infile, outfile, tablefile)