Blame view

egs/sprakbanken/s5/local/writenumbers.py 7.85 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
  #!/usr/bin/env python
  # -*- coding: utf-8 -*-
  '''
  # Copyright 2014 Author: Andreas Kirkedal
  
  # Licensed under the Apache License, Version 2.0 (the "License");                                                    
  # you may not use this file except in compliance with the License.                                                  
  # You may obtain a copy of the License at                                                                          
  #                                                                                                                 
  #  http://www.apache.org/licenses/LICENSE-2.0                                                                    
  #                                                                                                               
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY                                 
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED                                   
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,                                       
  # MERCHANTABLITY OR NON-INFRINGEMENT.                                                                                 
  # See the Apache 2 License for the specific language governing permissions and                                       
  # limitations under the License.
  
  
  Writes out numbers between 0 and 100 (ordinals and cardinals) and CPR numbers. 
  Currently only has a table for Danish
  
  Changed to write output to file to prevent problems with shell ascii codec.
  '''
  from __future__ import print_function
  
  import sys
  import os
  import codecs
  from re import split as charsplit
  
  
  def list2string(wordlist, lim=" ", newline=False):
      '''Converts a list to a string with a delimiter $lim and the possible
      addition of newline characters.'''
      strout = ""
      for w in wordlist:
          strout += w + lim
      if newline:
          return strout.strip(lim) + "
  "
      else:
          return strout.strip(lim)
  
  
  def loadNumTable(filename):
      '''Loads a table of numbers into a dictionary.'''
      tabSepTable = codecs.open(filename, "r", "utf8")
      numdict = {}
      for line in tabSepTable:
          num, txt = line.split("\t", 1)
          numdict[num] = txt.strip()
      
      return numdict
  
  
  def get_birth_date(number):
          """Split the date parts from the number and return the birth date."""
          day = int(number[0:2])
          month = int(number[2:4])
          year = int(number[4:6])
          if number[6] in '5678' and year >= 58:
              year += 1800
          elif number[6] in '0123' or (number[6] in '49' and year >= 37):
              year += 1900
          else:
              year += 2000
          try:
              return datetime.date(year, month, day)
          except ValueError:
              return False
  
  
  def leadingZero(string):
      '''Returns true if the first character is 0'''
      return string[0] == '0'
  
  
  def onlydigits(string):
      '''Returns only the numbers in the string'''
      return list2string([x for x in string if x.isdigit()], lim="")
  
  
  def isDKCPR(string):
      '''Checks whether a numeric string is a DKCPR. Only checks length'''
      number = onlydigits(string)
      if len(number) == 10:
          return True
      else:
          return False
  
  
  def writeZeroDigit(s, k, t):
      '''Converts numeric strings that start with "0" to their spoken form'''
      numbers = [t[x] for x in s if x in k]
      if len(s.strip(".")) == len(numbers):
          return list2string(numbers)
      else:
          return s
  
  
  def writeOutCPR(s, k, t):
      '''Writes out a DKCPR number to it's spoken form'''
      digits = onlydigits(s)
      parts = [digits[0:2], digits[2:4], digits[4:6], digits[6:8], digits[8:]]
      numbers = []
      for n in parts:
          if n == '': 
              sys.exit('n: ' +n+ '
  ' +s)
          if leadingZero(n):
              numbers.append(writeZeroDigit(n, k, t))
          else:
              numbers.append(t[n])
      return list2string(numbers)
  
  
  def hundreds(string):
      '''Checks if a number is in the hundreds'''
      if len(string) == 3 and string.isnumeric():
          return True
      return False
  
  
  def writeHundreds(s, t):
      '''Converts Danish numbers in the hundreds to their spoken form'''
      huns = s[0]
      tens = s[1:3]
      if tens == '00':
          return list2string([t[huns], t['100']])
      elif leadingZero(tens):
          numbers = [t[huns], t['100'], "OG", t[tens[1]]]
      else:
          numbers = [t[huns], t['100'], "OG", t[tens]]
      return list2string(numbers)
      
  
  def thousands(string):
      '''Checks if a number is in the thousands'''
      if len(string) == 4 and string.isnumeric():
          return True
      return False
  
  
  def writeThousands(s, t):
      '''Converts Danish numbers in the thousands to their spoken form'''
      thous = s[0]
      huns = s[1:4]
      numbers = [t[thous], t['1000']]
      if huns == '000':
          return list2string(numbers)
      elif huns[0] != '0':
          numbers.append(writeHundreds(huns, t))
      else:
          numbers.append("OG")
          if leadingZero(huns[1:3]):
              numbers.append(t[huns[2:3]])
          else:
              numbers.append(t[huns[1:3]])
      return list2string(numbers)
  
  
  def writeNumber(tok, table, keys):
      '''Converts many numbers to their Danish spoken form'''
      try:
          if tok in keys:
              return table[tok]
          elif isDKCPR(tok):
              return writeOutCPR(tok, keys, table)
          elif leadingZero(tok):
              return writeZeroDigit(tok, keys, table)
          elif hundreds(tok):
              return writeHundreds(tok, table)
          elif thousands(tok):
              return writeThousands(tok, table)        
          else:
              return tok
      except KeyError:
          return tok
  
  
  def splitNumeric(s, splitchar="([-,/])"):
      '''Splits a date, decimal or other token containind numbers or returns False'''
      parts = charsplit(splitchar, s)
      if len(parts) > 2 and parts[0].isnumeric():
          return parts
      return False    
  
  
  def writeOutSplits(s):
      '''Writes common separators to their spoken form. Context-sensitive ''' 
      d = {"-": 'TIL',
           "/": 'SKRÅSTREG',
           "-": 'STREG'
           }
      splitchar = d.keys()
      
      if len(s) == 3:
          if s[2].isnumeric():
              if s[1] == "-":
                  s1 = d[s[1]]
              elif s[1] == "/":
                  s[1] == ""
          elif s[0].isalpha() and s[2].isnumeric():
              s[1] == ""
      else:
          for num, dig in enumerate(s):
              if dig in splitchar:
                  s[num] = d[dig]
      return s
      
  def rmPvAnnotation(string):
      if string[0] == "_" and string[-1] == "_":
  #        print(string+ ": " +string.strip("_"))
          return string.strip("_")
      else:
          return string
  
  def normNumber(line, table):
      tokens = line.split()
      keys = list(table.keys())
      for num, tok in enumerate(tokens):
          newtoks = splitNumeric(tok)
          if newtoks != False:
              newtoks = writeOutSplits(newtoks)
              written = [writeNumber(x, table, keys) for x in newtoks if x.isnumeric()]
              newstring = list2string(written)
          else:
              newstring = writeNumber(tok, table, keys)
          tokens[num] = newstring
      return list2string(tokens, newline=True)
  
  
  def writeOutNumbers(infile, outfile, tablefile="numbers.tbl"):
      '''Uses a table of numbers-text to write out numbers in the infile.'''
      text = codecs.open(infile, "r", "utf8")
      fout = codecs.open(outfile, "w", "utf8")
      table = loadNumTable(tablefile)
  #    keys = table.keys()
      for line in text:
          cleanline = normNumber(line, table)
          #print(cleanline)
          fout.write(cleanline)
       
      text.close()
      fout.close()
  
  
  if __name__ == '__main__':
      
      try:
          tablefile = sys.argv[1]
          infile = sys.argv[2]
          outfile = sys.argv[3]
  
      except IndexError:
          print("python3 writenumbers.py <tablefile> <infile> <outfile>")
          sys.exit("Terminate")
  
  
  
      writeOutNumbers(infile, outfile, tablefile)