Blame view

egs/sprakbanken/s5/local/sprak2parallel.py 5.35 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
  #!/usr/bin/env python
  '''
  # Copyright 2013-2014 Mirsk Digital ApS  (Author: Andreas Kirkedal)
  
  # Licensed under the Apache License, Version 2.0 (the "License");                                                    
  # you may not use this file except in compliance with the License.                                                  
  # You may obtain a copy of the License at                                                                          
  #                                                                                                                 
  #  http://www.apache.org/licenses/LICENSE-2.0                                                                    
  #                                                                                                               
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY                                 
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED                                   
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,                                       
  # MERCHANTABLITY OR NON-INFRINGEMENT.                                                                                 
  # See the Apache 2 License for the specific language governing permissions and                                       
  # limitations under the License.
  '''
  
  import subprocess                                                                                                     
  import sys                                                                                                            
  import codecs                                                                                                         
  import os                                                                                                             
  from sprakparser import Session                                                                                       
  import shutil
  
  n = 0
  
  ### Utility functions                                                                                                 
  def find_ext_folders(topfolder, extfolderlist, file_ext):                                                             
      '''Recursive function that finds all the folders containing $file_ext files and                                   
      returns a list of folders.'''                                                                                     
                                                                                                                        
      for path in os.listdir(topfolder):
          curpath = os.path.join(topfolder,path)                                                                        
          if os.path.isdir(curpath):                                                                                    
              find_ext_folders(curpath, extfolderlist, file_ext)                                                        
          elif os.path.isfile(curpath):                                                                                 
              if os.path.splitext(path)[1] == file_ext:                                                                 
                  extfolderlist.append(topfolder)                                                                       
                  return                                                                                                
          else:         
              pass 
  
  
  def create_parallel_files(session):
      
      for recnum, recording in enumerate(session.record_states):
          if recnum == 0: # skip the first recording of silence
              continue
          oldsound = os.path.join(session.wavdir, recording[1])
          if not os.path.exists(oldsound):
              continue
          txtout = session.create_filename(recnum+1, "txt")
          sndout = session.create_filename(recnum+1, "wav")
          
          fout = codecs.open(os.path.join(session.sessiondir,txtout), "w", "utf8")
          fout.write(recording[0]+ "
  ")
          fout.close()
          dst = shutil.copyfile(oldsound, os.path.join(session.sessiondir,sndout))
          
  
  def make_speech_corpus(top, dest, srcfolder):
      global n
      spls = os.listdir(srcfolder)
      for splfile in sorted(spls):
          if os.path.splitext(splfile)[1] != ".spl":
              continue
          
          session = Session(os.path.abspath(srcfolder), splfile)
          if session.speaker_id == "": # ignore if there is no speaker
              continue
          if not session.wavdir: # ignore if there is no matching directory
              continue
          if len(session.record_states) < 2: # unsure whether this has an effect
              continue
          session.sessiondir = os.path.join(dest, session.filestem) +"."+ session.speaker_id
          if os.path.exists(session.sessiondir):
              n += 1
              session.sessiondir = "{}_{}".format(session.sessiondir, n)
              session.speaker_id = "{}_{}".format(session.speaker_id, n)
          os.mkdir(session.sessiondir)
          
          create_parallel_files(session)
  
  
  if __name__ == '__main__':
  
      spldirs = []
      dest = sys.argv[2] #"/mnt/sprakkaldi"
      if not os.path.exists(dest):
          os.mkdir(dest)
      topfolder = sys.argv[1]
      find_ext_folders(topfolder, spldirs, ".spl")
      
  #    print(len(spldirs))
      for num, folder in enumerate(spldirs):
          make_speech_corpus(topfolder, dest, folder)