Blame view

egs/sprakbanken/s5/local/parallel2kaldi.py 1.3 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
  #!/usr/bin/env python
  '''
  This script assumes that the parallel files have the same filename with different extensions and you must
  specify the absolute path to the corpus from the root. The text files may only contain a single line of text.
  
  '''
  
  import sys, os, codecs
  
  srcdir = sys.argv[1]
  dest = sys.argv[2]
  snd_ext = sys.argv[3]
  txt_ext = sys.argv[4]
  
  corpus = os.listdir(srcdir)
  
  text = codecs.open(os.path.join(dest, "text"), "w", "utf8")
  wavscp = codecs.open(os.path.join(dest, "wav.scp"), "w", "utf8")
  utt2spk = codecs.open(os.path.join(dest, "utt2spk"), "w", "utf8")
  sndlist = []
  txtlist = []
  
  for line in corpus:
      stem_and_ext = line.strip().rsplit(".", 1)
      if len(stem_and_ext) == 2:
          if stem_and_ext[-1] == snd_ext:
              sndlist.append(stem_and_ext[0])
          elif stem_and_ext[-1] == txt_ext:
              txtlist.append(stem_and_ext[0])
  
  stems = sorted(list(set(sndlist) & set(txtlist)))
  
  #print(stems)
  
  # Use the filename as utterance id
  
  for uttid in stems:
      fin = uttid+ "." +txt_ext
      utt = codecs.open(os.path.join(srcdir, fin), "r", "utf8").read()
      text.write(uttid+ " " +utt)
      spkid = uttid.rsplit("_")[0]
      wavscp.write(uttid+ " " +os.path.join(srcdir, uttid+ "." +snd_ext)+ "
  ")
      utt2spk.write(uttid+ " " +spkid+ "
  ")
      
  utt2spk.close()
  text.close()
  wavscp.close()