Blame view
egs/sprakbanken/s5/local/parallel2kaldi.py
1.3 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
#!/usr/bin/env python ''' This script assumes that the parallel files have the same filename with different extensions and you must specify the absolute path to the corpus from the root. The text files may only contain a single line of text. ''' import sys, os, codecs srcdir = sys.argv[1] dest = sys.argv[2] snd_ext = sys.argv[3] txt_ext = sys.argv[4] corpus = os.listdir(srcdir) text = codecs.open(os.path.join(dest, "text"), "w", "utf8") wavscp = codecs.open(os.path.join(dest, "wav.scp"), "w", "utf8") utt2spk = codecs.open(os.path.join(dest, "utt2spk"), "w", "utf8") sndlist = [] txtlist = [] for line in corpus: stem_and_ext = line.strip().rsplit(".", 1) if len(stem_and_ext) == 2: if stem_and_ext[-1] == snd_ext: sndlist.append(stem_and_ext[0]) elif stem_and_ext[-1] == txt_ext: txtlist.append(stem_and_ext[0]) stems = sorted(list(set(sndlist) & set(txtlist))) #print(stems) # Use the filename as utterance id for uttid in stems: fin = uttid+ "." +txt_ext utt = codecs.open(os.path.join(srcdir, fin), "r", "utf8").read() text.write(uttid+ " " +utt) spkid = uttid.rsplit("_")[0] wavscp.write(uttid+ " " +os.path.join(srcdir, uttid+ "." +snd_ext)+ " ") utt2spk.write(uttid+ " " +spkid+ " ") utt2spk.close() text.close() wavscp.close() |