split_data_enroll_eval.py 842 Bytes
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37


#!/usr/bin/env python3

# Copyright 2017 Bengu Wu
# Apache 2.0.

# This script splits the test set utt2spk into enroll set and eval set
# For each speaker, 3 utterances are randomly selected as enroll samples,
# and the others are used as eval samples for evaluation
# input: test utt2spk
# output: enroll utt2spk, eval utt2spk

import sys,random

dictutt = {}

for line in open(sys.argv[1]):
  line = line.rstrip('\r\t\n ')
  utt, spk = line.split(' ')
  if spk not in dictutt:
    dictutt[spk] = []
  dictutt[spk].append(utt)

fenroll = open(sys.argv[2], 'w')
feval = open(sys.argv[3], 'w')

for key in dictutt:
  utts = dictutt[key]
  random.shuffle(utts)
  for i in range(0, len(utts)):
    line = utts[i] + ' ' + key
    if(i < 3):
      fenroll.write(line + '\n')
    else:
      feval.write(line + '\n')

fenroll.close()
feval.close()