gen_UBM_list.pl
2.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/perl
# Perl program to generate data/data.lst and lst/UBM.lst
# using all speech segments in spkDiarization/data/sph/
#
# Author: Xavier Bost
# email: xavier.bost@univ-avignon.fr
#
# Synopsis:
#
# Source files: spkDiarization/data/sph/*.sph
#
# Retained informations are written in:
# spkDiarization/gmm/world.gmm
use strict;
use List::Util qw(min max);
my $line; # current line
my $dir; # directory containing label files
my $file; # current file
my $episode; # episode name
my $base_name; # file base name
my $start; # current segment beginning
my $end; # current segment end
my $maxEnd;
my $minStart;
my %starts = (); # segments beginning
my %ends = (); # segments end
my %index = ();
my $output;
$dir = "spkDiarization/data/sph/";
opendir(REP, $dir);
my @files = grep /.+_\d+_\d+\.sph/, readdir REP;
closedir(REP);
# looping over files, i.e. speech segments
foreach $file (sort @files) {
if ($file =~ /(.+)_(\d+)_(\d+)\.sph$/) {
$episode = $1;
$start = $2;
$end = $3;
}
$ends{$episode}{$start}{$end} = 1;
$starts{$episode}{$end}{$start} = 1;
$index{$episode}{$start}{$end} = 1;
}
# reducing segments with multiple ends
foreach $episode (sort keys %ends) {
foreach $start (sort {$a<=>$b} keys %{$ends{$episode}}) {
$maxEnd = max(keys %{$ends{$episode}{$start}});
foreach $end (sort {$a<=>$b} keys %{$ends{$episode}{$start}}) {
if ($end != $maxEnd) {
delete $index{$episode}{$start}{$end};
delete $starts{$episode}{$start}{$end};
}
}
}
}
# reducing segments with multiple starts
foreach $episode (sort keys %starts) {
foreach $end (sort {$a<=>$b} keys %{$starts{$episode}}) {
$minStart = min(keys %{$starts{$episode}{$end}});
foreach $start (sort {$a<=>$b} keys %{$starts{$episode}{$end}}) {
if ($start != $minStart) {
delete $index{$episode}{$start}{$end};
}
}
}
}
# writing out lists of speech segments
open(OUT1, "> spkDiarization/data/data.lst");
open(OUT2, "> spkDiarization/lst/UBM.lst");
foreach $episode (sort keys %index) {
foreach $start (sort {$a<=>$b} keys %{$index{$episode}}) {
foreach $end (sort {$a<=>$b} keys %{$index{$episode}{$start}}) {
$output = $episode."_".$start."_".$end."\n";
print OUT1 $output;
print OUT2 $output;
}
}
}
close(OUT2);
close(OUT1);