Blame view

spkDiarization/scripts/gen_UBM_list.pl 2.37 KB
3f2992b2c   bostx   V1.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  #!/usr/bin/perl
  
  # Perl program to generate data/data.lst and lst/UBM.lst
  # using all speech segments in spkDiarization/data/sph/
  #
  # Author: Xavier Bost
  # email: xavier.bost@univ-avignon.fr
  #
  # Synopsis:
  #
  # Source files: spkDiarization/data/sph/*.sph
  #
  # Retained informations are written in:
  #   spkDiarization/gmm/world.gmm
  
  use strict;
  use List::Util qw(min max);
  
  my $line;                  # current line
  my $dir;                   # directory containing label files
  my $file;                  # current file
  my $episode;               # episode name
  my $base_name;             # file base name
  my $start;                 # current segment beginning
  my $end;                   # current segment end
  my $maxEnd;
  my $minStart;
  my %starts = ();           # segments beginning
  my %ends = ();             # segments end
  my %index = ();
  my $output;
  
  $dir = "spkDiarization/data/sph/";
  opendir(REP, $dir);
  my @files = grep /.+_\d+_\d+\.sph/, readdir REP;
  closedir(REP);
  
  # looping over files, i.e. speech segments
  foreach $file (sort @files) {
  
      if ($file =~ /(.+)_(\d+)_(\d+)\.sph$/) {
  	$episode = $1;
  	$start = $2;
  	$end = $3;
      }
  
      $ends{$episode}{$start}{$end} = 1;
      $starts{$episode}{$end}{$start} = 1;
      $index{$episode}{$start}{$end} = 1;
  }
  
  # reducing segments with multiple ends
  foreach $episode (sort keys %ends) {
      foreach $start (sort {$a<=>$b} keys %{$ends{$episode}}) {
  	$maxEnd = max(keys %{$ends{$episode}{$start}});
  	foreach $end (sort {$a<=>$b} keys %{$ends{$episode}{$start}}) {
  	    if ($end != $maxEnd) {
  		delete $index{$episode}{$start}{$end};
  		delete $starts{$episode}{$start}{$end};
  	    }
  	}
      }
  }
  
  # reducing segments with multiple starts
  foreach $episode (sort keys %starts) {
      foreach $end (sort {$a<=>$b} keys %{$starts{$episode}}) {
  	$minStart = min(keys %{$starts{$episode}{$end}});
  	foreach $start (sort {$a<=>$b} keys %{$starts{$episode}{$end}}) {
  	    if ($start != $minStart) {
  		delete $index{$episode}{$start}{$end};
  	    }
  	}
      }
  }
  
  # writing out lists of speech segments
  open(OUT1, "> spkDiarization/data/data.lst");
  open(OUT2, "> spkDiarization/lst/UBM.lst");
  
  foreach $episode (sort keys %index) {
      foreach $start (sort {$a<=>$b} keys %{$index{$episode}}) {
  	foreach $end (sort {$a<=>$b} keys %{$index{$episode}{$start}}) {
  	    $output = $episode."_".$start."_".$end."
  ";
  	    print OUT1 $output;
  	    print OUT2 $output;
  	}
      }
  }
  
  close(OUT2);
  close(OUT1);