gen_UBM_list.pl 2.37 KB
#!/usr/bin/perl

# Perl program to generate data/data.lst and lst/UBM.lst
# using all speech segments in spkDiarization/data/sph/
#
# Author: Xavier Bost
# email: xavier.bost@univ-avignon.fr
#
# Synopsis:
#
# Source files: spkDiarization/data/sph/*.sph
#
# Retained informations are written in:
#   spkDiarization/gmm/world.gmm

use strict;
use List::Util qw(min max);

my $line;                  # current line
my $dir;                   # directory containing label files
my $file;                  # current file
my $episode;               # episode name
my $base_name;             # file base name
my $start;                 # current segment beginning
my $end;                   # current segment end
my $maxEnd;
my $minStart;
my %starts = ();           # segments beginning
my %ends = ();             # segments end
my %index = ();
my $output;

$dir = "spkDiarization/data/sph/";
opendir(REP, $dir);
my @files = grep /.+_\d+_\d+\.sph/, readdir REP;
closedir(REP);

# looping over files, i.e. speech segments
foreach $file (sort @files) {

    if ($file =~ /(.+)_(\d+)_(\d+)\.sph$/) {
	$episode = $1;
	$start = $2;
	$end = $3;
    }

    $ends{$episode}{$start}{$end} = 1;
    $starts{$episode}{$end}{$start} = 1;
    $index{$episode}{$start}{$end} = 1;
}

# reducing segments with multiple ends
foreach $episode (sort keys %ends) {
    foreach $start (sort {$a<=>$b} keys %{$ends{$episode}}) {
	$maxEnd = max(keys %{$ends{$episode}{$start}});
	foreach $end (sort {$a<=>$b} keys %{$ends{$episode}{$start}}) {
	    if ($end != $maxEnd) {
		delete $index{$episode}{$start}{$end};
		delete $starts{$episode}{$start}{$end};
	    }
	}
    }
}

# reducing segments with multiple starts
foreach $episode (sort keys %starts) {
    foreach $end (sort {$a<=>$b} keys %{$starts{$episode}}) {
	$minStart = min(keys %{$starts{$episode}{$end}});
	foreach $start (sort {$a<=>$b} keys %{$starts{$episode}{$end}}) {
	    if ($start != $minStart) {
		delete $index{$episode}{$start}{$end};
	    }
	}
    }
}

# writing out lists of speech segments
open(OUT1, "> spkDiarization/data/data.lst");
open(OUT2, "> spkDiarization/lst/UBM.lst");

foreach $episode (sort keys %index) {
    foreach $start (sort {$a<=>$b} keys %{$index{$episode}}) {
	foreach $end (sort {$a<=>$b} keys %{$index{$episode}{$start}}) {
	    $output = $episode."_".$start."_".$end."\n";
	    print OUT1 $output;
	    print OUT2 $output;
	}
    }
}

close(OUT2);
close(OUT1);