csjconnect.pl
1.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env perl
use warnings;
# Copyright 2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki)
# 2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
# Apache 2.0
# Acknowledgement This work was supported by JSPS KAKENHI Grant Number 26280055.
# Connects CSJ segments to each moderate length utterance units by csj2kaldi4m.pl.
use utf8;
use open IO => ":utf8";
use open ":std";
if (@ARGV != 4) {
die "$0 gap maxlen file spk_id\n";
}
$gap = $ARGV[0];
$maxlen = $ARGV[1];
$file = $ARGV[2];
$spk = $ARGV[3];
if ($file eq '-') {
$in = STDIN;
} else {
open($in, $file) || die "$! : $file\n";
}
$psgid = -1;
$pspk_id = "";
$pend = 0;
$line = "";
while (<$in>) {
chomp;
if (! /(\d+) ([\d\.]+)-([\d\.]+) (.):-\d+-\d+ (.+)/) {
die "Unexpected format: $_\n";
}
$sgid = $1;
$start = $2;
$end = $3;
$pch = $4;
$wpp = $5;
if ( $spk =~ /^D/ ){
$ch = "\-$pch";
} else {
$ch = "";
}
$spk_id = "$spk$ch";
if ($psgid == -1) {
$ostart = $start;
$osgid = $sgid;
$ospk_id = $spk_id;
$line = "$wpp ";
} elsif ($psgid eq $sgid && $pspk_id eq $spk_id) {
$line .= "$wpp ";
} else {
if ($gap < $start - $pend || $maxlen < $pend - $ostart || $ospk_id ne $spk_id ) {
if ($opt_t) {
print "$osgid $ostart $pend\n";
} else {
unless($line=~ /\×/){
print "$ospk_id\_$osgid $ostart $pend <s> $line</s>\n";
}
}
$ostart = $start;
$osgid = $sgid;
$ospk_id = $spk_id;
$line = "$wpp ";
} else {
$line .= "<sp> $wpp ";
}
}
$psgid = $sgid;
$pspk_id = $spk_id;
$pend = $end;
}
if ($line ne "") {
if ($opt_t) {
print "$osgid $ostart $end\n";
} else {
unless($line =~ /\×/){
print "$ospk_id\_$osgid $ostart $end <s> $line</s>\n";
}
}
}