Blame view

egs/fisher_callhome_spanish/s5/local/fsp_make_trans.pl 2.56 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
  #!/usr/bin/env perl
  # Copyright 2014  Gaurav Kumar.   Apache 2.0
  
  use utf8;
  use File::Basename;
  ($tmpdir)=@ARGV;
  #$tmpdir='../data/local/tmp';
  $trans="$tmpdir/train_transcripts.flist";
  $reco="$tmpdir/reco2file_and_channel";
  open(T, "<", "$trans") || die "Can't open transcripts file";
  open(R, "|sort >$reco") || die "Can't open reco2file_and_channel file $!";
  open(O, ">$tmpdir/text.1") || die "Can't open text file for writing";
  open(G, ">$tmpdir/spk2gendertmp") || die "Can't open the speaker to gender map file";
  binmode(O, ":utf8");
  while (<T>) {
    $file = $_;
    m:([^/]+)\.tdf: || die "Bad filename $_";
    $call_id = $1;
    print R "$call_id-A $call_id A
  ";
    print R "$call_id-B $call_id B
  ";
    open(I, "<$file") || die "Opening file $_";
    binmode(I, ":utf8");
    # Get rid of header sections first
    foreach ( 0..2 ) {
      $tmpLine = <I>;
    }
    #Now read each line and extract information
    while (<I>) {
      #20051017_215732_274_fsp.sph     1       0.0     0.909856781803  Audrey  female  native   <foreign lang="English"> Audrey </foreign>     0       0       -1
      chomp;
      my @stringComponents = split(/\t/);
  
      #Check number of components in this array
      if ((scalar @stringComponents) >= 11) {
        $start = sprintf("%06d", $stringComponents[2] * 100);
        $end = sprintf("%06d", $stringComponents[3] * 100);
        length($end) > 6 && die "Time too long $end in $file";
        $side = $stringComponents[1] ? "B" : "A";
        $words = $stringComponents[7];
        $utt_id = "${call_id}-$side-$start-$end";
        $speaker_id = "${call_id}-$side";
        $gender = "m";
        if  ($stringComponents[5] == "female") {
          $gender = "f";
        }
        print G "$speaker_id $gender
  " || die "Error writing to speaker2gender file";
        $words =~ s:</:lendarrow:g;
        $words =~ s/</larrow/g;
        $words =~ s/>/rarrow/g;
        $words =~ s/[[:punct:]]//g;
        $words =~ s/larrow/</g;
        $words =~ s/rarrow/>/g;
        $words =~ s:lendarrow:</:g;
        $words =~ s/Á/á/g;
        $words =~ s/Í/í/g;
        $words =~ s/Ó/ó/g;
        $words =~ s/Ú/ú/g;
  #      $words =~ s/ì/í/g;
  #      $words =~ s/è/é/g;
  #      $words =~ s/¡/i/g;
  #      $words =~ s/J/J/g;
  #      $words =~ s/S/S/g;
  #      $words =~ s/à/á/g;
        $words =~ s/¨//g;
        $words =~ s/·//g;
        $words =~ s/´//g;
        $words =~ s/N/n/g;
  #      $words =~ s/2//g;
        $words = lc($words);
  #      $words =~ s:ü([eiéí]):w\1:g;
  #      $words =~ s:ü:u:g;
  #      $words =~ s:ñ:N:g;
        print O "$utt_id $words
  " || die "Error writing to text file";
      }
    }
    close(I)
  }
  close(T);
  close(R);
  close(O);
  close(G);