Blame view

egs/lre07/v1/local/make_lre05.pl 3.62 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
  #!/usr/bin/env perl
  #
  # Copyright 2014  David Snyder
  
  if (@ARGV != 2) {
    print STDERR "Usage: $0 <path-to-LDC2008S05> <output-dir>
  ";
    print STDERR "e.g. $0 /export/corpora5/LDC/LDC2008S05 data
  ";
    exit(1);
  }
  
  ($db_base, $out_base_dir) = @ARGV;
  
  
  # This is the Indian English part of the corpora, we will prep it first.
  $db_ie =  $db_base . "/data/lid05d1/";
  $key = $db_ie . "key.txt";
  $out_dir = $out_base_dir . "/lid05d1/";
  if (system("mkdir -p $out_dir") != 0) {
    die "Error making directory $out_dir"; 
  }
  
  open(WAV, ">$out_dir" . '/wav.scp') 
    || die "Failed opening output file $out_dir/wav.scp";
  open(UTT2LANG, ">$out_dir" . '/utt2lang') 
    || die "Failed opening output file $out_dir/utt2lang";
  open(UTT2SPK, ">$out_dir" . '/utt2spk') 
    || die "Failed opening output file $out_dir/utt2spk";
  
  open(KEY, "<$key") 
    || die "Failed opening input file $key";
  
  while($line = <KEY>) {
    chomp($line);
    # If the line isn't a comment
    if (index($line, "#") == -1) {
      ($fi, $lang, $conv_id, $channel, $test_cut) = split(" ", $line);
      # Verify that we have only Indian English.
      if (not ($lang eq "IE")) {
        die "$db_ie contains non-Indian English utterances.";
      }
      ($set, $part, $utt_fi) = split("/", $fi);
      ($utt, $ext) = split("[.]", $utt_fi);
      # This part of the corpus is only english.indian.
      $uttId = "lid05d1_$utt";
      $wav = $db_ie . $fi;
      if (! -f $wav) {
        print STDERR "No such file $wav (skipping)
  ";
        next;
      }
      $channel =~ tr/AB/12/;
      print WAV "$uttId"," sph2pipe -f wav -p -c $channel $wav |
  ";
      print UTT2SPK "$uttId $uttId
  ";
      print UTT2LANG "$uttId english.indian
  ";
    }
  }
  close(WAV) || die;
  close(UTT2SPK) || die;
  close(UTT2LANG) || die;
  
  system("utils/fix_data_dir.sh $out_dir");
  (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") == 0) 
    || die "Error validating data dir.";
  
  $out_dir = $out_base_dir . "/lid05e1/";
  $db_dir = $db_base . "/data/lid05e1/";
  
  $key = $db_dir . "lid05e_key_v2.txt";
  open(KEY, "<$key") 
    || die "Failed opening input file $key";
  
  if (system("mkdir -p $out_dir") != 0) {
    die "Error making directory $out_dir"; 
  }
  
  open(WAV, ">$out_dir" . '/wav.scp')
    || die "Failed opening output file $out_dir/wav.scp";
  open(UTT2LANG, ">$out_dir" . '/utt2lang')
    || die "Failed opening output file $out_dir/utt2lang";
  open(UTT2SPK, ">$out_dir" . '/utt2spk')
    || die "Failed opening output file $out_dir/utt2spk";
  open(SPK2GEN, ">$out_dir" . '/spk2gender')
    || die "Failed opening output file $out_dir/spk2gender";
  
  while($line = <KEY>) {
    chomp($line);
    if (index($line, "#") == -1) {
      ($seg_id, $lang, $dialect, $conv_id, $channel,
       $cut, $dur, $corp, $gender, $loc, $alt_lang) = split(" ", $line);
      $wav = "$db_dir/test/${dur}/${seg_id}.sph";
      if (! -f $wav) {
        print STDERR "No such file $wav (skipping this utterance)
  ";
        next;
      }
      $lang = lc $lang;
      $dialect = lc $dialect;
      if ($dialect ne "na") {
        $full_lang = "$lang.$dialect";
      } else {
        $full_lang = $lang;
      }
  
      $gender = lc $gender;
      # Defaulting to male if the gender info is missing.
      if (not ($gender eq 'm' || $gender eq 'f')) {
        $gender = 'm';
      }
  
      $uttId = "lid05e1_".$seg_id;
      $channel =~ tr/AB/12/;
  
      print WAV "$uttId"," sph2pipe -f wav -p -c ${channel} $wav |
  ";
      print UTT2SPK "$uttId $uttId
  ";
      print SPK2GEN "$uttId $gender
  ";
      print UTT2LANG "$uttId $full_lang
  ";
    }
  }
  
  close(WAV) || die;
  close(UTT2SPK) || die;
  close(UTT2LANG) || die;
  close(SPK2GEN) || die;
  
  system("utils/fix_data_dir.sh $out_dir");
  (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") == 0) 
    || die "Error validating data dir.";