Blame view

egs/farsdat/s5/local/farsdat_data_prep.sh 6.29 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
  #!/bin/bash
  
  #
  # Copyright 2014 Univercity of Tehran (Author: Bagher BabaAli)
  #           2014 Brno University of Technology (Karel Vesely)
  #           2014 Johns Hopkins University (Daniel Povey)
  #
  # farsdat, description of the database:
  # http://www.assta.org/sst/SST-94-Vol-ll/cache/SST-94-VOL2-Chapter15-p20.pdf
  
  if [ $# -ne 1 ]; then
     echo "Argument should be the farsdat directory, see ../run.sh for example."
     exit 1;
  fi
  
  dir=`pwd`/data/local/data
  lmdir=`pwd`/data/local/nist_lm
  mkdir -p $dir $lmdir
  local=`pwd`/local
  utils=`pwd`/utils
  conf=`pwd`/conf
  
  . ./path.sh # Needed for KALDI_ROOT
  export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
  
  [ -f $conf/test_spk.list ] || error_exit "$PROG: Eval-set speaker list not found.";
  [ -f $conf/dev_spk.list ] || error_exit "$PROG: dev-set speaker list not found.";
  [ -f $conf/train_spk.list ] || error_exit "$PROG: train-set speaker list not found.";
  
  # First check if the train & test directories exist (these can either be upper-
  # or lower-cased
  if [ ! -d $*/CD1 -o ! -d $*/CD2 ] && [ ! -d $*/cd1 -o ! -d $*/cd2 ]; then
    echo "farsdat_data_prep.sh: Spot check of command line argument failed"
    echo "Command line argument must be absolute pathname to Farsdat directory"
    echo "with name like /export/corpora5/ELRA/farsdat"
    exit 1;
  fi
  
  # Now check what case the directory structure is
  uppercased=false
  cd1_dir=cd1
  cd2_dir=cd2
  if [ -d $*/CD1 ]; then
    uppercased=true
    cd1_dir=CD1
    cd2_dir=CD2
  fi
  
  tmpdir=$(mktemp -d /tmp/kaldi.XXXX);
  trap 'rm -rf "$tmpdir"' EXIT
  
  find $*/{$cd1_dir/SENTENCE/,$cd2_dir/SENTENCE/} -iname '*.SNT' -print |\
    while read filename; do
      rec_id=$(echo "$filename" | sed -e 's:.*/S\([1-2]\)\(.*\)\.SNT$:\2 \1:i' |\
        awk '{printf("%03d_%d
  ",$1,$2);}' ) || exit 1;
      cat "$filename" | awk -v rec_id=$rec_id \
        '{printf "%s_%s %s %f %f
  ",rec_id,$1,rec_id,$2/(2*22050),$3/(2*22050)}'
    done > $dir/segments || exit 1;
  
  find $*/{$cd1_dir/wave,$cd2_dir/wave} -iname '*.WAV' -print > $tmpdir/wav.flist || exit 1;
  sed -e 's:.*/S\([1-2]\)\(.*\)\.WAV$:\2 \1:i' $tmpdir/wav.flist |\
    awk '{printf("%03d_%d
  ",$1,$2);}' > $tmpdir/wav.uttids || exit 1;
  
  paste $tmpdir/wav.uttids $tmpdir/wav.flist | \
    awk '{printf("%s sox %s -t wav -r 16000 -c 1 - |
  ", $1, $2);}' | sort -k1,1 > $dir/wav.scp
  
    # Now, Convert the transcripts into our format (no normalization yet)
    # Get the transcripts: each line of the output contains an utterance 
    # ID followed by the transcript.
  
  find $*/{$cd1_dir/PHONEME,$cd2_dir/PHONEME} -iname 'PH*.*' -print > $tmpdir/phn.flist
  sed -e 's:.*/PH\([1-2]\)\(.*\)\.\(.*\)$:\2 \1 \3:i' $tmpdir/phn.flist |\
    awk '{printf("%03d_%d_%d
  ",$1,$2,$3);}' > $tmpdir/phn.uttids || exit 1;
  
  while read line; do
    [ -f $line ] || error_exit "Cannot find transcription file '$line'";
    cut -c1 "$line" | tr '
  ' ' ' | perl -ape 's: *$:
  :;' || exit 1;
  done < $tmpdir/phn.flist > $tmpdir/phn.trans || exit 1;
  
  paste $tmpdir/phn.uttids $tmpdir/phn.trans | sort -k1,1 > $dir/trans || exit 1;
  
  # Do normalization steps. 
  $local/farsdat_norm_trans.sh $dir/trans | sort > $dir/text || exit 1;
  
   # Prepare gender mapping
  cat $*/$cd1_dir/Information/Speaker.txt $*/$cd2_dir/Information/Speaker.txt | \
  sed '/Code/d' | awk '{printf("%03d %s
  ",$1,$3)}' > $dir/spk2gender || exit 1;
  
  for x in dev test; do
    cat $conf/${x}_spk.list | awk '{printf("%03d
  ",$1);}' > \
      $tmpdir/${x}_spk.list || exit 1;
    awk -F'_' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/segments |\
      sort -k1 | awk -F'_'  '{sent[$1]=sent[$1] " " $3 } 
                            END { 
                                  for(i=1; i<=304; ++i)
                                  { split(sent[i],sent_split," "); 
                                    asort(sent_split,sent_sort); 
                                    for(j=1; j<=8;j++)
                                    { 
                                      print sent_sort[j];
                                    }
                                  }
                                }' | sort -n | uniq > $tmpdir/${x}.sent || exit 1; 
  done 
  
  cat $conf/train_spk.list | awk '{printf("%03d
  ",$1);}' > \
    $tmpdir/train_spk.list|| exit 1;
  cat $tmpdir/dev.sent $tmpdir/test.sent | uniq -u > $tmpdir/dev+test.sent|| exit 1;
  seq 1 404 | sed  '/400/d' | grep -F -x -v -f $tmpdir/dev+test.sent - > \
    $tmpdir/train.sent || exit 1;
  
  for x in train dev test; do
    set=data/$x
    mkdir -p $set
  
    awk -F'_' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/segments |\
      sort -k1 > $tmpdir/segments || exit 1;
  
    awk -F'_' 'NR==FNR{a[$1]++;next} (a[substr($3,1,index($3," ")-1)])' \
      $tmpdir/${x}.sent $tmpdir/segments | sort -k1  > $set/segments || exit 1;
  
    awk -F'_' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/text |\
      sort -k1 > $tmpdir/text || exit 1;
  
    awk -F'_' 'NR==FNR{a[$1]++;next} (a[substr($3,1,index($3," ")-1)])' \
      $tmpdir/${x}.sent $tmpdir/text | sort -k1 > $set/text || exit 1;
  
    awk -F'_' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/wav.scp > \
    $tmpdir/wav.scp || exit 1;
  
    cat $set/segments | awk -F'_' '{printf("%03d_%d
  ",$1,$2)}' > \
      $tmpdir/spk_session || exit 1;
  
    awk -F' ' 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/spk_session $tmpdir/wav.scp |\
      sort -k1 > $set/wav.scp || exit 1;
  
    awk 'NR==FNR{a[$1]++;next} (a[$1])' $tmpdir/${x}_spk.list $dir/spk2gender |\
      tr '[:upper:]' '[:lower:]' > $set/spk2gender || exit 1;
  
    # Make the utt2spk and spk2utt files.
    cut -d' ' -f1 $set/segments | awk -F'_' '{print $0,$1}'  > $set/utt2spk || exit 1;
    cat $set/utt2spk | utils/utt2spk_to_spk2utt.pl > $set/spk2utt || exit 1;
  
    # Prepare STM file for sclite:
    awk -v txt=$set/text -v sex=$set/spk2gender \
    'BEGIN{ 
       while(getline < txt) { ref[$1]=substr($0, index($0,$2)); } 
       while(getline < sex) { gender[$1]=$2; } 
       print ";; LABEL \"O\" \"Overall\" \"Overall\"";
       print ";; LABEL \"F\" \"Female\" \"Female speakers\"";
       print ";; LABEL \"M\" \"Male\" \"Male speakers\""; 
     } 
     { spk_id=substr($2,1,3);    
       printf("%s 1 %s %s %s <O,%s> %s
  ", $1, spk_id, $3, $4, toupper(gender[spk_id]), ref[$1]);
     }' $set/segments >$set/stm || exit 1
  
    # Create dummy GLM file for sclite:
    echo ';; empty.glm
    [FAKE]     =>  %HESITATION     / [ ] __ [ ] ;; hesitation token
    ' > $set/glm 
  
    # Check that data dirs are okay!
    utils/validate_data_dir.sh --no-feats $set || exit 1
  
  done
  
  echo "Data preparation succeeded"