Blame view

egs/csj/s5/local/csj_make_trans/csj_autorun.sh 6.84 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
  #! /bin/bash
  
  # Copyright  2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki)
  #            2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
  # Apache 2.0
  # Acknowledgement  This work was supported by JSPS KAKENHI Grant Number 26280055.
  
  if [ $# -ne 3 ]; then
    echo "Usage: "`basename $0`" <speech-dir> <transcription-dir> <csj-version>"
    echo "e.g., "`basename $0`" /database/NINJAL/CSJ data/csj-data usb (or dvd)"
    echo "See comments in the script for more details"
    exit 1
  fi
  
  resource=$1
  outd=$2
  csjv=$3
  
  set -e # exit on error
  
  case "$csjv" in
      "merl" ) SDB=sdb/ ; WAV=WAV/ ; disc=CSJ2004 ;; # Set SDB directory and WAV directory respectively.
      "usb" ) SDB=MORPH/SDB/ ; WAV=WAV/ ; disc="core noncore" ;; # Set SDB directory and WAV directory respectively.
      "dvd" ) num=dvd        ; SDB=           ; WAV=     ; disc=$num`seq -s " "$num 3 17| sed "s/ $num$//"` ;; # Set preserved format name to $num.
      *) echo "Input variable is usb or dvd only. $csjv is UNAVAILABLE VERSION." && exit 1;
  esac
  
  [ ! -e $resource ] && echo "Not exist CSJ or incorrect PATH." && exit 1;
  
  if [ ! -e $outd/.done_make_trans ];then
  (
      echo "Make Transcription and PATH of WAV file."
      mkdir -p $outd
      rm -f $outd/README.txt
      echo "Contents about generated directory and file
            ## About each directory
            {dvd3(dvd) or core(usb)}                         :Contain training data
            eval/                                            :Official evaluation data set ( *** Extract from dvd *** )
            excluded/                                        :Same speaker data including evaluation data (e.g. A01M0056) ( *** Extract from dvd *** )
  
            ## About each file
            {dvd3(dvd) or core(usb)}/A01F0055
                                     A01F0055-trans.text     :Transcriptions (utterances with R-tags are removed)
                                     A01F0055-wav.list       :Path about existing wav file
                                     A01F0055.4lex           :File for making lexicon" >$outd/README.txt
  
      # Make transcription file for each dvd and each lecture
      [ ! -x "`which nkf `" ]\
          && echo "This processing is need to prepare \"nkf\" command. Please retry after installing command \"nkf\"." && exit 1;
  
      for vol in $disc ;do
          mkdir -p $outd/$vol
          (
              if [ $csjv = "merl" ]; then
                  ids=`ls $resource/$vol/$SDB | sed 's:.sdb::g' | sed 's/00README.txt//g'`
              else
                  ids=`ls $resource/${SDB}$vol | sed 's:.sdb::g' | sed 's/00README.txt//g'`
              fi
  
              for id in $ids; do
                  mkdir -p $outd/$vol/$id
  
                  case "$csjv" in
                      "usb" ) TPATH="$resource/${SDB}$vol" ; WPATH="$resource/${WAV}$vol" ;;
                      "dvd" ) TPATH="$resource/$vol/$id"   ; WPATH="$resource/$vol/$id" ;;
                      "merl" ) TPATH="$resource/$vol/$SDB" ; WPATH="$resource/$vol/$WAV" ;;
                  esac
  
                  local/csj_make_trans/csj2kaldi4m.pl $TPATH/${id}.sdb  $outd/$vol/$id/${id}.4lex $outd/$vol/$id/${id}.4trn.t || exit 1;
                  local/csj_make_trans/csjconnect.pl 0.5 10 $outd/$vol/$id/${id}.4trn.t $id > $outd/$vol/$id/${id}-trans.text || exit 1;
                  rm $outd/$vol/$id/${id}.4trn.t
  
                  if [ -e $WPATH/${id}-L.wav ]; then
                      find $WPATH -iname "${id}-[L,R].wav" >$outd/$vol/$id/${id}-wav.list
                  else
                      find $WPATH -iname ${id}.wav >$outd/$vol/$id/${id}-wav.list || exit 1;
                  fi
  
              done
  
              if [ -s $outd/$vol/$id/${id}-trans.text ] ;then
                  echo -n >$outd/$vol/.done_$vol
                  echo "Complete processing transcription data in $vol"
              else
                  echo "Bad processing of making transcriptions part" && exit;
              fi
          )&
      done
      wait
  
      if [ -e $outd/$vol/.done_$vol ] ;then
          echo -n >$outd/.done_make_trans
          echo "Done!"
      else
          echo "Bad processing of making transcriptions part" && exit;
      fi
  )
  fi
  
  ## Exclude speech data given by test set speakers.
  if [ ! -e $outd/.done_mv_eval_dup ]; then
  (
      echo "Make evaluation set 1, 2, 3. And exclude speech data given by test set speakers."
      mkdir -p $outd/{\eval,excluded}
      mkdir -p $outd/eval/eval{1,2,3}
  
      # Exclude speaker ID
      A01M0056="S05M0613 R00M0187 D01M0019 D04M0056 D02M0028 D03M0017"
  
  
      # Evaluation set ID
      eval1="A01M0110 A01M0137 A01M0097 A04M0123 A04M0121 A04M0051 A03M0156 A03M0112 A03M0106 A05M0011"
      eval2="A01M0056 A03F0072 A02M0012 A03M0016 A06M0064 A06F0135 A01F0034 A01F0063 A01F0001 A01M0141"
      eval3="S00M0112 S00F0066 S00M0213 S00F0019 S00M0079 S01F0105 S00F0152 S00M0070 S00M0008 S00F0148"
  
      # Speech data given by test set speakers (e.g. eval2 : A01M0056)
      for list in $A01M0056 ; do
          find . -type d -name $list | xargs -i mv {} $outd/excluded
      done
      wait
  
      # Evaluation data
      for list in $eval1 $eval2 $eval3 ; do
          find . -type d -name $list | xargs -i mv {} $outd/eval
      done
      wait
  
      mv $outd/eval/{A01M0110,A01M0137,A01M0097,A04M0123,A04M0121,A04M0051,A03M0156,A03M0112,A03M0106,A05M0011} $outd/eval/eval1
      mv $outd/eval/{A01M0056,A03F0072,A02M0012,A03M0016,A06M0064,A06F0135,A01F0034,A01F0063,A01F0001,A01M0141} $outd/eval/eval2
      mv $outd/eval/{S00M0112,S00F0066,S00M0213,S00F0019,S00M0079,S01F0105,S00F0152,S00M0070,S00M0008,S00F0148} $outd/eval/eval3
  
      [ 10 -eq `ls $outd/eval/eval1 | wc -l` ] && echo -n >$outd/eval/.done_eval1
      [ 10 -eq `ls $outd/eval/eval2 | wc -l` ] && echo -n >$outd/eval/.done_eval2
      [ 10 -eq `ls $outd/eval/eval3 | wc -l` ] && echo -n >$outd/eval/.done_eval3
      if [ 3 -eq `ls -a $outd/eval | grep done_eval | wc -l` ] ;then
          echo -n >$outd/.done_mv_eval_dup
          echo "Done!"
      else
          echo "Bad processing of making evaluation set part" && exit;
      fi
      )
  fi
  
  ## make lexicon.txt
  if [ ! -e $outd/.done_make_lexicon ]; then
    echo "Make lexicon file."
    (
      lexicon=$outd/lexicon
      rm -f $outd/lexicon/lexicon.txt
      mkdir -p $lexicon
      cat $outd/*/*/*.4lex | grep -v "+ー" | grep -v "++" | grep -v "×" > $lexicon/lexicon.txt
      sort -u $lexicon/lexicon.txt > $lexicon/lexicon_htk.txt
      local/csj_make_trans/vocab2dic.pl -p local/csj_make_trans/kana2phone -e $lexicon/ERROR_v2d -o $lexicon/lexicon.txt $lexicon/lexicon_htk.txt
      cut -d'+' -f1,3- $lexicon/lexicon.txt >$lexicon/lexicon_htk.txt
      cut -f1,3- $lexicon/lexicon_htk.txt | perl -ape 's:\t: :g' >$lexicon/lexicon.txt
  
      if [ -s $lexicon/lexicon.txt ] ;then
        echo -n >$outd/.done_make_lexicon
        echo "Done!"
      else
        echo "Bad processing of making lexicon file" && exit;
      fi
    )
  fi
  
  [ ! 3 -le `ls -a $outd | grep done | wc -l` ] \
    && echo "ERROR : Processing is incorrect." && exit;
  
  echo "Finish processing original CSJ data" && echo -n >$outd/.done_make_all