Blame view

egs/csj/s5/local/csj_data_prep.sh 4.22 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  #!/bin/bash
  
  # Copyright  2015 Tokyo Institute of Technology (Authors: Takafumi Moriya and Takahiro Shinozaki)
  #            2015 Mitsubishi Electric Research Laboratories (Author: Shinji Watanabe)
  # Apache 2.0
  # Acknowledgement  This work was supported by JSPS KAKENHI Grant Number 26280055.
  
  # CSJ 291 hours training data preparation.
  # "Academic lecture" 270 hours (Exclude the speakers in the evaluation set from all speech data)
  # + "Other" 21 hours
  # Actually, amount time of training data is 240 hours, this is excluding R-tag utterance and silence section.
  
  # To be run from one directory above this script.
  
  ## The input is a directory that contains the CSJ corpus.
  ## Note: If necessary, rewrite the "cat" command used in the followings
  ## to locate the .wav file path.
  
  . ./path.sh
  set -e # exit on error
  
  #check existing directories
  if [ $# -ne 1 ] && [ $# -ne 2 ]; then
    echo "Usage: csj_data_prep.sh <csj-data dir> [<mode_number>]"
    echo " mode_number can be 0, 1, 2, 3, (0=default using academic lecture and other data, 1=using academic lecture data,"
    echo "                                 2=using all data except for dialog data, 3=using all data)"
    exit 1;
  fi
  
  CSJ=$1
  mode=0
  
  if [ $# -eq 2 ]; then
    mode=$2
  fi
  
  
  dir=data/local/train
  mkdir -p $dir
  
  # Audio data directory check
  if [ ! -d $CSJ ]; then
   echo "Error: run.sh requires a directory argument"
    exit 1;
  fi
  
  # CSJ dictionary file check
  if [ ! -f $dir/lexicon.txt ]; then
    cp $CSJ/lexicon/lexicon.txt $dir || exit 1;
  fi
  
  ### Config of using wav data that relates with acoustic model training ###
  if [ $mode -eq 3 ]
  then
    cat $CSJ/*/*/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using All data
  elif [ $mode -eq 2 ]
  then
    cat $CSJ/*/{A*,M*,R*,S*}/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using All data except for "dialog" data
  elif [ $mode -eq 1 ]
  then 
    cat $CSJ/*/A*/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using "Academic lecture" data
  else
    # cat $CSJ/*/{A*,M*}/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using "Academic lecture" and "other" data
    cat $CSJ/*/{A,M}*/*-wav.list 2>/dev/null | sort > $dir/wav.flist # Using "Academic lecture" and "other" data
  fi
  
  
  n=`cat $dir/wav.flist | wc -l`
  
  [ $n -ne 986 ] && \
    echo "Warning: expected 986 data files (Case : Using 'Academic lecture' and 'Other' data), found $n."
  
  
  # (1a) Transcriptions preparation
  # make basic transcription file (add segments info)
  
  ##e.g A01F0055_0172 00380.213 00385.951 => A01F0055_0380213_0385951
  ## for CSJ
  awk '{
        spkutt_id=$1;
        split(spkutt_id,T,"[_ ]");
        name=T[1]; stime=$2; etime=$3;
        printf("%s_%07.0f_%07.0f",name, int(1000*stime), int(1000*etime));
        for(i=4;i<=NF;i++) printf(" %s", tolower($i)); printf "
  "
  }' $CSJ/*/*/*-trans.text |sort > $dir/transcripts1.txt # This data is for training language models
  # Except evaluation set (30 speakers)
  
  # test if trans. file is sorted
  export LC_ALL=C;
  sort -c $dir/transcripts1.txt || exit 1; # check it's sorted.
  
  # Remove Option.
  # **NOTE: modified the pattern matches to make them case insensitive
  cat $dir/transcripts1.txt \
    | perl -ane 's:\<s\>::gi;
                 s:\<\/s\>::gi;
                 print;' \
    | awk '{if(NF > 1) { print; } } ' |sort > $dir/text
  
  
  # (1c) Make segments files from transcript
  #segments file format is: utt-id start-time end-time, e.g.:
  #A01F0055_0380213_0385951 => A01F0055_0380213_0385951 A01F0055 00380.213 00385.951
  awk '{
         segment=$1;
         split(segment,S,"[_]");
         spkid=S[1]; startf=S[2]; endf=S[3];
         print segment " " spkid " " startf/1000 " " endf/1000
     }' < $dir/text > $dir/segments
  
  sed -e 's?.*/??' -e 's?.wav??' -e 's?\-[R,L]??' $dir/wav.flist | paste - $dir/wav.flist \
    > $dir/wavflist.scp
  
  awk '{
   printf("%s cat %s |
  ", $1, $2);
  }' < $dir/wavflist.scp | sort > $dir/wav.scp || exit 1;
  
  
  awk '{segment=$1; split(segment,S,"[_]"); spkid=S[1]; print $1 " " spkid}' $dir/segments > $dir/utt2spk || exit 1;
  
  sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
  
  # Copy stuff into its final locations [this has been moved from the format_data script]
  mkdir -p data/train
  for f in spk2utt utt2spk wav.scp text segments; do
    cp data/local/train/$f data/train/ || exit 1;
  done
  
  echo "CSJ data preparation succeeded."
  
  utils/fix_data_dir.sh data/train