Blame view

egs/fisher_swbd/s5/local/swbd1_data_prep.sh 4.7 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
  #!/bin/bash
  
  # Switchboard-1 training data preparation customized for Edinburgh
  # Author:  Arnab Ghoshal (Jan 2013)
  
  # To be run from one directory above this script.
  
  ## The input is some directory containing the switchboard-1 release 2
  ## corpus (LDC97S62).  Note: we don't make many assumptions about how
  ## you unpacked this.  We are just doing a "find" command to locate
  ## the .sph files.
  
  . ./path.sh
  
  #check existing directories
  if [ $# != 1 ]; then
    echo "Usage: swbd1_data_prep.sh /path/to/SWBD"
    exit 1; 
  fi 
  
  SWBD_DIR=$1
  
  dir=data/local/train_swbd
  mkdir -p $dir
  
  # Audio data directory check
  if [ ! -d $SWBD_DIR ]; then
    echo "Error: run.sh requires a directory argument"
    exit 1; 
  fi  
  
  sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
  [ ! -x $sph2pipe ] \
    && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
  
  # Option A: SWBD dictionary file check
  [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
    echo  "SWBD dictionary file does not exist" &&  exit 1;
  
  # find sph audio files
  find $SWBD_DIR -iname '*.sph' | sort > $dir/sph.flist
  
  n=`cat $dir/sph.flist | wc -l`
  [ $n -ne 2435 ] && \
    echo Warning: expected 2435 data data files, found $n
  
  
  # (1a) Transcriptions preparation
  # make basic transcription file (add segments info)
  # **NOTE: In the default Kaldi recipe, everything is made uppercase, while we 
  # make everything lowercase here. This is because we will be using SRILM which
  # can optionally make everything lowercase (but not uppercase) when mapping 
  # LM vocabs.
  awk '{ 
         name=substr($1,1,6); gsub("^sw","sw0",name); side=substr($1,7,1); 
         stime=$2; etime=$3;
         printf("%s-%s_%06.0f-%06.0f", 
                name, side, int(100*stime+0.5), int(100*etime+0.5));
         for(i=4;i<=NF;i++) printf(" %s", $i); printf "
  "
  }' $dir/swb_ms98_transcriptions/*/*/*-trans.text  > $dir/transcripts1.txt
  
  # test if trans. file is sorted
  export LC_ALL=C;
  sort -c $dir/transcripts1.txt || exit 1; # check it's sorted.
  
  # Remove SILENCE, <B_ASIDE> and <E_ASIDE>.
  
  # Note: we have [NOISE], [VOCALIZED-NOISE], [LAUGHTER], [SILENCE].
  # removing [SILENCE], and the <B_ASIDE> and <E_ASIDE> markers that mark
  # speech to somone; we will give phones to the other three (NSN, SPN, LAU). 
  # There will also be a silence phone, SIL.
  # **NOTE: modified the pattern matches to make them case insensitive
  cat $dir/transcripts1.txt \
    | perl -ane 's:\s\[SILENCE\](\s|$):$1:gi; 
                 s/<B_ASIDE>//gi; 
                 s/<E_ASIDE>//gi; 
                 print;' \
    | awk '{if(NF > 1) { print; } } ' > $dir/transcripts2.txt
  
  
  # **NOTE: swbd1_map_words.pl has been modified to make the pattern matches 
  # case insensitive
  local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt  > $dir/text  # final transcripts
  
  # format acronyms in text
  python local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \
    -M data/local/dict_nosp/acronyms_swbd.map
  cp $dir/text $dir/text_bk
  mv $dir/text_map $dir/text
  
  
  
  # (1c) Make segment files from transcript
  #segments file format is: utt-id side-id start-time end-time, e.g.:
  #sw02001-A_000098-001156 sw02001-A 0.98 11.56
  awk '{ 
         segment=$1;
         split(segment,S,"[_-]");
         side=S[2]; audioname=S[1]; startf=S[3]; endf=S[4];
         print segment " " audioname "-" side " " startf/100 " " endf/100
  }' < $dir/text > $dir/segments
  
  sed -e 's?.*/??' -e 's?.sph??' $dir/sph.flist | paste - $dir/sph.flist \
    > $dir/sph.scp
  
  awk -v sph2pipe=$sph2pipe '{
    printf("%s-A %s -f wav -p -c 1 %s |
  ", $1, sph2pipe, $2); 
    printf("%s-B %s -f wav -p -c 2 %s |
  ", $1, sph2pipe, $2);
  }' < $dir/sph.scp | sort > $dir/wav.scp || exit 1;
  #side A - channel 1, side B - channel 2
  
  # this file reco2file_and_channel maps recording-id (e.g. sw02001-A)
  # to the file name sw02001 and the A, e.g.
  # sw02001-A  sw02001 A
  # In this case it's trivial, but in other corpora the information might
  # be less obvious.  Later it will be needed for ctm scoring.
  awk '{print $1}' $dir/wav.scp \
    | perl -ane '$_ =~ m:^(\S+)-([AB])$: || die "bad label $_"; 
                 print "$1-$2 $1 $2
  "; ' \
    > $dir/reco2file_and_channel || exit 1;
  
  awk '{spk=substr($1,1,9); print $1 " " spk}' $dir/segments > $dir/utt2spk \
    || exit 1;
  sort -k 2 $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt || exit 1;
  
  # We assume each conversation side is a separate speaker. This is a very 
  # reasonable assumption for Switchboard. The actual speaker info file is at:
  # http://www.ldc.upenn.edu/Catalog/desc/addenda/swb-multi-annot.summary
  
  # Copy stuff into its final locations [this has been moved from the format_data
  # script]
  mkdir -p data/train_swbd
  for f in spk2utt utt2spk wav.scp text segments reco2file_and_channel; do
    cp $dir/$f data/train_swbd/$f || exit 1;
  done
  
  
  echo Switchboard-1 data preparation succeeded.