Blame view

egs/tedlium/s5_r2/local/prepare_data_iwslt.sh 2.72 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  #!/bin/bash
  #
  # Copyright  2014 Nickolay V. Shmyrev 
  #            2014 Brno University of Technology (Author: Karel Vesely)
  # Apache 2.0
  
  # To be run from one directory above this script.
  
  . ./path.sh
  
  export LC_ALL=C
  
  # Prepare: test set 2014 iwslt
  for set in tst2014; do
    dir=data/$set
    mkdir -p $dir
  
    # Merge transcripts into a single 'stm' file, do some mappings:
    # - <F0_M> -> <o,f0,male> : map dev stm labels to be coherent with train + test,
    # - <F0_F> -> <o,f0,female> : --||--
    # - (2) -> null : remove pronunciation variants in transcripts, keep in dictionary
    # - <sil> -> null : remove marked <sil>, it is modelled implicitly (in kaldi)
    # - (...) -> null : remove utterance names from end-lines of train
    # - it 's -> it's : merge words that contain apostrophe (if compound in dictionary, local/join_suffix.py)
    { # Add STM header, so sclite can prepare the '.lur' file
      echo ';;
  ;; LABEL "o" "Overall" "Overall results"
  ;; LABEL "f0" "f0" "Wideband channel"
  ;; LABEL "f2" "f2" "Telephone channel"
  ;; LABEL "male" "Male" "Male Talkers"
  ;; LABEL "female" "Female" "Female Talkers"
  ;; LABEL "unknown" "unknown" "Unknown Talkers"
  ;;'
      # Process the STMs
      cat db/TEDLIUM_release2/$set/*.stm | sort -k1,1 -k2,2 -k4,4n | \
        sed -e 's:<F0_M>:<o,f0,male>:' \
            -e 's:<F0_F>:<o,f0,female>:' \
            -e 's:([0-9])::g' \
            -e 's:<sil>::g' \
            -e 's:([^ ]*)$::' | \
        awk '{ $2 = "A"; print $0; }'
    } | local/join_suffix.py db/cantab-TEDLIUM/cantab-TEDLIUM.dct > data/$set/stm 
  
    # Prepare 'text' file
    # - {NOISE} -> [NOISE] : map the tags to match symbols in dictionary
    cat $dir/stm | grep -v -e 'ignore_time_segment_in_scoring' -e ';;' | \
      awk '{ printf ("%s-%07d-%07d", $1, $4*100, $5*100); 
             for (i=7;i<=NF;i++) { printf(" %s", $i); } 
             printf("
  "); 
           }' | tr '{}' '[]' | sort -k1,1 > $dir/text || exit 1
  
    # Prepare 'segments', 'utt2spk', 'spk2utt'
    cat $dir/text | cut -d" " -f 1 | awk -F"-" '{printf("%s %s-%s %07.2f %07.2f
  ", $0, $1, $2, $3/100.0, $4/100.0)}' > $dir/segments
    cat $dir/segments | awk '{print $1, $2}' > $dir/utt2spk
    cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt
    
    # Prepare 'wav.scp', 'reco2file_and_channel' 
    cat $dir/spk2utt | cut -d " " -f1 | cut -d "." -f 2- | awk -v set=$set -v pwd=$PWD '{ printf("ted.%s sph2pipe -f wav -p %s/db/TEDLIUM_release2/%s/sph/IWSLT14.ASR.%s.sph |
  ", $1, pwd, set, $1); }' > $dir/wav.scp
    cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel
    
    # Create empty 'glm' file
    echo ';; empty.glm
    [FAKE]     =>  %HESITATION     / [ ] __ [ ] ;; hesitation token
    ' > data/$set/glm
  
    # Check that data dirs are okay!
    utils/validate_data_dir.sh --no-feats $dir || exit 1
  done