Blame view

egs/gale_arabic/s5/local/gale_data_prep_split.sh 1.06 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
  #!/bin/bash 
  
  # Copyright 2014 QCRI (author: Ahmed Ali)
  # Apache 2.0
  
  if [ $# -ne 1 ]; then
     echo "Arguments should be the <gale folder>"; exit 1
  fi
  
  
  #data will data/local
  
  galeData=$(utils/make_absolute.sh $1)
  mkdir -p data/local
  dir=$(utils/make_absolute.sh data/local)
  
  
  grep -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.test
  grep -v -f local/test_list $galeData/all | grep -v -f local/bad_segments > $galeData/all.train 
  
  for x in test train; do
   outdir=$dir/$x
   file=$galeData/all.$x 
   mkdir -p $outdir
   awk '{print $2 " " $2}' $file | sort -u > $outdir/utt2spk 
   cp -pr $outdir/utt2spk $outdir/spk2utt
   awk '{print $2 " " $1 " " $3 " " $4}' $file  | sort -u > $outdir/segments
   awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "
  "}' $file | sort -u > $outdir/text
  done 
  
  
  grep -f local/test_list $galeData/wav.scp > $dir/test/wav.scp
  
  cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
   {if (seen[$1]) { print $0}}' > $dir/train/wav.scp
   
  echo data prep split succeeded