Blame view

Scripts/utils/subset_data_dir_tr90_cv10.sh 1.06 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
  #!/bin/bash
  # Copyright 2010-2012  Brno University of Technology (Author: Karel Vesely)
  # Apache 2.0
  
  # This script splits dataset to two parts : 
  # 90% training set and 10% held-out set (or cross-validation),
  # which will be later on used for neural network training
  #
  # It is useful if the database is not presplit or where
  # we cannot get alignment on dev set
  
  if [ $# != 3 ]; then
    echo "Usage: $0 <srcdir> <traindir> <crossvaldir>"
    exit 1;
  fi
  
  srcdir=$1
  trndir=$2
  cvdir=$3
  
  if [ ! -f $srcdir/utt2spk ]; then
    echo "$0: no such file $srcdir/utt2spk" 
    exit 1;
  fi
  
  #total number of lines
  N=$(cat $srcdir/utt2spk | wc -l)
  #get line number where 90% of the data lies
  N_head=$((N*9/10)) 
  #move the boundary so it is located on speaker change
  N_head=$(cat $srcdir/utt2spk | uniq -f1 -c | awk '{ if(n+$1<='$N_head') { n += $1 } else { nextfile } } END{ print n }')
  #the rest of the data will be that big
  N_tail=$((N-N_head))
  
  #now call the subset_data_dir.sh and fix the directories
  subset_data_dir.sh --first $srcdir $N_head $trndir
  subset_data_dir.sh --last $srcdir $N_tail $cvdir