Blame view

Scripts/01_init_TRAIN_data_features_LIA.sh 1.9 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
  #/bin/bash
  
  #==============#
  #for train data #
  # usage : $0 /local_disk/hera2/PERCOL/bigot/KALDI/EXPE_REPERE_P1/ /local_disk/hera2/REPERE/Database/Phase1/train/trs_corrige/ /local_disk/hera2/REPERE/Database/Phase1/train/wav/
  
   
  FORK=8
  
  EXPE_DIR=$1
  TRS_DIR=$2 
  WAV_DIR=$3
  
  . ../LIA_kaldiUtils/path.sh
  . ../LIA_kaldiUtils/cmd.sh
  
  
  LM_DIR=$EXPE_DIR/LANGUAGE_MODEL
  AC_DIR=$EXPE_DIR/ACOUSTIC_MODEL
  AC_DATA=$EXPE_DIR/ac_Data/
  LM_DATA=$EXPE_DIR/ling_Data/
  MFCC_DIR=$EXPE_DIR/MFCC
  MFCC_DIR_LOG=$EXPE_DIR/MFCC/log/ 
  
  mkdir -p $LM_DATA $AC_DATA $MFCC_DIR $MFCC_DIR_LOG
  
  TEXT=$LM_DATA/text
  rm $TEXT
  # preparing speech turns file  data and features
  for file in `find $TRS_DIR -name "*.trs"`
  do
  	#trs2stm.pl $file  -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -k  |\
  	trs2stm.pl $file  -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -l -k |\
  	reacc_win2bdlex |\
  	stm_2_kaldi_txt.pl  |\
  	lia_map_words.pl  >> $TEXT 
  done
  sort $TEXT -o $TEXT
  
  awk '{ 
  	segment=$1;     
  	split(segment,S,"[#]");
  	spk_id=S[2];
  	audioname=S[1];
  	startf=S[3];
  	endf=S[4];
  	print segment " " audioname " " startf/100 " " endf/100}' < $TEXT > $AC_DATA/segments    
  
  # __ preparing audio file list ____ #    
  for file in $(cut -d" " -f2 $AC_DATA/segments | sort -u ); do   
  	echo $file $(find $WAV_DIR -name "*$file*" | sort -u)
  done  | sort -u > $AC_DATA/wav.scp
  
  cat $AC_DATA/segments | awk '{ split($0, a, " ") ; split(a[1], b, "#");  print a[1] " "  b[2] }'   > $AC_DATA/utt2spk || exit 1;  
  
  cat $AC_DATA/utt2spk | sort -k 2  | utt2spk_to_spk2utt.pl > $AC_DATA/spk2utt || exit 1; 
  
  # --- feature extraction ---- #
  echo "===> make_mfcc.sh --nj $FORK  --mfcc-config $CONF_DIR/mfcc.conf  --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR"
  make_mfcc.sh --nj $FORK  --mfcc-config $CONF_DIR/mfcc.conf  --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR 
  echo "===> compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR"
  compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR