01_init_TRAIN_data_features_LIA.sh 1.9 KB
#/bin/bash

#==============#
#for train data #
# usage : $0 /local_disk/hera2/PERCOL/bigot/KALDI/EXPE_REPERE_P1/ /local_disk/hera2/REPERE/Database/Phase1/train/trs_corrige/ /local_disk/hera2/REPERE/Database/Phase1/train/wav/

 
FORK=8

EXPE_DIR=$1
TRS_DIR=$2 
WAV_DIR=$3

. ../LIA_kaldiUtils/path.sh
. ../LIA_kaldiUtils/cmd.sh


LM_DIR=$EXPE_DIR/LANGUAGE_MODEL
AC_DIR=$EXPE_DIR/ACOUSTIC_MODEL
AC_DATA=$EXPE_DIR/ac_Data/
LM_DATA=$EXPE_DIR/ling_Data/
MFCC_DIR=$EXPE_DIR/MFCC
MFCC_DIR_LOG=$EXPE_DIR/MFCC/log/ 

mkdir -p $LM_DATA $AC_DATA $MFCC_DIR $MFCC_DIR_LOG

TEXT=$LM_DATA/text
rm $TEXT
# preparing speech turns file  data and features
for file in `find $TRS_DIR -name "*.trs"`
do
	#trs2stm.pl $file  -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -k  |\
	trs2stm.pl $file  -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -l -k |\
	reacc_win2bdlex |\
	stm_2_kaldi_txt.pl  |\
	lia_map_words.pl  >> $TEXT 
done
sort $TEXT -o $TEXT

awk '{ 
	segment=$1;     
	split(segment,S,"[#]");
	spk_id=S[2];
	audioname=S[1];
	startf=S[3];
	endf=S[4];
	print segment " " audioname " " startf/100 " " endf/100}' < $TEXT > $AC_DATA/segments    

# __ preparing audio file list ____ #    
for file in $(cut -d" " -f2 $AC_DATA/segments | sort -u ); do   
	echo $file $(find $WAV_DIR -name "*$file*" | sort -u)
done  | sort -u > $AC_DATA/wav.scp

cat $AC_DATA/segments | awk '{ split($0, a, " ") ; split(a[1], b, "#");  print a[1] " "  b[2] }'   > $AC_DATA/utt2spk || exit 1;  

cat $AC_DATA/utt2spk | sort -k 2  | utt2spk_to_spk2utt.pl > $AC_DATA/spk2utt || exit 1; 

# --- feature extraction ---- #
echo "===> make_mfcc.sh --nj $FORK  --mfcc-config $CONF_DIR/mfcc.conf  --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR"
make_mfcc.sh --nj $FORK  --mfcc-config $CONF_DIR/mfcc.conf  --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR 
echo "===> compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR"
compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR