01_init_TEST_data_features_LIA.sh 1.95 KB
#!/bin/bash

#==============#
#for test data #

# bash 01_init_TEST_data_features_LIA.sh  /local2/PERCOL/bigot/KALDI/EXP_GRNBL/ /local2/PERCOL/bigot/KALDI/MICRO_TEST/TRS/ /local2/PERCOL/bigot/KALDI/MICRO_TEST/WAV/

 
FORK=8

EXPE_DIR=$1
TRS_DIR=$2 
WAV_DIR=$3

. ../LIA_kaldiUtils/path.sh
. ../LIA_kaldiUtils/cmd.sh

LM_DIR=$EXPE_DIR/TEST/LANGUAGE_MODEL
AC_DIR=$EXPE_DIR/TEST/ACOUSTIC_MODEL
AC_DATA=$EXPE_DIR/TEST/ac_Data/
LM_DATA=$EXPE_DIR/TEST/ling_Data/
MFCC_DIR=$EXPE_DIR/TEST/MFCC
MFCC_DIR_LOG=$EXPE_DIR/TEST/MFCC/log/ 

mkdir -p $LM_DATA $AC_DATA $MFCC_DIR $MFCC_DIR_LOG

TEXT=$LM_DATA/text
rm $TEXT

# preparing speech turns file  data and features
for file in $(find $TRS_DIR  -name "*.trs")
do
	#trs2stm.pl $file  -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -k | \
	trs2stm.pl $file  -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -l -k | \
	reacc_win2bdlex  | \
	stm_2_kaldi_txt.pl  | \
	lia_map_words.pl  >> $TEXT 
done
sort $TEXT -o $TEXT
cp $TEXT $AC_DATA/text


awk '{ 
	segment=$1;     
	split(segment,S,"[#]");
	spk_id=S[2];
	audioname=S[1];
	startf=S[3];
	endf=S[4];
	print segment " " audioname " " startf/100 " " endf/100}' < $TEXT > $AC_DATA/segments    

# __ preparing audio file list ____ #    
for file in $(cut -d" " -f2 $AC_DATA/segments | sort -u ); do   
	echo $file $(find $WAV_DIR -name "*$file*" | sort -u)
done  | sort -u > $AC_DATA/wav.scp

cat $AC_DATA/segments | awk '{ split($0, a, " ") ; split(a[1], b, "#");  print a[1] " "  b[2] }'   > $AC_DATA/utt2spk || exit 1;  

cat $AC_DATA/utt2spk | sort -k 2  | utt2spk_to_spk2utt.pl > $AC_DATA/spk2utt || exit 1; 

# --- feature extraction ---- #
echo "====> make_mfcc.sh --nj $FORK --mfcc-config $CONF_DIR/mfcc.conf   --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR"
make_mfcc.sh --nj $FORK --mfcc-config $CONF_DIR/mfcc.conf   --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR 

echo "======> compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR"
compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR