Blame view
Scripts/01_init_TRAIN_data_features_LIA.sh
1.9 KB
ec85f8892
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
#/bin/bash #==============# #for train data # # usage : $0 /local_disk/hera2/PERCOL/bigot/KALDI/EXPE_REPERE_P1/ /local_disk/hera2/REPERE/Database/Phase1/train/trs_corrige/ /local_disk/hera2/REPERE/Database/Phase1/train/wav/ FORK=8 EXPE_DIR=$1 TRS_DIR=$2 WAV_DIR=$3 . ../LIA_kaldiUtils/path.sh . ../LIA_kaldiUtils/cmd.sh LM_DIR=$EXPE_DIR/LANGUAGE_MODEL AC_DIR=$EXPE_DIR/ACOUSTIC_MODEL AC_DATA=$EXPE_DIR/ac_Data/ LM_DATA=$EXPE_DIR/ling_Data/ MFCC_DIR=$EXPE_DIR/MFCC MFCC_DIR_LOG=$EXPE_DIR/MFCC/log/ mkdir -p $LM_DATA $AC_DATA $MFCC_DIR $MFCC_DIR_LOG TEXT=$LM_DATA/text rm $TEXT # preparing speech turns file data and features for file in `find $TRS_DIR -name "*.trs"` do #trs2stm.pl $file -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -k |\ trs2stm.pl $file -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -l -k |\ reacc_win2bdlex |\ stm_2_kaldi_txt.pl |\ lia_map_words.pl >> $TEXT done sort $TEXT -o $TEXT awk '{ segment=$1; split(segment,S,"[#]"); spk_id=S[2]; audioname=S[1]; startf=S[3]; endf=S[4]; print segment " " audioname " " startf/100 " " endf/100}' < $TEXT > $AC_DATA/segments # __ preparing audio file list ____ # for file in $(cut -d" " -f2 $AC_DATA/segments | sort -u ); do echo $file $(find $WAV_DIR -name "*$file*" | sort -u) done | sort -u > $AC_DATA/wav.scp cat $AC_DATA/segments | awk '{ split($0, a, " ") ; split(a[1], b, "#"); print a[1] " " b[2] }' > $AC_DATA/utt2spk || exit 1; cat $AC_DATA/utt2spk | sort -k 2 | utt2spk_to_spk2utt.pl > $AC_DATA/spk2utt || exit 1; # --- feature extraction ---- # echo "===> make_mfcc.sh --nj $FORK --mfcc-config $CONF_DIR/mfcc.conf --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR" make_mfcc.sh --nj $FORK --mfcc-config $CONF_DIR/mfcc.conf --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR echo "===> compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR" compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR |