01_init_TRAIN_data_features_LIA.sh
1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#/bin/bash
#==============#
#for train data #
# usage : $0 /local_disk/hera2/PERCOL/bigot/KALDI/EXPE_REPERE_P1/ /local_disk/hera2/REPERE/Database/Phase1/train/trs_corrige/ /local_disk/hera2/REPERE/Database/Phase1/train/wav/
FORK=8
EXPE_DIR=$1
TRS_DIR=$2
WAV_DIR=$3
. ../LIA_kaldiUtils/path.sh
. ../LIA_kaldiUtils/cmd.sh
LM_DIR=$EXPE_DIR/LANGUAGE_MODEL
AC_DIR=$EXPE_DIR/ACOUSTIC_MODEL
AC_DATA=$EXPE_DIR/ac_Data/
LM_DATA=$EXPE_DIR/ling_Data/
MFCC_DIR=$EXPE_DIR/MFCC
MFCC_DIR_LOG=$EXPE_DIR/MFCC/log/
mkdir -p $LM_DATA $AC_DATA $MFCC_DIR $MFCC_DIR_LOG
TEXT=$LM_DATA/text
rm $TEXT
# preparing speech turns file data and features
for file in `find $TRS_DIR -name "*.trs"`
do
#trs2stm.pl $file -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -k |\
trs2stm.pl $file -rmt "noise,hes,pi,pibe,pers" -e "pronounce" -l -k |\
reacc_win2bdlex |\
stm_2_kaldi_txt.pl |\
lia_map_words.pl >> $TEXT
done
sort $TEXT -o $TEXT
awk '{
segment=$1;
split(segment,S,"[#]");
spk_id=S[2];
audioname=S[1];
startf=S[3];
endf=S[4];
print segment " " audioname " " startf/100 " " endf/100}' < $TEXT > $AC_DATA/segments
# __ preparing audio file list ____ #
for file in $(cut -d" " -f2 $AC_DATA/segments | sort -u ); do
echo $file $(find $WAV_DIR -name "*$file*" | sort -u)
done | sort -u > $AC_DATA/wav.scp
cat $AC_DATA/segments | awk '{ split($0, a, " ") ; split(a[1], b, "#"); print a[1] " " b[2] }' > $AC_DATA/utt2spk || exit 1;
cat $AC_DATA/utt2spk | sort -k 2 | utt2spk_to_spk2utt.pl > $AC_DATA/spk2utt || exit 1;
# --- feature extraction ---- #
echo "===> make_mfcc.sh --nj $FORK --mfcc-config $CONF_DIR/mfcc.conf --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR"
make_mfcc.sh --nj $FORK --mfcc-config $CONF_DIR/mfcc.conf --cmd "$train_cmd" $AC_DATA $MFCC_DIR_LOG $MFCC_DIR
echo "===> compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR"
compute_cmvn_stats.sh $AC_DATA $MFCC_DIR_LOG $MFCC_DIR