Blame view
egs/aishell2/s5/local/prepare_data.sh
2.16 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
#!/bin/bash # Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) # 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) # Apache 2.0 # transform raw AISHELL-2 data to kaldi format . ./path.sh || exit 1; tmp= dir= if [ $# != 4 ]; then echo "Usage: $0 <corpus-data-dir> <dict-dir> <tmp-dir> <output-dir>" echo " $0 /export/AISHELL-2/iOS/train data/local/dict data/local/train data/train" exit 1; fi corpus=$1 dict_dir=$2 tmp=$3 dir=$4 echo "prepare_data.sh: Preparing data in $corpus" mkdir -p $tmp mkdir -p $dir # corpus check if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then echo "Error: $0 requires wav.scp and trans.txt under $corpus directory." exit 1; fi # validate utt-key list awk '{print $1}' $corpus/wav.scp > $tmp/wav_utt.list awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list utils/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list # wav.scp awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s ",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp # text python -c "import jieba" 2>/dev/null || \ (echo "jieba is not found. Use tools/extra/install_jieba.sh to install it." && exit 1;) utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt # jieba's vocab format requires word count(frequency), set to 99 awk '{print $1}' $dict_dir/lexicon.txt | sort | uniq | awk '{print $1,99}'> $tmp/word_seg_vocab.txt python local/word_segmentation.py $tmp/word_seg_vocab.txt $tmp/trans.txt > $tmp/text # utt2spk & spk2utt awk -F'\t' '{print $2}' $tmp/wav.scp > $tmp/wav.list sed -e 's:\.wav::g' $tmp/wav.list | \ awk -F'/' '{i=NF-1;printf("%s\t%s ",$NF,$i)}' > $tmp/tmp_utt2spk utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_utt2spk | sort -k 1 | uniq > $tmp/utt2spk utils/utt2spk_to_spk2utt.pl $tmp/utt2spk | sort -k 1 | uniq > $tmp/spk2utt # copy prepared resources from tmp_dir to target dir mkdir -p $dir for f in wav.scp text spk2utt utt2spk; do cp $tmp/$f $dir/$f || exit 1; done echo "local/prepare_data.sh succeeded" exit 0; |