Blame view

egs/aishell2/s5/local/prepare_data.sh 2.16 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
  #!/bin/bash
  # Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
  #           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
  # Apache 2.0
  
  # transform raw AISHELL-2 data to kaldi format
  
  . ./path.sh || exit 1;
  
  tmp=
  dir=
  
  if [ $# != 4 ]; then
    echo "Usage: $0 <corpus-data-dir> <dict-dir> <tmp-dir> <output-dir>"
    echo " $0 /export/AISHELL-2/iOS/train data/local/dict data/local/train data/train"
    exit 1;
  fi
  
  corpus=$1
  dict_dir=$2
  tmp=$3
  dir=$4
  
  echo "prepare_data.sh: Preparing data in $corpus"
  
  mkdir -p $tmp
  mkdir -p $dir
  
  # corpus check
  if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then
    echo "Error: $0 requires wav.scp and trans.txt under $corpus directory."
    exit 1;
  fi
  
  # validate utt-key list
  awk '{print $1}' $corpus/wav.scp   > $tmp/wav_utt.list
  awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list
  utils/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list
  
  # wav.scp
  awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s
  ",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp
  utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp
  
  # text
  python -c "import jieba" 2>/dev/null || \
    (echo "jieba is not found. Use tools/extra/install_jieba.sh to install it." && exit 1;)
  utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/trans.txt
  # jieba's vocab format requires word count(frequency), set to 99
  awk '{print $1}' $dict_dir/lexicon.txt | sort | uniq | awk '{print $1,99}'> $tmp/word_seg_vocab.txt
  python local/word_segmentation.py $tmp/word_seg_vocab.txt $tmp/trans.txt > $tmp/text
  
  # utt2spk & spk2utt
  awk -F'\t' '{print $2}' $tmp/wav.scp > $tmp/wav.list
  sed -e 's:\.wav::g' $tmp/wav.list | \
    awk -F'/' '{i=NF-1;printf("%s\t%s
  ",$NF,$i)}' > $tmp/tmp_utt2spk
  utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_utt2spk | sort -k 1 | uniq > $tmp/utt2spk
  utils/utt2spk_to_spk2utt.pl $tmp/utt2spk | sort -k 1 | uniq > $tmp/spk2utt
  
  # copy prepared resources from tmp_dir to target dir
  mkdir -p $dir
  for f in wav.scp text spk2utt utt2spk; do
    cp $tmp/$f $dir/$f || exit 1;
  done
  
  echo "local/prepare_data.sh succeeded"
  exit 0;