Blame view
egs/tunisian_msa/s5/local/prepare_data.sh
3.98 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
#!/bin/bash # Copyright 2018 John Morgan # Apache 2.0. # configuration variables tmpdir=data/local/tmp download_dir=$(pwd) tmp_tunis=$tmpdir/tunis tmp_libyan=$tmpdir/libyan data_dir=$download_dir/Tunisian_MSA/data # location of test data libyan_src=$data_dir/speech/test/Libyan_MSA # end of configuration variable settings # process the Tunisian MSA devtest data # get list of wav files for s in devtest/CTELLONE/Recordings_Arabic/6 devtest/CTELLTHREE/Recordings_Arabic/10; do echo "$0: looking for wav files for $s." mkdir -p $tmp_tunis/$s find $data_dir/speech/$s -type f \ -name "*.wav" | grep Recordings_Arabic > $tmp_tunis/$s/wav.txt local/devtest_recordings_make_lists.pl \ $data_dir/transcripts/devtest/recordings.tsv $s tunis mkdir -p data/devtest for x in wav.scp utt2spk text; do cat $tmp_tunis/$s/$x | tr " " " " >> data/devtest/$x done done utils/utt2spk_to_spk2utt.pl data/devtest/utt2spk | sort > data/devtest/spk2utt utils/fix_data_dir.sh data/devtest # training data consists of 2 parts: answers and recordings (recited) answers_transcripts=$data_dir/transcripts/train/answers.tsv recordings_transcripts=$data_dir/transcripts/train/recordings.tsv # location of test data cls_rec_tr=$libyan_src/cls/data/transcripts/recordings/cls_recordings.tsv lfi_rec_tr=$libyan_src/lfi/data/transcripts/recordings/lfi_recordings.tsv srj_rec_tr=$libyan_src/srj/data/transcripts/recordings/srj_recordings.tsv mbt_rec_tr=$data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv # make acoustic model training lists mkdir -p $tmp_tunis # get wav file names # for recited speech # the data collection laptops had names like CTELLONE CTELLTWO ... for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do find $data_dir/speech/train/$machine -type f -name "*.wav" | grep Recordings \ >> $tmp_tunis/recordings_wav.txt done # get file names for Answers for machine in CTELLONE CTELLTWO CTELLTHREE CTELLFOUR CTELLFIVE; do find $data_dir/speech/train/$machine -type f \ -name "*.wav" \ | grep Answers >> $tmp_tunis/answers_wav.txt done # make separate transcription lists for answers and recordings export LC_ALL=en_US.UTF-8 local/answers_make_lists.pl $answers_transcripts utils/fix_data_dir.sh $tmp_tunis/answers local/recordings_make_lists.pl $recordings_transcripts utils/fix_data_dir.sh $tmp_tunis/recordings # consolidate lists # acoustic models will be trained on both recited and prompted speech mkdir -p $tmp_tunis/lists for x in wav.scp utt2spk text; do cat $tmp_tunis/answers/$x $tmp_tunis/recordings/$x > $tmp_tunis/lists/$x done utils/fix_data_dir.sh $tmp_tunis/lists # get training lists mkdir -p data/train for x in wav.scp utt2spk text; do sort $tmp_tunis/lists/$x | tr " " " " > data/train/$x done utils/utt2spk_to_spk2utt.pl data/train/utt2spk | sort > data/train/spk2utt utils/fix_data_dir.sh data/train # process the Libyan MSA data mkdir -p $tmp_libyan for s in cls lfi srj; do mkdir -p $tmp_libyan/$s # get list of wav files find $libyan_src/$s -type f \ -name "*.wav" \ | grep recordings > $tmp_libyan/$s/recordings_wav.txt echo "$0: making recordings list for $s" local/test_recordings_make_lists.pl \ $libyan_src/$s/data/transcripts/recordings/${s}_recordings.tsv $s libyan done # process the Tunisian MSA test data mkdir -p $tmp_tunis/mbt # get list of wav files find $data_dir/speech/test/mbt -type f \ -name "*.wav" \ | grep recordings > $tmp_tunis/mbt/recordings_wav.txt echo "$0: making recordings list for mbt" local/test_recordings_make_lists.pl \ $data_dir/transcripts/test/mbt/recordings/mbt_recordings.tsv mbt tunis mkdir -p data/test # get the Libyan files for s in cls lfi srj; do for x in wav.scp utt2spk text; do cat $tmp_libyan/$s/recordings/$x | tr " " " " >> data/test/$x done done for x in wav.scp utt2spk text; do cat $tmp_tunis/mbt/recordings/$x | tr " " " " >> data/test/$x done utils/utt2spk_to_spk2utt.pl data/test/utt2spk | sort > data/test/spk2utt utils/fix_data_dir.sh data/test |