Blame view
egs/gale_mandarin/s5/local/gale_data_prep_txt.sh
3.36 KB
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
#!/bin/bash # Copyright 2014 (author: Ahmed Ali, Hainan Xu) # Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal) # Apache 2.0 echo $0 "$@" export LC_ALL=C galeData=$(utils/make_absolute.sh "${@: -1}" ); length=$(($#-1)) args=${@:1:$length} top_pwd=`pwd` txtdir=$galeData/txt mkdir -p $txtdir cd $txtdir for cdx in ${args[@]}; do echo "Preparing $cdx" if [[ $cdx == *.tgz ]] ; then tar -xvf $cdx elif [ -d "$cdx" ]; then tgt=$(basename $cdx) test -x $tgt || ln -s $cdx `basename $tgt` else echo "I don't really know what I shall do with $cdx " >&2 fi done find -L . -type f -name *.tdf | while read file; do sed '1,3d' $file done > all.tmp perl -e ' ($inFile,$idFile,$txtFile,$spk,$mapf)= split /\s+/, $ARGV[0]; open(IN, "$inFile"); open(ID, ">$idFile"); open(TXT, ">$txtFile"); open(SPK, ">$spk"); open(MAP, ">$mapf"); while (<IN>) { @arr= split /\t/,$_; $arr[4] =~ s/ //g; $arr[4] = sprintf("%020s", $arr[4]); $spkid = "$arr[0]_$arr[4]"; $spkfix = sprintf("%080s", $spkid); $start=sprintf ("%0.3f",$arr[2]); $rStart=$start; $start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning $start = sprintf("%09s", $start); $end=sprintf ("%0.3f",$arr[3]); $rEnd=$end; $end=~s/^0+([^0])/$1/; $end=~s/\.//; $end = sprintf("%09s", $end); $id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd "; next if ($rStart == $rEnd); $id =~ s/.sph//g; print ID $id; print TXT "$arr[7] "; print SPK "${spkfix}_$arr[0]_${start}_${end} ${spkfix} "; print MAP "$arr[0] ${spkfix}_$arr[0] "; }' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp" perl -p -i -e 's=/.$==g' contentall.tmp cd $top_pwd pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'` export PYTHONPATH=$PYTHONPATH:`pwd`/tools/mmseg-1.3.0/lib/python${pyver}/site-packages if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then echo "--- Downloading mmseg-1.3.0 ..." echo "NOTE: it assumes that you have Python, Setuptools installed on your system!" wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz tar xf tools/mmseg-1.3.0.tar.gz -C tools cd tools/mmseg-1.3.0 mkdir -p lib/python${pyver}/site-packages CC=gcc CXX=g++ python setup.py build python setup.py install --prefix=. cd ../.. if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then echo "mmseg is not found - installation failed?" exit 1 fi fi cat $txtdir/contentall.tmp |\ sed -e 's/,//g' |\ sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\ sed -e 's/<\/foreign>/ /g' |\ perl -pe 's/<Event.*?>/ /g' |\ sed -e 's/\[NS\]//g' |\ sed -e 's/\[ns\]//g' |\ sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\ sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\ local/gale_normalize.pl | \ python local/gale_segment.py \ > $txtdir/text paste $txtdir/allid.tmp $txtdir/text | sed 's: $::' | awk '{if (NF>5) {print $0}}' > $txtdir/all_1.tmp awk '{$1="";print $0}' $txtdir/all_1.tmp | sed 's:^ ::' > $txtdir/../all cat $txtdir/utt2spk.tmp | sort -u > $txtdir/../utt2spk cat $txtdir/map.tmp | sort -u > $txtdir/../map sort -c $txtdir/../utt2spk utils/utt2spk_to_spk2utt.pl $txtdir/../utt2spk | sort -u > $txtdir/../spk2utt echo data prep text succeeded |