Blame view

egs/gale_mandarin/s5/local/gale_data_prep_txt.sh 3.36 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
  #!/bin/bash
  
  # Copyright 2014 (author: Ahmed Ali, Hainan Xu)
  # Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
  # Apache 2.0
  
  echo $0 "$@"
  export LC_ALL=C
  
  galeData=$(utils/make_absolute.sh "${@: -1}" );
  
  length=$(($#-1))
  args=${@:1:$length}
  
  top_pwd=`pwd`
  txtdir=$galeData/txt
  mkdir -p $txtdir
  
  cd $txtdir
  
  for cdx in ${args[@]}; do
    echo "Preparing $cdx"
    if [[ $cdx  == *.tgz ]] ; then
       tar -xvf $cdx
    elif [  -d "$cdx" ]; then
      tgt=$(basename $cdx)
      test -x $tgt || ln -s $cdx `basename $tgt`
    else
      echo "I don't really know what I shall do with $cdx " >&2
    fi
  done
  
  find -L . -type f -name *.tdf | while read file; do
  sed '1,3d' $file
  done > all.tmp
  
  perl -e '
      ($inFile,$idFile,$txtFile,$spk,$mapf)= split /\s+/, $ARGV[0];
      open(IN, "$inFile");
      open(ID, ">$idFile");
      open(TXT, ">$txtFile");
      open(SPK, ">$spk");
      open(MAP, ">$mapf");
      while (<IN>) {
        @arr= split /\t/,$_;
        $arr[4] =~ s/ //g;
        $arr[4] = sprintf("%020s", $arr[4]);
        $spkid = "$arr[0]_$arr[4]";
        $spkfix = sprintf("%080s", $spkid);
  
        $start=sprintf ("%0.3f",$arr[2]);
        $rStart=$start;
        $start=~s/\.//;
        $start=~s/^0+$/0/;
        $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
        $start = sprintf("%09s", $start);
  
        $end=sprintf ("%0.3f",$arr[3]);
        $rEnd=$end;
        $end=~s/^0+([^0])/$1/;
        $end=~s/\.//;
        $end = sprintf("%09s", $end);
  
        $id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd
  ";
        next if ($rStart == $rEnd);
        $id =~ s/.sph//g;
        print ID $id;
        print TXT "$arr[7]
  ";
        print SPK "${spkfix}_$arr[0]_${start}_${end} ${spkfix}
  ";
        print MAP "$arr[0] ${spkfix}_$arr[0]
  ";
   }' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp"
  
  perl -p -i -e 's=/.$==g' contentall.tmp
  
  cd $top_pwd
  
  
  pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
  export PYTHONPATH=$PYTHONPATH:`pwd`/tools/mmseg-1.3.0/lib/python${pyver}/site-packages
  if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
    echo "--- Downloading mmseg-1.3.0 ..."
    echo "NOTE: it assumes that you have Python, Setuptools installed on your system!"
    wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz
    tar xf tools/mmseg-1.3.0.tar.gz -C tools
    cd tools/mmseg-1.3.0
    mkdir -p lib/python${pyver}/site-packages
    CC=gcc CXX=g++ python setup.py build
    python setup.py install --prefix=.
    cd ../..
    if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
      echo "mmseg is not found - installation failed?"
      exit 1
    fi
  fi
  
  cat $txtdir/contentall.tmp |\
    sed -e 's/,//g' |\
    sed -e 's/<foreign language=\"[a-zA-Z]\+\">/ /g' |\
    sed -e 's/<\/foreign>/ /g' |\
    perl -pe 's/<Event.*?>/ /g' |\
    sed -e 's/\[NS\]//g' |\
    sed -e 's/\[ns\]//g' |\
    sed -e 's/<noise>\(.\+\)<\/noise>/\1/g' |\
    sed -e 's/((\([^)]\{0,\}\)))/\1/g' |\
    local/gale_normalize.pl | \
    python local/gale_segment.py \
    > $txtdir/text
  
  paste $txtdir/allid.tmp $txtdir/text | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > $txtdir/all_1.tmp
  
  awk '{$1="";print $0}' $txtdir/all_1.tmp | sed 's:^ ::' > $txtdir/../all
  
  cat $txtdir/utt2spk.tmp | sort -u > $txtdir/../utt2spk
  cat $txtdir/map.tmp | sort -u > $txtdir/../map
  
  sort -c $txtdir/../utt2spk
  
  utils/utt2spk_to_spk2utt.pl $txtdir/../utt2spk | sort -u > $txtdir/../spk2utt
  
  echo data prep text succeeded