Blame view

egs/gale_arabic/s5/local/gale_data_prep_txt.sh 1.79 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
  #!/bin/bash
  
  # Copyright 2014 QCRI (author: Ahmed Ali)
  # Apache 2.0
  
  echo $0 "$@"
  
  galeData=$(utils/make_absolute.sh "${@: -1}" );
  
  length=$(($#-1))
  args=${@:1:$length}
  
  top_pwd=`pwd`
  txtdir=$galeData/txt
  mkdir -p $txtdir; cd $txtdir
  
  for cdx in ${args[@]}; do
    echo "Preparing $cdx"
    if [[ $cdx  == *.tgz ]] ; then
       tar -xvf $cdx
    elif [  -d "$cdx" ]; then
      ln -s $cdx `basename $cdx`
    else
      echo "I don't really know what I shall do with $cdx " >&2
    fi
  done
  
  find -L . -type f -name *.tdf | while read file; do
  sed '1,3d' $file
  done >  all.tmp$$
  
  perl -e '
      ($inFile,$idFile,$txtFile)= split /\s+/, $ARGV[0];
      open(IN, "$inFile");
      open(ID, ">$idFile");
      open(TXT, ">$txtFile");
      while (<IN>) {
        @arr= split /\t/,$_;
        $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
        $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
        if ( ($arr[11] !~ m/report/) && ($arr[11] !~ m/conversational/) ){$arr[11]="UNK";}
        $id="$arr[11] $arr[0] $arr[0]_${start}_${end} $rStart $rEnd
  ";
        next if ($rStart == $rEnd);
        $id =~ s/.sph//g;
        print ID $id;
        print TXT "$arr[7]
  ";
   }' "all.tmp$$ allid.tmp$$ contentall.tmp$$"
  
  
  perl ${top_pwd}/local/normalize_transcript_BW.pl contentall.tmp$$ contentall.buck.tmp$$
  
  paste allid.tmp$$ contentall.buck.tmp$$ | sed 's: $::' | awk '{if (NF>5) {print $0}}'  > all_1.tmp$$
  
  awk '{$1="";print $0}' all_1.tmp$$ | sed 's:^ ::' > $galeData/all
  awk '{if ($1 == "report") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' >  $galeData/report
  awk '{if ($1 == "conversational") {$1="";print $0}}' all_1.tmp$$ | sed 's:^ ::' > $galeData/conversational
  
  #cd ..;
  #rm -fr $txtdir
  cd $top_pwd
  echo data prep text succeeded
  
  exit 0