Blame view

Scripts/utils/nnet/.svn/text-base/copy_feats.sh.svn-base 1.89 KB
ec85f8892   bigot benjamin   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
  #!/bin/bash
  # Copyright 2012 Karel Vesely
  
  # Licensed under the Apache License, Version 2.0 (the "License");
  # you may not use this file except in compliance with the License.
  # You may obtain a copy of the License at
  #
  #  http://www.apache.org/licenses/LICENSE-2.0
  #
  # THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  # KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
  # WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
  # MERCHANTABLITY OR NON-INFRINGEMENT.
  # See the Apache 2 License for the specific language governing permissions and
  # limitations under the License.
  
  # This script resaves features to a specified directory,
  # this is done to have the randomized data stored consecutivly,
  # which improves the speed and reduces loads on disks.
  #
  # To make sure the temporary dir gets deleted upon exit of the calling script
  # you can use something like:
  #
  # trap "echo \"Removing features tmpdir $tmpdir @ $(hostname)\"; rm -r $tmpdir" EXIT
  
  
  echo "$0 $@"  # Print the command line for logging
  
  [ -f path.sh ] && . ./path.sh;
  . parse_options.sh || exit 1;
  
  
  if [ $# != 3 ]; then
     echo "Usage: $0 <input.scp> <tmpdir> <output.scp>"
     echo " e.g.: $0 train_remote.scp /tmp/324nkjl train_local.scp"
     exit 1;
  fi
  
  scp_in=$1
  tmpdir=$2
  scp_out=$3
  
  echo "Re-saving the features to tmpdir $tmpdir @ $(hostname)"
  #divide the arks per 10k files
  nj=$((1 + $(cat $scp_in | wc -l) / 10000))
  for((n=0; n<nj; n++)); do
    copy-feats "scp:utils/split_scp.pl -j $nj $n $scp_in - |" ark,scp:$tmpdir/feats.$n.ark,$tmpdir/feats.$n.scp || exit 1
  done
  #assemble the scp file
  for((n=0; n<nj; n++)); do
    cat $tmpdir/feats.$n.scp
  done > $scp_out
  #test we have all the data
  l1=$(cat $scp_in | wc -l)
  l2=$(cat $scp_out | wc -l)
  [[ "$l1" != "$l2" ]] && echo "ERROR in data re-saving $l1 != $l2" && exit 1;
  #notify it was copied ok
  wc -l $scp_in $scp_out
  echo Copied ok!
  
  exit 0