Blame view

egs/mini_librispeech/s5/local/download_lm.sh 2.63 KB
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
  #!/bin/bash
  
  # Copyright 2014  Vassil Panayotov
  #           2017  Daniel Povey
  # Apache 2.0
  
  if [ $# -ne "3" ]; then
    echo "Usage: $0 <base-url> <download_dir> <local?"
    echo "e.g.: $0 http://www.openslr.org/resources/11 ./corpus/ data/local/lm"
    exit 1
  fi
  
  base_url=$1
  dst_dir=$2
  local_dir=$3
  
  # given a filename returns the corresponding file size in bytes
  # The switch cases below can be autogenerated by entering the data directory and running:
  # for f in *; do echo "\"$f\") echo \"$(du -b $f | awk '{print $1}')\";;"; done
  function filesize() {
    case $1 in
      "3-gram.arpa.gz") echo "759636181";;
      "3-gram.pruned.1e-7.arpa.gz") echo "34094057";;
      "3-gram.pruned.3e-7.arpa.gz") echo "13654242";;
      "librispeech-lexicon.txt") echo "5627653";;
      "librispeech-vocab.txt") echo "1737588";;
      *) echo "";;
    esac
  }
  
  function check_and_download () {
    [[ $# -eq 1 ]] || { echo "check_and_download() expects exactly one argument!"; return 1; }
    fname=$1
    echo "Downloading file '$fname' into '$dst_dir'..."
    expect_size="$(filesize $fname)"
    [[ ! -z "$expect_size" ]] || { echo "Unknown file size for '$fname'"; return 1; }
    if [[ -s $dst_dir/$fname ]]; then
      # In the following statement, the first version works on linux, and the part
      # after '||' works on Linux.
      f=$dst_dir/$fname
      fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
      if [[ "$fsize" -eq "$expect_size" ]]; then
        echo "'$fname' already exists and appears to be complete"
        return 0
      else
        echo "WARNING: '$fname' exists, but the size is wrong - re-downloading ..."
      fi
    fi
    wget --no-check-certificate -O $dst_dir/$fname $base_url/$fname || {
      echo "Error while trying to download $fname!"
      return 1
    }
    f=$dst_dir/$fname
    # In the following statement, the first version works on linux, and the part after '||'
    # works on Linux.
    fsize=$(set -o pipefail; du -b $f 2>/dev/null | awk '{print $1}' || stat '-f %z' $f)
    [[ "$fsize" -eq "$expect_size" ]] || { echo "$fname: file size mismatch!"; return 1; }
    return 0
  }
  
  mkdir -p $dst_dir $local_dir
  
  for f in 3-gram.arpa.gz 3-gram.pruned.1e-7.arpa.gz 3-gram.pruned.3e-7.arpa.gz  \
           librispeech-vocab.txt librispeech-lexicon.txt; do
    check_and_download $f || exit 1
  done
  
  dst_dir=$(readlink -f $dst_dir)
  ln -sf $dst_dir/3-gram.pruned.1e-7.arpa.gz $local_dir/lm_tgmed.arpa.gz
  ln -sf $dst_dir/3-gram.pruned.3e-7.arpa.gz $local_dir/lm_tgsmall.arpa.gz
  ln -sf $dst_dir/3-gram.arpa.gz $local_dir/lm_tglarge.arpa.gz
  ln -sf $dst_dir/librispeech-lexicon.txt $local_dir/librispeech-lexicon.txt
  ln -sf $dst_dir/librispeech-vocab.txt $local_dir/librispeech-vocab.txt
  exit 0