make_priors.sh
3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/bin/bash
# Copyright 2012-2015 Brno University of Technology (author: Karel Vesely)
# Apache 2.0
# To be run from .. (one directory up from here)
# see ../run.sh for example
# Begin configuration section.
nj=4
cmd=run.pl
use_gpu=no
ivector=
# End configuration section.
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
set -euo pipefail
if [ $# != 2 ]; then
echo "usage: $0 [options] <data-dir> <nnet-dir>";
echo "options: "
echo " --cmd 'queue.pl <queue opts>' # how to run jobs."
echo " --nj <nj> # number of parallel jobs"
echo " --remove-last-components <N> # number of NNet Components to remove from the end"
echo " --use-gpu (no|yes|optional) # forwarding on GPU"
exit 1;
fi
if [ -f path.sh ]; then . ./path.sh; fi
data=$1
nndir=$2
######## CONFIGURATION
required="$data/feats.scp $nndir/final.nnet $nndir/final.feature_transform"
for f in $required; do
[ ! -f $f ] && echo "$0: Missing $f" && exit 1;
done
sdata=$data/split$nj
[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
echo "Accumulating prior stats by forwarding '$data' with '$nndir'"
# We estimate priors on 10k utterances, selected randomly from the splitted data,
N=$((10000/nj))
# PREPARE FEATURE EXTRACTION PIPELINE
# import config,
cmvn_opts=
delta_opts=
D=$nndir
[ -e $D/norm_vars ] && cmvn_opts="--norm-means=true --norm-vars=$(cat $D/norm_vars)" # Bwd-compatibility,
[ -e $D/cmvn_opts ] && cmvn_opts=$(cat $D/cmvn_opts)
[ -e $D/delta_order ] && delta_opts="--delta-order=$(cat $D/delta_order)" # Bwd-compatibility,
[ -e $D/delta_opts ] && delta_opts=$(cat $D/delta_opts)
#
# Create the feature stream,
feats="ark:cat $sdata/JOB/feats.scp | utils/shuffle_list.pl --srand 777 | head -n$N | copy-feats scp:- ark:- |"
# apply-cmvn (optional),
[ ! -z "$cmvn_opts" -a ! -f $sdata/1/cmvn.scp ] && echo "$0: Missing $sdata/1/cmvn.scp" && exit 1
[ ! -z "$cmvn_opts" ] && feats="$feats apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp ark:- ark:- |"
# add-deltas (optional),
[ ! -z "$delta_opts" ] && feats="$feats add-deltas $delta_opts ark:- ark:- |"
# add-pytel transform (optional),
[ -e $D/pytel_transform.py ] && feats="$feats /bin/env python $D/pytel_transform.py |"
# add-ivector (optional),
if [ -e $D/ivector_dim ]; then
[ -z $ivector ] && echo "Missing --ivector, they were used in training!" && exit 1
# Get the tool,
ivector_append_tool=append-vector-to-feats # default,
[ -e $D/ivector_append_tool ] && ivector_append_tool=$(cat $D/ivector_append_tool)
# Check dims,
feats_job_1=$(sed 's:JOB:1:g' <(echo $feats))
dim_raw=$(feat-to-dim "$feats_job_1" -)
dim_raw_and_ivec=$(feat-to-dim "$feats_job_1 $ivector_append_tool ark:- '$ivector' ark:- |" -)
dim_ivec=$((dim_raw_and_ivec - dim_raw))
[ $dim_ivec != "$(cat $D/ivector_dim)" ] && \
echo "Error, i-vector dim. mismatch (expected $(cat $D/ivector_dim), got $dim_ivec in '$ivector')" && \
exit 1
# Append to feats,
feats="$feats $ivector_append_tool ark:- '$ivector' ark:- |"
fi
# Run the forward pass,
$cmd JOB=1:$nj $nndir/log/prior_stats.JOB.log \
nnet-forward --use-gpu=$use_gpu --feature-transform=$nndir/final.feature_transform $nndir/final.nnet "$feats" ark:- \| \
compute-cmvn-stats --binary=false ark:- $nndir/JOB.prior_cmvn_stats || exit 1
sum-matrices --binary=false $nndir/prior_cmvn_stats $nndir/*.prior_cmvn_stats 2>$nndir/log/prior_sum_matrices.log || exit 1
rm $nndir/*.prior_cmvn_stats
awk 'NR==2{ $NF=""; print "[",$0,"]"; }' $nndir/prior_cmvn_stats >$nndir/prior_counts || exit 1
echo "Succeeded creating prior counts '$nndir/prior_counts' from '$data'"