extract_xvectors.sh
4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/bin/bash
# Copyright 2017 David Snyder
# 2017 Johns Hopkins University (Author: Daniel Povey)
# 2017 Johns Hopkins University (Author: Daniel Garcia Romero)
# Apache 2.0.
# This script extracts embeddings (called "xvectors" here) from a set of
# utterances, given features and a trained DNN. The purpose of this script
# is analogous to sid/extract_ivectors.sh: it creates archives of
# vectors that are used in speaker recognition. Like ivectors, xvectors can
# be used in PLDA or a similar backend for scoring.
# Begin configuration section.
nj=30
cmd="run.pl"
cache_capacity=64 # Cache capacity for x-vector extractor
chunk_size=-1 # The chunk size over which the embedding is extracted.
# If left unspecified, it uses the max_chunk_size in the nnet
# directory.
use_gpu=false
stage=0
echo "$0 $@" # Print the command line for logging
if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;
if [ $# != 3 ]; then
echo "Usage: $0 <nnet-dir> <data> <xvector-dir>"
echo " e.g.: $0 exp/xvector_nnet data/train exp/xvectors_train"
echo "main options (for others, see top of script file)"
echo " --config <config-file> # config containing options"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --use-gpu <bool|false> # If true, use GPU."
echo " --nj <n|30> # Number of jobs"
echo " --stage <stage|0> # To control partial reruns"
echo " --cache-capacity <n|64> # To speed-up xvector extraction"
echo " --chunk-size <n|-1> # If provided, extracts embeddings with specified"
echo " # chunk size, and averages to produce final embedding"
fi
srcdir=$1
data=$2
dir=$3
for f in $srcdir/final.raw $srcdir/min_chunk_size $srcdir/max_chunk_size $data/feats.scp $data/vad.scp ; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
done
min_chunk_size=`cat $srcdir/min_chunk_size 2>/dev/null`
max_chunk_size=`cat $srcdir/max_chunk_size 2>/dev/null`
nnet=$srcdir/final.raw
if [ -f $srcdir/extract.config ] ; then
echo "$0: using $srcdir/extract.config to extract xvectors"
nnet="nnet3-copy --nnet-config=$srcdir/extract.config $srcdir/final.raw - |"
fi
if [ $chunk_size -le 0 ]; then
chunk_size=$max_chunk_size
fi
if [ $max_chunk_size -lt $chunk_size ]; then
echo "$0: specified chunk size of $chunk_size is larger than the maximum chunk size, $max_chunk_size" && exit 1;
fi
mkdir -p $dir/log
utils/split_data.sh $data $nj
echo "$0: extracting xvectors for $data"
sdata=$data/split$nj/JOB
# Set up the features
feat="ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:${sdata}/feats.scp ark:- | select-voiced-frames ark:- scp,s,cs:${sdata}/vad.scp ark:- |"
if [ $stage -le 0 ]; then
echo "$0: extracting xvectors from nnet"
if $use_gpu; then
for g in $(seq $nj); do
$cmd --gpu 1 ${dir}/log/extract.$g.log \
nnet3-xvector-compute --use-gpu=yes --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size --cache-capacity=${cache_capacity} \
"$nnet" "`echo $feat | sed s/JOB/$g/g`" ark,scp:${dir}/xvector.$g.ark,${dir}/xvector.$g.scp || exit 1 &
done
wait
else
$cmd JOB=1:$nj ${dir}/log/extract.JOB.log \
nnet3-xvector-compute --use-gpu=no --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size --cache-capacity=${cache_capacity} \
"$nnet" "$feat" ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp || exit 1;
fi
fi
if [ $stage -le 1 ]; then
echo "$0: combining xvectors across jobs"
for j in $(seq $nj); do cat $dir/xvector.$j.scp; done >$dir/xvector.scp || exit 1;
fi
if [ $stage -le 2 ]; then
# Average the utterance-level xvectors to get speaker-level xvectors.
echo "$0: computing mean of xvectors for each speaker"
$cmd $dir/log/speaker_mean.log \
ivector-mean ark:$data/spk2utt scp:$dir/xvector.scp \
ark,scp:$dir/spk_xvector.ark,$dir/spk_xvector.scp ark,t:$dir/num_utts.ark || exit 1;
fi