get_oracle.sh
1.02 KB
#!/usr/bin/env bash
# Gets lattice oracles
if [ $# -lt 3 ]; then
echo "Specify lattice dir, symbol table and text file for partition"
exit 1;
fi
latticeDir=$1
textFile=$3
symTable=$2
oracleDir=$latticeDir/oracle
echo $latticeDir
echo $oracleDir
. ./path.sh
if [ ! -f $textFile -o ! -f $symTable -o ! -d $latticeDir ]; then
echo "Required files not found"
exit 1;
fi
mkdir -p $oracleDir
# Since the lexicon is built from the LDC lexicon, there are words in the dataset
# that do not appear in the lexicon. These have to marked as OOV.
# Removing [hes] symbols as well. This is not consistent with the scoring scheme used
# while scoring 1-best.
cat $textFile | sed 's:\[laughter\]::g' | sed 's:\[noise\]::g' | sed 's:\[hes\]::g' | \
utils/sym2int.pl --map-oov [oov] -f 2- $symTable | \
$KALDI_ROOT/src/latbin/lattice-oracle --word-symbol-table=$symTable "ark:gunzip -c $latticeDir/lat.*.gz|" ark:- ark,t:$oracleDir/oracle.tra 2>$oracleDir/oracle.log
sort -k1,1 -u $oracleDir/oracle.tra -o $oracleDir/oracle.tra