make_arpa_sri.csh
1008 Bytes
#!/bin/csh
if ( $#argv != 3 ) then
echo 'Syntax: make_arpa_sri.csh <lexicon> <corpus> <size n>'
exit
endif
# set sribin = /laboinfo/TAP/TOOLS/SRI-LM-1.4.2/bin/i686
set sribin = $SRILM_BIN
if (1) then
echo "=> make ARPA LM with: LEXICON="$1" CORPUS="$2
set CORPUS = `basename $2 ".txt"`
set LEXICON = `basename $1 ".lex"`
rm -f $CORPUS.$LEXICON.sri.n$3.ngrams
rm -f $CORPUS.$LEXICON.sri.n$3.arpa
rm -f $CORPUS.$LEXICON.sri.n$3.sort.arpa
cat $2 | \
$LIA_TAGG/bin/unephraseparligne -remove_cc -cut 80 | \
$sribin/ngram-count -order $3 \
-text - \
-sort \
-vocab $1 \
-unk -map-unk "<UNK>" \
-write $CORPUS.$LEXICON.sri.n$3.ngrams
$sribin/ngram-count -order $3 \
-read $CORPUS.$LEXICON.sri.n$3.ngrams \
-lm $CORPUS.$LEXICON.sri.n$3.arpa \
-kndiscount -kndiscount1 0 -kndiscount2 0 -kndiscount3 0 \
-unk -vocab $1 -map-unk "<UNK>"
$LIA_TAGG/bin/sort_arpa -n $3 -unk < $CORPUS.$LEXICON.sri.n$3.arpa > $CORPUS.$LEXICON.sri.n$3.sort.arpa
echo ' -> done'
endif