make_arpa_sri.csh 1008 Bytes
#!/bin/csh

if ( $#argv != 3 ) then
 echo 'Syntax: make_arpa_sri.csh <lexicon> <corpus> <size n>'
 exit
endif

# set sribin = /laboinfo/TAP/TOOLS/SRI-LM-1.4.2/bin/i686
set sribin = $SRILM_BIN

if (1) then
	echo "=> make ARPA LM with: LEXICON="$1"  CORPUS="$2
        set CORPUS = `basename $2 ".txt"`
	set LEXICON = `basename $1 ".lex"`
	rm -f $CORPUS.$LEXICON.sri.n$3.ngrams
	rm -f $CORPUS.$LEXICON.sri.n$3.arpa
	rm -f $CORPUS.$LEXICON.sri.n$3.sort.arpa
	cat $2 | \
		$LIA_TAGG/bin/unephraseparligne -remove_cc -cut 80 | \
		$sribin/ngram-count -order $3 \
			-text - \
			-sort \
			-vocab $1 \
			-unk -map-unk "<UNK>" \
			-write $CORPUS.$LEXICON.sri.n$3.ngrams
	$sribin/ngram-count -order $3 \
		-read $CORPUS.$LEXICON.sri.n$3.ngrams \
		-lm $CORPUS.$LEXICON.sri.n$3.arpa \
		-kndiscount -kndiscount1 0 -kndiscount2 0 -kndiscount3 0 \
		-unk -vocab $1 -map-unk "<UNK>"
	$LIA_TAGG/bin/sort_arpa -n $3 -unk < $CORPUS.$LEXICON.sri.n$3.arpa > $CORPUS.$LEXICON.sri.n$3.sort.arpa
	echo '    -> done'
endif