Blame view

egs/vystadial_cz/s5/local/prepare_cs_transcription.sh 566 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
  #!/bin/bash
  
  locdata=$1; shift
  locdict=$1; shift
  
  
  mkdir -p $locdict 
  
  perl local/phonetic_transcription_cs.pl $locdata/vocab-full.txt $locdict/cs_transcription.txt
  
  echo "--- Searching for OOV words ..."
  gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \
    $locdict/cs_transcription.txt $locdata/vocab-full.txt |\
    egrep -v '<.?s>' > $locdict/vocab-oov.txt
  
  gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \
    $locdata/vocab-full.txt $locdict/cs_transcription.txt |\
    egrep -v '<.?s>' > $locdict/lexicon.txt
  
  wc -l $locdict/vocab-oov.txt
  wc -l $locdict/lexicon.txt