Blame view

egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.sh 423 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
  #!/bin/bash
  
  # Copyright 2018 John Morgan
  # Apache 2.0.
  
  # write separate files for word and pronunciation fields
  cut -d " " -f 1 qcri.txt > qcri_words_buckwalter.txt
  cut -d " " -f 2- qcri.txt > qcri_prons.txt
  
  # convert words to utf8 
  local/buckwalter2unicode.py -i qcri_words_buckwalter.txt -o qcri_words_utf8.txt
  
  paste qcri_words_utf8.txt qcri_prons.txt
  
  rm qcri_words_buckwalter.txt qcri_words_utf8.txt qcri_prons.txt