Blame view
egs/tunisian_msa/s5/local/qcri_buckwalter2utf8.sh
423 Bytes
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
#!/bin/bash # Copyright 2018 John Morgan # Apache 2.0. # write separate files for word and pronunciation fields cut -d " " -f 1 qcri.txt > qcri_words_buckwalter.txt cut -d " " -f 2- qcri.txt > qcri_prons.txt # convert words to utf8 local/buckwalter2unicode.py -i qcri_words_buckwalter.txt -o qcri_words_utf8.txt paste qcri_words_utf8.txt qcri_prons.txt rm qcri_words_buckwalter.txt qcri_words_utf8.txt qcri_prons.txt |