Blame view
egs/aishell2/s5/local/word_segmentation.py
725 Bytes
8dcb6dfcb first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
#!/usr/bin/env python # encoding=utf-8 # Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) # 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) # Apache 2.0 from __future__ import print_function import sys import jieba reload(sys) sys.setdefaultencoding('utf-8') if len(sys.argv) < 3: sys.stderr.write("word_segmentation.py <vocab> <trans> > <word-segmented-trans> ") exit(1) vocab_file=sys.argv[1] trans_file=sys.argv[2] jieba.set_dictionary(vocab_file) for line in open(trans_file): key,trans = line.strip().split('\t',1) words = jieba.cut(trans, HMM=False) # turn off new word discovery (HMM-based) new_line = key + '\t' + " ".join(words) print(new_line) |