word_segmentation.py 725 Bytes
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25


#!/usr/bin/env python
# encoding=utf-8
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0

from __future__ import print_function
import sys
import jieba
reload(sys)
sys.setdefaultencoding('utf-8')

if len(sys.argv) < 3:
  sys.stderr.write("word_segmentation.py <vocab> <trans> > <word-segmented-trans>\n")
  exit(1)

vocab_file=sys.argv[1]
trans_file=sys.argv[2]

jieba.set_dictionary(vocab_file)
for line in open(trans_file):
  key,trans = line.strip().split('\t',1)
  words = jieba.cut(trans, HMM=False) # turn off new word discovery (HMM-based)
  new_line = key + '\t' + " ".join(words)
  print(new_line)