Blame view

egs/aishell2/s5/local/word_segmentation.py 725 Bytes
8dcb6dfcb   Yannick Estève   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
  #!/usr/bin/env python
  # encoding=utf-8
  # Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
  #           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
  # Apache 2.0
  
  from __future__ import print_function
  import sys
  import jieba
  reload(sys)
  sys.setdefaultencoding('utf-8')
  
  if len(sys.argv) < 3:
    sys.stderr.write("word_segmentation.py <vocab> <trans> > <word-segmented-trans>
  ")
    exit(1)
  
  vocab_file=sys.argv[1]
  trans_file=sys.argv[2]
  
  jieba.set_dictionary(vocab_file)
  for line in open(trans_file):
    key,trans = line.strip().split('\t',1)
    words = jieba.cut(trans, HMM=False) # turn off new word discovery (HMM-based)
    new_line = key + '\t' + " ".join(words)
    print(new_line)