Blame view

utils.py 295 Bytes
b6d0165d1   Killian   Initial commit
1
2
3
4
5
6
7
8
9
10
  def yield_corpus(df_list):
      for corpus in df_list:
          for id,doc in corpus.iterrows():
              try:
                  yield tok2.tokenize(doc[2].decode("utf-8"))
              except:
                  print doc[2]
                  raise
  def select(elm):
      return int(elm.split("_")[-1])