Blame view

tools/scripts/ProcessSOLRQueries.py 2.29 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
  #!/usr/bin/python
  # -*- coding: utf-8 -*-
  
  from urllib2 import *
  import json
  from pprint import pprint
  import sys
  from solrinfo import *
  
  def obtainSOLRInfos(webName, query):
      #print 'http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json'
      #conn = urlopen('http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json')
      print 'http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json'
      conn = urlopen('http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json')
      rsp = json.load(conn)
      keywords = ""
      txts = ""
      for doc in rsp['response']['docs']:
          jsonDoc = json.loads(doc['jsonDocument'].encode("utf-8"))
  	if 'extractedData' in jsonDoc:
              if 'entityTags' in jsonDoc['extractedData']:
          	for keyword in jsonDoc['extractedData']['entityTags']:
                      for item in jsonDoc['extractedData']['entityTags'][keyword]:
                          for access in item:
                              keywords += item[access]['value'].encode("utf-8") + "
  "
  	if 'content' in jsonDoc:
  	    if 'body' in jsonDoc['content']:
  		if 'textBlock' in jsonDoc['content']['body']['textBlocks']:
  		    for txt in jsonDoc['content']['body']['textBlocks']['textBlock']:
  			if 'parag' in txt:
                              for parag in txt['parag']:
  				for textOrMedia in parag['textOrMultimediaRef']:
  				    if 'text' in textOrMedia:
  				        txts += textOrMedia['text']['value'].encode("utf-8")
  	    if 'text' in jsonDoc['content']:
                  for txt in jsonDoc['content']['text']:
                      txts += txt['value'].encode("utf-8")
      return keywords, txts
  
  if len(sys.argv) != 4:
      print "BAD USAGE: <(i) input queries> <(o) keywords out file> <(o) text out file>
  "
      sys.exit(-1)
  
  fileName = sys.argv[1]
  keywordsFile = sys.argv[2]
  txtsFile = sys.argv[3]
  
  inFile = open(fileName, "r")
  outKeywords = open(keywordsFile, "w")
  outTxts = open(txtsFile, "w")
  
  lines = inFile.readlines()
  for query in lines:
      query = query.rstrip()
  #    print query
      keywords, txt = obtainSOLRInfos('solr-otmedia-document', query)
      outKeywords.write(keywords)
      outTxts.write(txt)
      keywords, txt = obtainSOLRInfos('solr-otmedia-multimedia', query)
      outKeywords.write(keywords)
      outTxts.write(txt)
  
  outKeywords.close()
  outTxts.close()