Blame view

tools/scripts/ProcessSOLRQueries.py 2.35 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
  #!/usr/bin/python
  # -*- coding: utf-8 -*-
  
  from urllib2 import *
  import json
  from pprint import pprint
  import sys
  from solrinfo import *
  
  def obtainSOLRInfos(webName, query):
      #print 'http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json'
      #conn = urlopen('http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json')
a564ec1e5   Jean-François Rey   bugfix
13
14
      print 'http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json' + ' timeout = 30 '
      conn = urlopen('http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json', timeout = 30)
e6be5137b   Jean-François Rey   reinitialized pro...
15
      rsp = json.load(conn)
a564ec1e5   Jean-François Rey   bugfix
16
      conn.close()
e6be5137b   Jean-François Rey   reinitialized pro...
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
      keywords = ""
      txts = ""
      for doc in rsp['response']['docs']:
          jsonDoc = json.loads(doc['jsonDocument'].encode("utf-8"))
  	if 'extractedData' in jsonDoc:
              if 'entityTags' in jsonDoc['extractedData']:
          	for keyword in jsonDoc['extractedData']['entityTags']:
                      for item in jsonDoc['extractedData']['entityTags'][keyword]:
                          for access in item:
                              keywords += item[access]['value'].encode("utf-8") + "
  "
  	if 'content' in jsonDoc:
  	    if 'body' in jsonDoc['content']:
  		if 'textBlock' in jsonDoc['content']['body']['textBlocks']:
  		    for txt in jsonDoc['content']['body']['textBlocks']['textBlock']:
  			if 'parag' in txt:
                              for parag in txt['parag']:
  				for textOrMedia in parag['textOrMultimediaRef']:
  				    if 'text' in textOrMedia:
  				        txts += textOrMedia['text']['value'].encode("utf-8")
  	    if 'text' in jsonDoc['content']:
                  for txt in jsonDoc['content']['text']:
                      txts += txt['value'].encode("utf-8")
      return keywords, txts
  
  if len(sys.argv) != 4:
      print "BAD USAGE: <(i) input queries> <(o) keywords out file> <(o) text out file>
  "
      sys.exit(-1)
  
  fileName = sys.argv[1]
  keywordsFile = sys.argv[2]
  txtsFile = sys.argv[3]
  
  inFile = open(fileName, "r")
  outKeywords = open(keywordsFile, "w")
  outTxts = open(txtsFile, "w")
  
  lines = inFile.readlines()
  for query in lines:
      query = query.rstrip()
  #    print query
      keywords, txt = obtainSOLRInfos('solr-otmedia-document', query)
      outKeywords.write(keywords)
      outTxts.write(txt)
      keywords, txt = obtainSOLRInfos('solr-otmedia-multimedia', query)
      outKeywords.write(keywords)
      outTxts.write(txt)
  
  outKeywords.close()
  outTxts.close()
a564ec1e5   Jean-François Rey   bugfix
68
  inFile.close()