ProcessSOLRQueries.py
2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/python
# -*- coding: utf-8 -*-
from urllib2 import *
import json
from pprint import pprint
import sys
from solrinfo import *
def obtainSOLRInfos(webName, query):
#print 'http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json'
#conn = urlopen('http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json')
print 'http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json' + ' timeout = 30 '
conn = urlopen('http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json', timeout = 30)
rsp = json.load(conn)
conn.close()
keywords = ""
txts = ""
for doc in rsp['response']['docs']:
jsonDoc = json.loads(doc['jsonDocument'].encode("utf-8"))
if 'extractedData' in jsonDoc:
if 'entityTags' in jsonDoc['extractedData']:
for keyword in jsonDoc['extractedData']['entityTags']:
for item in jsonDoc['extractedData']['entityTags'][keyword]:
for access in item:
keywords += item[access]['value'].encode("utf-8") + "\n"
if 'content' in jsonDoc:
if 'body' in jsonDoc['content']:
if 'textBlock' in jsonDoc['content']['body']['textBlocks']:
for txt in jsonDoc['content']['body']['textBlocks']['textBlock']:
if 'parag' in txt:
for parag in txt['parag']:
for textOrMedia in parag['textOrMultimediaRef']:
if 'text' in textOrMedia:
txts += textOrMedia['text']['value'].encode("utf-8")
if 'text' in jsonDoc['content']:
for txt in jsonDoc['content']['text']:
txts += txt['value'].encode("utf-8")
return keywords, txts
if len(sys.argv) != 4:
print "BAD USAGE: <(i) input queries> <(o) keywords out file> <(o) text out file>\n"
sys.exit(-1)
fileName = sys.argv[1]
keywordsFile = sys.argv[2]
txtsFile = sys.argv[3]
inFile = open(fileName, "r")
outKeywords = open(keywordsFile, "w")
outTxts = open(txtsFile, "w")
lines = inFile.readlines()
for query in lines:
query = query.rstrip()
# print query
keywords, txt = obtainSOLRInfos('solr-otmedia-document', query)
outKeywords.write(keywords)
outTxts.write(txt)
keywords, txt = obtainSOLRInfos('solr-otmedia-multimedia', query)
outKeywords.write(keywords)
outTxts.write(txt)
outKeywords.close()
outTxts.close()
inFile.close()