ProcessSOLRQueries.py 2.29 KB
#!/usr/bin/python
# -*- coding: utf-8 -*-

from urllib2 import *
import json
from pprint import pprint
import sys
from solrinfo import *

def obtainSOLRInfos(webName, query):
    #print 'http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json'
    #conn = urlopen('http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json')
    print 'http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json'
    conn = urlopen('http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json')
    rsp = json.load(conn)
    keywords = ""
    txts = ""
    for doc in rsp['response']['docs']:
        jsonDoc = json.loads(doc['jsonDocument'].encode("utf-8"))
	if 'extractedData' in jsonDoc:
            if 'entityTags' in jsonDoc['extractedData']:
        	for keyword in jsonDoc['extractedData']['entityTags']:
                    for item in jsonDoc['extractedData']['entityTags'][keyword]:
                        for access in item:
                            keywords += item[access]['value'].encode("utf-8") + "\n"
	if 'content' in jsonDoc:
	    if 'body' in jsonDoc['content']:
		if 'textBlock' in jsonDoc['content']['body']['textBlocks']:
		    for txt in jsonDoc['content']['body']['textBlocks']['textBlock']:
			if 'parag' in txt:
                            for parag in txt['parag']:
				for textOrMedia in parag['textOrMultimediaRef']:
				    if 'text' in textOrMedia:
				        txts += textOrMedia['text']['value'].encode("utf-8")
	    if 'text' in jsonDoc['content']:
                for txt in jsonDoc['content']['text']:
                    txts += txt['value'].encode("utf-8")
    return keywords, txts

if len(sys.argv) != 4:
    print "BAD USAGE: <(i) input queries> <(o) keywords out file> <(o) text out file>\n"
    sys.exit(-1)

fileName = sys.argv[1]
keywordsFile = sys.argv[2]
txtsFile = sys.argv[3]

inFile = open(fileName, "r")
outKeywords = open(keywordsFile, "w")
outTxts = open(txtsFile, "w")

lines = inFile.readlines()
for query in lines:
    query = query.rstrip()
#    print query
    keywords, txt = obtainSOLRInfos('solr-otmedia-document', query)
    outKeywords.write(keywords)
    outTxts.write(txt)
    keywords, txt = obtainSOLRInfos('solr-otmedia-multimedia', query)
    outKeywords.write(keywords)
    outTxts.write(txt)

outKeywords.close()
outTxts.close()