Blame view
tools/scripts/ProcessSOLRQueries.py
2.29 KB
e6be5137b reinitialized pro... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
#!/usr/bin/python # -*- coding: utf-8 -*- from urllib2 import * import json from pprint import pprint import sys from solrinfo import * def obtainSOLRInfos(webName, query): #print 'http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json' #conn = urlopen('http://194.57.216.43:8080/' + webName + '/select?q='+ query + '&wt=json') print 'http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json' conn = urlopen('http://'+machine+':'+port+'/' + webName + '/select?q='+ query + '&wt=json') rsp = json.load(conn) keywords = "" txts = "" for doc in rsp['response']['docs']: jsonDoc = json.loads(doc['jsonDocument'].encode("utf-8")) if 'extractedData' in jsonDoc: if 'entityTags' in jsonDoc['extractedData']: for keyword in jsonDoc['extractedData']['entityTags']: for item in jsonDoc['extractedData']['entityTags'][keyword]: for access in item: keywords += item[access]['value'].encode("utf-8") + " " if 'content' in jsonDoc: if 'body' in jsonDoc['content']: if 'textBlock' in jsonDoc['content']['body']['textBlocks']: for txt in jsonDoc['content']['body']['textBlocks']['textBlock']: if 'parag' in txt: for parag in txt['parag']: for textOrMedia in parag['textOrMultimediaRef']: if 'text' in textOrMedia: txts += textOrMedia['text']['value'].encode("utf-8") if 'text' in jsonDoc['content']: for txt in jsonDoc['content']['text']: txts += txt['value'].encode("utf-8") return keywords, txts if len(sys.argv) != 4: print "BAD USAGE: <(i) input queries> <(o) keywords out file> <(o) text out file> " sys.exit(-1) fileName = sys.argv[1] keywordsFile = sys.argv[2] txtsFile = sys.argv[3] inFile = open(fileName, "r") outKeywords = open(keywordsFile, "w") outTxts = open(txtsFile, "w") lines = inFile.readlines() for query in lines: query = query.rstrip() # print query keywords, txt = obtainSOLRInfos('solr-otmedia-document', query) outKeywords.write(keywords) outTxts.write(txt) keywords, txt = obtainSOLRInfos('solr-otmedia-multimedia', query) outKeywords.write(keywords) outTxts.write(txt) outKeywords.close() outTxts.close() |