'''
Created on 1 aout 2012
@author: gerard
'''
from xml.dom.minidom import parseString
import logging
logger = logging.getLogger('document')
class WebLabParser(object):
def parse(self, xml):
dom = parseString(xml)
self.get_xmlns(dom)
list_concepts = self.get_concepts(dom)
documents_concepts_list = self.get_documents_concepts_(dom)
return (list_concepts, documents_concepts_list)
def get_xmlns(self, dom):
self._ns = ''
rdf_elts = dom.getElementsByTagName('rdf:RDF')
for rdf in rdf_elts:
for key, value in rdf.attributes.items():
if value == 'http://weblab.ow2.org/core/1.2/ontology/retrieval#':
full_ns = key.split(':')
self._ns = full_ns[1]
return
def get_concepts(self, dom):
concept_list = []
for node in dom.getElementsByTagName('annotation'):
parent = node.parentNode
if parent.localName == 'resultSet':
description_elts = node.getElementsByTagName('rdf:Description')
for description in description_elts:
about = description.getAttribute('rdf:about')
dc_title_elts = description.getElementsByTagName('dc:title')
mediaunit_elts = description.getElementsByTagName('mediaUnit')
for mediaunit in mediaunit_elts:
has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
for has_native_content in has_native_content_elts:
a_url_image = has_native_content.childNodes[0].data
print 'a_url_image'
print a_url_image
break
break
wls_score_elts = description.getElementsByTagName(self._ns + ':hasScore')
atitle = ''
for title in dc_title_elts:
atitle = title.childNodes[0].data
a_score = '0'
for wls_score in wls_score_elts:
a_score = wls_score.childNodes[0].data
print 'a_score'
print a_score
if atitle != '':
has_description_elts = description.getElementsByTagName(self._ns + ':hasDescription')
cluster_abstract = ''
for has_description in has_description_elts:
cluster_abstract = has_description.childNodes[0].data
concept_list.append({'about':about,'title': atitle, 'abstract':cluster_abstract, 'url_image':a_url_image, 'score':a_score})
logger.info('concept_list')
logger.info(concept_list)
return concept_list
def get_documents_concepts_(self, dom):
concepts_with_documents_list = []
for node in dom.getElementsByTagName('resultSet'):
'''parent = node.parentNode
if parent.localName == 'resultSet':'''
rdf_bag_elts = node.getElementsByTagName('rdf:Bag')
for rdf_bag in rdf_bag_elts: # loop with the different concepts
list_hits_of_a_concept = []
concept_documents_list = []
for rdf_li_elts in rdf_bag.getElementsByTagName('rdf:li'):
list_hits_of_a_concept.append( rdf_li_elts.getAttribute('rdf:resource'))
description_elts = node.getElementsByTagName('rdf:Description')
for description in description_elts:
about = description.getAttribute('rdf:about')
if about in list_hits_of_a_concept:
img_internal_path = ''
mediaunit_elts = description.getElementsByTagName('mediaUnit')
for mediaunit in mediaunit_elts:
has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
for has_native_content in has_native_content_elts:
img_internal_path = has_native_content.childNodes[0].data
print 'img_internal_path'
print img_internal_path
break
break
dc_identifier_elts = description.getElementsByTagName('dc:identifier')
document_id = dc_identifier_elts[0].childNodes[0].data
hasScore_elts = description.getElementsByTagName(self._ns + ':hasScore')
score = hasScore_elts[0].childNodes[0].data
hasRank_elts = description.getElementsByTagName(self._ns + ':hasRank')
rank = hasRank_elts[0].childNodes[0].data
# Not used ?
wlt_elts = description.getElementsByTagName(self._ns + ':isLinkedTo')
isLinkedTo = wlt_elts[0].getAttribute('rdf:resource')
#
hasDescription_elts = description.getElementsByTagName(self._ns + ':hasDescription')
abstract = hasDescription_elts[0].childNodes[0].data
#
'''hasRelevantMediaUnit_elts = description.getElementsByTagName('wls:hasRelevantMediaUnit')
if hasRelevantMediaUnit_elts:
media_uri = hasRelevantMediaUnit_elts[0].getAttribute('rdf:resource')
media_unit_elts = dom.getElementsByTagName('mediaUnit')
for media_unit in media_unit_elts:
uri = media_unit.getAttribute('uri')
if uri == media_uri:
has_exposed_content_elts = media_unit.getElementsByTagName('wlp:hasExposedContent')
if has_exposed_content_elts:
img_internal_path = has_exposed_content_elts[0].childNodes[0].data'''
concept_documents_list.append({'id':document_id, 'score':score, 'rank':rank, 'isLinkedTo':isLinkedTo, 'image_path':img_internal_path, 'abstract':abstract})
concepts_with_documents_list.append(concept_documents_list)
logger.info('concepts_with_documents_list')
logger.info(concepts_with_documents_list)
return concepts_with_documents_list
def get_document_ids_concepts_(self, dom, list_links_concepts):
document_ids_concepts_list = []
for list_links_of_a_concept in list_links_concepts:
empty_list = []
document_ids_concepts_list.append(empty_list)
for node in dom.getElementsByTagName('resource'):
uri = node.getAttribute('uri')
for index, list_links_of_a_concept in enumerate(list_links_concepts):
if uri in list_links_of_a_concept:
dc_identifier_elts = node.getElementsByTagName('dc:identifier')
for dc_identifier in dc_identifier_elts:
document_id = dc_identifier.childNodes[0].data
document_ids_concepts_list[index].append(document_id)
return document_ids_concepts_list