diff -r 94f586daa623 -r 8ca7f2cea729 alcatel/dataparser/WebLabParser.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alcatel/dataparser/WebLabParser.py Thu Jan 24 16:58:55 2013 +0100 @@ -0,0 +1,141 @@ +''' +Created on 1 aout 2012 + +@author: gerard +''' +from xml.dom.minidom import parseString +import logging +logger = logging.getLogger('document') + +class WebLabParser(object): + + def parse(self, xml): + dom = parseString(xml) + self.get_xmlns(dom) + list_concepts = self.get_concepts(dom) + documents_concepts_list = self.get_documents_concepts_(dom) + return (list_concepts, documents_concepts_list) + + def get_xmlns(self, dom): + self._ns = '' + rdf_elts = dom.getElementsByTagName('rdf:RDF') + for rdf in rdf_elts: + for key, value in rdf.attributes.items(): + if value == 'http://weblab.ow2.org/core/1.2/ontology/retrieval#': + full_ns = key.split(':') + self._ns = full_ns[1] + return + + + def get_concepts(self, dom): + concept_list = [] + for node in dom.getElementsByTagName('annotation'): + parent = node.parentNode + if parent.localName == 'resultSet': + description_elts = node.getElementsByTagName('rdf:Description') + for description in description_elts: + about = description.getAttribute('rdf:about') + dc_title_elts = description.getElementsByTagName('dc:title') + mediaunit_elts = description.getElementsByTagName('mediaUnit') + for mediaunit in mediaunit_elts: + has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent') + for has_native_content in has_native_content_elts: + a_url_image = has_native_content.childNodes[0].data + print 'a_url_image' + print a_url_image + break + break + + wls_score_elts = description.getElementsByTagName(self._ns + ':hasScore') + atitle = '' + for title in dc_title_elts: + atitle = title.childNodes[0].data + a_score = '0' + for wls_score in wls_score_elts: + a_score = wls_score.childNodes[0].data + print 'a_score' + print a_score + if atitle != '': + has_description_elts = description.getElementsByTagName(self._ns + ':hasDescription') + cluster_abstract = '' + for has_description in has_description_elts: + cluster_abstract = has_description.childNodes[0].data + concept_list.append({'about':about,'title': atitle, 'abstract':cluster_abstract, 'url_image':a_url_image, 'score':a_score}) + logger.info('concept_list') + logger.info(concept_list) + return concept_list + + def get_documents_concepts_(self, dom): + concepts_with_documents_list = [] + for node in dom.getElementsByTagName('resultSet'): + '''parent = node.parentNode + if parent.localName == 'resultSet':''' + rdf_bag_elts = node.getElementsByTagName('rdf:Bag') + for rdf_bag in rdf_bag_elts: # loop with the different concepts + list_hits_of_a_concept = [] + concept_documents_list = [] + for rdf_li_elts in rdf_bag.getElementsByTagName('rdf:li'): + list_hits_of_a_concept.append( rdf_li_elts.getAttribute('rdf:resource')) + + description_elts = node.getElementsByTagName('rdf:Description') + for description in description_elts: + about = description.getAttribute('rdf:about') + if about in list_hits_of_a_concept: + img_internal_path = '' + mediaunit_elts = description.getElementsByTagName('mediaUnit') + for mediaunit in mediaunit_elts: + has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent') + for has_native_content in has_native_content_elts: + img_internal_path = has_native_content.childNodes[0].data + print 'img_internal_path' + print img_internal_path + break + break + + dc_identifier_elts = description.getElementsByTagName('dc:identifier') + document_id = dc_identifier_elts[0].childNodes[0].data + hasScore_elts = description.getElementsByTagName(self._ns + ':hasScore') + score = hasScore_elts[0].childNodes[0].data + hasRank_elts = description.getElementsByTagName(self._ns + ':hasRank') + rank = hasRank_elts[0].childNodes[0].data + # Not used ? + wlt_elts = description.getElementsByTagName(self._ns + ':isLinkedTo') + isLinkedTo = wlt_elts[0].getAttribute('rdf:resource') + # + hasDescription_elts = description.getElementsByTagName(self._ns + ':hasDescription') + abstract = hasDescription_elts[0].childNodes[0].data + # + + '''hasRelevantMediaUnit_elts = description.getElementsByTagName('wls:hasRelevantMediaUnit') + if hasRelevantMediaUnit_elts: + media_uri = hasRelevantMediaUnit_elts[0].getAttribute('rdf:resource') + media_unit_elts = dom.getElementsByTagName('mediaUnit') + for media_unit in media_unit_elts: + uri = media_unit.getAttribute('uri') + if uri == media_uri: + has_exposed_content_elts = media_unit.getElementsByTagName('wlp:hasExposedContent') + if has_exposed_content_elts: + img_internal_path = has_exposed_content_elts[0].childNodes[0].data''' + + concept_documents_list.append({'id':document_id, 'score':score, 'rank':rank, 'isLinkedTo':isLinkedTo, 'image_path':img_internal_path, 'abstract':abstract}) + + concepts_with_documents_list.append(concept_documents_list) + logger.info('concepts_with_documents_list') + logger.info(concepts_with_documents_list) + return concepts_with_documents_list + + def get_document_ids_concepts_(self, dom, list_links_concepts): + document_ids_concepts_list = [] + for list_links_of_a_concept in list_links_concepts: + empty_list = [] + document_ids_concepts_list.append(empty_list) + + for node in dom.getElementsByTagName('resource'): + uri = node.getAttribute('uri') + for index, list_links_of_a_concept in enumerate(list_links_concepts): + if uri in list_links_of_a_concept: + dc_identifier_elts = node.getElementsByTagName('dc:identifier') + for dc_identifier in dc_identifier_elts: + document_id = dc_identifier.childNodes[0].data + document_ids_concepts_list[index].append(document_id) + return document_ids_concepts_list \ No newline at end of file