--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/alcatel/dataparser/WebLabParser.py Thu Jan 24 16:58:55 2013 +0100
@@ -0,0 +1,141 @@
+'''
+Created on 1 aout 2012
+
+@author: gerard
+'''
+from xml.dom.minidom import parseString
+import logging
+logger = logging.getLogger('document')
+
+class WebLabParser(object):
+
+ def parse(self, xml):
+ dom = parseString(xml)
+ self.get_xmlns(dom)
+ list_concepts = self.get_concepts(dom)
+ documents_concepts_list = self.get_documents_concepts_(dom)
+ return (list_concepts, documents_concepts_list)
+
+ def get_xmlns(self, dom):
+ self._ns = ''
+ rdf_elts = dom.getElementsByTagName('rdf:RDF')
+ for rdf in rdf_elts:
+ for key, value in rdf.attributes.items():
+ if value == 'http://weblab.ow2.org/core/1.2/ontology/retrieval#':
+ full_ns = key.split(':')
+ self._ns = full_ns[1]
+ return
+
+
+ def get_concepts(self, dom):
+ concept_list = []
+ for node in dom.getElementsByTagName('annotation'):
+ parent = node.parentNode
+ if parent.localName == 'resultSet':
+ description_elts = node.getElementsByTagName('rdf:Description')
+ for description in description_elts:
+ about = description.getAttribute('rdf:about')
+ dc_title_elts = description.getElementsByTagName('dc:title')
+ mediaunit_elts = description.getElementsByTagName('mediaUnit')
+ for mediaunit in mediaunit_elts:
+ has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
+ for has_native_content in has_native_content_elts:
+ a_url_image = has_native_content.childNodes[0].data
+ print 'a_url_image'
+ print a_url_image
+ break
+ break
+
+ wls_score_elts = description.getElementsByTagName(self._ns + ':hasScore')
+ atitle = ''
+ for title in dc_title_elts:
+ atitle = title.childNodes[0].data
+ a_score = '0'
+ for wls_score in wls_score_elts:
+ a_score = wls_score.childNodes[0].data
+ print 'a_score'
+ print a_score
+ if atitle != '':
+ has_description_elts = description.getElementsByTagName(self._ns + ':hasDescription')
+ cluster_abstract = ''
+ for has_description in has_description_elts:
+ cluster_abstract = has_description.childNodes[0].data
+ concept_list.append({'about':about,'title': atitle, 'abstract':cluster_abstract, 'url_image':a_url_image, 'score':a_score})
+ logger.info('concept_list')
+ logger.info(concept_list)
+ return concept_list
+
+ def get_documents_concepts_(self, dom):
+ concepts_with_documents_list = []
+ for node in dom.getElementsByTagName('resultSet'):
+ '''parent = node.parentNode
+ if parent.localName == 'resultSet':'''
+ rdf_bag_elts = node.getElementsByTagName('rdf:Bag')
+ for rdf_bag in rdf_bag_elts: # loop with the different concepts
+ list_hits_of_a_concept = []
+ concept_documents_list = []
+ for rdf_li_elts in rdf_bag.getElementsByTagName('rdf:li'):
+ list_hits_of_a_concept.append( rdf_li_elts.getAttribute('rdf:resource'))
+
+ description_elts = node.getElementsByTagName('rdf:Description')
+ for description in description_elts:
+ about = description.getAttribute('rdf:about')
+ if about in list_hits_of_a_concept:
+ img_internal_path = ''
+ mediaunit_elts = description.getElementsByTagName('mediaUnit')
+ for mediaunit in mediaunit_elts:
+ has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
+ for has_native_content in has_native_content_elts:
+ img_internal_path = has_native_content.childNodes[0].data
+ print 'img_internal_path'
+ print img_internal_path
+ break
+ break
+
+ dc_identifier_elts = description.getElementsByTagName('dc:identifier')
+ document_id = dc_identifier_elts[0].childNodes[0].data
+ hasScore_elts = description.getElementsByTagName(self._ns + ':hasScore')
+ score = hasScore_elts[0].childNodes[0].data
+ hasRank_elts = description.getElementsByTagName(self._ns + ':hasRank')
+ rank = hasRank_elts[0].childNodes[0].data
+ # Not used ?
+ wlt_elts = description.getElementsByTagName(self._ns + ':isLinkedTo')
+ isLinkedTo = wlt_elts[0].getAttribute('rdf:resource')
+ #
+ hasDescription_elts = description.getElementsByTagName(self._ns + ':hasDescription')
+ abstract = hasDescription_elts[0].childNodes[0].data
+ #
+
+ '''hasRelevantMediaUnit_elts = description.getElementsByTagName('wls:hasRelevantMediaUnit')
+ if hasRelevantMediaUnit_elts:
+ media_uri = hasRelevantMediaUnit_elts[0].getAttribute('rdf:resource')
+ media_unit_elts = dom.getElementsByTagName('mediaUnit')
+ for media_unit in media_unit_elts:
+ uri = media_unit.getAttribute('uri')
+ if uri == media_uri:
+ has_exposed_content_elts = media_unit.getElementsByTagName('wlp:hasExposedContent')
+ if has_exposed_content_elts:
+ img_internal_path = has_exposed_content_elts[0].childNodes[0].data'''
+
+ concept_documents_list.append({'id':document_id, 'score':score, 'rank':rank, 'isLinkedTo':isLinkedTo, 'image_path':img_internal_path, 'abstract':abstract})
+
+ concepts_with_documents_list.append(concept_documents_list)
+ logger.info('concepts_with_documents_list')
+ logger.info(concepts_with_documents_list)
+ return concepts_with_documents_list
+
+ def get_document_ids_concepts_(self, dom, list_links_concepts):
+ document_ids_concepts_list = []
+ for list_links_of_a_concept in list_links_concepts:
+ empty_list = []
+ document_ids_concepts_list.append(empty_list)
+
+ for node in dom.getElementsByTagName('resource'):
+ uri = node.getAttribute('uri')
+ for index, list_links_of_a_concept in enumerate(list_links_concepts):
+ if uri in list_links_of_a_concept:
+ dc_identifier_elts = node.getElementsByTagName('dc:identifier')
+ for dc_identifier in dc_identifier_elts:
+ document_id = dc_identifier.childNodes[0].data
+ document_ids_concepts_list[index].append(document_id)
+ return document_ids_concepts_list
\ No newline at end of file