alcatel/dataparser/WebLabParser.py
changeset 27 8ca7f2cea729
child 37 3848e1813a30
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alcatel/dataparser/WebLabParser.py	Thu Jan 24 16:58:55 2013 +0100
@@ -0,0 +1,141 @@
+'''
+Created on 1 aout 2012
+
+@author: gerard
+'''
+from xml.dom.minidom import parseString
+import logging
+logger = logging.getLogger('document')
+
+class WebLabParser(object):
+    
+    def parse(self, xml):
+        dom = parseString(xml)
+        self.get_xmlns(dom)
+        list_concepts = self.get_concepts(dom)
+        documents_concepts_list = self.get_documents_concepts_(dom)
+        return (list_concepts, documents_concepts_list)
+    
+    def get_xmlns(self, dom):
+        self._ns = ''
+        rdf_elts = dom.getElementsByTagName('rdf:RDF')
+        for rdf in rdf_elts:
+            for key, value in rdf.attributes.items():
+                if value == 'http://weblab.ow2.org/core/1.2/ontology/retrieval#':
+                    full_ns = key.split(':')
+                    self._ns = full_ns[1]
+                    return
+        
+                 
+    def get_concepts(self, dom):
+        concept_list = []
+        for node in dom.getElementsByTagName('annotation'):
+            parent = node.parentNode
+            if parent.localName == 'resultSet':
+                description_elts = node.getElementsByTagName('rdf:Description')
+                for description in description_elts:
+                    about = description.getAttribute('rdf:about')
+                    dc_title_elts = description.getElementsByTagName('dc:title')
+                    mediaunit_elts = description.getElementsByTagName('mediaUnit')
+                    for mediaunit in mediaunit_elts:
+                        has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
+                        for has_native_content in has_native_content_elts:
+                            a_url_image = has_native_content.childNodes[0].data
+                            print 'a_url_image'
+                            print a_url_image
+                            break
+                        break
+                        
+                    wls_score_elts = description.getElementsByTagName(self._ns + ':hasScore')
+                    atitle = ''
+                    for title in dc_title_elts:
+                        atitle = title.childNodes[0].data
+                    a_score = '0'
+                    for wls_score in wls_score_elts:
+                        a_score = wls_score.childNodes[0].data
+                        print 'a_score'
+                        print a_score
+                    if atitle != '':
+                        has_description_elts = description.getElementsByTagName(self._ns + ':hasDescription')
+                        cluster_abstract = ''
+                        for has_description in has_description_elts:
+                            cluster_abstract = has_description.childNodes[0].data
+                        concept_list.append({'about':about,'title': atitle, 'abstract':cluster_abstract, 'url_image':a_url_image, 'score':a_score})
+        logger.info('concept_list')
+        logger.info(concept_list)
+        return concept_list
+                           
+    def get_documents_concepts_(self, dom):
+        concepts_with_documents_list = []
+        for node in dom.getElementsByTagName('resultSet'):
+            '''parent = node.parentNode
+            if parent.localName == 'resultSet':'''
+            rdf_bag_elts = node.getElementsByTagName('rdf:Bag')
+            for rdf_bag in rdf_bag_elts: # loop with the different concepts
+                list_hits_of_a_concept = []
+                concept_documents_list = []
+                for rdf_li_elts in rdf_bag.getElementsByTagName('rdf:li'):
+                    list_hits_of_a_concept.append( rdf_li_elts.getAttribute('rdf:resource'))
+            
+                description_elts = node.getElementsByTagName('rdf:Description')
+                for description in description_elts:
+                    about = description.getAttribute('rdf:about')
+                    if about in list_hits_of_a_concept:
+                        img_internal_path = ''
+                        mediaunit_elts = description.getElementsByTagName('mediaUnit')
+                        for mediaunit in mediaunit_elts:
+                            has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
+                            for has_native_content in has_native_content_elts:
+                                img_internal_path = has_native_content.childNodes[0].data
+                                print 'img_internal_path'
+                                print img_internal_path
+                                break
+                            break
+                    
+                        dc_identifier_elts = description.getElementsByTagName('dc:identifier')
+                        document_id = dc_identifier_elts[0].childNodes[0].data
+                        hasScore_elts = description.getElementsByTagName(self._ns + ':hasScore')
+                        score = hasScore_elts[0].childNodes[0].data
+                        hasRank_elts = description.getElementsByTagName(self._ns + ':hasRank')
+                        rank = hasRank_elts[0].childNodes[0].data
+                        # Not used ?
+                        wlt_elts = description.getElementsByTagName(self._ns + ':isLinkedTo')
+                        isLinkedTo = wlt_elts[0].getAttribute('rdf:resource')
+                        #
+                        hasDescription_elts = description.getElementsByTagName(self._ns + ':hasDescription')
+                        abstract = hasDescription_elts[0].childNodes[0].data
+                        # 
+                        
+                        '''hasRelevantMediaUnit_elts = description.getElementsByTagName('wls:hasRelevantMediaUnit')
+                        if hasRelevantMediaUnit_elts:
+                            media_uri = hasRelevantMediaUnit_elts[0].getAttribute('rdf:resource')
+                            media_unit_elts = dom.getElementsByTagName('mediaUnit')
+                            for media_unit in media_unit_elts:
+                                uri = media_unit.getAttribute('uri')
+                                if uri == media_uri:
+                                    has_exposed_content_elts = media_unit.getElementsByTagName('wlp:hasExposedContent')
+                                    if has_exposed_content_elts:
+                                        img_internal_path = has_exposed_content_elts[0].childNodes[0].data'''
+
+                        concept_documents_list.append({'id':document_id, 'score':score, 'rank':rank, 'isLinkedTo':isLinkedTo, 'image_path':img_internal_path, 'abstract':abstract})  
+
+                concepts_with_documents_list.append(concept_documents_list)
+        logger.info('concepts_with_documents_list')
+        logger.info(concepts_with_documents_list)
+        return concepts_with_documents_list
+            
+    def get_document_ids_concepts_(self, dom, list_links_concepts):
+        document_ids_concepts_list = [] 
+        for list_links_of_a_concept in list_links_concepts:
+            empty_list = []
+            document_ids_concepts_list.append(empty_list) 
+                    
+        for node in dom.getElementsByTagName('resource'):
+            uri = node.getAttribute('uri')
+            for index, list_links_of_a_concept in enumerate(list_links_concepts):
+                if uri in list_links_of_a_concept:
+                    dc_identifier_elts = node.getElementsByTagName('dc:identifier')
+                    for dc_identifier in dc_identifier_elts:
+                        document_id = dc_identifier.childNodes[0].data
+                        document_ids_concepts_list[index].append(document_id)
+        return document_ids_concepts_list
\ No newline at end of file