alcatel/dataparser/WebLabParser.py
author cobled@FRVILN0H401086.emea.lucent.com
Thu, 24 Jan 2013 16:58:55 +0100
changeset 27 8ca7f2cea729
child 37 3848e1813a30
permissions -rw-r--r--
add alcatel folder

'''
Created on 1 aout 2012

@author: gerard
'''
from xml.dom.minidom import parseString
import logging
logger = logging.getLogger('document')

class WebLabParser(object):
    
    def parse(self, xml):
        dom = parseString(xml)
        self.get_xmlns(dom)
        list_concepts = self.get_concepts(dom)
        documents_concepts_list = self.get_documents_concepts_(dom)
        return (list_concepts, documents_concepts_list)
    
    def get_xmlns(self, dom):
        self._ns = ''
        rdf_elts = dom.getElementsByTagName('rdf:RDF')
        for rdf in rdf_elts:
            for key, value in rdf.attributes.items():
                if value == 'http://weblab.ow2.org/core/1.2/ontology/retrieval#':
                    full_ns = key.split(':')
                    self._ns = full_ns[1]
                    return
        
                 
    def get_concepts(self, dom):
        concept_list = []
        for node in dom.getElementsByTagName('annotation'):
            parent = node.parentNode
            if parent.localName == 'resultSet':
                description_elts = node.getElementsByTagName('rdf:Description')
                for description in description_elts:
                    about = description.getAttribute('rdf:about')
                    dc_title_elts = description.getElementsByTagName('dc:title')
                    mediaunit_elts = description.getElementsByTagName('mediaUnit')
                    for mediaunit in mediaunit_elts:
                        has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
                        for has_native_content in has_native_content_elts:
                            a_url_image = has_native_content.childNodes[0].data
                            print 'a_url_image'
                            print a_url_image
                            break
                        break
                        
                    wls_score_elts = description.getElementsByTagName(self._ns + ':hasScore')
                    atitle = ''
                    for title in dc_title_elts:
                        atitle = title.childNodes[0].data
                    a_score = '0'
                    for wls_score in wls_score_elts:
                        a_score = wls_score.childNodes[0].data
                        print 'a_score'
                        print a_score
                    if atitle != '':
                        has_description_elts = description.getElementsByTagName(self._ns + ':hasDescription')
                        cluster_abstract = ''
                        for has_description in has_description_elts:
                            cluster_abstract = has_description.childNodes[0].data
                        concept_list.append({'about':about,'title': atitle, 'abstract':cluster_abstract, 'url_image':a_url_image, 'score':a_score})
        logger.info('concept_list')
        logger.info(concept_list)
        return concept_list
                           
    def get_documents_concepts_(self, dom):
        concepts_with_documents_list = []
        for node in dom.getElementsByTagName('resultSet'):
            '''parent = node.parentNode
            if parent.localName == 'resultSet':'''
            rdf_bag_elts = node.getElementsByTagName('rdf:Bag')
            for rdf_bag in rdf_bag_elts: # loop with the different concepts
                list_hits_of_a_concept = []
                concept_documents_list = []
                for rdf_li_elts in rdf_bag.getElementsByTagName('rdf:li'):
                    list_hits_of_a_concept.append( rdf_li_elts.getAttribute('rdf:resource'))
            
                description_elts = node.getElementsByTagName('rdf:Description')
                for description in description_elts:
                    about = description.getAttribute('rdf:about')
                    if about in list_hits_of_a_concept:
                        img_internal_path = ''
                        mediaunit_elts = description.getElementsByTagName('mediaUnit')
                        for mediaunit in mediaunit_elts:
                            has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
                            for has_native_content in has_native_content_elts:
                                img_internal_path = has_native_content.childNodes[0].data
                                print 'img_internal_path'
                                print img_internal_path
                                break
                            break
                    
                        dc_identifier_elts = description.getElementsByTagName('dc:identifier')
                        document_id = dc_identifier_elts[0].childNodes[0].data
                        hasScore_elts = description.getElementsByTagName(self._ns + ':hasScore')
                        score = hasScore_elts[0].childNodes[0].data
                        hasRank_elts = description.getElementsByTagName(self._ns + ':hasRank')
                        rank = hasRank_elts[0].childNodes[0].data
                        # Not used ?
                        wlt_elts = description.getElementsByTagName(self._ns + ':isLinkedTo')
                        isLinkedTo = wlt_elts[0].getAttribute('rdf:resource')
                        #
                        hasDescription_elts = description.getElementsByTagName(self._ns + ':hasDescription')
                        abstract = hasDescription_elts[0].childNodes[0].data
                        # 
                        
                        '''hasRelevantMediaUnit_elts = description.getElementsByTagName('wls:hasRelevantMediaUnit')
                        if hasRelevantMediaUnit_elts:
                            media_uri = hasRelevantMediaUnit_elts[0].getAttribute('rdf:resource')
                            media_unit_elts = dom.getElementsByTagName('mediaUnit')
                            for media_unit in media_unit_elts:
                                uri = media_unit.getAttribute('uri')
                                if uri == media_uri:
                                    has_exposed_content_elts = media_unit.getElementsByTagName('wlp:hasExposedContent')
                                    if has_exposed_content_elts:
                                        img_internal_path = has_exposed_content_elts[0].childNodes[0].data'''

                        concept_documents_list.append({'id':document_id, 'score':score, 'rank':rank, 'isLinkedTo':isLinkedTo, 'image_path':img_internal_path, 'abstract':abstract})  

                concepts_with_documents_list.append(concept_documents_list)
        logger.info('concepts_with_documents_list')
        logger.info(concepts_with_documents_list)
        return concepts_with_documents_list
            
    def get_document_ids_concepts_(self, dom, list_links_concepts):
        document_ids_concepts_list = [] 
        for list_links_of_a_concept in list_links_concepts:
            empty_list = []
            document_ids_concepts_list.append(empty_list) 
                    
        for node in dom.getElementsByTagName('resource'):
            uri = node.getAttribute('uri')
            for index, list_links_of_a_concept in enumerate(list_links_concepts):
                if uri in list_links_of_a_concept:
                    dc_identifier_elts = node.getElementsByTagName('dc:identifier')
                    for dc_identifier in dc_identifier_elts:
                        document_id = dc_identifier.childNodes[0].data
                        document_ids_concepts_list[index].append(document_id)
        return document_ids_concepts_list