alcatel/dataparser/WebLabParser.py
author cobled
Wed, 14 Aug 2013 16:36:41 +0200
changeset 37 3848e1813a30
parent 27 8ca7f2cea729
permissions -rw-r--r--
last version
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
27
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
     1
'''
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
     2
Created on 1 aout 2012
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
     3
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
     4
@author: gerard
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
     5
'''
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
     6
from xml.dom.minidom import parseString
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
     7
import logging
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
     8
logger = logging.getLogger('document')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
     9
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    10
class WebLabParser(object):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    11
    
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    12
    def parse(self, xml):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    13
        dom = parseString(xml)
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    14
        self.get_xmlns(dom)
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    15
        list_concepts = self.get_concepts(dom)
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    16
        documents_concepts_list = self.get_documents_concepts_(dom)
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    17
        return (list_concepts, documents_concepts_list)
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    18
    
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    19
    def get_xmlns(self, dom):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    20
        self._ns = ''
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    21
        rdf_elts = dom.getElementsByTagName('rdf:RDF')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    22
        for rdf in rdf_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    23
            for key, value in rdf.attributes.items():
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    24
                if value == 'http://weblab.ow2.org/core/1.2/ontology/retrieval#':
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    25
                    full_ns = key.split(':')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    26
                    self._ns = full_ns[1]
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    27
                    return
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    28
        
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    29
                 
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    30
    def get_concepts(self, dom):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    31
        concept_list = []
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    32
        for node in dom.getElementsByTagName('annotation'):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    33
            parent = node.parentNode
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    34
            if parent.localName == 'resultSet':
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    35
                description_elts = node.getElementsByTagName('rdf:Description')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    36
                for description in description_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    37
                    about = description.getAttribute('rdf:about')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    38
                    dc_title_elts = description.getElementsByTagName('dc:title')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    39
                    mediaunit_elts = description.getElementsByTagName('mediaUnit')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    40
                    for mediaunit in mediaunit_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    41
                        has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    42
                        for has_native_content in has_native_content_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    43
                            a_url_image = has_native_content.childNodes[0].data
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    44
                            break
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    45
                        break
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    46
                        
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    47
                    wls_score_elts = description.getElementsByTagName(self._ns + ':hasScore')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    48
                    atitle = ''
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    49
                    for title in dc_title_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    50
                        atitle = title.childNodes[0].data
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    51
                    a_score = '0'
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    52
                    for wls_score in wls_score_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    53
                        a_score = wls_score.childNodes[0].data
37
3848e1813a30 last version
cobled
parents: 27
diff changeset
    54
                        
3848e1813a30 last version
cobled
parents: 27
diff changeset
    55
                        
27
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    56
                    if atitle != '':
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    57
                        has_description_elts = description.getElementsByTagName(self._ns + ':hasDescription')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    58
                        cluster_abstract = ''
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    59
                        for has_description in has_description_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    60
                            cluster_abstract = has_description.childNodes[0].data
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    61
                        concept_list.append({'about':about,'title': atitle, 'abstract':cluster_abstract, 'url_image':a_url_image, 'score':a_score})
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    62
        return concept_list
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    63
                           
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    64
    def get_documents_concepts_(self, dom):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    65
        concepts_with_documents_list = []
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    66
        for node in dom.getElementsByTagName('resultSet'):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    67
            '''parent = node.parentNode
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    68
            if parent.localName == 'resultSet':'''
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    69
            rdf_bag_elts = node.getElementsByTagName('rdf:Bag')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    70
            for rdf_bag in rdf_bag_elts: # loop with the different concepts
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    71
                list_hits_of_a_concept = []
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    72
                concept_documents_list = []
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    73
                for rdf_li_elts in rdf_bag.getElementsByTagName('rdf:li'):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    74
                    list_hits_of_a_concept.append( rdf_li_elts.getAttribute('rdf:resource'))
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    75
            
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    76
                description_elts = node.getElementsByTagName('rdf:Description')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    77
                for description in description_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    78
                    about = description.getAttribute('rdf:about')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    79
                    if about in list_hits_of_a_concept:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    80
                        img_internal_path = ''
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    81
                        mediaunit_elts = description.getElementsByTagName('mediaUnit')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    82
                        for mediaunit in mediaunit_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    83
                            has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    84
                            for has_native_content in has_native_content_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    85
                                img_internal_path = has_native_content.childNodes[0].data
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    86
                                break
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    87
                            break
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    88
                    
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    89
                        dc_identifier_elts = description.getElementsByTagName('dc:identifier')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    90
                        document_id = dc_identifier_elts[0].childNodes[0].data
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    91
                        hasScore_elts = description.getElementsByTagName(self._ns + ':hasScore')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    92
                        score = hasScore_elts[0].childNodes[0].data
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    93
                        hasRank_elts = description.getElementsByTagName(self._ns + ':hasRank')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    94
                        rank = hasRank_elts[0].childNodes[0].data
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    95
                        # Not used ?
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    96
                        wlt_elts = description.getElementsByTagName(self._ns + ':isLinkedTo')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    97
                        isLinkedTo = wlt_elts[0].getAttribute('rdf:resource')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    98
                        #
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
    99
                        hasDescription_elts = description.getElementsByTagName(self._ns + ':hasDescription')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   100
                        abstract = hasDescription_elts[0].childNodes[0].data
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   101
                        # 
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   102
                        
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   103
                        '''hasRelevantMediaUnit_elts = description.getElementsByTagName('wls:hasRelevantMediaUnit')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   104
                        if hasRelevantMediaUnit_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   105
                            media_uri = hasRelevantMediaUnit_elts[0].getAttribute('rdf:resource')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   106
                            media_unit_elts = dom.getElementsByTagName('mediaUnit')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   107
                            for media_unit in media_unit_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   108
                                uri = media_unit.getAttribute('uri')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   109
                                if uri == media_uri:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   110
                                    has_exposed_content_elts = media_unit.getElementsByTagName('wlp:hasExposedContent')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   111
                                    if has_exposed_content_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   112
                                        img_internal_path = has_exposed_content_elts[0].childNodes[0].data'''
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   113
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   114
                        concept_documents_list.append({'id':document_id, 'score':score, 'rank':rank, 'isLinkedTo':isLinkedTo, 'image_path':img_internal_path, 'abstract':abstract})  
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   115
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   116
                concepts_with_documents_list.append(concept_documents_list)
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   117
        return concepts_with_documents_list
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   118
            
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   119
    def get_document_ids_concepts_(self, dom, list_links_concepts):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   120
        document_ids_concepts_list = [] 
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   121
        for list_links_of_a_concept in list_links_concepts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   122
            empty_list = []
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   123
            document_ids_concepts_list.append(empty_list) 
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   124
                    
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   125
        for node in dom.getElementsByTagName('resource'):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   126
            uri = node.getAttribute('uri')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   127
            for index, list_links_of_a_concept in enumerate(list_links_concepts):
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   128
                if uri in list_links_of_a_concept:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   129
                    dc_identifier_elts = node.getElementsByTagName('dc:identifier')
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   130
                    for dc_identifier in dc_identifier_elts:
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   131
                        document_id = dc_identifier.childNodes[0].data
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   132
                        document_ids_concepts_list[index].append(document_id)
8ca7f2cea729 add alcatel folder
cobled@FRVILN0H401086.emea.lucent.com
parents:
diff changeset
   133
        return document_ids_concepts_list