alcatel/dataparser/WebLabParser.py
changeset 27 8ca7f2cea729
child 37 3848e1813a30
equal deleted inserted replaced
26:94f586daa623 27:8ca7f2cea729
       
     1 '''
       
     2 Created on 1 aout 2012
       
     3 
       
     4 @author: gerard
       
     5 '''
       
     6 from xml.dom.minidom import parseString
       
     7 import logging
       
     8 logger = logging.getLogger('document')
       
     9 
       
    10 class WebLabParser(object):
       
    11     
       
    12     def parse(self, xml):
       
    13         dom = parseString(xml)
       
    14         self.get_xmlns(dom)
       
    15         list_concepts = self.get_concepts(dom)
       
    16         documents_concepts_list = self.get_documents_concepts_(dom)
       
    17         return (list_concepts, documents_concepts_list)
       
    18     
       
    19     def get_xmlns(self, dom):
       
    20         self._ns = ''
       
    21         rdf_elts = dom.getElementsByTagName('rdf:RDF')
       
    22         for rdf in rdf_elts:
       
    23             for key, value in rdf.attributes.items():
       
    24                 if value == 'http://weblab.ow2.org/core/1.2/ontology/retrieval#':
       
    25                     full_ns = key.split(':')
       
    26                     self._ns = full_ns[1]
       
    27                     return
       
    28         
       
    29                  
       
    30     def get_concepts(self, dom):
       
    31         concept_list = []
       
    32         for node in dom.getElementsByTagName('annotation'):
       
    33             parent = node.parentNode
       
    34             if parent.localName == 'resultSet':
       
    35                 description_elts = node.getElementsByTagName('rdf:Description')
       
    36                 for description in description_elts:
       
    37                     about = description.getAttribute('rdf:about')
       
    38                     dc_title_elts = description.getElementsByTagName('dc:title')
       
    39                     mediaunit_elts = description.getElementsByTagName('mediaUnit')
       
    40                     for mediaunit in mediaunit_elts:
       
    41                         has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
       
    42                         for has_native_content in has_native_content_elts:
       
    43                             a_url_image = has_native_content.childNodes[0].data
       
    44                             print 'a_url_image'
       
    45                             print a_url_image
       
    46                             break
       
    47                         break
       
    48                         
       
    49                     wls_score_elts = description.getElementsByTagName(self._ns + ':hasScore')
       
    50                     atitle = ''
       
    51                     for title in dc_title_elts:
       
    52                         atitle = title.childNodes[0].data
       
    53                     a_score = '0'
       
    54                     for wls_score in wls_score_elts:
       
    55                         a_score = wls_score.childNodes[0].data
       
    56                         print 'a_score'
       
    57                         print a_score
       
    58                     if atitle != '':
       
    59                         has_description_elts = description.getElementsByTagName(self._ns + ':hasDescription')
       
    60                         cluster_abstract = ''
       
    61                         for has_description in has_description_elts:
       
    62                             cluster_abstract = has_description.childNodes[0].data
       
    63                         concept_list.append({'about':about,'title': atitle, 'abstract':cluster_abstract, 'url_image':a_url_image, 'score':a_score})
       
    64         logger.info('concept_list')
       
    65         logger.info(concept_list)
       
    66         return concept_list
       
    67                            
       
    68     def get_documents_concepts_(self, dom):
       
    69         concepts_with_documents_list = []
       
    70         for node in dom.getElementsByTagName('resultSet'):
       
    71             '''parent = node.parentNode
       
    72             if parent.localName == 'resultSet':'''
       
    73             rdf_bag_elts = node.getElementsByTagName('rdf:Bag')
       
    74             for rdf_bag in rdf_bag_elts: # loop with the different concepts
       
    75                 list_hits_of_a_concept = []
       
    76                 concept_documents_list = []
       
    77                 for rdf_li_elts in rdf_bag.getElementsByTagName('rdf:li'):
       
    78                     list_hits_of_a_concept.append( rdf_li_elts.getAttribute('rdf:resource'))
       
    79             
       
    80                 description_elts = node.getElementsByTagName('rdf:Description')
       
    81                 for description in description_elts:
       
    82                     about = description.getAttribute('rdf:about')
       
    83                     if about in list_hits_of_a_concept:
       
    84                         img_internal_path = ''
       
    85                         mediaunit_elts = description.getElementsByTagName('mediaUnit')
       
    86                         for mediaunit in mediaunit_elts:
       
    87                             has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent')
       
    88                             for has_native_content in has_native_content_elts:
       
    89                                 img_internal_path = has_native_content.childNodes[0].data
       
    90                                 print 'img_internal_path'
       
    91                                 print img_internal_path
       
    92                                 break
       
    93                             break
       
    94                     
       
    95                         dc_identifier_elts = description.getElementsByTagName('dc:identifier')
       
    96                         document_id = dc_identifier_elts[0].childNodes[0].data
       
    97                         hasScore_elts = description.getElementsByTagName(self._ns + ':hasScore')
       
    98                         score = hasScore_elts[0].childNodes[0].data
       
    99                         hasRank_elts = description.getElementsByTagName(self._ns + ':hasRank')
       
   100                         rank = hasRank_elts[0].childNodes[0].data
       
   101                         # Not used ?
       
   102                         wlt_elts = description.getElementsByTagName(self._ns + ':isLinkedTo')
       
   103                         isLinkedTo = wlt_elts[0].getAttribute('rdf:resource')
       
   104                         #
       
   105                         hasDescription_elts = description.getElementsByTagName(self._ns + ':hasDescription')
       
   106                         abstract = hasDescription_elts[0].childNodes[0].data
       
   107                         # 
       
   108                         
       
   109                         '''hasRelevantMediaUnit_elts = description.getElementsByTagName('wls:hasRelevantMediaUnit')
       
   110                         if hasRelevantMediaUnit_elts:
       
   111                             media_uri = hasRelevantMediaUnit_elts[0].getAttribute('rdf:resource')
       
   112                             media_unit_elts = dom.getElementsByTagName('mediaUnit')
       
   113                             for media_unit in media_unit_elts:
       
   114                                 uri = media_unit.getAttribute('uri')
       
   115                                 if uri == media_uri:
       
   116                                     has_exposed_content_elts = media_unit.getElementsByTagName('wlp:hasExposedContent')
       
   117                                     if has_exposed_content_elts:
       
   118                                         img_internal_path = has_exposed_content_elts[0].childNodes[0].data'''
       
   119 
       
   120                         concept_documents_list.append({'id':document_id, 'score':score, 'rank':rank, 'isLinkedTo':isLinkedTo, 'image_path':img_internal_path, 'abstract':abstract})  
       
   121 
       
   122                 concepts_with_documents_list.append(concept_documents_list)
       
   123         logger.info('concepts_with_documents_list')
       
   124         logger.info(concepts_with_documents_list)
       
   125         return concepts_with_documents_list
       
   126             
       
   127     def get_document_ids_concepts_(self, dom, list_links_concepts):
       
   128         document_ids_concepts_list = [] 
       
   129         for list_links_of_a_concept in list_links_concepts:
       
   130             empty_list = []
       
   131             document_ids_concepts_list.append(empty_list) 
       
   132                     
       
   133         for node in dom.getElementsByTagName('resource'):
       
   134             uri = node.getAttribute('uri')
       
   135             for index, list_links_of_a_concept in enumerate(list_links_concepts):
       
   136                 if uri in list_links_of_a_concept:
       
   137                     dc_identifier_elts = node.getElementsByTagName('dc:identifier')
       
   138                     for dc_identifier in dc_identifier_elts:
       
   139                         document_id = dc_identifier.childNodes[0].data
       
   140                         document_ids_concepts_list[index].append(document_id)
       
   141         return document_ids_concepts_list