|
1 ''' |
|
2 Created on 1 aout 2012 |
|
3 |
|
4 @author: gerard |
|
5 ''' |
|
6 from xml.dom.minidom import parseString |
|
7 import logging |
|
8 logger = logging.getLogger('document') |
|
9 |
|
10 class WebLabParser(object): |
|
11 |
|
12 def parse(self, xml): |
|
13 dom = parseString(xml) |
|
14 self.get_xmlns(dom) |
|
15 list_concepts = self.get_concepts(dom) |
|
16 documents_concepts_list = self.get_documents_concepts_(dom) |
|
17 return (list_concepts, documents_concepts_list) |
|
18 |
|
19 def get_xmlns(self, dom): |
|
20 self._ns = '' |
|
21 rdf_elts = dom.getElementsByTagName('rdf:RDF') |
|
22 for rdf in rdf_elts: |
|
23 for key, value in rdf.attributes.items(): |
|
24 if value == 'http://weblab.ow2.org/core/1.2/ontology/retrieval#': |
|
25 full_ns = key.split(':') |
|
26 self._ns = full_ns[1] |
|
27 return |
|
28 |
|
29 |
|
30 def get_concepts(self, dom): |
|
31 concept_list = [] |
|
32 for node in dom.getElementsByTagName('annotation'): |
|
33 parent = node.parentNode |
|
34 if parent.localName == 'resultSet': |
|
35 description_elts = node.getElementsByTagName('rdf:Description') |
|
36 for description in description_elts: |
|
37 about = description.getAttribute('rdf:about') |
|
38 dc_title_elts = description.getElementsByTagName('dc:title') |
|
39 mediaunit_elts = description.getElementsByTagName('mediaUnit') |
|
40 for mediaunit in mediaunit_elts: |
|
41 has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent') |
|
42 for has_native_content in has_native_content_elts: |
|
43 a_url_image = has_native_content.childNodes[0].data |
|
44 print 'a_url_image' |
|
45 print a_url_image |
|
46 break |
|
47 break |
|
48 |
|
49 wls_score_elts = description.getElementsByTagName(self._ns + ':hasScore') |
|
50 atitle = '' |
|
51 for title in dc_title_elts: |
|
52 atitle = title.childNodes[0].data |
|
53 a_score = '0' |
|
54 for wls_score in wls_score_elts: |
|
55 a_score = wls_score.childNodes[0].data |
|
56 print 'a_score' |
|
57 print a_score |
|
58 if atitle != '': |
|
59 has_description_elts = description.getElementsByTagName(self._ns + ':hasDescription') |
|
60 cluster_abstract = '' |
|
61 for has_description in has_description_elts: |
|
62 cluster_abstract = has_description.childNodes[0].data |
|
63 concept_list.append({'about':about,'title': atitle, 'abstract':cluster_abstract, 'url_image':a_url_image, 'score':a_score}) |
|
64 logger.info('concept_list') |
|
65 logger.info(concept_list) |
|
66 return concept_list |
|
67 |
|
68 def get_documents_concepts_(self, dom): |
|
69 concepts_with_documents_list = [] |
|
70 for node in dom.getElementsByTagName('resultSet'): |
|
71 '''parent = node.parentNode |
|
72 if parent.localName == 'resultSet':''' |
|
73 rdf_bag_elts = node.getElementsByTagName('rdf:Bag') |
|
74 for rdf_bag in rdf_bag_elts: # loop with the different concepts |
|
75 list_hits_of_a_concept = [] |
|
76 concept_documents_list = [] |
|
77 for rdf_li_elts in rdf_bag.getElementsByTagName('rdf:li'): |
|
78 list_hits_of_a_concept.append( rdf_li_elts.getAttribute('rdf:resource')) |
|
79 |
|
80 description_elts = node.getElementsByTagName('rdf:Description') |
|
81 for description in description_elts: |
|
82 about = description.getAttribute('rdf:about') |
|
83 if about in list_hits_of_a_concept: |
|
84 img_internal_path = '' |
|
85 mediaunit_elts = description.getElementsByTagName('mediaUnit') |
|
86 for mediaunit in mediaunit_elts: |
|
87 has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent') |
|
88 for has_native_content in has_native_content_elts: |
|
89 img_internal_path = has_native_content.childNodes[0].data |
|
90 print 'img_internal_path' |
|
91 print img_internal_path |
|
92 break |
|
93 break |
|
94 |
|
95 dc_identifier_elts = description.getElementsByTagName('dc:identifier') |
|
96 document_id = dc_identifier_elts[0].childNodes[0].data |
|
97 hasScore_elts = description.getElementsByTagName(self._ns + ':hasScore') |
|
98 score = hasScore_elts[0].childNodes[0].data |
|
99 hasRank_elts = description.getElementsByTagName(self._ns + ':hasRank') |
|
100 rank = hasRank_elts[0].childNodes[0].data |
|
101 # Not used ? |
|
102 wlt_elts = description.getElementsByTagName(self._ns + ':isLinkedTo') |
|
103 isLinkedTo = wlt_elts[0].getAttribute('rdf:resource') |
|
104 # |
|
105 hasDescription_elts = description.getElementsByTagName(self._ns + ':hasDescription') |
|
106 abstract = hasDescription_elts[0].childNodes[0].data |
|
107 # |
|
108 |
|
109 '''hasRelevantMediaUnit_elts = description.getElementsByTagName('wls:hasRelevantMediaUnit') |
|
110 if hasRelevantMediaUnit_elts: |
|
111 media_uri = hasRelevantMediaUnit_elts[0].getAttribute('rdf:resource') |
|
112 media_unit_elts = dom.getElementsByTagName('mediaUnit') |
|
113 for media_unit in media_unit_elts: |
|
114 uri = media_unit.getAttribute('uri') |
|
115 if uri == media_uri: |
|
116 has_exposed_content_elts = media_unit.getElementsByTagName('wlp:hasExposedContent') |
|
117 if has_exposed_content_elts: |
|
118 img_internal_path = has_exposed_content_elts[0].childNodes[0].data''' |
|
119 |
|
120 concept_documents_list.append({'id':document_id, 'score':score, 'rank':rank, 'isLinkedTo':isLinkedTo, 'image_path':img_internal_path, 'abstract':abstract}) |
|
121 |
|
122 concepts_with_documents_list.append(concept_documents_list) |
|
123 logger.info('concepts_with_documents_list') |
|
124 logger.info(concepts_with_documents_list) |
|
125 return concepts_with_documents_list |
|
126 |
|
127 def get_document_ids_concepts_(self, dom, list_links_concepts): |
|
128 document_ids_concepts_list = [] |
|
129 for list_links_of_a_concept in list_links_concepts: |
|
130 empty_list = [] |
|
131 document_ids_concepts_list.append(empty_list) |
|
132 |
|
133 for node in dom.getElementsByTagName('resource'): |
|
134 uri = node.getAttribute('uri') |
|
135 for index, list_links_of_a_concept in enumerate(list_links_concepts): |
|
136 if uri in list_links_of_a_concept: |
|
137 dc_identifier_elts = node.getElementsByTagName('dc:identifier') |
|
138 for dc_identifier in dc_identifier_elts: |
|
139 document_id = dc_identifier.childNodes[0].data |
|
140 document_ids_concepts_list[index].append(document_id) |
|
141 return document_ids_concepts_list |