|
27
|
1 |
''' |
|
|
2 |
Created on 1 aout 2012 |
|
|
3 |
|
|
|
4 |
@author: gerard |
|
|
5 |
''' |
|
|
6 |
from xml.dom.minidom import parseString |
|
|
7 |
import logging |
|
|
8 |
logger = logging.getLogger('document') |
|
|
9 |
|
|
|
10 |
class WebLabParser(object): |
|
|
11 |
|
|
|
12 |
def parse(self, xml): |
|
|
13 |
dom = parseString(xml) |
|
|
14 |
self.get_xmlns(dom) |
|
|
15 |
list_concepts = self.get_concepts(dom) |
|
|
16 |
documents_concepts_list = self.get_documents_concepts_(dom) |
|
|
17 |
return (list_concepts, documents_concepts_list) |
|
|
18 |
|
|
|
19 |
def get_xmlns(self, dom): |
|
|
20 |
self._ns = '' |
|
|
21 |
rdf_elts = dom.getElementsByTagName('rdf:RDF') |
|
|
22 |
for rdf in rdf_elts: |
|
|
23 |
for key, value in rdf.attributes.items(): |
|
|
24 |
if value == 'http://weblab.ow2.org/core/1.2/ontology/retrieval#': |
|
|
25 |
full_ns = key.split(':') |
|
|
26 |
self._ns = full_ns[1] |
|
|
27 |
return |
|
|
28 |
|
|
|
29 |
|
|
|
30 |
def get_concepts(self, dom): |
|
|
31 |
concept_list = [] |
|
|
32 |
for node in dom.getElementsByTagName('annotation'): |
|
|
33 |
parent = node.parentNode |
|
|
34 |
if parent.localName == 'resultSet': |
|
|
35 |
description_elts = node.getElementsByTagName('rdf:Description') |
|
|
36 |
for description in description_elts: |
|
|
37 |
about = description.getAttribute('rdf:about') |
|
|
38 |
dc_title_elts = description.getElementsByTagName('dc:title') |
|
|
39 |
mediaunit_elts = description.getElementsByTagName('mediaUnit') |
|
|
40 |
for mediaunit in mediaunit_elts: |
|
|
41 |
has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent') |
|
|
42 |
for has_native_content in has_native_content_elts: |
|
|
43 |
a_url_image = has_native_content.childNodes[0].data |
|
|
44 |
break |
|
|
45 |
break |
|
|
46 |
|
|
|
47 |
wls_score_elts = description.getElementsByTagName(self._ns + ':hasScore') |
|
|
48 |
atitle = '' |
|
|
49 |
for title in dc_title_elts: |
|
|
50 |
atitle = title.childNodes[0].data |
|
|
51 |
a_score = '0' |
|
|
52 |
for wls_score in wls_score_elts: |
|
|
53 |
a_score = wls_score.childNodes[0].data |
|
37
|
54 |
|
|
|
55 |
|
|
27
|
56 |
if atitle != '': |
|
|
57 |
has_description_elts = description.getElementsByTagName(self._ns + ':hasDescription') |
|
|
58 |
cluster_abstract = '' |
|
|
59 |
for has_description in has_description_elts: |
|
|
60 |
cluster_abstract = has_description.childNodes[0].data |
|
|
61 |
concept_list.append({'about':about,'title': atitle, 'abstract':cluster_abstract, 'url_image':a_url_image, 'score':a_score}) |
|
|
62 |
return concept_list |
|
|
63 |
|
|
|
64 |
def get_documents_concepts_(self, dom): |
|
|
65 |
concepts_with_documents_list = [] |
|
|
66 |
for node in dom.getElementsByTagName('resultSet'): |
|
|
67 |
'''parent = node.parentNode |
|
|
68 |
if parent.localName == 'resultSet':''' |
|
|
69 |
rdf_bag_elts = node.getElementsByTagName('rdf:Bag') |
|
|
70 |
for rdf_bag in rdf_bag_elts: # loop with the different concepts |
|
|
71 |
list_hits_of_a_concept = [] |
|
|
72 |
concept_documents_list = [] |
|
|
73 |
for rdf_li_elts in rdf_bag.getElementsByTagName('rdf:li'): |
|
|
74 |
list_hits_of_a_concept.append( rdf_li_elts.getAttribute('rdf:resource')) |
|
|
75 |
|
|
|
76 |
description_elts = node.getElementsByTagName('rdf:Description') |
|
|
77 |
for description in description_elts: |
|
|
78 |
about = description.getAttribute('rdf:about') |
|
|
79 |
if about in list_hits_of_a_concept: |
|
|
80 |
img_internal_path = '' |
|
|
81 |
mediaunit_elts = description.getElementsByTagName('mediaUnit') |
|
|
82 |
for mediaunit in mediaunit_elts: |
|
|
83 |
has_native_content_elts = mediaunit.getElementsByTagName('wlr:hasNativeContent') |
|
|
84 |
for has_native_content in has_native_content_elts: |
|
|
85 |
img_internal_path = has_native_content.childNodes[0].data |
|
|
86 |
break |
|
|
87 |
break |
|
|
88 |
|
|
|
89 |
dc_identifier_elts = description.getElementsByTagName('dc:identifier') |
|
|
90 |
document_id = dc_identifier_elts[0].childNodes[0].data |
|
|
91 |
hasScore_elts = description.getElementsByTagName(self._ns + ':hasScore') |
|
|
92 |
score = hasScore_elts[0].childNodes[0].data |
|
|
93 |
hasRank_elts = description.getElementsByTagName(self._ns + ':hasRank') |
|
|
94 |
rank = hasRank_elts[0].childNodes[0].data |
|
|
95 |
# Not used ? |
|
|
96 |
wlt_elts = description.getElementsByTagName(self._ns + ':isLinkedTo') |
|
|
97 |
isLinkedTo = wlt_elts[0].getAttribute('rdf:resource') |
|
|
98 |
# |
|
|
99 |
hasDescription_elts = description.getElementsByTagName(self._ns + ':hasDescription') |
|
|
100 |
abstract = hasDescription_elts[0].childNodes[0].data |
|
|
101 |
# |
|
|
102 |
|
|
|
103 |
'''hasRelevantMediaUnit_elts = description.getElementsByTagName('wls:hasRelevantMediaUnit') |
|
|
104 |
if hasRelevantMediaUnit_elts: |
|
|
105 |
media_uri = hasRelevantMediaUnit_elts[0].getAttribute('rdf:resource') |
|
|
106 |
media_unit_elts = dom.getElementsByTagName('mediaUnit') |
|
|
107 |
for media_unit in media_unit_elts: |
|
|
108 |
uri = media_unit.getAttribute('uri') |
|
|
109 |
if uri == media_uri: |
|
|
110 |
has_exposed_content_elts = media_unit.getElementsByTagName('wlp:hasExposedContent') |
|
|
111 |
if has_exposed_content_elts: |
|
|
112 |
img_internal_path = has_exposed_content_elts[0].childNodes[0].data''' |
|
|
113 |
|
|
|
114 |
concept_documents_list.append({'id':document_id, 'score':score, 'rank':rank, 'isLinkedTo':isLinkedTo, 'image_path':img_internal_path, 'abstract':abstract}) |
|
|
115 |
|
|
|
116 |
concepts_with_documents_list.append(concept_documents_list) |
|
|
117 |
return concepts_with_documents_list |
|
|
118 |
|
|
|
119 |
def get_document_ids_concepts_(self, dom, list_links_concepts): |
|
|
120 |
document_ids_concepts_list = [] |
|
|
121 |
for list_links_of_a_concept in list_links_concepts: |
|
|
122 |
empty_list = [] |
|
|
123 |
document_ids_concepts_list.append(empty_list) |
|
|
124 |
|
|
|
125 |
for node in dom.getElementsByTagName('resource'): |
|
|
126 |
uri = node.getAttribute('uri') |
|
|
127 |
for index, list_links_of_a_concept in enumerate(list_links_concepts): |
|
|
128 |
if uri in list_links_of_a_concept: |
|
|
129 |
dc_identifier_elts = node.getElementsByTagName('dc:identifier') |
|
|
130 |
for dc_identifier in dc_identifier_elts: |
|
|
131 |
document_id = dc_identifier.childNodes[0].data |
|
|
132 |
document_ids_concepts_list[index].append(document_id) |
|
|
133 |
return document_ids_concepts_list |