|
27
|
1 |
''' |
|
|
2 |
Created on 7 aout 2012 |
|
|
3 |
|
|
|
4 |
@author: gerard |
|
|
5 |
''' |
|
|
6 |
import logging |
|
|
7 |
import simplejson |
|
|
8 |
import locale |
|
|
9 |
from datetime import datetime |
|
|
10 |
|
|
|
11 |
from django.core.cache import cache |
|
|
12 |
from document.models import Annotationdocument |
|
|
13 |
from document.models import Tag, Cluster |
|
|
14 |
from mediapartdb.MediapartReader import MediapartReader |
|
|
15 |
from dataparser.ClientDocumentsGetAttributes import ClientDocumentsGetAttributes |
|
|
16 |
|
|
|
17 |
logger = logging.getLogger('document') |
|
|
18 |
|
|
|
19 |
# List of documents of a cluster with annotations |
|
|
20 |
class Documents(object): |
|
|
21 |
|
|
|
22 |
def __init__(self, request): |
|
|
23 |
self.request = request |
|
|
24 |
|
|
|
25 |
def get_documents(self,query,cluster,offset,count): |
|
|
26 |
json = {} |
|
|
27 |
|
|
|
28 |
if query == 0: |
|
|
29 |
attr = ClientDocumentsGetAttributes(self.request) |
|
|
30 |
|
|
|
31 |
if not attr.get_cluster(): |
|
|
32 |
json = '{"error msg": "no cluster_id defined"}' |
|
|
33 |
return json |
|
|
34 |
|
|
|
35 |
if attr.get_offset() == '': |
|
|
36 |
json = '{"error msg": "no offset defined"}' |
|
|
37 |
return json |
|
|
38 |
|
|
|
39 |
if attr.get_count() == '': |
|
|
40 |
json = '{"error msg": "no count defined"}' |
|
|
41 |
return json |
|
|
42 |
|
|
|
43 |
|
|
|
44 |
json['cluster_id'] = int(attr.get_cluster()) |
|
|
45 |
json['offset'] = int(attr.get_offset()) |
|
|
46 |
|
|
|
47 |
query_id = int(attr.get_query_id()) |
|
|
48 |
cluster_id = int(attr.get_cluster()) |
|
|
49 |
offset = int(attr.get_offset()) |
|
|
50 |
count=int(attr.get_count()) |
|
|
51 |
else: |
|
|
52 |
json['cluster_id'] = int(cluster) |
|
|
53 |
json['offset'] = int(offset) |
|
|
54 |
|
|
|
55 |
query_id = int(query) |
|
|
56 |
cluster_id = int(cluster) |
|
|
57 |
offset = int(offset) |
|
|
58 |
count=int(count) |
|
|
59 |
|
|
|
60 |
self.request.session['query'] = query_id |
|
|
61 |
self.request.session['cluster'] = cluster_id |
|
|
62 |
self.request.session['offset'] = offset |
|
|
63 |
self.request.session['count'] = count |
|
|
64 |
|
|
|
65 |
'''print self.request.session['json'] |
|
|
66 |
json_treemap = simplejson.loads(self.request.session['json']) |
|
|
67 |
print json_treemap.query''' |
|
|
68 |
|
|
|
69 |
d = simplejson.loads(self.request.session['json']) |
|
|
70 |
print d |
|
|
71 |
jsonquery = {'text': d['query']['text']} |
|
|
72 |
jsonquery['categories'] = d['query']['categories'] |
|
|
73 |
jsonquery['from_date'] = d['query']['from_date'] |
|
|
74 |
jsonquery['to_date'] = d['query']['to_date'] |
|
|
75 |
json['query'] = jsonquery |
|
|
76 |
|
|
|
77 |
json['documents'] = [] |
|
|
78 |
article_index=0 |
|
|
79 |
#if no query_id it is a cluster saved in database |
|
|
80 |
if not query_id: |
|
|
81 |
#json = '{"error msg": "query_id is not defined"}' |
|
|
82 |
try: |
|
|
83 |
cluster = Cluster.objects.get(pk=cluster_id) |
|
|
84 |
except Cluster.DoesNotExist: |
|
|
85 |
json = '{"error": "Invalid cluster id"}' |
|
|
86 |
logger.info(json) |
|
|
87 |
return json |
|
|
88 |
json['cluster_title'] = cluster.title |
|
|
89 |
reader = MediapartReader() |
|
|
90 |
for thedocument in cluster.document.all(): |
|
|
91 |
article_index += 1 |
|
|
92 |
jsonarticle = {'id':str(thedocument.documentId)} |
|
|
93 |
jsonarticle['title'] = str(thedocument.title) |
|
|
94 |
jsonarticle['abstract'] = str(thedocument.description) |
|
|
95 |
jsonarticle['url_document'] = reader.get_url(str(thedocument.documentId)) |
|
|
96 |
# TODO |
|
|
97 |
jsonarticle['url_image'] = thedocument.image.url |
|
|
98 |
'''jsonarticle['date'] = datetime.fromtimestamp(int(reader.get_date(str(thedocument.documentId)))).isoformat() + '.0Z''' |
|
|
99 |
|
|
|
100 |
jsonarticle['date'] =(datetime.fromtimestamp(int(reader.get_date(str(thedocument.documentId))))).strftime('%d-%m-%Y') |
|
|
101 |
|
|
|
102 |
|
|
|
103 |
jsonarticle['category'] = reader.get_category(str(thedocument.documentId)) |
|
|
104 |
|
|
|
105 |
clusterDoc = cluster.clusterdocumentweight_set.get(document=thedocument) |
|
|
106 |
jsonarticle['weight'] = clusterDoc.weight |
|
|
107 |
tags = reader.get_tags(str(thedocument.documentId)) |
|
|
108 |
jsonarticle['tags'] = [] |
|
|
109 |
#tags in mediapart |
|
|
110 |
for tag in tags: |
|
|
111 |
jsontag = {'title':tag[0].decode("windows-1252").encode("utf8")} |
|
|
112 |
jsonarticle['tags'].append(jsontag) |
|
|
113 |
|
|
|
114 |
#tags in periplus |
|
|
115 |
tags = thedocument.tag_set.all() |
|
|
116 |
for tag in tags: |
|
|
117 |
jsontag = {'title':tag.value} |
|
|
118 |
jsonarticle['tags'].append(jsontag) |
|
|
119 |
|
|
|
120 |
author = self.get_author(str(thedocument.documentId)) |
|
|
121 |
jsonarticle['author'] = [] |
|
|
122 |
jsonauthor = {'id':author['id'], 'name':author['name'], 'url':'http://www.mediapart.fr/biographie/'+str(author['id'])} |
|
|
123 |
jsonarticle['author'].append(jsonauthor) |
|
|
124 |
|
|
|
125 |
json['documents'].append(jsonarticle) |
|
|
126 |
jsonarticle['annotations'] = [] |
|
|
127 |
|
|
|
128 |
for theannotationdoc in thedocument.annotationdocument_set.all(): |
|
|
129 |
#Take only the public annotations |
|
|
130 |
if theannotationdoc.visibility == 1: |
|
|
131 |
jsonannotation = {'id':theannotationdoc.id} |
|
|
132 |
jsonannotation['user'] = theannotationdoc.user.username |
|
|
133 |
# Test the scope of the annotation (a part of an article or the global article) |
|
|
134 |
if theannotationdoc.annoted_text: |
|
|
135 |
jsonannotation['annotated_text'] = theannotationdoc.annoted_text |
|
|
136 |
jsonannotation['text'] = theannotationdoc.description |
|
|
137 |
|
|
|
138 |
jsonannotation['tags'] = [] |
|
|
139 |
for theannotationdoctag in theannotationdoc.tag_set.all(): |
|
|
140 |
logger.info('DOCUMENT_TAG_VALUE_OF_ANNOTATION == '+str(theannotationdoctag.value)) |
|
|
141 |
jsontag = {'id': theannotationdoctag.value} |
|
|
142 |
jsontag = {'title':str(theannotationdoctag.value)} |
|
|
143 |
#TO DO URL ? |
|
|
144 |
jsonannotation['tags'].append(jsontag) |
|
|
145 |
|
|
|
146 |
jsonarticle['annotations'].append(jsonannotation) |
|
|
147 |
|
|
|
148 |
#if query_id it is a cluster saved in cache |
|
|
149 |
else: |
|
|
150 |
logger.info('query_id present') |
|
|
151 |
dico = self.get_contextual_data(query_id) |
|
|
152 |
if dico['weblab_data']: |
|
|
153 |
list_concepts, concepts_with_detailed_documents_list = dico['weblab_data'] |
|
|
154 |
filtering = dico['filtering_params'] |
|
|
155 |
if not list_concepts: |
|
|
156 |
json = '{"error msg": "no data for the query id"}' |
|
|
157 |
return json |
|
|
158 |
if int(cluster_id) >= len(list_concepts): |
|
|
159 |
json = '{"error msg": "invalid cluster id"}' |
|
|
160 |
return json |
|
|
161 |
categories = filtering['categories'] |
|
|
162 |
print 'get_documents !!!!' |
|
|
163 |
print categories |
|
|
164 |
from_date = filtering['from_date'] |
|
|
165 |
print 'from_date' |
|
|
166 |
print from_date |
|
|
167 |
if from_date == '': |
|
|
168 |
from_date = 0 |
|
|
169 |
to_date = filtering['to_date'] |
|
|
170 |
print 'to_date' |
|
|
171 |
print to_date |
|
|
172 |
if to_date == '': |
|
|
173 |
to_date = 9999999999 |
|
|
174 |
json['cluster_title'] = list_concepts[cluster_id]['title'] |
|
|
175 |
for document in concepts_with_detailed_documents_list[cluster_id]: |
|
|
176 |
#Filtering by category |
|
|
177 |
if (categories != [] and document['category'] in categories) or (categories == []): |
|
|
178 |
#Filtering by date |
|
|
179 |
if int(document['date']) >= int(from_date) and int(document['date']) < int(to_date): |
|
|
180 |
article_index += 1 |
|
|
181 |
#Filtering by offset |
|
|
182 |
if article_index - 1 >= offset and article_index - 1 < offset + count: |
|
|
183 |
jsonarticle = {'id':document['id']} |
|
|
184 |
jsonarticle['title'] = document['title'] |
|
|
185 |
jsonarticle['abstract'] = document['abstract'] |
|
|
186 |
jsonarticle['url_document'] = document['url'] |
|
|
187 |
# TODO |
|
|
188 |
jsonarticle['url_image'] = document['image_path'] |
|
|
189 |
# |
|
|
190 |
'''jsonarticle['date'] = datetime.fromtimestamp(int(document['date'])).isoformat() + '.0Z''' |
|
|
191 |
locale.setlocale(locale.LC_ALL,'') |
|
|
192 |
jsonarticle['date'] =(datetime.fromtimestamp(int(document['date']))).strftime('%d %B %Y') |
|
|
193 |
jsonarticle['category'] = document['category'] |
|
|
194 |
jsonarticle['weight'] = float(document['weight']) |
|
|
195 |
reader = MediapartReader() |
|
|
196 |
tags = reader.get_tags(str(document['id'])) |
|
|
197 |
jsonarticle['tags'] = [] |
|
|
198 |
for tag in tags: |
|
|
199 |
jsontag = {'title':tag[0].decode("windows-1252").encode("utf8")} |
|
|
200 |
jsonarticle['tags'].append(jsontag) |
|
|
201 |
author = self.get_author(document['id']) |
|
|
202 |
print document['id'] |
|
|
203 |
jsonarticle['author'] = [] |
|
|
204 |
jsonauthor = {'id':author['id'], 'name':author['name'], 'url':'http://www.mediapart.fr/biographie/'+str(author['id'])} |
|
|
205 |
jsonarticle['author'].append(jsonauthor) |
|
|
206 |
|
|
|
207 |
json['documents'].append(jsonarticle) |
|
|
208 |
jsonarticle['annotations'] = [] |
|
|
209 |
|
|
|
210 |
annotations = Annotationdocument.objects.all() |
|
|
211 |
for annotation in annotations: |
|
|
212 |
#Take only the public annotations |
|
|
213 |
if annotation.visibility == 1: |
|
|
214 |
jsonannotation = {'id':annotation.id} |
|
|
215 |
jsonannotation['user'] = annotation.user.username |
|
|
216 |
# Test the scope of the annotation (a part of an article or the global article) |
|
|
217 |
if annotation.annoted_text: |
|
|
218 |
jsonannotation['annotated_text'] = annotation.annoted_text |
|
|
219 |
jsonannotation['text'] = annotation.description |
|
|
220 |
|
|
|
221 |
jsonannotation['tags'] = [] |
|
|
222 |
tags = Tag.objects.filter(annotationdocument_id=annotation.id) |
|
|
223 |
|
|
|
224 |
for tag in tags: |
|
|
225 |
jsontag = {'id': tag.value} |
|
|
226 |
jsontag = {'title':str(tag.value)} |
|
|
227 |
#TO DO URL ? |
|
|
228 |
jsonannotation['tags'].append(jsontag) |
|
|
229 |
|
|
|
230 |
jsonarticle['annotations'].append(jsonannotation) |
|
|
231 |
else: |
|
|
232 |
json = '{"Error: Invalid query id"}' |
|
|
233 |
return json |
|
|
234 |
json['total_count'] = article_index |
|
|
235 |
result = simplejson.dumps(json) |
|
|
236 |
return result |
|
|
237 |
|
|
|
238 |
def get_author(self, document_id): |
|
|
239 |
reader = MediapartReader() |
|
|
240 |
dico = reader.get_author(document_id) |
|
|
241 |
return dico |
|
|
242 |
|
|
|
243 |
def get_contextual_data(self, query_id): |
|
|
244 |
query_context = cache.get(query_id) |
|
|
245 |
if not query_context: |
|
|
246 |
print "Error: Invalid query id:"+query_id |
|
|
247 |
logger.info("Error: Invalid query id:"+query_id) |
|
|
248 |
weblab_data=None |
|
|
249 |
query_context ={'filtering_params':{'from_date':0, 'to_date':0, 'categories':[]}} |
|
|
250 |
else: |
|
|
251 |
weblab_data = cache.get(query_context['weblab_data_key']) |
|
|
252 |
|
|
|
253 |
return {'weblab_data':weblab_data, 'filtering_params':query_context['filtering_params']} |