1 #!/usr/bin/env python |
1 #!/usr/bin/env python |
2 # coding=utf-8 |
2 # coding=utf-8 |
3 |
3 |
4 from lxml import etree |
4 from lxml import etree |
5 from iri_tweet.models import setup_database |
5 from iri_tweet.models import setup_database, Tweet, User |
6 from optparse import OptionParser #@UnresolvedImport |
6 from optparse import OptionParser #@UnresolvedImport |
7 from sqlalchemy import Table, Column, BigInteger |
7 from sqlalchemy import Table, Column, BigInteger |
8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, |
8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, |
9 get_logger) |
9 get_logger) |
10 import anyjson |
10 import anyjson |
11 import datetime |
11 import datetime |
12 import httplib2 |
12 import requests |
13 import os.path |
13 import os.path |
14 import re |
14 import re |
15 import sys |
15 import sys |
16 import time |
16 import time |
17 import uuid #@UnresolvedImport |
17 import uuid #@UnresolvedImport |
85 help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False) |
88 help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False) |
86 parser.add_option("-L", "--list-conf", dest="listconf", |
89 parser.add_option("-L", "--list-conf", dest="listconf", |
87 help="list of file to process", metavar="LIST_CONF", default=None) |
90 help="list of file to process", metavar="LIST_CONF", default=None) |
88 parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", |
91 parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", |
89 help="Trigger polemic extended mode", metavar="EXTENDED", default=False) |
92 help="Trigger polemic extended mode", metavar="EXTENDED", default=False) |
|
93 parser.add_option("-b", "--base-url", dest="base_url", |
|
94 help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/") |
|
95 parser.add_option("-p", "--project", dest="project_id", |
|
96 help="Project id", metavar="PROJECT_ID", default=None) |
|
97 parser.add_option("-P", "--post-param", dest="post_param", |
|
98 help="Post param", metavar="POST_PARAM", default=None) |
90 parser.add_option("--user-whitelist", dest="user_whitelist", action="store", |
99 parser.add_option("--user-whitelist", dest="user_whitelist", action="store", |
91 help="A list of user screen name", metavar="USER_WHITELIST",default=None) |
100 help="A list of user screen name", metavar="USER_WHITELIST",default=None) |
92 |
101 |
93 |
102 |
94 set_logging_options(parser) |
103 set_logging_options(parser) |
125 metadata.create_all(bind=conn, tables=[tweet_exclude_table]) |
134 metadata.create_all(bind=conn, tables=[tweet_exclude_table]) |
126 |
135 |
127 if options.exclude and os.path.exists(options.exclude): |
136 if options.exclude and os.path.exists(options.exclude): |
128 with open(options.exclude, 'r+') as f: |
137 with open(options.exclude, 'r+') as f: |
129 tei = tweet_exclude_table.insert() |
138 tei = tweet_exclude_table.insert() |
|
139 ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I) |
130 for line in f: |
140 for line in f: |
131 conn.execute(tei.values(id=long(line.strip()))) |
141 res = ex_regexp.match(line.strip()) |
|
142 if res: |
|
143 if res.group('field') == "id": |
|
144 conn.execute(tei.values(id=res.group('value'))) |
|
145 else: |
|
146 exclude_query = session.query(Tweet) |
|
147 filter_obj = Tweet |
|
148 filter_field = res.group('field') |
|
149 if filter_field.startswith("user_"): |
|
150 exclude_query = exclude_query.join(User) |
|
151 filter_obj = User |
|
152 filter_field = filter_field[len("user_"):] |
|
153 |
|
154 |
|
155 if res.group('op') == "=": |
|
156 exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field) == res.group('value')) |
|
157 else: |
|
158 exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field).like(res.group('value'))) |
|
159 |
|
160 for t in exclude_query.all(): |
|
161 conn.execute(tei.values(id=t.id)) |
|
162 |
132 user_whitelist_file = options.user_whitelist |
163 user_whitelist_file = options.user_whitelist |
133 user_whitelist = None |
164 user_whitelist = None |
134 |
165 |
135 if options.listconf: |
166 if options.listconf: |
136 |
167 |
139 for node in confdoc.xpath("/twitter_export/file"): |
170 for node in confdoc.xpath("/twitter_export/file"): |
140 params = {} |
171 params = {} |
141 for snode in node: |
172 for snode in node: |
142 if snode.tag == "path": |
173 if snode.tag == "path": |
143 params['content_file'] = snode.text |
174 params['content_file'] = snode.text |
|
175 params['content_file_write'] = snode.text |
|
176 elif snode.tag == "project_id": |
|
177 params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json" |
|
178 params['project_id'] = snode.text |
144 elif snode.tag == "start_date": |
179 elif snode.tag == "start_date": |
145 params['start_date'] = snode.text |
180 params['start_date'] = snode.text |
146 elif snode.tag == "end_date": |
181 elif snode.tag == "end_date": |
147 params['end_date'] = snode.text |
182 params['end_date'] = snode.text |
148 elif snode.tag == "duration": |
183 elif snode.tag == "duration": |
150 elif snode.tag == "hashtags": |
185 elif snode.tag == "hashtags": |
151 params['hashtags'] = [snode.text] |
186 params['hashtags'] = [snode.text] |
152 if options.hashtag or 'hashtags' not in params : |
187 if options.hashtag or 'hashtags' not in params : |
153 params['hashtags'] = options.hashtag |
188 params['hashtags'] = options.hashtag |
154 parameters.append(params) |
189 parameters.append(params) |
155 else: |
190 else: |
|
191 if options.project_id: |
|
192 content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json" |
|
193 else: |
|
194 content_file = options.content_file |
156 parameters = [{ |
195 parameters = [{ |
157 'start_date': options.start_date, |
196 'start_date': options.start_date, |
158 'end_date' : options.end_date, |
197 'end_date' : options.end_date, |
159 'duration' : options.duration, |
198 'duration' : options.duration, |
160 'content_file' : options.content_file, |
199 'content_file' : content_file, |
161 'hashtags' : options.hashtag |
200 'content_file_write' : content_file, |
|
201 'hashtags' : options.hashtag, |
|
202 'project_id' : options.project_id |
162 }] |
203 }] |
163 |
204 post_param = {} |
|
205 if options.post_param: |
|
206 post_param = anyjson.loads(options.post_param) |
|
207 |
164 for params in parameters: |
208 for params in parameters: |
165 |
209 |
166 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable |
210 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable |
167 |
211 |
168 start_date_str = params.get("start_date",None) |
212 start_date_str = params.get("start_date",None) |
169 end_date_str = params.get("end_date", None) |
213 end_date_str = params.get("end_date", None) |
170 duration = params.get("duration", None) |
214 duration = params.get("duration", None) |
171 content_file = params.get("content_file", None) |
215 content_file = params.get("content_file", None) |
|
216 content_file_write = params.get("content_file_write", None) |
172 hashtags = params.get('hashtags', []) |
217 hashtags = params.get('hashtags', []) |
173 |
218 |
174 if user_whitelist_file: |
219 if user_whitelist_file: |
175 with open(user_whitelist_file, 'r+') as f: |
220 with open(user_whitelist_file, 'r+') as f: |
176 user_whitelist = list(set([s.strip() for s in f])) |
221 user_whitelist = list(set([s.strip() for s in f])) |
179 ts = None |
224 ts = None |
180 if start_date_str: |
225 if start_date_str: |
181 start_date = parse_date(start_date_str) |
226 start_date = parse_date(start_date_str) |
182 ts = time.mktime(start_date.timetuple()) |
227 ts = time.mktime(start_date.timetuple()) |
183 |
228 |
184 end_date = None |
|
185 if end_date_str: |
|
186 end_date = parse_date(end_date_str) |
|
187 elif start_date and duration: |
|
188 end_date = start_date + datetime.timedelta(seconds=duration) |
|
189 |
|
190 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) |
|
191 |
|
192 query_res = query.all() |
|
193 |
229 |
194 root = None |
230 root = None |
195 ensemble_parent = None |
231 ensemble_parent = None |
196 |
232 |
197 #to do : analyse situation ldt or iri ? filename set or not ? |
233 #to do : analyse situation ldt or iri ? filename set or not ? |
198 |
234 |
199 if content_file and content_file.find("http") == 0: |
235 if content_file and content_file.find("http") == 0: |
200 |
236 |
201 get_logger().debug("url : " + content_file) #@UndefinedVariable |
237 get_logger().debug("url : " + content_file) #@UndefinedVariable |
202 |
238 |
203 h = httplib2.Http() |
239 r = requests.get(content_file, params=post_param) |
204 resp, content = h.request(content_file) |
240 get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable |
205 |
241 project = r.json() |
206 get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable |
|
207 |
|
208 project = anyjson.deserialize(content) |
|
209 root = etree.fromstring(project["ldt"]) |
242 root = etree.fromstring(project["ldt"]) |
210 |
243 |
211 elif content_file and os.path.exists(content_file): |
244 elif content_file and os.path.exists(content_file): |
212 |
245 |
213 doc = etree.parse(content_file) |
246 doc = etree.parse(content_file) |
214 root = doc.getroot() |
247 root = doc.getroot() |
215 |
248 |
|
249 content_id = None |
216 |
250 |
217 if root is None: |
251 if root is None: |
218 |
252 |
219 root = etree.Element(u"iri") |
253 root = etree.Element(u"iri") |
220 |
254 |
224 media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) |
258 media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) |
225 |
259 |
226 annotations = etree.SubElement(root, u"annotations") |
260 annotations = etree.SubElement(root, u"annotations") |
227 content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) |
261 content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) |
228 ensemble_parent = content |
262 ensemble_parent = content |
|
263 |
|
264 content_id = options.content_id |
229 |
265 |
230 |
266 |
231 if ensemble_parent is None: |
267 if ensemble_parent is None: |
232 file_type = None |
268 file_type = None |
233 for node in root: |
269 for node in root: |
247 annotations_node = etree.SubElement(root, u"annotations") |
283 annotations_node = etree.SubElement(root, u"annotations") |
248 content_node = annotations_node.find(u"content") |
284 content_node = annotations_node.find(u"content") |
249 if content_node is None: |
285 if content_node is None: |
250 content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) |
286 content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) |
251 ensemble_parent = content_node |
287 ensemble_parent = content_node |
|
288 content_id = content_node.get(u"id") |
252 elif file_type == "iri": |
289 elif file_type == "iri": |
253 body_node = root.find(u"body") |
290 body_node = root.find(u"body") |
254 if body_node is None: |
291 if body_node is None: |
255 body_node = etree.SubElement(root, u"body") |
292 body_node = etree.SubElement(root, u"body") |
256 ensembles_node = body_node.find(u"ensembles") |
293 ensembles_node = body_node.find(u"ensembles") |
257 if ensembles_node is None: |
294 if ensembles_node is None: |
258 ensembles_node = etree.SubElement(body_node, u"ensembles") |
295 ensembles_node = etree.SubElement(body_node, u"ensembles") |
259 ensemble_parent = ensembles_node |
296 ensemble_parent = ensembles_node |
|
297 content_id = root.xpath("head/meta[@name='id']/@content")[0] |
260 |
298 |
261 |
299 |
262 if ensemble_parent is None: |
300 if ensemble_parent is None: |
263 get_logger().error("Can not process file") #@UndefinedVariable |
301 get_logger().error("Can not process file") #@UndefinedVariable |
264 sys.exit() |
302 sys.exit() |
282 |
320 |
283 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
321 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
284 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
322 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
285 |
323 |
286 elements = etree.SubElement(decoupage, u"elements") |
324 elements = etree.SubElement(decoupage, u"elements") |
|
325 |
|
326 end_date = None |
|
327 if end_date_str: |
|
328 end_date = parse_date(end_date_str) |
|
329 elif start_date and duration: |
|
330 end_date = start_date + datetime.timedelta(seconds=duration) |
|
331 elif start_date and options.base_url: |
|
332 # get duration from api |
|
333 content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json" |
|
334 r = requests.get(content_url) |
|
335 duration = int(r.json()['duration']) |
|
336 get_logger().debug("get duration " + content_url) #@UndefinedVariable |
|
337 get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable |
|
338 |
|
339 end_date = start_date + datetime.timedelta(seconds=int(duration/1000)) |
|
340 |
|
341 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) |
|
342 |
|
343 query_res = query.all() |
287 |
344 |
288 |
345 |
289 for tw in query_res: |
346 for tw in query_res: |
290 tweet_ts_dt = tw.created_at |
347 tweet_ts_dt = tw.created_at |
291 tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) |
348 tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) |
331 |
388 |
332 |
389 |
333 |
390 |
334 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) |
391 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) |
335 |
392 |
336 if content_file and content_file.find("http") == 0: |
393 if content_file_write and content_file_write.find("http") == 0: |
337 |
394 |
338 project["ldt"] = output_data |
395 project["ldt"] = output_data |
339 body = anyjson.serialize(project) |
396 post_param = {} |
340 get_logger().debug("write http " + content_file) #@UndefinedVariable |
397 if options.post_param: |
341 get_logger().debug("write http " + repr(body)) #@UndefinedVariable |
398 post_param = anyjson.loads(options.post_param) |
342 h = httplib2.Http() |
399 |
343 resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body) |
400 get_logger().debug("write http " + content_file_write) #@UndefinedVariable |
344 get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable |
401 get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable |
345 if resp.status != 200: |
402 get_logger().debug("write http " + repr(project)) #@UndefinedVariable |
346 get_logger().error("Error http " + repr(resp) + " content " + content) #@UndefinedVariable |
403 r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param); |
347 raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason)) |
404 get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable |
|
405 if r.status_code != requests.codes.ok: |
|
406 r.raise_for_status() |
348 else: |
407 else: |
349 if content_file and os.path.exists(content_file): |
408 if content_file_write and os.path.exists(content_file_write): |
350 dest_file_name = content_file |
409 dest_file_name = content_file_write |
351 else: |
410 else: |
352 dest_file_name = options.filename |
411 dest_file_name = options.filename |
353 |
412 |
354 get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable |
413 get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable |
355 output = open(dest_file_name, "w") |
414 output = open(dest_file_name, "w") |