1 #!/usr/bin/env python |
1 #!/usr/bin/env python |
2 # coding=utf-8 |
2 # coding=utf-8 |
3 |
3 |
4 from lxml import etree |
4 from lxml import etree |
5 from iri_tweet.models import setup_database |
5 from iri_tweet.models import setup_database, Tweet, User |
6 from optparse import OptionParser #@UnresolvedImport |
6 from optparse import OptionParser #@UnresolvedImport |
7 from sqlalchemy import Table, Column, BigInteger |
7 from sqlalchemy import Table, Column, BigInteger, event, bindparam |
|
8 from sqlalchemy.sql import select, func |
8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, |
9 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, |
9 get_logger) |
10 get_logger) |
10 import anyjson |
11 import anyjson |
11 import datetime |
12 import datetime |
12 import httplib2 |
13 import requests |
13 import os.path |
14 import os.path |
14 import re |
15 import re |
15 import sys |
16 import sys |
16 import time |
17 import time |
17 import uuid #@UnresolvedImport |
18 import uuid #@UnresolvedImport |
22 # self.id = id |
23 # self.id = id |
23 # |
24 # |
24 # def __repr__(self): |
25 # def __repr__(self): |
25 # return "<TweetExclude(id=%d)>" % (self.id) |
26 # return "<TweetExclude(id=%d)>" % (self.id) |
26 |
27 |
|
28 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/" |
|
29 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" |
|
30 |
|
31 def re_fn(expr, item): |
|
32 reg = re.compile(expr, re.I) |
|
33 res = reg.search(item) |
|
34 if res: |
|
35 get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable |
|
36 return res is not None |
27 |
37 |
28 def parse_polemics(tw, extended_mode): |
38 def parse_polemics(tw, extended_mode): |
29 """ |
39 """ |
30 parse polemics in text and return a list of polemic code. None if not polemic found |
40 parse polemics in text and return a list of polemic code. None if not polemic found |
31 """ |
41 """ |
85 help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False) |
95 help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False) |
86 parser.add_option("-L", "--list-conf", dest="listconf", |
96 parser.add_option("-L", "--list-conf", dest="listconf", |
87 help="list of file to process", metavar="LIST_CONF", default=None) |
97 help="list of file to process", metavar="LIST_CONF", default=None) |
88 parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", |
98 parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", |
89 help="Trigger polemic extended mode", metavar="EXTENDED", default=False) |
99 help="Trigger polemic extended mode", metavar="EXTENDED", default=False) |
|
100 parser.add_option("-b", "--base-url", dest="base_url", |
|
101 help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/") |
|
102 parser.add_option("-p", "--project", dest="project_id", |
|
103 help="Project id", metavar="PROJECT_ID", default=None) |
|
104 parser.add_option("-P", "--post-param", dest="post_param", |
|
105 help="Post param", metavar="POST_PARAM", default=None) |
90 parser.add_option("--user-whitelist", dest="user_whitelist", action="store", |
106 parser.add_option("--user-whitelist", dest="user_whitelist", action="store", |
91 help="A list of user screen name", metavar="USER_WHITELIST",default=None) |
107 help="A list of user screen name", metavar="USER_WHITELIST",default=None) |
92 |
108 |
93 |
109 |
94 set_logging_options(parser) |
110 set_logging_options(parser) |
114 conn_str = 'sqlite:///' + conn_str |
130 conn_str = 'sqlite:///' + conn_str |
115 |
131 |
116 engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) |
132 engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) |
117 conn = None |
133 conn = None |
118 try : |
134 try : |
119 conn = engine.connect() |
135 conn = engine.connect() |
|
136 @event.listens_for(conn, "begin") |
|
137 def do_begin(conn): |
|
138 conn.connection.create_function('regexp', 2, re_fn) |
120 session = None |
139 session = None |
121 try : |
140 try : |
122 session = Session(bind=conn) |
141 session = Session(bind=conn) |
123 tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) |
142 tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) |
124 #mapper(TweetExclude, tweet_exclude_table) |
143 #mapper(TweetExclude, tweet_exclude_table) |
125 metadata.create_all(bind=conn, tables=[tweet_exclude_table]) |
144 metadata.create_all(bind=conn, tables=[tweet_exclude_table]) |
126 |
145 |
127 if options.exclude and os.path.exists(options.exclude): |
146 if options.exclude and os.path.exists(options.exclude): |
128 with open(options.exclude, 'r+') as f: |
147 with open(options.exclude, 'r+') as f: |
129 tei = tweet_exclude_table.insert() |
148 tei = tweet_exclude_table.insert() |
|
149 ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I) |
130 for line in f: |
150 for line in f: |
131 conn.execute(tei.values(id=long(line.strip()))) |
151 res = ex_regexp.match(line.strip()) |
|
152 if res: |
|
153 if res.group('field') == "id": |
|
154 conn.execute(tei.values(id=res.group('value'))) |
|
155 else: |
|
156 exclude_query = session.query(Tweet) |
|
157 filter_obj = Tweet |
|
158 filter_field = res.group('field') |
|
159 if filter_field.startswith("user__"): |
|
160 exclude_query = exclude_query.outerjoin(User, Tweet.user_id==User.id) |
|
161 filter_obj = User |
|
162 filter_field = filter_field[len("user__"):] |
|
163 |
|
164 if res.group('op') == "=": |
|
165 exclude_query = exclude_query.filter(getattr(filter_obj, filter_field) == res.group('value')) |
|
166 else: |
|
167 exclude_query = exclude_query.filter(getattr(filter_obj, filter_field).op('regexp')(res.group('value'))) |
|
168 |
|
169 test_query = select([func.count()]).where(tweet_exclude_table.c.id==bindparam('t_id')) |
|
170 for t in exclude_query.all(): |
|
171 get_logger().debug("t : " + repr(t)) |
|
172 if conn.execute(test_query, t_id=t.id).fetchone()[0] == 0: |
|
173 conn.execute(tei.values(id=t.id)) |
|
174 |
132 user_whitelist_file = options.user_whitelist |
175 user_whitelist_file = options.user_whitelist |
133 user_whitelist = None |
176 user_whitelist = None |
134 |
177 |
135 if options.listconf: |
178 if options.listconf: |
136 |
179 |
139 for node in confdoc.xpath("/twitter_export/file"): |
182 for node in confdoc.xpath("/twitter_export/file"): |
140 params = {} |
183 params = {} |
141 for snode in node: |
184 for snode in node: |
142 if snode.tag == "path": |
185 if snode.tag == "path": |
143 params['content_file'] = snode.text |
186 params['content_file'] = snode.text |
|
187 params['content_file_write'] = snode.text |
|
188 elif snode.tag == "project_id": |
|
189 params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json" |
|
190 params['content_file_write'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json" |
|
191 params['project_id'] = snode.text |
144 elif snode.tag == "start_date": |
192 elif snode.tag == "start_date": |
145 params['start_date'] = snode.text |
193 params['start_date'] = snode.text |
146 elif snode.tag == "end_date": |
194 elif snode.tag == "end_date": |
147 params['end_date'] = snode.text |
195 params['end_date'] = snode.text |
148 elif snode.tag == "duration": |
196 elif snode.tag == "duration": |
150 elif snode.tag == "hashtags": |
198 elif snode.tag == "hashtags": |
151 params['hashtags'] = [snode.text] |
199 params['hashtags'] = [snode.text] |
152 if options.hashtag or 'hashtags' not in params : |
200 if options.hashtag or 'hashtags' not in params : |
153 params['hashtags'] = options.hashtag |
201 params['hashtags'] = options.hashtag |
154 parameters.append(params) |
202 parameters.append(params) |
155 else: |
203 else: |
|
204 if options.project_id: |
|
205 content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json" |
|
206 else: |
|
207 content_file = options.content_file |
156 parameters = [{ |
208 parameters = [{ |
157 'start_date': options.start_date, |
209 'start_date': options.start_date, |
158 'end_date' : options.end_date, |
210 'end_date' : options.end_date, |
159 'duration' : options.duration, |
211 'duration' : options.duration, |
160 'content_file' : options.content_file, |
212 'content_file' : content_file, |
161 'hashtags' : options.hashtag |
213 'content_file_write' : content_file, |
|
214 'hashtags' : options.hashtag, |
|
215 'project_id' : options.project_id |
162 }] |
216 }] |
163 |
217 post_param = {} |
|
218 if options.post_param: |
|
219 post_param = anyjson.loads(options.post_param) |
|
220 |
164 for params in parameters: |
221 for params in parameters: |
165 |
222 |
166 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable |
223 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable |
167 |
224 |
168 start_date_str = params.get("start_date",None) |
225 start_date_str = params.get("start_date",None) |
169 end_date_str = params.get("end_date", None) |
226 end_date_str = params.get("end_date", None) |
170 duration = params.get("duration", None) |
227 duration = params.get("duration", None) |
171 content_file = params.get("content_file", None) |
228 content_file = params.get("content_file", None) |
|
229 content_file_write = params.get("content_file_write", None) |
172 hashtags = params.get('hashtags', []) |
230 hashtags = params.get('hashtags', []) |
173 |
231 |
174 if user_whitelist_file: |
232 if user_whitelist_file: |
175 with open(user_whitelist_file, 'r+') as f: |
233 with open(user_whitelist_file, 'r+') as f: |
176 user_whitelist = list(set([s.strip() for s in f])) |
234 user_whitelist = list(set([s.strip() for s in f])) |
179 ts = None |
237 ts = None |
180 if start_date_str: |
238 if start_date_str: |
181 start_date = parse_date(start_date_str) |
239 start_date = parse_date(start_date_str) |
182 ts = time.mktime(start_date.timetuple()) |
240 ts = time.mktime(start_date.timetuple()) |
183 |
241 |
184 end_date = None |
|
185 if end_date_str: |
|
186 end_date = parse_date(end_date_str) |
|
187 elif start_date and duration: |
|
188 end_date = start_date + datetime.timedelta(seconds=duration) |
|
189 |
|
190 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) |
|
191 |
|
192 query_res = query.all() |
|
193 |
242 |
194 root = None |
243 root = None |
195 ensemble_parent = None |
244 ensemble_parent = None |
196 |
245 |
197 #to do : analyse situation ldt or iri ? filename set or not ? |
246 #to do : analyse situation ldt or iri ? filename set or not ? |
198 |
247 |
199 if content_file and content_file.find("http") == 0: |
248 if content_file and content_file.find("http") == 0: |
200 |
249 |
201 get_logger().debug("url : " + content_file) #@UndefinedVariable |
250 get_logger().debug("url : " + content_file) #@UndefinedVariable |
202 |
251 |
203 h = httplib2.Http() |
252 r = requests.get(content_file, params=post_param) |
204 resp, content = h.request(content_file) |
253 #get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable |
205 |
254 project = r.json() |
206 get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable |
255 text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S) |
207 |
256 root = etree.fromstring(text_match.group(1) if text_match else project['ldt']) |
208 project = anyjson.deserialize(content) |
|
209 root = etree.fromstring(project["ldt"]) |
|
210 |
257 |
211 elif content_file and os.path.exists(content_file): |
258 elif content_file and os.path.exists(content_file): |
212 |
259 |
213 doc = etree.parse(content_file) |
260 doc = etree.parse(content_file) |
214 root = doc.getroot() |
261 root = doc.getroot() |
215 |
262 |
|
263 content_id = None |
216 |
264 |
217 if root is None: |
265 if root is None: |
218 |
266 |
219 root = etree.Element(u"iri") |
267 root = etree.Element(u"iri") |
220 |
268 |
224 media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) |
272 media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) |
225 |
273 |
226 annotations = etree.SubElement(root, u"annotations") |
274 annotations = etree.SubElement(root, u"annotations") |
227 content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) |
275 content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) |
228 ensemble_parent = content |
276 ensemble_parent = content |
|
277 |
|
278 content_id = options.content_id |
229 |
279 |
230 |
280 |
231 if ensemble_parent is None: |
281 if ensemble_parent is None: |
232 file_type = None |
282 file_type = None |
233 for node in root: |
283 for node in root: |
247 annotations_node = etree.SubElement(root, u"annotations") |
297 annotations_node = etree.SubElement(root, u"annotations") |
248 content_node = annotations_node.find(u"content") |
298 content_node = annotations_node.find(u"content") |
249 if content_node is None: |
299 if content_node is None: |
250 content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) |
300 content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) |
251 ensemble_parent = content_node |
301 ensemble_parent = content_node |
|
302 content_id = content_node.get(u"id") |
252 elif file_type == "iri": |
303 elif file_type == "iri": |
253 body_node = root.find(u"body") |
304 body_node = root.find(u"body") |
254 if body_node is None: |
305 if body_node is None: |
255 body_node = etree.SubElement(root, u"body") |
306 body_node = etree.SubElement(root, u"body") |
256 ensembles_node = body_node.find(u"ensembles") |
307 ensembles_node = body_node.find(u"ensembles") |
257 if ensembles_node is None: |
308 if ensembles_node is None: |
258 ensembles_node = etree.SubElement(body_node, u"ensembles") |
309 ensembles_node = etree.SubElement(body_node, u"ensembles") |
259 ensemble_parent = ensembles_node |
310 ensemble_parent = ensembles_node |
|
311 content_id = root.xpath("head/meta[@name='id']/@content")[0] |
260 |
312 |
261 |
313 |
262 if ensemble_parent is None: |
314 if ensemble_parent is None: |
263 get_logger().error("Can not process file") #@UndefinedVariable |
315 get_logger().error("Can not process file") #@UndefinedVariable |
264 sys.exit() |
316 sys.exit() |
282 |
334 |
283 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
335 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
284 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
336 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
285 |
337 |
286 elements = etree.SubElement(decoupage, u"elements") |
338 elements = etree.SubElement(decoupage, u"elements") |
|
339 |
|
340 end_date = None |
|
341 if end_date_str: |
|
342 end_date = parse_date(end_date_str) |
|
343 elif start_date and duration: |
|
344 end_date = start_date + datetime.timedelta(seconds=duration) |
|
345 elif start_date and options.base_url: |
|
346 # get duration from api |
|
347 content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json" |
|
348 r = requests.get(content_url) |
|
349 duration = int(r.json()['duration']) |
|
350 get_logger().debug("get duration " + content_url) #@UndefinedVariable |
|
351 get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable |
|
352 |
|
353 end_date = start_date + datetime.timedelta(seconds=int(duration/1000)) |
|
354 |
|
355 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) |
|
356 |
|
357 query_res = query.all() |
287 |
358 |
288 |
359 |
289 for tw in query_res: |
360 for tw in query_res: |
290 tweet_ts_dt = tw.created_at |
361 tweet_ts_dt = tw.created_at |
291 tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) |
362 tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) |
331 |
402 |
332 |
403 |
333 |
404 |
334 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) |
405 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) |
335 |
406 |
336 if content_file and content_file.find("http") == 0: |
407 if content_file_write and content_file_write.find("http") == 0: |
337 |
408 |
338 project["ldt"] = output_data |
409 project["ldt"] = output_data |
339 body = anyjson.serialize(project) |
410 post_param = {} |
340 get_logger().debug("write http " + content_file) #@UndefinedVariable |
411 if options.post_param: |
341 get_logger().debug("write http " + repr(body)) #@UndefinedVariable |
412 post_param = anyjson.loads(options.post_param) |
342 h = httplib2.Http() |
413 |
343 resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body) |
414 get_logger().debug("write http " + content_file_write) #@UndefinedVariable |
344 get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable |
415 get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable |
345 if resp.status != 200: |
416 get_logger().debug("write http " + repr(project)) #@UndefinedVariable |
346 get_logger().error("Error http " + repr(resp) + " content " + content) #@UndefinedVariable |
417 r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param); |
347 raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason)) |
418 get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable |
|
419 if r.status_code != requests.codes.ok: |
|
420 r.raise_for_status() |
348 else: |
421 else: |
349 if content_file and os.path.exists(content_file): |
422 if content_file_write and os.path.exists(content_file_write): |
350 dest_file_name = content_file |
423 dest_file_name = content_file_write |
351 else: |
424 else: |
352 dest_file_name = options.filename |
425 dest_file_name = options.filename |
353 |
426 |
354 get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable |
427 get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable |
355 output = open(dest_file_name, "w") |
428 output = open(dest_file_name, "w") |