|
1 #!/usr/bin/env python |
|
2 # coding=utf-8 |
|
3 |
|
4 from lxml import etree |
|
5 from iri_tweet.models import setup_database |
|
6 from optparse import OptionParser #@UnresolvedImport |
|
7 from sqlalchemy import Table, Column, BigInteger |
|
8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, |
|
9 get_logger) |
|
10 import anyjson |
|
11 import datetime |
|
12 import httplib2 |
|
13 import os.path |
|
14 import re |
|
15 import sys |
|
16 import time |
|
17 import uuid #@UnresolvedImport |
|
18 from dateutil.parser import parse as parse_date |
|
19 |
|
20 #class TweetExclude(object): |
|
21 # def __init__(self, id): |
|
22 # self.id = id |
|
23 # |
|
24 # def __repr__(self): |
|
25 # return "<TweetExclude(id=%d)>" % (self.id) |
|
26 |
|
27 |
|
28 def parse_polemics(tw, extended_mode): |
|
29 """ |
|
30 parse polemics in text and return a list of polemic code. None if not polemic found |
|
31 """ |
|
32 polemics = {} |
|
33 for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): |
|
34 pol_link = { |
|
35 '++' : u'OK', |
|
36 '--' : u'KO', |
|
37 '??' : u'Q', |
|
38 '==' : u'REF'}[m.group(1)] |
|
39 polemics[pol_link] = pol_link |
|
40 |
|
41 if extended_mode: |
|
42 if "?" in tw.text: |
|
43 polemics["Q"] = "Q" |
|
44 |
|
45 for entity in tw.entity_list: |
|
46 if entity.type == "entity_url": |
|
47 polemics["REF"] = "REF" |
|
48 |
|
49 if len(polemics) > 0: |
|
50 return polemics.keys() |
|
51 else: |
|
52 return None |
|
53 |
|
54 def get_options(): |
|
55 parser = OptionParser() |
|
56 parser.add_option("-f", "--file", dest="filename", |
|
57 help="write export to file", metavar="FILE", default="project.ldt") |
|
58 parser.add_option("-d", "--database", dest="database", |
|
59 help="Input database", metavar="DATABASE") |
|
60 parser.add_option("-s", "--start-date", dest="start_date", |
|
61 help="start date", metavar="START_DATE", default=None) |
|
62 parser.add_option("-e", "--end-date", dest="end_date", |
|
63 help="end date", metavar="END_DATE", default=None) |
|
64 parser.add_option("-I", "--content-file", dest="content_file", |
|
65 help="Content file", metavar="CONTENT_FILE") |
|
66 parser.add_option("-c", "--content", dest="content", |
|
67 help="Content url", metavar="CONTENT") |
|
68 parser.add_option("-V", "--video-url", dest="video", |
|
69 help="video url", metavar="VIDEO") |
|
70 parser.add_option("-i", "--content-id", dest="content_id", |
|
71 help="Content id", metavar="CONTENT_ID") |
|
72 parser.add_option("-x", "--exclude", dest="exclude", |
|
73 help="file containing the id to exclude", metavar="EXCLUDE") |
|
74 parser.add_option("-C", "--color", dest="color", |
|
75 help="Color code", metavar="COLOR", default="16763904") |
|
76 parser.add_option("-H", "--hashtag", dest="hashtag", |
|
77 help="Hashtag", metavar="HASHTAG", default=[], action="append") |
|
78 parser.add_option("-D", "--duration", dest="duration", type="int", |
|
79 help="Duration", metavar="DURATION", default=None) |
|
80 parser.add_option("-n", "--name", dest="name", |
|
81 help="Cutting name", metavar="NAME", default=u"Tweets") |
|
82 parser.add_option("-R", "--replace", dest="replace", action="store_true", |
|
83 help="Replace tweet ensemble", metavar="REPLACE", default=False) |
|
84 parser.add_option("-m", "--merge", dest="merge", action="store_true", |
|
85 help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False) |
|
86 parser.add_option("-L", "--list-conf", dest="listconf", |
|
87 help="list of file to process", metavar="LIST_CONF", default=None) |
|
88 parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", |
|
89 help="Trigger polemic extended mode", metavar="EXTENDED", default=False) |
|
90 parser.add_option("--user-whitelist", dest="user_whitelist", action="store", |
|
91 help="A list of user screen name", metavar="USER_WHITELIST",default=None) |
|
92 |
|
93 |
|
94 set_logging_options(parser) |
|
95 |
|
96 |
|
97 return parser.parse_args() + (parser,) |
|
98 |
|
99 |
|
100 if __name__ == "__main__" : |
|
101 |
|
102 (options, args, parser) = get_options() |
|
103 |
|
104 set_logging(options) |
|
105 |
|
106 get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable |
|
107 |
|
108 if len(sys.argv) == 1 or options.database is None: |
|
109 parser.print_help() |
|
110 sys.exit(1) |
|
111 |
|
112 conn_str = options.database.strip() |
|
113 if not re.match("^\w+://.+", conn_str): |
|
114 conn_str = 'sqlite:///' + conn_str |
|
115 |
|
116 engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) |
|
117 conn = None |
|
118 try : |
|
119 conn = engine.connect() |
|
120 session = None |
|
121 try : |
|
122 session = Session(bind=conn) |
|
123 tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) |
|
124 #mapper(TweetExclude, tweet_exclude_table) |
|
125 metadata.create_all(bind=conn, tables=[tweet_exclude_table]) |
|
126 |
|
127 if options.exclude and os.path.exists(options.exclude): |
|
128 with open(options.exclude, 'r+') as f: |
|
129 tei = tweet_exclude_table.insert() |
|
130 for line in f: |
|
131 conn.execute(tei.values(id=long(line.strip()))) |
|
132 user_whitelist_file = options.user_whitelist |
|
133 user_whitelist = None |
|
134 |
|
135 if options.listconf: |
|
136 |
|
137 parameters = [] |
|
138 confdoc = etree.parse(options.listconf) |
|
139 for node in confdoc.xpath("/twitter_export/file"): |
|
140 params = {} |
|
141 for snode in node: |
|
142 if snode.tag == "path": |
|
143 params['content_file'] = snode.text |
|
144 elif snode.tag == "start_date": |
|
145 params['start_date'] = snode.text |
|
146 elif snode.tag == "end_date": |
|
147 params['end_date'] = snode.text |
|
148 elif snode.tag == "duration": |
|
149 params['duration'] = int(snode.text) |
|
150 elif snode.tag == "hashtags": |
|
151 params['hashtags'] = [snode.text] |
|
152 if options.hashtag or 'hashtags' not in params : |
|
153 params['hashtags'] = options.hashtag |
|
154 parameters.append(params) |
|
155 else: |
|
156 parameters = [{ |
|
157 'start_date': options.start_date, |
|
158 'end_date' : options.end_date, |
|
159 'duration' : options.duration, |
|
160 'content_file' : options.content_file, |
|
161 'hashtags' : options.hashtag |
|
162 }] |
|
163 |
|
164 for params in parameters: |
|
165 |
|
166 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable |
|
167 |
|
168 start_date_str = params.get("start_date",None) |
|
169 end_date_str = params.get("end_date", None) |
|
170 duration = params.get("duration", None) |
|
171 content_file = params.get("content_file", None) |
|
172 hashtags = params.get('hashtags', []) |
|
173 |
|
174 if user_whitelist_file: |
|
175 with open(user_whitelist_file, 'r+') as f: |
|
176 user_whitelist = list(set([s.strip() for s in f])) |
|
177 |
|
178 start_date = None |
|
179 ts = None |
|
180 if start_date_str: |
|
181 start_date = parse_date(start_date_str) |
|
182 ts = time.mktime(start_date.timetuple()) |
|
183 |
|
184 end_date = None |
|
185 if end_date_str: |
|
186 end_date = parse_date(end_date_str) |
|
187 elif start_date and duration: |
|
188 end_date = start_date + datetime.timedelta(seconds=duration) |
|
189 |
|
190 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) |
|
191 |
|
192 query_res = query.all() |
|
193 |
|
194 root = None |
|
195 ensemble_parent = None |
|
196 |
|
197 #to do : analyse situation ldt or iri ? filename set or not ? |
|
198 |
|
199 if content_file and content_file.find("http") == 0: |
|
200 |
|
201 get_logger().debug("url : " + content_file) #@UndefinedVariable |
|
202 |
|
203 h = httplib2.Http() |
|
204 resp, content = h.request(content_file) |
|
205 |
|
206 get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable |
|
207 |
|
208 project = anyjson.deserialize(content) |
|
209 root = etree.fromstring(project["ldt"]) |
|
210 |
|
211 elif content_file and os.path.exists(content_file): |
|
212 |
|
213 doc = etree.parse(content_file) |
|
214 root = doc.getroot() |
|
215 |
|
216 |
|
217 if root is None: |
|
218 |
|
219 root = etree.Element(u"iri") |
|
220 |
|
221 project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) |
|
222 |
|
223 medias = etree.SubElement(root, u"medias") |
|
224 media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) |
|
225 |
|
226 annotations = etree.SubElement(root, u"annotations") |
|
227 content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) |
|
228 ensemble_parent = content |
|
229 |
|
230 |
|
231 if ensemble_parent is None: |
|
232 file_type = None |
|
233 for node in root: |
|
234 if node.tag == "project": |
|
235 file_type = "ldt" |
|
236 break |
|
237 elif node.tag == "head": |
|
238 file_type = "iri" |
|
239 break |
|
240 |
|
241 if file_type == "ldt": |
|
242 media_nodes = root.xpath("//media") |
|
243 if len(media_nodes) > 0: |
|
244 media = media_nodes[0] |
|
245 annotations_node = root.find(u"annotations") |
|
246 if annotations_node is None: |
|
247 annotations_node = etree.SubElement(root, u"annotations") |
|
248 content_node = annotations_node.find(u"content") |
|
249 if content_node is None: |
|
250 content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) |
|
251 ensemble_parent = content_node |
|
252 elif file_type == "iri": |
|
253 body_node = root.find(u"body") |
|
254 if body_node is None: |
|
255 body_node = etree.SubElement(root, u"body") |
|
256 ensembles_node = body_node.find(u"ensembles") |
|
257 if ensembles_node is None: |
|
258 ensembles_node = etree.SubElement(body_node, u"ensembles") |
|
259 ensemble_parent = ensembles_node |
|
260 |
|
261 |
|
262 if ensemble_parent is None: |
|
263 get_logger().error("Can not process file") #@UndefinedVariable |
|
264 sys.exit() |
|
265 |
|
266 if options.replace: |
|
267 for ens in ensemble_parent.iterchildren(tag=u"ensemble"): |
|
268 if ens.get("id","").startswith("tweet_"): |
|
269 ensemble_parent.remove(ens) |
|
270 |
|
271 ensemble = None |
|
272 elements = None |
|
273 |
|
274 if options.merge: |
|
275 ensemble = ensemble_parent.find(u"ensemble") |
|
276 if ensemble is not None: |
|
277 elements = ensemble.find(u".//elements") |
|
278 |
|
279 if ensemble is None or elements is None: |
|
280 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) |
|
281 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) |
|
282 |
|
283 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
|
284 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
|
285 |
|
286 elements = etree.SubElement(decoupage, u"elements") |
|
287 |
|
288 |
|
289 for tw in query_res: |
|
290 tweet_ts_dt = tw.created_at |
|
291 tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) |
|
292 if ts is None: |
|
293 ts = tweet_ts |
|
294 tweet_ts_rel = (tweet_ts-ts) * 1000 |
|
295 username = None |
|
296 profile_url = "" |
|
297 if tw.user is not None: |
|
298 username = tw.user.name |
|
299 profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else "" |
|
300 if not username: |
|
301 username = "anon." |
|
302 |
|
303 element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)}) |
|
304 etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) |
|
305 etree.SubElement(element, u"abstract").text = unicode(tw.text) |
|
306 |
|
307 tags_node = etree.SubElement(element, u"tags") |
|
308 |
|
309 for entity in tw.entity_list: |
|
310 if entity.type == u'entity_hashtag': |
|
311 etree.SubElement(tags_node,u"tag").text = entity.hashtag.text |
|
312 |
|
313 meta_element = etree.SubElement(element, u'meta') |
|
314 |
|
315 polemics_list = parse_polemics(tw, options.extended_mode) |
|
316 if polemics_list: |
|
317 polemics_element = etree.Element(u'polemics') |
|
318 for pol in polemics_list: |
|
319 etree.SubElement(polemics_element, u'polemic').text = pol |
|
320 meta_element.append(polemics_element) |
|
321 |
|
322 etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json)) |
|
323 |
|
324 # sort by tc in |
|
325 if options.merge : |
|
326 # remove all elements and put them in a array |
|
327 # sort them with tc |
|
328 #put them back |
|
329 elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) |
|
330 |
|
331 |
|
332 |
|
333 |
|
334 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True) |
|
335 |
|
336 if content_file and content_file.find("http") == 0: |
|
337 |
|
338 project["ldt"] = output_data |
|
339 body = anyjson.serialize(project) |
|
340 get_logger().debug("write http " + content_file) #@UndefinedVariable |
|
341 get_logger().debug("write http " + repr(body)) #@UndefinedVariable |
|
342 h = httplib2.Http() |
|
343 resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body) |
|
344 get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable |
|
345 else: |
|
346 if content_file and os.path.exists(content_file): |
|
347 dest_file_name = content_file |
|
348 else: |
|
349 dest_file_name = options.filename |
|
350 |
|
351 get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable |
|
352 output = open(dest_file_name, "w") |
|
353 output.write(output_data) |
|
354 output.flush() |
|
355 output.close() |
|
356 |
|
357 finally: |
|
358 if session: |
|
359 session.close() |
|
360 finally: |
|
361 if conn: |
|
362 conn.close() |