|
1 #!/usr/bin/env python |
|
2 # coding=utf-8 |
|
3 |
|
4 from lxml import etree |
|
5 from models import * |
|
6 from optparse import OptionParser |
|
7 from sqlalchemy import Table, Column, Integer, BigInteger, String, MetaData, \ |
|
8 ForeignKey |
|
9 from sqlalchemy.orm import sessionmaker, mapper |
|
10 from sqlalchemy.sql import select |
|
11 import datetime |
|
12 import email.utils |
|
13 import logging |
|
14 import os |
|
15 import os.path |
|
16 import re |
|
17 import sys |
|
18 import time |
|
19 import uuid |
|
20 |
|
21 #class TweetExclude(object): |
|
22 # def __init__(self, id): |
|
23 # self.id = id |
|
24 # |
|
25 # def __repr__(self): |
|
26 # return "<TweetExclude(id=%d)>" % (self.id) |
|
27 |
|
28 def parse_date(date_str): |
|
29 ts = email.utils.parsedate_tz(date_str) |
|
30 return datetime.datetime(*ts[0:7]) |
|
31 |
|
32 def get_options(): |
|
33 parser = OptionParser() |
|
34 parser.add_option("-f", "--file", dest="filename", |
|
35 help="write export to file", metavar="FILE", default="project_enmi.ldt") |
|
36 parser.add_option("-d", "--database", dest="database", |
|
37 help="Input database", metavar="DATABASE") |
|
38 parser.add_option("-s", "--start-date", dest="start_date", |
|
39 help="start date", metavar="START_DATE") |
|
40 parser.add_option("-e", "--end-date", dest="end_date", |
|
41 help="end date", metavar="END_DATE") |
|
42 parser.add_option("-I", "--content-file", dest="content_file", |
|
43 help="Content file", metavar="CONTENT_FILE") |
|
44 parser.add_option("-c", "--content", dest="content", |
|
45 help="Content url", metavar="CONTENT") |
|
46 parser.add_option("-V", "--video-url", dest="video", |
|
47 help="video url", metavar="VIDEO") |
|
48 parser.add_option("-i", "--content-id", dest="content_id", |
|
49 help="Content id", metavar="CONTENT_ID") |
|
50 parser.add_option("-x", "--exclude", dest="exclude", |
|
51 help="file containing the id to exclude", metavar="EXCLUDE") |
|
52 parser.add_option("-C", "--color", dest="color", |
|
53 help="Color code", metavar="COLOR", default="16763904") |
|
54 parser.add_option("-H", "--hashtag", dest="hashtag", |
|
55 help="Hashtag", metavar="HASHTAG", default="enmi") |
|
56 parser.add_option("-D", "--duration", dest="duration", type="int", |
|
57 help="Duration", metavar="DURATION", default=None) |
|
58 parser.add_option("-n", "--name", dest="name", |
|
59 help="Cutting name", metavar="NAME", default=u"Tweets") |
|
60 parser.add_option("-R", "--replace", dest="replace", action="store_true", |
|
61 help="Replace tweet ensemble", metavar="REPLACE", default=False) |
|
62 parser.add_option("-l", "--log", dest="logfile", |
|
63 help="log to file", metavar="LOG", default="stderr") |
|
64 |
|
65 set_logging_options(parser) |
|
66 |
|
67 |
|
68 return parser.parse_args() |
|
69 |
|
70 |
|
71 if __name__ == "__main__" : |
|
72 |
|
73 (options, args) = get_options() |
|
74 |
|
75 set_logging(options) |
|
76 |
|
77 logging.debug("OPTIONS : " + repr(options)) |
|
78 |
|
79 engine, metadata = setup_database('sqlite:///'+options.database, echo=((options.verbose-options.quiet)>0), create_all = False) |
|
80 |
|
81 Session = sessionmaker() |
|
82 conn = engine.connect() |
|
83 try : |
|
84 session = Session(bind=conn) |
|
85 try : |
|
86 |
|
87 metadata = MetaData(bind=conn) |
|
88 tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) |
|
89 #mapper(TweetExclude, tweet_exclude_table) |
|
90 metadata.create_all() |
|
91 |
|
92 if options.exclude and os.path.exists(options.exclude): |
|
93 with open(options.exclude, 'r+') as f: |
|
94 tei = tweet_exclude_table.insert() |
|
95 for line in f: |
|
96 conn.execute(tei.values(id=long(line.strip()))) |
|
97 |
|
98 if options.listconf: |
|
99 |
|
100 parameters = [] |
|
101 confdoc = etree.parse(options.listconf) |
|
102 for node in confdoc.xpath("/twitter_export/file"): |
|
103 params = {} |
|
104 for snode in node: |
|
105 if snode.tag == "path": |
|
106 params['content_file'] = snode.text |
|
107 elif snode.tag == "start_date": |
|
108 params['start_date'] = snode.text |
|
109 elif snode.tag == "end_date": |
|
110 params['end_date'] = snode.text |
|
111 elif snode.tag == "duration": |
|
112 params['duration'] = int(snode.text) |
|
113 parameters.append(params) |
|
114 else: |
|
115 parameters = [{ |
|
116 'start_date': options.start_date, |
|
117 'end_date' : options.end_date, |
|
118 'duration' : options.duration, |
|
119 'content_file' : otions.content_file |
|
120 |
|
121 }] |
|
122 |
|
123 for params in parameters: |
|
124 |
|
125 logging.debug("PARAMETERS " + repr(params)) |
|
126 |
|
127 start_date_str = params.get("start_date",None) |
|
128 end_date_str = params.get("end_date", None) |
|
129 duration = params.get("duration", None) |
|
130 content_file = params.get("content_file", None) |
|
131 |
|
132 |
|
133 start_date = parse_date(start_date_str) |
|
134 ts = time.mktime(start_date.timetuple()) |
|
135 |
|
136 if end_date_str: |
|
137 end_date = parse_date(end_date_str) |
|
138 te = time.mktime(end_date.timetuple()) |
|
139 else: |
|
140 te = ts + duration |
|
141 end_date = start_date + datetime.timedelta(seconds=duration) |
|
142 |
|
143 |
|
144 query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >= start_date).filter(Tweet.created_at <= end_date).all() |
|
145 |
|
146 #hashtag = u"%#"+unicode(options.hashtag)+u"%" |
|
147 |
|
148 #cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te)); |
|
149 |
|
150 root = None |
|
151 ensemble_parent = None |
|
152 |
|
153 if content_file and os.path.exists(content_file): |
|
154 |
|
155 doc = etree.parse(content_file) |
|
156 root = doc.getroot() |
|
157 |
|
158 ensemble_parent = root.xpath("//ensembles")[0] |
|
159 |
|
160 else: |
|
161 root = etree.Element(u"iri") |
|
162 |
|
163 project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) |
|
164 |
|
165 medias = etree.SubElement(root, u"medias") |
|
166 media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) |
|
167 |
|
168 annotations = etree.SubElement(root, u"annotations") |
|
169 content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) |
|
170 ensemble_parent = content |
|
171 |
|
172 if options.replace: |
|
173 for ens in ensemble_parent.iterchildren(tag=u"ensemble"): |
|
174 if ens.get("id","").startswith("tweet_"): |
|
175 ensemble_parent.remove(ens) |
|
176 |
|
177 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"}) |
|
178 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) |
|
179 |
|
180 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
|
181 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
|
182 |
|
183 elements = etree.SubElement(decoupage, u"elements") |
|
184 |
|
185 for tw in query_res: |
|
186 tweet_ts_dt = tw.created_at |
|
187 tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) |
|
188 tweet_ts_rel = (tweet_ts-ts) * 1000 |
|
189 username = None |
|
190 if tw.user is not None: |
|
191 username = tw.user.name |
|
192 if not username: |
|
193 username = "anon." |
|
194 element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""}) |
|
195 etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) |
|
196 etree.SubElement(element, u"abstract").text = unicode(tw.text) |
|
197 |
|
198 tags_node = etree.SubElement(element, u"tags") |
|
199 |
|
200 for entity in tw.entity_list: |
|
201 if entity.type == u'entity_hashtag': |
|
202 etree.SubElement(tags_node,u"tag").text = entity.hashtag.text |
|
203 |
|
204 if content_file and os.path.exists(content_file): |
|
205 output = open(content_file, "w") |
|
206 else: |
|
207 output = open(options.filename, "w") |
|
208 |
|
209 output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)) |
|
210 output.flush() |
|
211 output.close() |
|
212 |
|
213 finally: |
|
214 session.close() |
|
215 finally: |
|
216 conn.close() |