diff -r b7f4b0554ef8 -r bb44692e09ee script/rest/export_twitter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/rest/export_twitter.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# coding=utf-8 + +from sqlite3 import * +import datetime, time +import email.utils +from optparse import OptionParser +import os.path +import os +import sys +from lxml import etree +import uuid +import re + +def parse_date(date_str): + ts = email.utils.parsedate_tz(date_str) + return time.mktime(ts[0:9]) - 60 * ts[9] + +def adapt_datetime(ts): + return time.mktime(ts.timetuple()) + +def adapt_geo(geo): + return simplejson.dumps(geo) + +def convert_geo(s): + return simplejson.loads(s) + + +register_adapter(datetime.datetime, adapt_datetime) +register_converter("geo", convert_geo) + +columns_tweet = [u'favorited', u'truncated', u'text', u'created_at', u'source', u'in_reply_to_status_id', u'in_reply_to_screen_name', u'in_reply_to_user_id', u'geo', u'id', u'user'] +columns_user = [u'id', u'verified', u'profile_sidebar_fill_color', u'profile_text_color', u'followers_count', u'protected', u'location', u'profile_background_color', u'utc_offset', u'statuses_count', u'description', u'friends_count', u'profile_link_color', u'profile_image_url', u'notifications', u'geo_enabled', u'profile_background_image_url', u'screen_name', u'profile_background_tile', u'favourites_count', u'name', u'url', u'created_at', u'time_zone', u'profile_sidebar_border_color', u'following'] + + +if __name__ == "__main__" : + + parser = OptionParser() + parser.add_option("-f", "--file", dest="filename", + help="write export to file", metavar="FILE", default="project_enmi.ldt") + parser.add_option("-d", "--database", dest="database", + help="Input database", metavar="DATABASE") + parser.add_option("-s", "--start-date", dest="start_date", + help="start date", metavar="START_DATE") + parser.add_option("-e", "--end-date", dest="end_date", + help="end date", metavar="END_DATE") + parser.add_option("-I", "--content-file", dest="content_file", + help="Content file", metavar="CONTENT_FILE") + parser.add_option("-c", "--content", dest="content", + help="Content url", metavar="CONTENT") + parser.add_option("-v", "--video-url", dest="video", + help="video url", metavar="VIDEO") + parser.add_option("-i", "--content-id", dest="content_id", + help="Content id", metavar="CONTENT_ID") + parser.add_option("-x", "--exclude", dest="exclude", + help="file containing the id to exclude", metavar="EXCLUDE") + parser.add_option("-C", "--color", dest="color", + help="Color code", metavar="COLOR", default="16763904") + parser.add_option("-H", "--hashtag", dest="hashtag", + help="Hashtag", metavar="HASHTAG", default="enmi09") + parser.add_option("-D", "--duration", dest="duration", type="int", + help="Duration", metavar="DURATION", default=None) + parser.add_option("-n", "--name", dest="name", + help="Cuttting name", metavar="NAME", default=u"Tweets") + parser.add_option("-R", "--replace", dest="replace", action="store_true", + help="Replace tweet ensemble", metavar="REPLACE", default=False) + + + + (options, args) = parser.parse_args() + + + ts = int(parse_date(options.start_date)) + + if options.end_date: + te = int(parse_date(options.end_date)) + else: + te = ts + options.duration + + conn = connect(options.database) + conn.row_factory = Row + cursor = conn.cursor() + + cursor.execute("create temporary table tweet_exclude (id)") + + if options.exclude and os.path.exists(options.exclude): + f = open(options.exclude, 'r+') + for line in f: + cursor.execute("insert into tweet_exclude (id) values (?)", (int(line.strip()),)) + f.close() + + hashtag = u"%#"+unicode(options.hashtag)+u"%" + cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te)); + + root = None + ensemble_parent = None + + if options.content_file and os.path.exists(options.content_file): + + doc = etree.parse(options.content_file) + root = doc.getroot() + + ensemble_parent = root.xpath("//ensembles")[0] + + else: + root = etree.Element(u"iri") + + project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) + + medias = etree.SubElement(root, u"medias") + media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) + + annotations = etree.SubElement(root, u"annotations") + content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) + ensemble_parent = content + + if options.replace: + for ens in ensemble_parent.iterchildren(tag=u"ensemble"): + if ens.get("id","").startswith("tweet_"): + ensemble_parent.remove(ens) + + ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"}) + decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) + + etree.SubElement(decoupage, u"title").text = unicode(options.name) + etree.SubElement(decoupage, u"abstract").text = unicode(options.name) + + elements = etree.SubElement(decoupage, u"elements") + + for res in cursor: + tweet_ts = int(res["created_at_ts"]) + tweet_ts_dt = datetime.datetime.fromtimestamp(tweet_ts) + tweet_ts_rel = (tweet_ts-ts) * 1000 + element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(res["id"]), u"color":unicode(options.color), u"author":unicode(res["name"]), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""}) + etree.SubElement(element, u"title").text = unicode(res["name"]) + u": " + unicode(res["text"]) + etree.SubElement(element, u"abstract").text = unicode(res["text"]) + + tags = {} + for m in re.finditer(u"\#(\\w+)",res["text"], re.U): + tags[m.group(1)] = "" + + tags_node = etree.SubElement(element, u"tags") + + for t in tags.keys(): + etree.SubElement(tags_node,u"tag").text = t + + if options.content_file and os.path.exists(options.content_file): + output = open(options.content_file, "w") + else: + output = open(options.filename, "w") + + output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)) + output.flush() + output.close() + +