diff -r b7f4b0554ef8 -r bb44692e09ee script/iri_tweet/export_twitter_alchemy.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/iri_tweet/export_twitter_alchemy.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,230 @@ +#!/usr/bin/env python +# coding=utf-8 + +from lxml import etree +from models import * +from optparse import OptionParser +from sqlalchemy import Table, Column, Integer, BigInteger, String, MetaData, \ + ForeignKey, create_engine +from sqlalchemy.orm import sessionmaker, mapper +from sqlalchemy.sql import select +import datetime +import time +import email.utils +import logging +import os +import os.path +import re +import sys +import uuid + +#class TweetExclude(object): +# def __init__(self, id): +# self.id = id +# +# def __repr__(self): +# return "" % (self.id) + +def parse_date(date_str): + ts = email.utils.parsedate_tz(date_str) + return datetime.datetime(*ts[0:7]) + + +if __name__ == "__main__" : + + parser = OptionParser() + parser.add_option("-f", "--file", dest="filename", + help="write export to file", metavar="FILE", default="project_enmi.ldt") + parser.add_option("-d", "--database", dest="database", + help="Input database", metavar="DATABASE") + parser.add_option("-s", "--start-date", dest="start_date", + help="start date", metavar="START_DATE") + parser.add_option("-e", "--end-date", dest="end_date", + help="end date", metavar="END_DATE") + parser.add_option("-I", "--content-file", dest="content_file", + help="Content file", metavar="CONTENT_FILE") + parser.add_option("-c", "--content", dest="content", + help="Content url", metavar="CONTENT") + parser.add_option("-V", "--video-url", dest="video", + help="video url", metavar="VIDEO") + parser.add_option("-i", "--content-id", dest="content_id", + help="Content id", metavar="CONTENT_ID") + parser.add_option("-x", "--exclude", dest="exclude", + help="file containing the id to exclude", metavar="EXCLUDE") + parser.add_option("-C", "--color", dest="color", + help="Color code", metavar="COLOR", default="16763904") + parser.add_option("-H", "--hashtag", dest="hashtag", + help="Hashtag", metavar="HASHTAG", default="enmi") + parser.add_option("-D", "--duration", dest="duration", type="int", + help="Duration", metavar="DURATION", default=None) + parser.add_option("-n", "--name", dest="name", + help="Cutting name", metavar="NAME", default=u"Tweets") + parser.add_option("-R", "--replace", dest="replace", action="store_true", + help="Replace tweet ensemble", metavar="REPLACE", default=False) + parser.add_option("-l", "--log", dest="logfile", + help="log to file", metavar="LOG", default="stderr") + parser.add_option("-v", dest="verbose", action="count", + help="verbose", metavar="VERBOSE", default=0) + parser.add_option("-q", dest="quiet", action="count", + help="quiet", metavar="QUIET", default=0) + parser.add_option("-L", dest="listconf", + help="file containing the list of file to process", metavar="LIST", default=0) + + + + (options, args) = parser.parse_args() + + logging_config = {} + + if options.logfile == "stdout": + logging_config["stream"] = sys.stdout + elif options.logfile == "stderr": + logging_config["stream"] = sys.stderr + else: + logging_config["filename"] = options.logfile + + logging_config["level"] = max(logging.NOTSET, min(logging.CRITICAL, logging.WARNING - 10 * options.verbose + 10 * options.quiet)) + + logging.basicConfig(**logging_config) + + logging.debug("OPTIONS : " + repr(options)) + + + engine = create_engine('sqlite:///'+options.database, echo=((options.verbose-options.quiet)>0)) + Session = sessionmaker() + + conn = engine.connect() + try : + session = Session(bind=conn) + try : + + metadata = MetaData(bind=conn) + tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) + #mapper(TweetExclude, tweet_exclude_table) + metadata.create_all() + + if options.exclude and os.path.exists(options.exclude): + with open(options.exclude, 'r+') as f: + tei = tweet_exclude_table.insert() + for line in f: + conn.execute(tei.values(id=long(line.strip()))) + + if options.listconf: + + parameters = [] + confdoc = etree.parse(options.listconf) + for node in confdoc.xpath("/twitter_export/file"): + params = {} + for snode in node: + if snode.tag == "path": + params['content_file'] = snode.text + elif snode.tag == "start_date": + params['start_date'] = snode.text + elif snode.tag == "end_date": + params['end_date'] = snode.text + elif snode.tag == "duration": + params['duration'] = int(snode.text) + parameters.append(params) + else: + parameters = [{ + 'start_date': options.start_date, + 'end_date' : options.end_date, + 'duration' : options.duration, + 'content_file' : otions.content_file + + }] + + for params in parameters: + + logging.debug("PARAMETERS " + repr(params)) + + start_date_str = params.get("start_date",None) + end_date_str = params.get("end_date", None) + duration = params.get("duration", None) + content_file = params.get("content_file", None) + + + start_date = parse_date(start_date_str) + ts = time.mktime(start_date.timetuple()) + + if end_date_str: + end_date = parse_date(end_date_str) + te = time.mktime(end_date.timetuple()) + else: + te = ts + duration + end_date = start_date + datetime.timedelta(seconds=duration) + + + query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >= start_date).filter(Tweet.created_at <= end_date).all() + + #hashtag = u"%#"+unicode(options.hashtag)+u"%" + + #cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te)); + + root = None + ensemble_parent = None + + if content_file and os.path.exists(content_file): + + doc = etree.parse(content_file) + root = doc.getroot() + + ensemble_parent = root.xpath("//ensembles")[0] + + else: + root = etree.Element(u"iri") + + project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) + + medias = etree.SubElement(root, u"medias") + media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) + + annotations = etree.SubElement(root, u"annotations") + content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) + ensemble_parent = content + + if options.replace: + for ens in ensemble_parent.iterchildren(tag=u"ensemble"): + if ens.get("id","").startswith("tweet_"): + ensemble_parent.remove(ens) + + ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"}) + decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) + + etree.SubElement(decoupage, u"title").text = unicode(options.name) + etree.SubElement(decoupage, u"abstract").text = unicode(options.name) + + elements = etree.SubElement(decoupage, u"elements") + + for tw in query_res: + tweet_ts_dt = tw.created_at + tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) + tweet_ts_rel = (tweet_ts-ts) * 1000 + username = None + if tw.user is not None: + username = tw.user.name + if not username: + username = "anon." + element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""}) + etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) + etree.SubElement(element, u"abstract").text = unicode(tw.text) + + tags_node = etree.SubElement(element, u"tags") + + for entity in tw.entity_list: + if entity.type == u'entity_hashtag': + etree.SubElement(tags_node,u"tag").text = entity.hashtag.text + + if content_file and os.path.exists(content_file): + output = open(content_file, "w") + else: + output = open(options.filename, "w") + + output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)) + output.flush() + output.close() + + finally: + session.close() + finally: + conn.close()