diff -r 54d7f1486ac4 -r 4daf47fcf792 script/iri_tweet/export_twitter_alchemy.py --- a/script/iri_tweet/export_twitter_alchemy.py Tue Jan 18 10:08:03 2011 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,216 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 - -from lxml import etree -from models import * -from optparse import OptionParser -from sqlalchemy import Table, Column, Integer, BigInteger, String, MetaData, \ - ForeignKey -from sqlalchemy.orm import sessionmaker, mapper -from sqlalchemy.sql import select -import datetime -import email.utils -import logging -import os -import os.path -import re -import sys -import time -import uuid - -#class TweetExclude(object): -# def __init__(self, id): -# self.id = id -# -# def __repr__(self): -# return "" % (self.id) - -def parse_date(date_str): - ts = email.utils.parsedate_tz(date_str) - return datetime.datetime(*ts[0:7]) - -def get_options(): - parser = OptionParser() - parser.add_option("-f", "--file", dest="filename", - help="write export to file", metavar="FILE", default="project_enmi.ldt") - parser.add_option("-d", "--database", dest="database", - help="Input database", metavar="DATABASE") - parser.add_option("-s", "--start-date", dest="start_date", - help="start date", metavar="START_DATE") - parser.add_option("-e", "--end-date", dest="end_date", - help="end date", metavar="END_DATE") - parser.add_option("-I", "--content-file", dest="content_file", - help="Content file", metavar="CONTENT_FILE") - parser.add_option("-c", "--content", dest="content", - help="Content url", metavar="CONTENT") - parser.add_option("-V", "--video-url", dest="video", - help="video url", metavar="VIDEO") - parser.add_option("-i", "--content-id", dest="content_id", - help="Content id", metavar="CONTENT_ID") - parser.add_option("-x", "--exclude", dest="exclude", - help="file containing the id to exclude", metavar="EXCLUDE") - parser.add_option("-C", "--color", dest="color", - help="Color code", metavar="COLOR", default="16763904") - parser.add_option("-H", "--hashtag", dest="hashtag", - help="Hashtag", metavar="HASHTAG", default="enmi") - parser.add_option("-D", "--duration", dest="duration", type="int", - help="Duration", metavar="DURATION", default=None) - parser.add_option("-n", "--name", dest="name", - help="Cutting name", metavar="NAME", default=u"Tweets") - parser.add_option("-R", "--replace", dest="replace", action="store_true", - help="Replace tweet ensemble", metavar="REPLACE", default=False) - parser.add_option("-l", "--log", dest="logfile", - help="log to file", metavar="LOG", default="stderr") - - set_logging_options(parser) - - - return parser.parse_args() - - -if __name__ == "__main__" : - - (options, args) = get_options() - - set_logging(options) - - logging.debug("OPTIONS : " + repr(options)) - - engine, metadata = setup_database('sqlite:///'+options.database, echo=((options.verbose-options.quiet)>0), create_all = False) - - Session = sessionmaker() - conn = engine.connect() - try : - session = Session(bind=conn) - try : - - metadata = MetaData(bind=conn) - tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) - #mapper(TweetExclude, tweet_exclude_table) - metadata.create_all() - - if options.exclude and os.path.exists(options.exclude): - with open(options.exclude, 'r+') as f: - tei = tweet_exclude_table.insert() - for line in f: - conn.execute(tei.values(id=long(line.strip()))) - - if options.listconf: - - parameters = [] - confdoc = etree.parse(options.listconf) - for node in confdoc.xpath("/twitter_export/file"): - params = {} - for snode in node: - if snode.tag == "path": - params['content_file'] = snode.text - elif snode.tag == "start_date": - params['start_date'] = snode.text - elif snode.tag == "end_date": - params['end_date'] = snode.text - elif snode.tag == "duration": - params['duration'] = int(snode.text) - parameters.append(params) - else: - parameters = [{ - 'start_date': options.start_date, - 'end_date' : options.end_date, - 'duration' : options.duration, - 'content_file' : otions.content_file - - }] - - for params in parameters: - - logging.debug("PARAMETERS " + repr(params)) - - start_date_str = params.get("start_date",None) - end_date_str = params.get("end_date", None) - duration = params.get("duration", None) - content_file = params.get("content_file", None) - - - start_date = parse_date(start_date_str) - ts = time.mktime(start_date.timetuple()) - - if end_date_str: - end_date = parse_date(end_date_str) - te = time.mktime(end_date.timetuple()) - else: - te = ts + duration - end_date = start_date + datetime.timedelta(seconds=duration) - - - query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >= start_date).filter(Tweet.created_at <= end_date).all() - - #hashtag = u"%#"+unicode(options.hashtag)+u"%" - - #cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te)); - - root = None - ensemble_parent = None - - if content_file and os.path.exists(content_file): - - doc = etree.parse(content_file) - root = doc.getroot() - - ensemble_parent = root.xpath("//ensembles")[0] - - else: - root = etree.Element(u"iri") - - project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) - - medias = etree.SubElement(root, u"medias") - media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) - - annotations = etree.SubElement(root, u"annotations") - content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) - ensemble_parent = content - - if options.replace: - for ens in ensemble_parent.iterchildren(tag=u"ensemble"): - if ens.get("id","").startswith("tweet_"): - ensemble_parent.remove(ens) - - ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"}) - decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) - - etree.SubElement(decoupage, u"title").text = unicode(options.name) - etree.SubElement(decoupage, u"abstract").text = unicode(options.name) - - elements = etree.SubElement(decoupage, u"elements") - - for tw in query_res: - tweet_ts_dt = tw.created_at - tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) - tweet_ts_rel = (tweet_ts-ts) * 1000 - username = None - if tw.user is not None: - username = tw.user.name - if not username: - username = "anon." - element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""}) - etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) - etree.SubElement(element, u"abstract").text = unicode(tw.text) - - tags_node = etree.SubElement(element, u"tags") - - for entity in tw.entity_list: - if entity.type == u'entity_hashtag': - etree.SubElement(tags_node,u"tag").text = entity.hashtag.text - - if content_file and os.path.exists(content_file): - output = open(content_file, "w") - else: - output = open(options.filename, "w") - - output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)) - output.flush() - output.close() - - finally: - session.close() - finally: - conn.close()