--- a/script/iri_tweet/export_twitter_alchemy.py Tue Jan 18 10:08:03 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,216 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-from lxml import etree
-from models import *
-from optparse import OptionParser
-from sqlalchemy import Table, Column, Integer, BigInteger, String, MetaData, \
- ForeignKey
-from sqlalchemy.orm import sessionmaker, mapper
-from sqlalchemy.sql import select
-import datetime
-import email.utils
-import logging
-import os
-import os.path
-import re
-import sys
-import time
-import uuid
-
-#class TweetExclude(object):
-# def __init__(self, id):
-# self.id = id
-#
-# def __repr__(self):
-# return "<TweetExclude(id=%d)>" % (self.id)
-
-def parse_date(date_str):
- ts = email.utils.parsedate_tz(date_str)
- return datetime.datetime(*ts[0:7])
-
-def get_options():
- parser = OptionParser()
- parser.add_option("-f", "--file", dest="filename",
- help="write export to file", metavar="FILE", default="project_enmi.ldt")
- parser.add_option("-d", "--database", dest="database",
- help="Input database", metavar="DATABASE")
- parser.add_option("-s", "--start-date", dest="start_date",
- help="start date", metavar="START_DATE")
- parser.add_option("-e", "--end-date", dest="end_date",
- help="end date", metavar="END_DATE")
- parser.add_option("-I", "--content-file", dest="content_file",
- help="Content file", metavar="CONTENT_FILE")
- parser.add_option("-c", "--content", dest="content",
- help="Content url", metavar="CONTENT")
- parser.add_option("-V", "--video-url", dest="video",
- help="video url", metavar="VIDEO")
- parser.add_option("-i", "--content-id", dest="content_id",
- help="Content id", metavar="CONTENT_ID")
- parser.add_option("-x", "--exclude", dest="exclude",
- help="file containing the id to exclude", metavar="EXCLUDE")
- parser.add_option("-C", "--color", dest="color",
- help="Color code", metavar="COLOR", default="16763904")
- parser.add_option("-H", "--hashtag", dest="hashtag",
- help="Hashtag", metavar="HASHTAG", default="enmi")
- parser.add_option("-D", "--duration", dest="duration", type="int",
- help="Duration", metavar="DURATION", default=None)
- parser.add_option("-n", "--name", dest="name",
- help="Cutting name", metavar="NAME", default=u"Tweets")
- parser.add_option("-R", "--replace", dest="replace", action="store_true",
- help="Replace tweet ensemble", metavar="REPLACE", default=False)
- parser.add_option("-l", "--log", dest="logfile",
- help="log to file", metavar="LOG", default="stderr")
-
- set_logging_options(parser)
-
-
- return parser.parse_args()
-
-
-if __name__ == "__main__" :
-
- (options, args) = get_options()
-
- set_logging(options)
-
- logging.debug("OPTIONS : " + repr(options))
-
- engine, metadata = setup_database('sqlite:///'+options.database, echo=((options.verbose-options.quiet)>0), create_all = False)
-
- Session = sessionmaker()
- conn = engine.connect()
- try :
- session = Session(bind=conn)
- try :
-
- metadata = MetaData(bind=conn)
- tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
- #mapper(TweetExclude, tweet_exclude_table)
- metadata.create_all()
-
- if options.exclude and os.path.exists(options.exclude):
- with open(options.exclude, 'r+') as f:
- tei = tweet_exclude_table.insert()
- for line in f:
- conn.execute(tei.values(id=long(line.strip())))
-
- if options.listconf:
-
- parameters = []
- confdoc = etree.parse(options.listconf)
- for node in confdoc.xpath("/twitter_export/file"):
- params = {}
- for snode in node:
- if snode.tag == "path":
- params['content_file'] = snode.text
- elif snode.tag == "start_date":
- params['start_date'] = snode.text
- elif snode.tag == "end_date":
- params['end_date'] = snode.text
- elif snode.tag == "duration":
- params['duration'] = int(snode.text)
- parameters.append(params)
- else:
- parameters = [{
- 'start_date': options.start_date,
- 'end_date' : options.end_date,
- 'duration' : options.duration,
- 'content_file' : otions.content_file
-
- }]
-
- for params in parameters:
-
- logging.debug("PARAMETERS " + repr(params))
-
- start_date_str = params.get("start_date",None)
- end_date_str = params.get("end_date", None)
- duration = params.get("duration", None)
- content_file = params.get("content_file", None)
-
-
- start_date = parse_date(start_date_str)
- ts = time.mktime(start_date.timetuple())
-
- if end_date_str:
- end_date = parse_date(end_date_str)
- te = time.mktime(end_date.timetuple())
- else:
- te = ts + duration
- end_date = start_date + datetime.timedelta(seconds=duration)
-
-
- query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >= start_date).filter(Tweet.created_at <= end_date).all()
-
- #hashtag = u"%#"+unicode(options.hashtag)+u"%"
-
- #cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te));
-
- root = None
- ensemble_parent = None
-
- if content_file and os.path.exists(content_file):
-
- doc = etree.parse(content_file)
- root = doc.getroot()
-
- ensemble_parent = root.xpath("//ensembles")[0]
-
- else:
- root = etree.Element(u"iri")
-
- project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
-
- medias = etree.SubElement(root, u"medias")
- media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
-
- annotations = etree.SubElement(root, u"annotations")
- content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
- ensemble_parent = content
-
- if options.replace:
- for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
- if ens.get("id","").startswith("tweet_"):
- ensemble_parent.remove(ens)
-
- ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"})
- decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
-
- etree.SubElement(decoupage, u"title").text = unicode(options.name)
- etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
-
- elements = etree.SubElement(decoupage, u"elements")
-
- for tw in query_res:
- tweet_ts_dt = tw.created_at
- tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
- tweet_ts_rel = (tweet_ts-ts) * 1000
- username = None
- if tw.user is not None:
- username = tw.user.name
- if not username:
- username = "anon."
- element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""})
- etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
- etree.SubElement(element, u"abstract").text = unicode(tw.text)
-
- tags_node = etree.SubElement(element, u"tags")
-
- for entity in tw.entity_list:
- if entity.type == u'entity_hashtag':
- etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
-
- if content_file and os.path.exists(content_file):
- output = open(content_file, "w")
- else:
- output = open(options.filename, "w")
-
- output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True))
- output.flush()
- output.close()
-
- finally:
- session.close()
- finally:
- conn.close()