#!/usr/bin/env python
# coding=utf-8
from lxml import etree
from models import *
from optparse import OptionParser
from sqlalchemy import Table, Column, Integer, BigInteger, String, MetaData, \
ForeignKey
from sqlalchemy.orm import sessionmaker, mapper
from sqlalchemy.sql import select
from utils import *
import datetime
import email.utils
import logging
import os
import os.path
import re
import re
import sys
import time
import uuid
#class TweetExclude(object):
# def __init__(self, id):
# self.id = id
#
# def __repr__(self):
# return "<TweetExclude(id=%d)>" % (self.id)
def parse_date(date_str):
ts = email.utils.parsedate_tz(date_str)
return datetime.datetime(*ts[0:7])
def get_options():
parser = OptionParser()
parser.add_option("-f", "--file", dest="filename",
help="write export to file", metavar="FILE", default="project_enmi.ldt")
parser.add_option("-d", "--database", dest="database",
help="Input database", metavar="DATABASE")
parser.add_option("-s", "--start-date", dest="start_date",
help="start date", metavar="START_DATE")
parser.add_option("-e", "--end-date", dest="end_date",
help="end date", metavar="END_DATE")
parser.add_option("-I", "--content-file", dest="content_file",
help="Content file", metavar="CONTENT_FILE")
parser.add_option("-c", "--content", dest="content",
help="Content url", metavar="CONTENT")
parser.add_option("-V", "--video-url", dest="video",
help="video url", metavar="VIDEO")
parser.add_option("-i", "--content-id", dest="content_id",
help="Content id", metavar="CONTENT_ID")
parser.add_option("-x", "--exclude", dest="exclude",
help="file containing the id to exclude", metavar="EXCLUDE")
parser.add_option("-C", "--color", dest="color",
help="Color code", metavar="COLOR", default="16763904")
parser.add_option("-H", "--hashtag", dest="hashtag",
help="Hashtag", metavar="HASHTAG", default="enmi")
parser.add_option("-D", "--duration", dest="duration", type="int",
help="Duration", metavar="DURATION", default=None)
parser.add_option("-n", "--name", dest="name",
help="Cutting name", metavar="NAME", default=u"Tweets")
parser.add_option("-R", "--replace", dest="replace", action="store_true",
help="Replace tweet ensemble", metavar="REPLACE", default=False)
parser.add_option("-L", "--list-conf", dest="listconf",
help="list of file to process", metavar="LIST_CONF", default=None)
set_logging_options(parser)
return parser.parse_args()
if __name__ == "__main__" :
(options, args) = get_options()
set_logging(options)
logging.debug("OPTIONS : " + repr(options))
engine, metadata = setup_database('sqlite:///'+options.database, echo=((options.verbose-options.quiet)>0), create_all = False)
Session = sessionmaker()
conn = engine.connect()
try :
session = Session(bind=conn)
try :
metadata = MetaData(bind=conn)
tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
#mapper(TweetExclude, tweet_exclude_table)
metadata.create_all()
if options.exclude and os.path.exists(options.exclude):
with open(options.exclude, 'r+') as f:
tei = tweet_exclude_table.insert()
for line in f:
conn.execute(tei.values(id=long(line.strip())))
if options.listconf:
parameters = []
confdoc = etree.parse(options.listconf)
for node in confdoc.xpath("/twitter_export/file"):
params = {}
for snode in node:
if snode.tag == "path":
params['content_file'] = snode.text
elif snode.tag == "start_date":
params['start_date'] = snode.text
elif snode.tag == "end_date":
params['end_date'] = snode.text
elif snode.tag == "duration":
params['duration'] = int(snode.text)
parameters.append(params)
else:
parameters = [{
'start_date': options.start_date,
'end_date' : options.end_date,
'duration' : options.duration,
'content_file' : otions.content_file
}]
for params in parameters:
logging.debug("PARAMETERS " + repr(params))
start_date_str = params.get("start_date",None)
end_date_str = params.get("end_date", None)
duration = params.get("duration", None)
content_file = params.get("content_file", None)
start_date = parse_date(start_date_str)
ts = time.mktime(start_date.timetuple())
if end_date_str:
end_date = parse_date(end_date_str)
te = time.mktime(end_date.timetuple())
else:
te = ts + duration
end_date = start_date + datetime.timedelta(seconds=duration)
query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >= start_date).filter(Tweet.created_at <= end_date).all()
root = None
ensemble_parent = None
if content_file and os.path.exists(content_file):
doc = etree.parse(content_file)
root = doc.getroot()
ensemble_parent = root.xpath("//ensembles")[0]
else:
root = etree.Element(u"iri")
project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
medias = etree.SubElement(root, u"medias")
media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
annotations = etree.SubElement(root, u"annotations")
content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
ensemble_parent = content
if options.replace:
for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
if ens.get("id","").startswith("tweet_"):
ensemble_parent.remove(ens)
ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"})
decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
etree.SubElement(decoupage, u"title").text = unicode(options.name)
etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
elements = etree.SubElement(decoupage, u"elements")
for tw in query_res:
tweet_ts_dt = tw.created_at
tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
tweet_ts_rel = (tweet_ts-ts) * 1000
username = None
if tw.user is not None:
username = tw.user.name
if not username:
username = "anon."
element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""})
etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
etree.SubElement(element, u"abstract").text = unicode(tw.text)
tags_node = etree.SubElement(element, u"tags")
for entity in tw.entity_list:
if entity.type == u'entity_hashtag':
etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
meta_element = etree.SubElement(element, u'meta')
polemics_element = etree.Element(u'polemics')
polemic_added = False
for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
polemic_added = True
pol_link = {
'++' : u'OK',
'--' : u'KO',
'??' : u'Q',
'==' : u'REF'}[m.group(1)]
etree.SubElement(polemics_element, u'polemic').text = pol_link
if polemic_added:
meta_element.append(polemics_element)
etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.original_json))
if content_file and os.path.exists(content_file):
dest_file_name = content_file
else:
dest_file_name = options.filename
logging.debug("WRITE : " + dest_file_name)
output = open(content_file, "w")
output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True))
output.flush()
output.close()
finally:
session.close()
finally:
conn.close()