diff -r f7febf052997 -r e0dbcf98c13e server/lib/iri_tweet/export_twitter_alchemy.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/server/lib/iri_tweet/export_twitter_alchemy.py Tue Feb 14 18:38:48 2012 +0100 @@ -0,0 +1,339 @@ +#!/usr/bin/env python +# coding=utf-8 + +from lxml import etree +from models import setup_database +from optparse import OptionParser #@UnresolvedImport +from sqlalchemy import Table, Column, BigInteger +from utils import (parse_date, set_logging_options, set_logging, get_filter_query, + get_logger) +import anyjson +import datetime +import httplib2 +import os.path +import re +import sys +import time +import uuid #@UnresolvedImport + +#class TweetExclude(object): +# def __init__(self, id): +# self.id = id +# +# def __repr__(self): +# return "" % (self.id) + + +def parse_polemics(tw, extended_mode): + """ + parse polemics in text and return a list of polemic code. None if not polemic found + """ + polemics = {} + for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): + pol_link = { + '++' : u'OK', + '--' : u'KO', + '??' : u'Q', + '==' : u'REF'}[m.group(1)] + polemics[pol_link] = pol_link + + if extended_mode: + if "?" in tw.text: + polemics["Q"] = "Q" + + for entity in tw.entity_list: + if entity.type == "entity_url": + polemics["REF"] = "REF" + + if len(polemics) > 0: + return polemics.keys() + else: + return None + +def get_options(): + parser = OptionParser() + parser.add_option("-f", "--file", dest="filename", + help="write export to file", metavar="FILE", default="project.ldt") + parser.add_option("-d", "--database", dest="database", + help="Input database", metavar="DATABASE") + parser.add_option("-s", "--start-date", dest="start_date", + help="start date", metavar="START_DATE", default=None) + parser.add_option("-e", "--end-date", dest="end_date", + help="end date", metavar="END_DATE", default=None) + parser.add_option("-I", "--content-file", dest="content_file", + help="Content file", metavar="CONTENT_FILE") + parser.add_option("-c", "--content", dest="content", + help="Content url", metavar="CONTENT") + parser.add_option("-V", "--video-url", dest="video", + help="video url", metavar="VIDEO") + parser.add_option("-i", "--content-id", dest="content_id", + help="Content id", metavar="CONTENT_ID") + parser.add_option("-x", "--exclude", dest="exclude", + help="file containing the id to exclude", metavar="EXCLUDE") + parser.add_option("-C", "--color", dest="color", + help="Color code", metavar="COLOR", default="16763904") + parser.add_option("-H", "--hashtag", dest="hashtag", + help="Hashtag", metavar="HASHTAG", default=[], action="append") + parser.add_option("-D", "--duration", dest="duration", type="int", + help="Duration", metavar="DURATION", default=None) + parser.add_option("-n", "--name", dest="name", + help="Cutting name", metavar="NAME", default=u"Tweets") + parser.add_option("-R", "--replace", dest="replace", action="store_true", + help="Replace tweet ensemble", metavar="REPLACE", default=False) + parser.add_option("-L", "--list-conf", dest="listconf", + help="list of file to process", metavar="LIST_CONF", default=None) + parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", + help="Trigger polemic extended mode", metavar="EXTENDED", default=False) + parser.add_option("--user-whitelist", dest="user_whitelist", action="store", + help="A list of user screen name", metavar="USER_WHITELIST",default=None) + + + set_logging_options(parser) + + + return parser.parse_args() + (parser,) + + +if __name__ == "__main__" : + + (options, args, parser) = get_options() + + set_logging(options) + + get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable + + if len(sys.argv) == 1 or options.database is None: + parser.print_help() + sys.exit(1) + + conn_str = options.database.strip() + if not re.match("^\w+://.+", conn_str): + conn_str = 'sqlite:///' + conn_str + + engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) + conn = None + try : + conn = engine.connect() + session = None + try : + session = Session(bind=conn) + tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) + #mapper(TweetExclude, tweet_exclude_table) + metadata.create_all(bind=conn, tables=[tweet_exclude_table]) + + if options.exclude and os.path.exists(options.exclude): + with open(options.exclude, 'r+') as f: + tei = tweet_exclude_table.insert() + for line in f: + conn.execute(tei.values(id=long(line.strip()))) + user_whitelist_file = options.user_whitelist + user_whitelist = None + + if options.listconf: + + parameters = [] + confdoc = etree.parse(options.listconf) + for node in confdoc.xpath("/twitter_export/file"): + params = {} + for snode in node: + if snode.tag == "path": + params['content_file'] = snode.text + elif snode.tag == "start_date": + params['start_date'] = snode.text + elif snode.tag == "end_date": + params['end_date'] = snode.text + elif snode.tag == "duration": + params['duration'] = int(snode.text) + elif snode.tag == "hashtags": + params['hashtags'] = [snode.text] + if options.hashtag or 'hashtags' not in params : + params['hashtags'] = options.hashtag + parameters.append(params) + else: + parameters = [{ + 'start_date': options.start_date, + 'end_date' : options.end_date, + 'duration' : options.duration, + 'content_file' : options.content_file, + 'hashtags' : options.hashtag + }] + + for params in parameters: + + get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable + + start_date_str = params.get("start_date",None) + end_date_str = params.get("end_date", None) + duration = params.get("duration", None) + content_file = params.get("content_file", None) + hashtags = params.get('hashtags', []) + + if user_whitelist_file: + with open(user_whitelist_file, 'r+') as f: + user_whitelist = list(set([s.strip() for s in f])) + + start_date = None + ts = None + if start_date_str: + start_date = parse_date(start_date_str) + ts = time.mktime(start_date.timetuple()) + + end_date = None + if end_date_str: + end_date = parse_date(end_date_str) + elif start_date and duration: + end_date = start_date + datetime.timedelta(seconds=duration) + + query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) + + query_res = query.all() + + root = None + ensemble_parent = None + + #to do : analyse situation ldt or iri ? filename set or not ? + + if content_file and content_file.find("http") == 0: + + get_logger().debug("url : " + content_file) #@UndefinedVariable + + h = httplib2.Http() + resp, content = h.request(content_file) + + get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable + + project = anyjson.deserialize(content) + root = etree.fromstring(project["ldt"]) + + elif content_file and os.path.exists(content_file): + + doc = etree.parse(content_file) + root = doc.getroot() + + + if root is None: + + root = etree.Element(u"iri") + + project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) + + medias = etree.SubElement(root, u"medias") + media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) + + annotations = etree.SubElement(root, u"annotations") + content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) + ensemble_parent = content + + + if ensemble_parent is None: + file_type = None + for node in root: + if node.tag == "project": + file_type = "ldt" + break + elif node.tag == "head": + file_type = "iri" + break + + if file_type == "ldt": + media_nodes = root.xpath("//media") + if len(media_nodes) > 0: + media = media_nodes[0] + annotations_node = root.find(u"annotations") + if annotations_node is None: + annotations_node = etree.SubElement(root, u"annotations") + content_node = annotations_node.find(u"content") + if content_node is None: + content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) + ensemble_parent = content_node + elif file_type == "iri": + body_node = root.find(u"body") + if body_node is None: + body_node = etree.SubElement(root, u"body") + ensembles_node = body_node.find(u"ensembles") + if ensembles_node is None: + ensembles_node = etree.SubElement(body_node, u"ensembles") + ensemble_parent = ensembles_node + + + if ensemble_parent is None: + get_logger().error("Can not process file") #@UndefinedVariable + sys.exit() + + if options.replace: + for ens in ensemble_parent.iterchildren(tag=u"ensemble"): + if ens.get("id","").startswith("tweet_"): + ensemble_parent.remove(ens) + + ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) + decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) + + etree.SubElement(decoupage, u"title").text = unicode(options.name) + etree.SubElement(decoupage, u"abstract").text = unicode(options.name) + + elements = etree.SubElement(decoupage, u"elements") + + for tw in query_res: + tweet_ts_dt = tw.created_at + tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) + if ts is None: + ts = tweet_ts + tweet_ts_rel = (tweet_ts-ts) * 1000 + username = None + profile_url = "" + if tw.user is not None: + username = tw.user.name + profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else "" + if not username: + username = "anon." + + element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)}) + etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) + etree.SubElement(element, u"abstract").text = unicode(tw.text) + + tags_node = etree.SubElement(element, u"tags") + + for entity in tw.entity_list: + if entity.type == u'entity_hashtag': + etree.SubElement(tags_node,u"tag").text = entity.hashtag.text + + meta_element = etree.SubElement(element, u'meta') + + polemics_list = parse_polemics(tw, options.extended_mode) + if polemics_list: + polemics_element = etree.Element(u'polemics') + for pol in polemics_list: + etree.SubElement(polemics_element, u'polemic').text = pol + meta_element.append(polemics_element) + + etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json)) + + output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True) + + if content_file and content_file.find("http") == 0: + + project["ldt"] = output_data + body = anyjson.serialize(project) + get_logger().debug("write http " + content_file) #@UndefinedVariable + get_logger().debug("write http " + repr(body)) #@UndefinedVariable + h = httplib2.Http() + resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body) + get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable + else: + if content_file and os.path.exists(content_file): + dest_file_name = content_file + else: + dest_file_name = options.filename + + get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable + output = open(dest_file_name, "w") + output.write(output_data) + output.flush() + output.close() + + finally: + if session: + session.close() + finally: + if conn: + conn.close()