diff -r d3b86c65c980 -r b9243ade95e2 script/lib/iri_tweet/export_twitter_alchemy.py --- a/script/lib/iri_tweet/export_twitter_alchemy.py Tue Dec 20 16:26:34 2011 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,361 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 - -from lxml import etree -from models import setup_database -from optparse import OptionParser #@UnresolvedImport -from sqlalchemy import Table, Column, BigInteger -from utils import (parse_date, set_logging_options, set_logging, get_filter_query, - get_logger) -import anyjson -import datetime -import httplib2 -import os.path -import re -import sys -import time -import uuid #@UnresolvedImport - -#class TweetExclude(object): -# def __init__(self, id): -# self.id = id -# -# def __repr__(self): -# return "" % (self.id) - - -def parse_polemics(tw, extended_mode): - """ - parse polemics in text and return a list of polemic code. None if not polemic found - """ - polemics = {} - for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): - pol_link = { - '++' : u'OK', - '--' : u'KO', - '??' : u'Q', - '==' : u'REF'}[m.group(1)] - polemics[pol_link] = pol_link - - if extended_mode: - if "?" in tw.text: - polemics["Q"] = "Q" - - for entity in tw.entity_list: - if entity.type == "entity_url": - polemics["REF"] = "REF" - - if len(polemics) > 0: - return polemics.keys() - else: - return None - -def get_options(): - parser = OptionParser() - parser.add_option("-f", "--file", dest="filename", - help="write export to file", metavar="FILE", default="project.ldt") - parser.add_option("-d", "--database", dest="database", - help="Input database", metavar="DATABASE") - parser.add_option("-s", "--start-date", dest="start_date", - help="start date", metavar="START_DATE", default=None) - parser.add_option("-e", "--end-date", dest="end_date", - help="end date", metavar="END_DATE", default=None) - parser.add_option("-I", "--content-file", dest="content_file", - help="Content file", metavar="CONTENT_FILE") - parser.add_option("-c", "--content", dest="content", - help="Content url", metavar="CONTENT") - parser.add_option("-V", "--video-url", dest="video", - help="video url", metavar="VIDEO") - parser.add_option("-i", "--content-id", dest="content_id", - help="Content id", metavar="CONTENT_ID") - parser.add_option("-x", "--exclude", dest="exclude", - help="file containing the id to exclude", metavar="EXCLUDE") - parser.add_option("-C", "--color", dest="color", - help="Color code", metavar="COLOR", default="16763904") - parser.add_option("-H", "--hashtag", dest="hashtag", - help="Hashtag", metavar="HASHTAG", default=[], action="append") - parser.add_option("-D", "--duration", dest="duration", type="int", - help="Duration", metavar="DURATION", default=None) - parser.add_option("-n", "--name", dest="name", - help="Cutting name", metavar="NAME", default=u"Tweets") - parser.add_option("-R", "--replace", dest="replace", action="store_true", - help="Replace tweet ensemble", metavar="REPLACE", default=False) - parser.add_option("-m", "--merge", dest="merge", action="store_true", - help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False) - parser.add_option("-L", "--list-conf", dest="listconf", - help="list of file to process", metavar="LIST_CONF", default=None) - parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", - help="Trigger polemic extended mode", metavar="EXTENDED", default=False) - parser.add_option("--user-whitelist", dest="user_whitelist", action="store", - help="A list of user screen name", metavar="USER_WHITELIST",default=None) - - - set_logging_options(parser) - - - return parser.parse_args() + (parser,) - - -if __name__ == "__main__" : - - (options, args, parser) = get_options() - - set_logging(options) - - get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable - - if len(sys.argv) == 1 or options.database is None: - parser.print_help() - sys.exit(1) - - conn_str = options.database.strip() - if not re.match("^\w+://.+", conn_str): - conn_str = 'sqlite:///' + conn_str - - engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) - conn = None - try : - conn = engine.connect() - session = None - try : - session = Session(bind=conn) - tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) - #mapper(TweetExclude, tweet_exclude_table) - metadata.create_all(bind=conn, tables=[tweet_exclude_table]) - - if options.exclude and os.path.exists(options.exclude): - with open(options.exclude, 'r+') as f: - tei = tweet_exclude_table.insert() - for line in f: - conn.execute(tei.values(id=long(line.strip()))) - user_whitelist_file = options.user_whitelist - user_whitelist = None - - if options.listconf: - - parameters = [] - confdoc = etree.parse(options.listconf) - for node in confdoc.xpath("/twitter_export/file"): - params = {} - for snode in node: - if snode.tag == "path": - params['content_file'] = snode.text - elif snode.tag == "start_date": - params['start_date'] = snode.text - elif snode.tag == "end_date": - params['end_date'] = snode.text - elif snode.tag == "duration": - params['duration'] = int(snode.text) - elif snode.tag == "hashtags": - params['hashtags'] = [snode.text] - if options.hashtag or 'hashtags' not in params : - params['hashtags'] = options.hashtag - parameters.append(params) - else: - parameters = [{ - 'start_date': options.start_date, - 'end_date' : options.end_date, - 'duration' : options.duration, - 'content_file' : options.content_file, - 'hashtags' : options.hashtag - }] - - for params in parameters: - - get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable - - start_date_str = params.get("start_date",None) - end_date_str = params.get("end_date", None) - duration = params.get("duration", None) - content_file = params.get("content_file", None) - hashtags = params.get('hashtags', []) - - if user_whitelist_file: - with open(user_whitelist_file, 'r+') as f: - user_whitelist = list(set([s.strip() for s in f])) - - start_date = None - ts = None - if start_date_str: - start_date = parse_date(start_date_str) - ts = time.mktime(start_date.timetuple()) - - end_date = None - if end_date_str: - end_date = parse_date(end_date_str) - elif start_date and duration: - end_date = start_date + datetime.timedelta(seconds=duration) - - query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) - - query_res = query.all() - - root = None - ensemble_parent = None - - #to do : analyse situation ldt or iri ? filename set or not ? - - if content_file and content_file.find("http") == 0: - - get_logger().debug("url : " + content_file) #@UndefinedVariable - - h = httplib2.Http() - resp, content = h.request(content_file) - - get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable - - project = anyjson.deserialize(content) - root = etree.fromstring(project["ldt"]) - - elif content_file and os.path.exists(content_file): - - doc = etree.parse(content_file) - root = doc.getroot() - - - if root is None: - - root = etree.Element(u"iri") - - project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) - - medias = etree.SubElement(root, u"medias") - media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) - - annotations = etree.SubElement(root, u"annotations") - content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) - ensemble_parent = content - - - if ensemble_parent is None: - file_type = None - for node in root: - if node.tag == "project": - file_type = "ldt" - break - elif node.tag == "head": - file_type = "iri" - break - - if file_type == "ldt": - media_nodes = root.xpath("//media") - if len(media_nodes) > 0: - media = media_nodes[0] - annotations_node = root.find(u"annotations") - if annotations_node is None: - annotations_node = etree.SubElement(root, u"annotations") - content_node = annotations_node.find(u"content") - if content_node is None: - content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) - ensemble_parent = content_node - elif file_type == "iri": - body_node = root.find(u"body") - if body_node is None: - body_node = etree.SubElement(root, u"body") - ensembles_node = body_node.find(u"ensembles") - if ensembles_node is None: - ensembles_node = etree.SubElement(body_node, u"ensembles") - ensemble_parent = ensembles_node - - - if ensemble_parent is None: - get_logger().error("Can not process file") #@UndefinedVariable - sys.exit() - - if options.replace: - for ens in ensemble_parent.iterchildren(tag=u"ensemble"): - if ens.get("id","").startswith("tweet_"): - ensemble_parent.remove(ens) - - ensemble = None - elements = None - - if options.merge: - ensemble = ensemble_parent.find(u"ensemble") - if ensemble is not None: - elements = ensemble.find(u".//elements") - - if ensemble is None or elements is None: - ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) - decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) - - etree.SubElement(decoupage, u"title").text = unicode(options.name) - etree.SubElement(decoupage, u"abstract").text = unicode(options.name) - - elements = etree.SubElement(decoupage, u"elements") - - - for tw in query_res: - tweet_ts_dt = tw.created_at - tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) - if ts is None: - ts = tweet_ts - tweet_ts_rel = (tweet_ts-ts) * 1000 - username = None - profile_url = "" - if tw.user is not None: - username = tw.user.name - profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else "" - if not username: - username = "anon." - - element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)}) - etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) - etree.SubElement(element, u"abstract").text = unicode(tw.text) - - tags_node = etree.SubElement(element, u"tags") - - for entity in tw.entity_list: - if entity.type == u'entity_hashtag': - etree.SubElement(tags_node,u"tag").text = entity.hashtag.text - - meta_element = etree.SubElement(element, u'meta') - - polemics_list = parse_polemics(tw, options.extended_mode) - if polemics_list: - polemics_element = etree.Element(u'polemics') - for pol in polemics_list: - etree.SubElement(polemics_element, u'polemic').text = pol - meta_element.append(polemics_element) - - etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json)) - - # sort by tc in - if options.merge : - # remove all elements and put them in a array - # sort them with tc - #put them back - elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) - - - - - output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True) - - if content_file and content_file.find("http") == 0: - - project["ldt"] = output_data - body = anyjson.serialize(project) - get_logger().debug("write http " + content_file) #@UndefinedVariable - get_logger().debug("write http " + repr(body)) #@UndefinedVariable - h = httplib2.Http() - resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body) - get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable - else: - if content_file and os.path.exists(content_file): - dest_file_name = content_file - else: - dest_file_name = options.filename - - get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable - output = open(dest_file_name, "w") - output.write(output_data) - output.flush() - output.close() - - finally: - if session: - session.close() - finally: - if conn: - conn.close()