--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/export_twitter_alchemy.py Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+from lxml import etree
+from iri_tweet.models import setup_database
+from optparse import OptionParser #@UnresolvedImport
+from sqlalchemy import Table, Column, BigInteger
+from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query,
+ get_logger)
+import anyjson
+import datetime
+import httplib2
+import os.path
+import re
+import sys
+import time
+import uuid #@UnresolvedImport
+from dateutil.parser import parse as parse_date
+
+#class TweetExclude(object):
+# def __init__(self, id):
+# self.id = id
+#
+# def __repr__(self):
+# return "<TweetExclude(id=%d)>" % (self.id)
+
+
+def parse_polemics(tw, extended_mode):
+ """
+ parse polemics in text and return a list of polemic code. None if not polemic found
+ """
+ polemics = {}
+ for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
+ pol_link = {
+ '++' : u'OK',
+ '--' : u'KO',
+ '??' : u'Q',
+ '==' : u'REF'}[m.group(1)]
+ polemics[pol_link] = pol_link
+
+ if extended_mode:
+ if "?" in tw.text:
+ polemics["Q"] = "Q"
+
+ for entity in tw.entity_list:
+ if entity.type == "entity_url":
+ polemics["REF"] = "REF"
+
+ if len(polemics) > 0:
+ return polemics.keys()
+ else:
+ return None
+
+def get_options():
+ parser = OptionParser()
+ parser.add_option("-f", "--file", dest="filename",
+ help="write export to file", metavar="FILE", default="project.ldt")
+ parser.add_option("-d", "--database", dest="database",
+ help="Input database", metavar="DATABASE")
+ parser.add_option("-s", "--start-date", dest="start_date",
+ help="start date", metavar="START_DATE", default=None)
+ parser.add_option("-e", "--end-date", dest="end_date",
+ help="end date", metavar="END_DATE", default=None)
+ parser.add_option("-I", "--content-file", dest="content_file",
+ help="Content file", metavar="CONTENT_FILE")
+ parser.add_option("-c", "--content", dest="content",
+ help="Content url", metavar="CONTENT")
+ parser.add_option("-V", "--video-url", dest="video",
+ help="video url", metavar="VIDEO")
+ parser.add_option("-i", "--content-id", dest="content_id",
+ help="Content id", metavar="CONTENT_ID")
+ parser.add_option("-x", "--exclude", dest="exclude",
+ help="file containing the id to exclude", metavar="EXCLUDE")
+ parser.add_option("-C", "--color", dest="color",
+ help="Color code", metavar="COLOR", default="16763904")
+ parser.add_option("-H", "--hashtag", dest="hashtag",
+ help="Hashtag", metavar="HASHTAG", default=[], action="append")
+ parser.add_option("-D", "--duration", dest="duration", type="int",
+ help="Duration", metavar="DURATION", default=None)
+ parser.add_option("-n", "--name", dest="name",
+ help="Cutting name", metavar="NAME", default=u"Tweets")
+ parser.add_option("-R", "--replace", dest="replace", action="store_true",
+ help="Replace tweet ensemble", metavar="REPLACE", default=False)
+ parser.add_option("-m", "--merge", dest="merge", action="store_true",
+ help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
+ parser.add_option("-L", "--list-conf", dest="listconf",
+ help="list of file to process", metavar="LIST_CONF", default=None)
+ parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
+ help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
+ parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
+ help="A list of user screen name", metavar="USER_WHITELIST",default=None)
+
+
+ set_logging_options(parser)
+
+
+ return parser.parse_args() + (parser,)
+
+
+if __name__ == "__main__" :
+
+ (options, args, parser) = get_options()
+
+ set_logging(options)
+
+ get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
+
+ if len(sys.argv) == 1 or options.database is None:
+ parser.print_help()
+ sys.exit(1)
+
+ conn_str = options.database.strip()
+ if not re.match("^\w+://.+", conn_str):
+ conn_str = 'sqlite:///' + conn_str
+
+ engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
+ conn = None
+ try :
+ conn = engine.connect()
+ session = None
+ try :
+ session = Session(bind=conn)
+ tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
+ #mapper(TweetExclude, tweet_exclude_table)
+ metadata.create_all(bind=conn, tables=[tweet_exclude_table])
+
+ if options.exclude and os.path.exists(options.exclude):
+ with open(options.exclude, 'r+') as f:
+ tei = tweet_exclude_table.insert()
+ for line in f:
+ conn.execute(tei.values(id=long(line.strip())))
+ user_whitelist_file = options.user_whitelist
+ user_whitelist = None
+
+ if options.listconf:
+
+ parameters = []
+ confdoc = etree.parse(options.listconf)
+ for node in confdoc.xpath("/twitter_export/file"):
+ params = {}
+ for snode in node:
+ if snode.tag == "path":
+ params['content_file'] = snode.text
+ elif snode.tag == "start_date":
+ params['start_date'] = snode.text
+ elif snode.tag == "end_date":
+ params['end_date'] = snode.text
+ elif snode.tag == "duration":
+ params['duration'] = int(snode.text)
+ elif snode.tag == "hashtags":
+ params['hashtags'] = [snode.text]
+ if options.hashtag or 'hashtags' not in params :
+ params['hashtags'] = options.hashtag
+ parameters.append(params)
+ else:
+ parameters = [{
+ 'start_date': options.start_date,
+ 'end_date' : options.end_date,
+ 'duration' : options.duration,
+ 'content_file' : options.content_file,
+ 'hashtags' : options.hashtag
+ }]
+
+ for params in parameters:
+
+ get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
+
+ start_date_str = params.get("start_date",None)
+ end_date_str = params.get("end_date", None)
+ duration = params.get("duration", None)
+ content_file = params.get("content_file", None)
+ hashtags = params.get('hashtags', [])
+
+ if user_whitelist_file:
+ with open(user_whitelist_file, 'r+') as f:
+ user_whitelist = list(set([s.strip() for s in f]))
+
+ start_date = None
+ ts = None
+ if start_date_str:
+ start_date = parse_date(start_date_str)
+ ts = time.mktime(start_date.timetuple())
+
+ end_date = None
+ if end_date_str:
+ end_date = parse_date(end_date_str)
+ elif start_date and duration:
+ end_date = start_date + datetime.timedelta(seconds=duration)
+
+ query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
+
+ query_res = query.all()
+
+ root = None
+ ensemble_parent = None
+
+ #to do : analyse situation ldt or iri ? filename set or not ?
+
+ if content_file and content_file.find("http") == 0:
+
+ get_logger().debug("url : " + content_file) #@UndefinedVariable
+
+ h = httplib2.Http()
+ resp, content = h.request(content_file)
+
+ get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
+
+ project = anyjson.deserialize(content)
+ root = etree.fromstring(project["ldt"])
+
+ elif content_file and os.path.exists(content_file):
+
+ doc = etree.parse(content_file)
+ root = doc.getroot()
+
+
+ if root is None:
+
+ root = etree.Element(u"iri")
+
+ project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
+
+ medias = etree.SubElement(root, u"medias")
+ media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
+
+ annotations = etree.SubElement(root, u"annotations")
+ content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
+ ensemble_parent = content
+
+
+ if ensemble_parent is None:
+ file_type = None
+ for node in root:
+ if node.tag == "project":
+ file_type = "ldt"
+ break
+ elif node.tag == "head":
+ file_type = "iri"
+ break
+
+ if file_type == "ldt":
+ media_nodes = root.xpath("//media")
+ if len(media_nodes) > 0:
+ media = media_nodes[0]
+ annotations_node = root.find(u"annotations")
+ if annotations_node is None:
+ annotations_node = etree.SubElement(root, u"annotations")
+ content_node = annotations_node.find(u"content")
+ if content_node is None:
+ content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
+ ensemble_parent = content_node
+ elif file_type == "iri":
+ body_node = root.find(u"body")
+ if body_node is None:
+ body_node = etree.SubElement(root, u"body")
+ ensembles_node = body_node.find(u"ensembles")
+ if ensembles_node is None:
+ ensembles_node = etree.SubElement(body_node, u"ensembles")
+ ensemble_parent = ensembles_node
+
+
+ if ensemble_parent is None:
+ get_logger().error("Can not process file") #@UndefinedVariable
+ sys.exit()
+
+ if options.replace:
+ for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
+ if ens.get("id","").startswith("tweet_"):
+ ensemble_parent.remove(ens)
+
+ ensemble = None
+ elements = None
+
+ if options.merge:
+ ensemble = ensemble_parent.find(u"ensemble")
+ if ensemble is not None:
+ elements = ensemble.find(u".//elements")
+
+ if ensemble is None or elements is None:
+ ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
+ decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
+
+ etree.SubElement(decoupage, u"title").text = unicode(options.name)
+ etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
+
+ elements = etree.SubElement(decoupage, u"elements")
+
+
+ for tw in query_res:
+ tweet_ts_dt = tw.created_at
+ tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
+ if ts is None:
+ ts = tweet_ts
+ tweet_ts_rel = (tweet_ts-ts) * 1000
+ username = None
+ profile_url = ""
+ if tw.user is not None:
+ username = tw.user.name
+ profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
+ if not username:
+ username = "anon."
+
+ element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
+ etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
+ etree.SubElement(element, u"abstract").text = unicode(tw.text)
+
+ tags_node = etree.SubElement(element, u"tags")
+
+ for entity in tw.entity_list:
+ if entity.type == u'entity_hashtag':
+ etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
+
+ meta_element = etree.SubElement(element, u'meta')
+
+ polemics_list = parse_polemics(tw, options.extended_mode)
+ if polemics_list:
+ polemics_element = etree.Element(u'polemics')
+ for pol in polemics_list:
+ etree.SubElement(polemics_element, u'polemic').text = pol
+ meta_element.append(polemics_element)
+
+ etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json))
+
+ # sort by tc in
+ if options.merge :
+ # remove all elements and put them in a array
+ # sort them with tc
+ #put them back
+ elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
+
+
+
+
+ output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)
+
+ if content_file and content_file.find("http") == 0:
+
+ project["ldt"] = output_data
+ body = anyjson.serialize(project)
+ get_logger().debug("write http " + content_file) #@UndefinedVariable
+ get_logger().debug("write http " + repr(body)) #@UndefinedVariable
+ h = httplib2.Http()
+ resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
+ get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
+ else:
+ if content_file and os.path.exists(content_file):
+ dest_file_name = content_file
+ else:
+ dest_file_name = options.filename
+
+ get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
+ output = open(dest_file_name, "w")
+ output.write(output_data)
+ output.flush()
+ output.close()
+
+ finally:
+ if session:
+ session.close()
+ finally:
+ if conn:
+ conn.close()