diff -r 92429e14ca48 -r 7d87ba8cc268 script/utils/export_twitter_alchemy.py --- a/script/utils/export_twitter_alchemy.py Fri Nov 29 18:14:45 2013 +0100 +++ b/script/utils/export_twitter_alchemy.py Mon Dec 02 16:38:41 2013 +0100 @@ -5,7 +5,7 @@ from iri_tweet.models import setup_database, Tweet, User from sqlalchemy import Table, Column, BigInteger, event, bindparam from sqlalchemy.sql import select, func -from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, +from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, get_logger) import argparse import anyjson @@ -17,29 +17,31 @@ import time import uuid #@UnresolvedImport from dateutil.parser import parse as parse_date +import bisect #class TweetExclude(object): # def __init__(self, id): # self.id = id -# +# # def __repr__(self): # return "" % (self.id) LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/" LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" -def re_fn(expr, item): + +def re_fn(expr, item): reg = re.compile(expr, re.I) res = reg.search(item) if res: get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable - return res is not None + return res is not None def parse_polemics(tw, extended_mode): """ parse polemics in text and return a list of polemic code. None if not polemic found """ - polemics = {} + polemics = {} for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): pol_link = { '++' : u'OK', @@ -47,26 +49,26 @@ '??' : u'Q', '==' : u'REF'}[m.group(1)] polemics[pol_link] = pol_link - + if extended_mode: if "?" in tw.text: polemics["Q"] = "Q" - + for entity in tw.entity_list: if entity.type == "entity_url": - polemics["REF"] = "REF" - + polemics["REF"] = "REF" + if len(polemics) > 0: return polemics.keys() else: return None def get_options(): - + usage = "usage: %(prog)s [options]" - + parser = argparse.ArgumentParser(usage) - + parser.add_argument("-f", "--file", dest="filename", help="write export to file", metavar="FILE", default="project.ldt") parser.add_argument("-d", "--database", dest="database", @@ -88,7 +90,7 @@ parser.add_argument("-C", "--color", dest="color", help="Color code", metavar="COLOR", default="16763904") parser.add_argument("-H", "--hashtag", dest="hashtag", - help="Hashtag", metavar="HASHTAG", default=[], action="append") + help="Hashtag", metavar="HASHTAG", default=[], action="append") parser.add_argument("-D", "--duration", dest="duration", type=int, help="Duration", metavar="DURATION", default=None) parser.add_argument("-n", "--name", dest="name", @@ -103,50 +105,84 @@ help="Trigger polemic extended mode", default=False) parser.add_argument("-b", "--base-url", dest="base_url", help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/") - parser.add_argument("-p", "--project", dest="project_id", + parser.add_argument("-p", "--project", dest="project_id", help="Project id", metavar="PROJECT_ID", default=None) - parser.add_argument("-P", "--post-param", dest="post_param", - help="Post param", metavar="POST_PARAM", default=None) + parser.add_argument("-P", "--post-param", dest="post_param", + help="Post param", metavar="POST_PARAM", default=None) parser.add_argument("--user-whitelist", dest="user_whitelist", action="store", help="A list of user screen name", metavar="USER_WHITELIST",default=None) - - + parser.add_argument("--cut", dest="cuts", action="append", + help="A cut with the forma ::", metavar="CUT", default=[]) + set_logging_options(parser) - return (parser.parse_args(), parser) +def find_delta(deltas, ts): + i = bisect.bisect_right(deltas, (ts+1,0)) + if i: + return deltas[i-1] + return (0,0) + + +def parse_duration(s): + try: + return int(s) + except ValueError: + parts = s.split(":") + if len(parts) < 2: + raise ValueError("Bad duration format") + time_params = { + 'hours': int(parts[0]), + 'minutes': int(parts[1]), + 'seconds': int(parts[2]) if len(parts)>2 else 0 + } + return int(datetime.timedelta(**time_params).total_seconds()*1000) + + if __name__ == "__main__" : (options, parser) = get_options() - + set_logging(options) - + get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable - + + + deltas = [(0,0)] + total_delta = 0 + if options.cuts: + cuts_raw = sorted([tuple([parse_duration(s) for s in c.split("::")]) for c in options.cuts]) + for c, d in cuts_raw: + deltas.append((c+total_delta, -1)) + total_delta += d + deltas.append((c+total_delta, total_delta)) + if len(sys.argv) == 1 or options.database is None: parser.print_help() sys.exit(1) - + conn_str = options.database.strip() if not re.match("^\w+://.+", conn_str): conn_str = 'sqlite:///' + conn_str - engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) + engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) + + conn = None try : conn = engine.connect() @event.listens_for(conn, "begin") def do_begin(conn): - conn.connection.create_function('regexp', 2, re_fn) + conn.connection.create_function('regexp', 2, re_fn) session = None try : - session = Session(bind=conn) + session = Session(bind=conn) tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) #mapper(TweetExclude, tweet_exclude_table) metadata.create_all(bind=conn, tables=[tweet_exclude_table]) - + if options.exclude and os.path.exists(options.exclude): with open(options.exclude, 'r+') as f: tei = tweet_exclude_table.insert() @@ -154,7 +190,7 @@ for line in f: res = ex_regexp.match(line.strip()) if res: - if res.group('field') == "id": + if res.group('field') == "id": conn.execute(tei.values(id=res.group('value'))) else: exclude_query = session.query(Tweet) @@ -163,24 +199,24 @@ if filter_field.startswith("user__"): exclude_query = exclude_query.outerjoin(User, Tweet.user_id==User.id) filter_obj = User - filter_field = filter_field[len("user__"):] + filter_field = filter_field[len("user__"):] if res.group('op') == "=": exclude_query = exclude_query.filter(getattr(filter_obj, filter_field) == res.group('value')) else: exclude_query = exclude_query.filter(getattr(filter_obj, filter_field).op('regexp')(res.group('value'))) - + test_query = select([func.count()]).where(tweet_exclude_table.c.id==bindparam('t_id')) for t in exclude_query.all(): get_logger().debug("t : " + repr(t)) if conn.execute(test_query, t_id=t.id).fetchone()[0] == 0: conn.execute(tei.values(id=t.id)) - + user_whitelist_file = options.user_whitelist user_whitelist = None - + if options.listconf: - + parameters = [] confdoc = etree.parse(options.listconf) for node in confdoc.xpath("/twitter_export/file"): @@ -208,7 +244,7 @@ if options.project_id: content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json" else: - content_file = options.content_file + content_file = options.content_file parameters = [{ 'start_date': options.start_date, 'end_date' : options.end_date, @@ -216,72 +252,72 @@ 'content_file' : content_file, 'content_file_write' : content_file, 'hashtags' : options.hashtag, - 'project_id' : options.project_id + 'project_id' : options.project_id }] post_param = {} if options.post_param: post_param = anyjson.loads(options.post_param) for params in parameters: - + get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable - + start_date_str = params.get("start_date",None) end_date_str = params.get("end_date", None) duration = params.get("duration", None) content_file = params.get("content_file", None) content_file_write = params.get("content_file_write", None) hashtags = params.get('hashtags', []) - + if user_whitelist_file: with open(user_whitelist_file, 'r+') as f: user_whitelist = list(set([s.strip() for s in f])) - + start_date = None ts = None if start_date_str: - start_date = parse_date(start_date_str) + start_date = parse_date(start_date_str) ts = time.mktime(start_date.timetuple()) - - + + root = None ensemble_parent = None - + #to do : analyse situation ldt or iri ? filename set or not ? - + if content_file and content_file.find("http") == 0: - + get_logger().debug("url : " + content_file) #@UndefinedVariable - - r = requests.get(content_file, params=post_param) - get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable + + r = requests.get(content_file, params=post_param) + get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable project = r.json() text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S) root = etree.fromstring(text_match.group(1) if text_match else project['ldt']) - + elif content_file and os.path.exists(content_file): doc = etree.parse(content_file) root = doc.getroot() - - content_id = None - + + content_id = None + if root is None: - + root = etree.Element(u"iri") - + project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) - + medias = etree.SubElement(root, u"medias") media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) - - annotations = etree.SubElement(root, u"annotations") + + annotations = etree.SubElement(root, u"annotations") content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) ensemble_parent = content - + content_id = options.content_id - - + + if ensemble_parent is None: file_type = None for node in root: @@ -291,7 +327,7 @@ elif node.tag == "head": file_type = "iri" break - + if file_type == "ldt": media_nodes = root.xpath("//media") if len(media_nodes) > 0: @@ -309,8 +345,8 @@ get_logger().info("No display node found. Will not update display") display_content_node = None else: - display_content_node = display_nodes[0] - + display_content_node = display_nodes[0] + elif file_type == "iri": body_node = root.find(u"body") if body_node is None: @@ -321,45 +357,45 @@ ensemble_parent = ensembles_node content_id = root.xpath("head/meta[@name='id']/@content")[0] display_content_node = None - - + + if ensemble_parent is None: get_logger().error("Can not process file") #@UndefinedVariable sys.exit() - + if options.replace: for ens in ensemble_parent.iterchildren(tag=u"ensemble"): - ens_id = ens.get("id","") - if ens_id.startswith("tweet_"): + ens_id = ens.get("id","") + if ens_id.startswith("tweet_"): ensemble_parent.remove(ens) # remove in display nodes if display_content_node is not None: for cut_display in display_content_node.iterchildren(): if cut_display.get('idens','') == ens_id: display_content_node.remove(cut_display) - + ensemble = None elements = None - + if options.merge: for ens in ensemble_parent.findall(u"ensemble"): if ens.get('id',"").startswith("tweet_"): ensemble = ens break - if ensemble is not None: + if ensemble is not None: elements = ensemble.find(u".//elements") decoupage = ensemble.find(u"decoupage") - + if ensemble is None or elements is None: ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) - + etree.SubElement(decoupage, u"title").text = unicode(options.name) etree.SubElement(decoupage, u"abstract").text = unicode(options.name) - + elements = etree.SubElement(decoupage, u"elements") - ensemble_id = ensemble.get('id', '') + ensemble_id = ensemble.get('id', '') decoupage_id = decoupage.get('id', '') if decoupage is not None else None end_date = None @@ -367,7 +403,7 @@ end_date = parse_date(end_date_str) elif start_date and duration: end_date = start_date + datetime.timedelta(seconds=duration) - elif start_date and options.base_url: + elif start_date and options.base_url: # get duration from api content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json" r = requests.get(content_url) @@ -376,18 +412,27 @@ get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable end_date = start_date + datetime.timedelta(seconds=int(duration/1000)) - + + if end_date and deltas: + end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1]) query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) - + query_res = query.all() - + for tw in query_res: tweet_ts_dt = tw.created_at tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) if ts is None: ts = tweet_ts tweet_ts_rel = (tweet_ts-ts) * 1000 + if deltas: + d = find_delta(tweet_ts_rel, deltas) + if d[1] < 0: + continue + else : + tweet_ts_rel -= d[1] + username = None profile_url = "" if tw.user is not None: @@ -395,19 +440,19 @@ profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else "" if not username: username = "anon." - + element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)}) etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) etree.SubElement(element, u"abstract").text = unicode(tw.text) - + tags_node = etree.SubElement(element, u"tags") - + for entity in tw.entity_list: - if entity.type == u'entity_hashtag': + if entity.type == u'entity_hashtag': etree.SubElement(tags_node,u"tag").text = entity.hashtag.text - + meta_element = etree.SubElement(element, u'meta') - + polemics_list = parse_polemics(tw, options.extended_mode) if polemics_list: polemics_element = etree.Element(u'polemics') @@ -416,15 +461,15 @@ meta_element.append(polemics_element) etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json)) - + # sort by tc in if options.merge : # remove all elements and put them in a array # sort them with tc #put them back elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) - - #add to display node + + #add to display node if display_content_node is not None: display_dec = None for dec in display_content_node.iterchildren(tag=u"decoupage"): @@ -433,15 +478,15 @@ break if display_dec is None and ensemble_id and decoupage_id: etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''}) - - output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) - + + output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) + if content_file_write and content_file_write.find("http") == 0: - + project["ldt"] = output_data project['owner'] = project['owner'].replace('%7E','~') project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']] - + post_param = {} if options.post_param: post_param = anyjson.loads(options.post_param) @@ -458,13 +503,13 @@ dest_file_name = content_file_write else: dest_file_name = options.filename - + get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable output = open(dest_file_name, "w") output.write(output_data) output.flush() output.close() - + finally: if session: session.close()