# HG changeset patch # User Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com> # Date 1358309063 -3600 # Node ID bc29a6fbb8e8c62b809c892edf53a72173f5784f # Parent 38ff25c1db25e0d4de4b0c96a9f2c96ff4de53ed various correction for export tweet alchemy. Can give a project diff -r 38ff25c1db25 -r bc29a6fbb8e8 script/lib/iri_tweet/iri_tweet/utils.py --- a/script/lib/iri_tweet/iri_tweet/utils.py Fri Jan 11 11:59:03 2013 +0100 +++ b/script/lib/iri_tweet/iri_tweet/utils.py Wed Jan 16 05:04:23 2013 +0100 @@ -624,3 +624,4 @@ writer.flush() return writer + diff -r 38ff25c1db25 -r bc29a6fbb8e8 script/utils/export_twitter_alchemy.py --- a/script/utils/export_twitter_alchemy.py Fri Jan 11 11:59:03 2013 +0100 +++ b/script/utils/export_twitter_alchemy.py Wed Jan 16 05:04:23 2013 +0100 @@ -2,14 +2,14 @@ # coding=utf-8 from lxml import etree -from iri_tweet.models import setup_database +from iri_tweet.models import setup_database, Tweet, User from optparse import OptionParser #@UnresolvedImport from sqlalchemy import Table, Column, BigInteger from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, get_logger) import anyjson import datetime -import httplib2 +import requests import os.path import re import sys @@ -24,6 +24,9 @@ # def __repr__(self): # return "" % (self.id) +LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/" +LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" + def parse_polemics(tw, extended_mode): """ @@ -87,6 +90,12 @@ help="list of file to process", metavar="LIST_CONF", default=None) parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", help="Trigger polemic extended mode", metavar="EXTENDED", default=False) + parser.add_option("-b", "--base-url", dest="base_url", + help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/") + parser.add_option("-p", "--project", dest="project_id", + help="Project id", metavar="PROJECT_ID", default=None) + parser.add_option("-P", "--post-param", dest="post_param", + help="Post param", metavar="POST_PARAM", default=None) parser.add_option("--user-whitelist", dest="user_whitelist", action="store", help="A list of user screen name", metavar="USER_WHITELIST",default=None) @@ -127,8 +136,30 @@ if options.exclude and os.path.exists(options.exclude): with open(options.exclude, 'r+') as f: tei = tweet_exclude_table.insert() + ex_regexp = re.compile("(?P\w+)(?P[~=])(?P.+)", re.I) for line in f: - conn.execute(tei.values(id=long(line.strip()))) + res = ex_regexp.match(line.strip()) + if res: + if res.group('field') == "id": + conn.execute(tei.values(id=res.group('value'))) + else: + exclude_query = session.query(Tweet) + filter_obj = Tweet + filter_field = res.group('field') + if filter_field.startswith("user_"): + exclude_query = exclude_query.join(User) + filter_obj = User + filter_field = filter_field[len("user_"):] + + + if res.group('op') == "=": + exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field) == res.group('value')) + else: + exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field).like(res.group('value'))) + + for t in exclude_query.all(): + conn.execute(tei.values(id=t.id)) + user_whitelist_file = options.user_whitelist user_whitelist = None @@ -141,6 +172,10 @@ for snode in node: if snode.tag == "path": params['content_file'] = snode.text + params['content_file_write'] = snode.text + elif snode.tag == "project_id": + params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json" + params['project_id'] = snode.text elif snode.tag == "start_date": params['start_date'] = snode.text elif snode.tag == "end_date": @@ -152,15 +187,24 @@ if options.hashtag or 'hashtags' not in params : params['hashtags'] = options.hashtag parameters.append(params) - else: + else: + if options.project_id: + content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json" + else: + content_file = options.content_file parameters = [{ 'start_date': options.start_date, 'end_date' : options.end_date, 'duration' : options.duration, - 'content_file' : options.content_file, - 'hashtags' : options.hashtag + 'content_file' : content_file, + 'content_file_write' : content_file, + 'hashtags' : options.hashtag, + 'project_id' : options.project_id }] - + post_param = {} + if options.post_param: + post_param = anyjson.loads(options.post_param) + for params in parameters: get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable @@ -169,6 +213,7 @@ end_date_str = params.get("end_date", None) duration = params.get("duration", None) content_file = params.get("content_file", None) + content_file_write = params.get("content_file_write", None) hashtags = params.get('hashtags', []) if user_whitelist_file: @@ -181,15 +226,6 @@ start_date = parse_date(start_date_str) ts = time.mktime(start_date.timetuple()) - end_date = None - if end_date_str: - end_date = parse_date(end_date_str) - elif start_date and duration: - end_date = start_date + datetime.timedelta(seconds=duration) - - query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) - - query_res = query.all() root = None ensemble_parent = None @@ -200,19 +236,17 @@ get_logger().debug("url : " + content_file) #@UndefinedVariable - h = httplib2.Http() - resp, content = h.request(content_file) - - get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable - - project = anyjson.deserialize(content) + r = requests.get(content_file, params=post_param) + get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable + project = r.json() root = etree.fromstring(project["ldt"]) elif content_file and os.path.exists(content_file): doc = etree.parse(content_file) root = doc.getroot() - + + content_id = None if root is None: @@ -227,6 +261,8 @@ content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) ensemble_parent = content + content_id = options.content_id + if ensemble_parent is None: file_type = None @@ -249,6 +285,7 @@ if content_node is None: content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) ensemble_parent = content_node + content_id = content_node.get(u"id") elif file_type == "iri": body_node = root.find(u"body") if body_node is None: @@ -257,6 +294,7 @@ if ensembles_node is None: ensembles_node = etree.SubElement(body_node, u"ensembles") ensemble_parent = ensembles_node + content_id = root.xpath("head/meta[@name='id']/@content")[0] if ensemble_parent is None: @@ -285,6 +323,25 @@ elements = etree.SubElement(decoupage, u"elements") + end_date = None + if end_date_str: + end_date = parse_date(end_date_str) + elif start_date and duration: + end_date = start_date + datetime.timedelta(seconds=duration) + elif start_date and options.base_url: + # get duration from api + content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json" + r = requests.get(content_url) + duration = int(r.json()['duration']) + get_logger().debug("get duration " + content_url) #@UndefinedVariable + get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable + + end_date = start_date + datetime.timedelta(seconds=int(duration/1000)) + + query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) + + query_res = query.all() + for tw in query_res: tweet_ts_dt = tw.created_at @@ -333,21 +390,23 @@ output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True) - if content_file and content_file.find("http") == 0: + if content_file_write and content_file_write.find("http") == 0: project["ldt"] = output_data - body = anyjson.serialize(project) - get_logger().debug("write http " + content_file) #@UndefinedVariable - get_logger().debug("write http " + repr(body)) #@UndefinedVariable - h = httplib2.Http() - resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body) - get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable - if resp.status != 200: - get_logger().error("Error http " + repr(resp) + " content " + content) #@UndefinedVariable - raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason)) + post_param = {} + if options.post_param: + post_param = anyjson.loads(options.post_param) + + get_logger().debug("write http " + content_file_write) #@UndefinedVariable + get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable + get_logger().debug("write http " + repr(project)) #@UndefinedVariable + r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param); + get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable + if r.status_code != requests.codes.ok: + r.raise_for_status() else: - if content_file and os.path.exists(content_file): - dest_file_name = content_file + if content_file_write and os.path.exists(content_file_write): + dest_file_name = content_file_write else: dest_file_name = options.filename diff -r 38ff25c1db25 -r bc29a6fbb8e8 script/utils/merge_tweets.py --- a/script/utils/merge_tweets.py Fri Jan 11 11:59:03 2013 +0100 +++ b/script/utils/merge_tweets.py Wed Jan 16 05:04:23 2013 +0100 @@ -91,7 +91,7 @@ session_tgt.flush() - show_progress(i+1, count_tw, progress_text+tweet.text, 70) + show_progress(i+1, count_tw, repr(progress_text+tweet.text), 70) session_tgt.commit() print u"%d new tweet added" % (added) diff -r 38ff25c1db25 -r bc29a6fbb8e8 script/virtualenv/res/lib/lib_create_env.py --- a/script/virtualenv/res/lib/lib_create_env.py Fri Jan 11 11:59:03 2013 +0100 +++ b/script/virtualenv/res/lib/lib_create_env.py Wed Jan 16 05:04:23 2013 +0100 @@ -29,7 +29,7 @@ 'TWEEPY': {'setup': 'tweepy', 'url':'https://github.com/tweepy/tweepy/archive/1.12.tar.gz', 'local':"tweepy-1.12.tar.gz", 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}}, 'TWITTER': {'setup': 'twitter', 'url':'http://pypi.python.org/packages/source/t/twitter/twitter-1.9.0.tar.gz', 'local':"twitter-1.9.0.tar.gz", 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}}, 'TWITTER-TEXT': {'setup': 'twitter-text', 'url':'https://github.com/dryan/twitter-text-py/archive/master.tar.gz', 'local':"twitter-text-1.0.4.tar.gz", 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}}, - 'REQUESTS': {'setup': 'requests', 'url':'https://github.com/kennethreitz/requests/archive/v1.0.2.tar.gz', 'local':'requests-v1.0.2.tar.gz', 'install' : {'method':'pip', 'option_str': None, 'dict_extra_env': None}}, + 'REQUESTS': {'setup': 'requests', 'url':'https://github.com/kennethreitz/requests/archive/v1.1.0.tar.gz', 'local':'requests-v1.1.0.tar.gz', 'install' : {'method':'pip', 'option_str': None, 'dict_extra_env': None}}, } if system_str == 'Windows': diff -r 38ff25c1db25 -r bc29a6fbb8e8 script/virtualenv/res/src/requests-1.1.0.tar.gz Binary file script/virtualenv/res/src/requests-1.1.0.tar.gz has changed diff -r 38ff25c1db25 -r bc29a6fbb8e8 script/virtualenv/res/src/requests-v1.0.2.tar.gz Binary file script/virtualenv/res/src/requests-v1.0.2.tar.gz has changed