--- a/script/utils/export_twitter_alchemy.py Fri Jan 11 11:59:03 2013 +0100
+++ b/script/utils/export_twitter_alchemy.py Wed Jan 16 05:04:23 2013 +0100
@@ -2,14 +2,14 @@
# coding=utf-8
from lxml import etree
-from iri_tweet.models import setup_database
+from iri_tweet.models import setup_database, Tweet, User
from optparse import OptionParser #@UnresolvedImport
from sqlalchemy import Table, Column, BigInteger
from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query,
get_logger)
import anyjson
import datetime
-import httplib2
+import requests
import os.path
import re
import sys
@@ -24,6 +24,9 @@
# def __repr__(self):
# return "<TweetExclude(id=%d)>" % (self.id)
+LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
+LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
+
def parse_polemics(tw, extended_mode):
"""
@@ -87,6 +90,12 @@
help="list of file to process", metavar="LIST_CONF", default=None)
parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
+ parser.add_option("-b", "--base-url", dest="base_url",
+ help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
+ parser.add_option("-p", "--project", dest="project_id",
+ help="Project id", metavar="PROJECT_ID", default=None)
+ parser.add_option("-P", "--post-param", dest="post_param",
+ help="Post param", metavar="POST_PARAM", default=None)
parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
help="A list of user screen name", metavar="USER_WHITELIST",default=None)
@@ -127,8 +136,30 @@
if options.exclude and os.path.exists(options.exclude):
with open(options.exclude, 'r+') as f:
tei = tweet_exclude_table.insert()
+ ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I)
for line in f:
- conn.execute(tei.values(id=long(line.strip())))
+ res = ex_regexp.match(line.strip())
+ if res:
+ if res.group('field') == "id":
+ conn.execute(tei.values(id=res.group('value')))
+ else:
+ exclude_query = session.query(Tweet)
+ filter_obj = Tweet
+ filter_field = res.group('field')
+ if filter_field.startswith("user_"):
+ exclude_query = exclude_query.join(User)
+ filter_obj = User
+ filter_field = filter_field[len("user_"):]
+
+
+ if res.group('op') == "=":
+ exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field) == res.group('value'))
+ else:
+ exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field).like(res.group('value')))
+
+ for t in exclude_query.all():
+ conn.execute(tei.values(id=t.id))
+
user_whitelist_file = options.user_whitelist
user_whitelist = None
@@ -141,6 +172,10 @@
for snode in node:
if snode.tag == "path":
params['content_file'] = snode.text
+ params['content_file_write'] = snode.text
+ elif snode.tag == "project_id":
+ params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
+ params['project_id'] = snode.text
elif snode.tag == "start_date":
params['start_date'] = snode.text
elif snode.tag == "end_date":
@@ -152,15 +187,24 @@
if options.hashtag or 'hashtags' not in params :
params['hashtags'] = options.hashtag
parameters.append(params)
- else:
+ else:
+ if options.project_id:
+ content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json"
+ else:
+ content_file = options.content_file
parameters = [{
'start_date': options.start_date,
'end_date' : options.end_date,
'duration' : options.duration,
- 'content_file' : options.content_file,
- 'hashtags' : options.hashtag
+ 'content_file' : content_file,
+ 'content_file_write' : content_file,
+ 'hashtags' : options.hashtag,
+ 'project_id' : options.project_id
}]
-
+ post_param = {}
+ if options.post_param:
+ post_param = anyjson.loads(options.post_param)
+
for params in parameters:
get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
@@ -169,6 +213,7 @@
end_date_str = params.get("end_date", None)
duration = params.get("duration", None)
content_file = params.get("content_file", None)
+ content_file_write = params.get("content_file_write", None)
hashtags = params.get('hashtags', [])
if user_whitelist_file:
@@ -181,15 +226,6 @@
start_date = parse_date(start_date_str)
ts = time.mktime(start_date.timetuple())
- end_date = None
- if end_date_str:
- end_date = parse_date(end_date_str)
- elif start_date and duration:
- end_date = start_date + datetime.timedelta(seconds=duration)
-
- query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
-
- query_res = query.all()
root = None
ensemble_parent = None
@@ -200,19 +236,17 @@
get_logger().debug("url : " + content_file) #@UndefinedVariable
- h = httplib2.Http()
- resp, content = h.request(content_file)
-
- get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
-
- project = anyjson.deserialize(content)
+ r = requests.get(content_file, params=post_param)
+ get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable
+ project = r.json()
root = etree.fromstring(project["ldt"])
elif content_file and os.path.exists(content_file):
doc = etree.parse(content_file)
root = doc.getroot()
-
+
+ content_id = None
if root is None:
@@ -227,6 +261,8 @@
content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
ensemble_parent = content
+ content_id = options.content_id
+
if ensemble_parent is None:
file_type = None
@@ -249,6 +285,7 @@
if content_node is None:
content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
ensemble_parent = content_node
+ content_id = content_node.get(u"id")
elif file_type == "iri":
body_node = root.find(u"body")
if body_node is None:
@@ -257,6 +294,7 @@
if ensembles_node is None:
ensembles_node = etree.SubElement(body_node, u"ensembles")
ensemble_parent = ensembles_node
+ content_id = root.xpath("head/meta[@name='id']/@content")[0]
if ensemble_parent is None:
@@ -285,6 +323,25 @@
elements = etree.SubElement(decoupage, u"elements")
+ end_date = None
+ if end_date_str:
+ end_date = parse_date(end_date_str)
+ elif start_date and duration:
+ end_date = start_date + datetime.timedelta(seconds=duration)
+ elif start_date and options.base_url:
+ # get duration from api
+ content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
+ r = requests.get(content_url)
+ duration = int(r.json()['duration'])
+ get_logger().debug("get duration " + content_url) #@UndefinedVariable
+ get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
+
+ end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
+
+ query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
+
+ query_res = query.all()
+
for tw in query_res:
tweet_ts_dt = tw.created_at
@@ -333,21 +390,23 @@
output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)
- if content_file and content_file.find("http") == 0:
+ if content_file_write and content_file_write.find("http") == 0:
project["ldt"] = output_data
- body = anyjson.serialize(project)
- get_logger().debug("write http " + content_file) #@UndefinedVariable
- get_logger().debug("write http " + repr(body)) #@UndefinedVariable
- h = httplib2.Http()
- resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
- get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
- if resp.status != 200:
- get_logger().error("Error http " + repr(resp) + " content " + content) #@UndefinedVariable
- raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason))
+ post_param = {}
+ if options.post_param:
+ post_param = anyjson.loads(options.post_param)
+
+ get_logger().debug("write http " + content_file_write) #@UndefinedVariable
+ get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable
+ get_logger().debug("write http " + repr(project)) #@UndefinedVariable
+ r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param);
+ get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable
+ if r.status_code != requests.codes.ok:
+ r.raise_for_status()
else:
- if content_file and os.path.exists(content_file):
- dest_file_name = content_file
+ if content_file_write and os.path.exists(content_file_write):
+ dest_file_name = content_file_write
else:
dest_file_name = options.filename