--- a/utils/export_annotations.py Wed Oct 15 18:44:05 2014 +0200
+++ b/utils/export_annotations.py Thu Oct 16 03:22:00 2014 +0200
@@ -2,10 +2,6 @@
# coding=utf-8
from lxml import etree
-from iri_tweet.models import setup_database, Tweet, User
-from sqlalchemy.sql import select, func
-from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query,
- get_logger)
import argparse
import json
import datetime
@@ -17,6 +13,7 @@
import uuid #@UnresolvedImport
from dateutil.parser import parse as parse_date
import bisect
+import logging
#class TweetExclude(object):
# def __init__(self, id):
@@ -29,6 +26,9 @@
LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
DEFAULT_ANNOTATION_CHANNEL = 'ANNOT'
+def get_logger():
+ return logging.getLogger(__name__)
+
def get_filter(start_date, end_date, events, channels, user_whitelist):
res = []
@@ -43,6 +43,7 @@
res.append({'name': 'channel', 'op': "in", 'val':channels })
if user_whitelist:
res.append({'name': 'user', 'op': "in", 'val':user_whitelist })
+ return res
# def parse_polemics(tw, extended_mode):
# """
@@ -70,6 +71,56 @@
# else:
# return None
+def set_logging(options, plogger=None, queue=None):
+
+ logging_config = {
+ "format" : '%(asctime)s %(levelname)s:%(name)s:%(message)s',
+ "level" : max(logging.NOTSET, min(logging.CRITICAL, logging.WARNING - 10 * options.verbose + 10 * options.quiet)), #@UndefinedVariable
+ }
+
+ if options.logfile == "stdout":
+ logging_config["stream"] = sys.stdout
+ elif options.logfile == "stderr":
+ logging_config["stream"] = sys.stderr
+ else:
+ logging_config["filename"] = options.logfile
+
+ logger = plogger
+ if logger is None:
+ logger = get_logger() #@UndefinedVariable
+
+ if len(logger.handlers) == 0:
+ filename = logging_config.get("filename")
+ if queue is not None:
+ hdlr = QueueHandler(queue, True)
+ elif filename:
+ mode = logging_config.get("filemode", 'a')
+ hdlr = logging.FileHandler(filename, mode) #@UndefinedVariable
+ else:
+ stream = logging_config.get("stream")
+ hdlr = logging.StreamHandler(stream) #@UndefinedVariable
+
+ fs = logging_config.get("format", logging.BASIC_FORMAT) #@UndefinedVariable
+ dfs = logging_config.get("datefmt", None)
+ fmt = logging.Formatter(fs, dfs) #@UndefinedVariable
+ hdlr.setFormatter(fmt)
+ logger.addHandler(hdlr)
+ level = logging_config.get("level")
+ if level is not None:
+ logger.setLevel(level)
+
+ options.debug = (options.verbose-options.quiet > 0)
+ return logger
+
+def set_logging_options(parser):
+ parser.add_argument("-l", "--log", dest="logfile",
+ help="log to file", metavar="LOG", default="stderr")
+ parser.add_argument("-v", dest="verbose", action="count",
+ help="verbose", default=0)
+ parser.add_argument("-q", dest="quiet", action="count",
+ help="quiet", default=0)
+
+
def get_options():
usage = "usage: %(prog)s [options]"
@@ -116,6 +167,8 @@
help="Project id", metavar="PROJECT_ID", default=None)
parser.add_argument("-P", "--post-param", dest="post_param",
help="Post param", metavar="POST_PARAM", default=None)
+ parser.add_argument("-B", "--batch-size", dest="batch_size", type=int,
+ help="Batch size for annotation request", metavar="BATCH_SIZE", default=500)
parser.add_argument("--user-whitelist", dest="user_whitelist", action="store",
help="A list of user screen name", metavar="USER_WHITELIST",default=None)
parser.add_argument("--cut", dest="cuts", action="append",
@@ -154,11 +207,11 @@
page += 1
params['page'] = page
resp = requests.get(url, params=params, headers=headers)
- if resp.code != 200:
+ if resp.status_code != requests.codes.ok:
return
resp_json = resp.json()
page_nb = resp_json.get('total_pages', 1)
- for item in resp_json.get('results', []):
+ for item in resp_json.get('objects', []):
#TODO: add progress log
yield item
@@ -181,7 +234,7 @@
total_delta += d
deltas.append((c+total_delta, total_delta))
- if len(sys.argv) == 1 or options.database is None:
+ if len(sys.argv) == 1 or options.annot_url is None:
parser.print_help()
sys.exit(1)
@@ -194,7 +247,7 @@
parameters = []
confdoc = etree.parse(options.listconf)
- for node in confdoc.xpath("/twitter_export/file"):
+ for node in confdoc.xpath("/annotation_export/file"):
params = {}
for snode in node:
if snode.tag == "path":
@@ -369,7 +422,7 @@
decoupage = ensemble.find(u"decoupage")
if ensemble is None or elements is None:
- ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
+ ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Annotation", u"author":u"IRI Web", u"abstract":u"Ensemble Annotation"})
decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
etree.SubElement(decoupage, u"title").text = unicode(options.name)
@@ -388,9 +441,10 @@
elif start_date and options.base_url:
# get duration from api
content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
- r = requests.get(content_url)
+ get_logger().debug("get duration " + content_url) #@UndefinedVariable
+ r = requests.get(content_url, params=post_param)
+ get_logger().debug("get duration resp " + repr(r)) #@UndefinedVariable
duration = int(r.json()['duration'])
- get_logger().debug("get duration " + content_url) #@UndefinedVariable
get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
@@ -399,16 +453,16 @@
end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1])
- filters = get_filter(start_date, end_date, user_whitelist)
+ filters = get_filter(start_date, end_date, events, channels, user_whitelist)
headers = {'Content-Type': 'application/json'}
params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size}
- for annot in build_annotation_iterator(url, params, headers)::
+ for annot in build_annotation_iterator(annotation_url, params, headers):
#TODO : check timezone !!!
- annot_ts_dt = annot['ts']
+ annot_ts_dt = parse_date(annot['ts'])
annot_ts = int(time.mktime(annot_ts_dt.timetuple()))
if ts is None:
ts = annot_ts