# HG changeset patch # User ymh # Date 1413422520 -7200 # Node ID ebfd0d3cffab4d7c9f8534b763f0fa5960c6d3b8 # Parent dd91da180852b69fd73d642b97e0eefd4aaa8345 Correction on export_annotation. First working version diff -r dd91da180852 -r ebfd0d3cffab README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Thu Oct 16 03:22:00 2014 +0200 @@ -0,0 +1,14 @@ + +# Mons software suite + +To create the virtualenv: + +``` +STATIC_DEPS=true pip install -r requirements.txt +``` + +Usage for export_annotations.py: + +``` +python export_annotations.py -a http://localhost:8080/p/api/annotations -b http://localhost/~ymh/platform/ldtplatform/ -p -E test -H ANNOT -v -v -s "2014-06-19T12:14:48+02" -R -P "{\"username\": \"\",\"api_key\":\"\"}" +``` diff -r dd91da180852 -r ebfd0d3cffab annot-server/webapp/api.py --- a/annot-server/webapp/api.py Wed Oct 15 18:44:05 2014 +0200 +++ b/annot-server/webapp/api.py Thu Oct 16 03:22:00 2014 +0200 @@ -3,6 +3,8 @@ # Copyright (c) 2014 IRI # +import sys + import flask.ext.restless import database @@ -15,4 +17,4 @@ methods=['GET', 'POST', 'PUT', 'DELETE'], url_prefix='/api/v1', primary_key='uuid', - max_results_per_page=-1) + max_results_per_page=sys.maxint) diff -r dd91da180852 -r ebfd0d3cffab requirements.txt --- a/requirements.txt Wed Oct 15 18:44:05 2014 +0200 +++ b/requirements.txt Thu Oct 16 03:22:00 2014 +0200 @@ -7,6 +7,7 @@ Werkzeug==0.9.6 autobahn==0.9.1 itsdangerous==0.24 +lxml==3.4.0 mimerender==0.5.4 ntplib==0.3.2 psycopg2==2.5.4 diff -r dd91da180852 -r ebfd0d3cffab utils/export_annotations.py --- a/utils/export_annotations.py Wed Oct 15 18:44:05 2014 +0200 +++ b/utils/export_annotations.py Thu Oct 16 03:22:00 2014 +0200 @@ -2,10 +2,6 @@ # coding=utf-8 from lxml import etree -from iri_tweet.models import setup_database, Tweet, User -from sqlalchemy.sql import select, func -from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, - get_logger) import argparse import json import datetime @@ -17,6 +13,7 @@ import uuid #@UnresolvedImport from dateutil.parser import parse as parse_date import bisect +import logging #class TweetExclude(object): # def __init__(self, id): @@ -29,6 +26,9 @@ LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" DEFAULT_ANNOTATION_CHANNEL = 'ANNOT' +def get_logger(): + return logging.getLogger(__name__) + def get_filter(start_date, end_date, events, channels, user_whitelist): res = [] @@ -43,6 +43,7 @@ res.append({'name': 'channel', 'op': "in", 'val':channels }) if user_whitelist: res.append({'name': 'user', 'op': "in", 'val':user_whitelist }) + return res # def parse_polemics(tw, extended_mode): # """ @@ -70,6 +71,56 @@ # else: # return None +def set_logging(options, plogger=None, queue=None): + + logging_config = { + "format" : '%(asctime)s %(levelname)s:%(name)s:%(message)s', + "level" : max(logging.NOTSET, min(logging.CRITICAL, logging.WARNING - 10 * options.verbose + 10 * options.quiet)), #@UndefinedVariable + } + + if options.logfile == "stdout": + logging_config["stream"] = sys.stdout + elif options.logfile == "stderr": + logging_config["stream"] = sys.stderr + else: + logging_config["filename"] = options.logfile + + logger = plogger + if logger is None: + logger = get_logger() #@UndefinedVariable + + if len(logger.handlers) == 0: + filename = logging_config.get("filename") + if queue is not None: + hdlr = QueueHandler(queue, True) + elif filename: + mode = logging_config.get("filemode", 'a') + hdlr = logging.FileHandler(filename, mode) #@UndefinedVariable + else: + stream = logging_config.get("stream") + hdlr = logging.StreamHandler(stream) #@UndefinedVariable + + fs = logging_config.get("format", logging.BASIC_FORMAT) #@UndefinedVariable + dfs = logging_config.get("datefmt", None) + fmt = logging.Formatter(fs, dfs) #@UndefinedVariable + hdlr.setFormatter(fmt) + logger.addHandler(hdlr) + level = logging_config.get("level") + if level is not None: + logger.setLevel(level) + + options.debug = (options.verbose-options.quiet > 0) + return logger + +def set_logging_options(parser): + parser.add_argument("-l", "--log", dest="logfile", + help="log to file", metavar="LOG", default="stderr") + parser.add_argument("-v", dest="verbose", action="count", + help="verbose", default=0) + parser.add_argument("-q", dest="quiet", action="count", + help="quiet", default=0) + + def get_options(): usage = "usage: %(prog)s [options]" @@ -116,6 +167,8 @@ help="Project id", metavar="PROJECT_ID", default=None) parser.add_argument("-P", "--post-param", dest="post_param", help="Post param", metavar="POST_PARAM", default=None) + parser.add_argument("-B", "--batch-size", dest="batch_size", type=int, + help="Batch size for annotation request", metavar="BATCH_SIZE", default=500) parser.add_argument("--user-whitelist", dest="user_whitelist", action="store", help="A list of user screen name", metavar="USER_WHITELIST",default=None) parser.add_argument("--cut", dest="cuts", action="append", @@ -154,11 +207,11 @@ page += 1 params['page'] = page resp = requests.get(url, params=params, headers=headers) - if resp.code != 200: + if resp.status_code != requests.codes.ok: return resp_json = resp.json() page_nb = resp_json.get('total_pages', 1) - for item in resp_json.get('results', []): + for item in resp_json.get('objects', []): #TODO: add progress log yield item @@ -181,7 +234,7 @@ total_delta += d deltas.append((c+total_delta, total_delta)) - if len(sys.argv) == 1 or options.database is None: + if len(sys.argv) == 1 or options.annot_url is None: parser.print_help() sys.exit(1) @@ -194,7 +247,7 @@ parameters = [] confdoc = etree.parse(options.listconf) - for node in confdoc.xpath("/twitter_export/file"): + for node in confdoc.xpath("/annotation_export/file"): params = {} for snode in node: if snode.tag == "path": @@ -369,7 +422,7 @@ decoupage = ensemble.find(u"decoupage") if ensemble is None or elements is None: - ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) + ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Annotation", u"author":u"IRI Web", u"abstract":u"Ensemble Annotation"}) decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) etree.SubElement(decoupage, u"title").text = unicode(options.name) @@ -388,9 +441,10 @@ elif start_date and options.base_url: # get duration from api content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json" - r = requests.get(content_url) + get_logger().debug("get duration " + content_url) #@UndefinedVariable + r = requests.get(content_url, params=post_param) + get_logger().debug("get duration resp " + repr(r)) #@UndefinedVariable duration = int(r.json()['duration']) - get_logger().debug("get duration " + content_url) #@UndefinedVariable get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable end_date = start_date + datetime.timedelta(seconds=int(duration/1000)) @@ -399,16 +453,16 @@ end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1]) - filters = get_filter(start_date, end_date, user_whitelist) + filters = get_filter(start_date, end_date, events, channels, user_whitelist) headers = {'Content-Type': 'application/json'} params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size} - for annot in build_annotation_iterator(url, params, headers):: + for annot in build_annotation_iterator(annotation_url, params, headers): #TODO : check timezone !!! - annot_ts_dt = annot['ts'] + annot_ts_dt = parse_date(annot['ts']) annot_ts = int(time.mktime(annot_ts_dt.timetuple())) if ts is None: ts = annot_ts