Correction on export_annotation. First working version
authorymh <ymh.work@gmail.com>
Thu, 16 Oct 2014 03:22:00 +0200
changeset 26 ebfd0d3cffab
parent 25 dd91da180852
child 27 68b29e36c9a2
Correction on export_annotation. First working version
README.md
annot-server/webapp/api.py
requirements.txt
utils/export_annotations.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Thu Oct 16 03:22:00 2014 +0200
@@ -0,0 +1,14 @@
+
+# Mons software suite
+
+To create the virtualenv:
+
+```
+STATIC_DEPS=true pip install -r requirements.txt
+```
+
+Usage for export_annotations.py:
+
+```
+python export_annotations.py -a http://localhost:8080/p/api/annotations -b http://localhost/~ymh/platform/ldtplatform/ -p  <project_guid> -E test -H ANNOT -v -v -s "2014-06-19T12:14:48+02" -R -P "{\"username\": \"<username>\",\"api_key\":\"<username api key>\"}"
+```
--- a/annot-server/webapp/api.py	Wed Oct 15 18:44:05 2014 +0200
+++ b/annot-server/webapp/api.py	Thu Oct 16 03:22:00 2014 +0200
@@ -3,6 +3,8 @@
 # Copyright (c) 2014 IRI
 #
 
+import sys
+
 import flask.ext.restless
 
 import database
@@ -15,4 +17,4 @@
     methods=['GET', 'POST', 'PUT', 'DELETE'],
     url_prefix='/api/v1',
     primary_key='uuid',
-    max_results_per_page=-1)
+    max_results_per_page=sys.maxint)
--- a/requirements.txt	Wed Oct 15 18:44:05 2014 +0200
+++ b/requirements.txt	Thu Oct 16 03:22:00 2014 +0200
@@ -7,6 +7,7 @@
 Werkzeug==0.9.6
 autobahn==0.9.1
 itsdangerous==0.24
+lxml==3.4.0
 mimerender==0.5.4
 ntplib==0.3.2
 psycopg2==2.5.4
--- a/utils/export_annotations.py	Wed Oct 15 18:44:05 2014 +0200
+++ b/utils/export_annotations.py	Thu Oct 16 03:22:00 2014 +0200
@@ -2,10 +2,6 @@
 # coding=utf-8
 
 from lxml import etree
-from iri_tweet.models import setup_database, Tweet, User
-from sqlalchemy.sql import select, func
-from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query,
-    get_logger)
 import argparse
 import json
 import datetime
@@ -17,6 +13,7 @@
 import uuid #@UnresolvedImport
 from dateutil.parser import parse as parse_date
 import bisect
+import logging
 
 #class TweetExclude(object):
 #    def __init__(self, id):
@@ -29,6 +26,9 @@
 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
 DEFAULT_ANNOTATION_CHANNEL = 'ANNOT'
 
+def get_logger():
+    return logging.getLogger(__name__)
+
 
 def get_filter(start_date, end_date, events, channels, user_whitelist):
     res = []
@@ -43,6 +43,7 @@
         res.append({'name': 'channel', 'op': "in", 'val':channels })
     if user_whitelist:
         res.append({'name': 'user', 'op': "in", 'val':user_whitelist })
+    return res
 
 # def parse_polemics(tw, extended_mode):
 #     """
@@ -70,6 +71,56 @@
 #     else:
 #         return None
 
+def set_logging(options, plogger=None, queue=None):
+
+    logging_config = {
+        "format" : '%(asctime)s %(levelname)s:%(name)s:%(message)s',
+        "level" : max(logging.NOTSET, min(logging.CRITICAL, logging.WARNING - 10 * options.verbose + 10 * options.quiet)), #@UndefinedVariable
+    }
+
+    if options.logfile == "stdout":
+        logging_config["stream"] = sys.stdout
+    elif options.logfile == "stderr":
+        logging_config["stream"] = sys.stderr
+    else:
+        logging_config["filename"] = options.logfile
+
+    logger = plogger
+    if logger is None:
+        logger = get_logger() #@UndefinedVariable
+
+    if len(logger.handlers) == 0:
+        filename = logging_config.get("filename")
+        if queue is not None:
+            hdlr = QueueHandler(queue, True)
+        elif filename:
+            mode = logging_config.get("filemode", 'a')
+            hdlr = logging.FileHandler(filename, mode) #@UndefinedVariable
+        else:
+            stream = logging_config.get("stream")
+            hdlr = logging.StreamHandler(stream) #@UndefinedVariable
+
+        fs = logging_config.get("format", logging.BASIC_FORMAT) #@UndefinedVariable
+        dfs = logging_config.get("datefmt", None)
+        fmt = logging.Formatter(fs, dfs) #@UndefinedVariable
+        hdlr.setFormatter(fmt)
+        logger.addHandler(hdlr)
+        level = logging_config.get("level")
+        if level is not None:
+            logger.setLevel(level)
+
+    options.debug = (options.verbose-options.quiet > 0)
+    return logger
+
+def set_logging_options(parser):
+    parser.add_argument("-l", "--log", dest="logfile",
+                      help="log to file", metavar="LOG", default="stderr")
+    parser.add_argument("-v", dest="verbose", action="count",
+                      help="verbose", default=0)
+    parser.add_argument("-q", dest="quiet", action="count",
+                      help="quiet", default=0)
+
+
 def get_options():
 
     usage = "usage: %(prog)s [options]"
@@ -116,6 +167,8 @@
                       help="Project id", metavar="PROJECT_ID", default=None)
     parser.add_argument("-P", "--post-param", dest="post_param",
                       help="Post param", metavar="POST_PARAM", default=None)
+    parser.add_argument("-B", "--batch-size", dest="batch_size", type=int,
+                      help="Batch size for annotation request", metavar="BATCH_SIZE", default=500)
     parser.add_argument("--user-whitelist", dest="user_whitelist", action="store",
                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
     parser.add_argument("--cut", dest="cuts", action="append",
@@ -154,11 +207,11 @@
         page += 1
         params['page'] = page
         resp = requests.get(url, params=params, headers=headers)
-        if resp.code != 200:
+        if resp.status_code != requests.codes.ok:
             return
         resp_json = resp.json()
         page_nb = resp_json.get('total_pages', 1)
-        for item in resp_json.get('results', []):
+        for item in resp_json.get('objects', []):
             #TODO: add progress log
             yield item
 
@@ -181,7 +234,7 @@
             total_delta += d
             deltas.append((c+total_delta, total_delta))
 
-    if len(sys.argv) == 1 or options.database is None:
+    if len(sys.argv) == 1 or options.annot_url is None:
         parser.print_help()
         sys.exit(1)
 
@@ -194,7 +247,7 @@
 
         parameters = []
         confdoc = etree.parse(options.listconf)
-        for node in confdoc.xpath("/twitter_export/file"):
+        for node in confdoc.xpath("/annotation_export/file"):
             params = {}
             for snode in node:
                 if snode.tag == "path":
@@ -369,7 +422,7 @@
                 decoupage = ensemble.find(u"decoupage")
 
         if ensemble is None or elements is None:
-            ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
+            ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Annotation", u"author":u"IRI Web", u"abstract":u"Ensemble Annotation"})
             decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
 
             etree.SubElement(decoupage, u"title").text = unicode(options.name)
@@ -388,9 +441,10 @@
         elif start_date and options.base_url:
             # get duration from api
             content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
-            r = requests.get(content_url)
+            get_logger().debug("get duration " + content_url) #@UndefinedVariable
+            r = requests.get(content_url, params=post_param)
+            get_logger().debug("get duration resp " + repr(r)) #@UndefinedVariable
             duration = int(r.json()['duration'])
-            get_logger().debug("get duration " + content_url) #@UndefinedVariable
             get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
 
             end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
@@ -399,16 +453,16 @@
             end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1])
 
 
-        filters = get_filter(start_date, end_date, user_whitelist)
+        filters = get_filter(start_date, end_date, events, channels, user_whitelist)
 
         headers = {'Content-Type': 'application/json'}
 
         params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size}
 
 
-        for annot in build_annotation_iterator(url, params, headers)::
+        for annot in build_annotation_iterator(annotation_url, params, headers):
             #TODO : check timezone !!!
-            annot_ts_dt = annot['ts']
+            annot_ts_dt = parse_date(annot['ts'])
             annot_ts = int(time.mktime(annot_ts_dt.timetuple()))
             if ts is None:
                 ts = annot_ts