utils/export_annotations.py
changeset 26 ebfd0d3cffab
parent 24 eb1f7b06001f
child 30 c2294ac6e875
equal deleted inserted replaced
25:dd91da180852 26:ebfd0d3cffab
     1 #!/usr/bin/env python
     1 #!/usr/bin/env python
     2 # coding=utf-8
     2 # coding=utf-8
     3 
     3 
     4 from lxml import etree
     4 from lxml import etree
     5 from iri_tweet.models import setup_database, Tweet, User
       
     6 from sqlalchemy.sql import select, func
       
     7 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query,
       
     8     get_logger)
       
     9 import argparse
     5 import argparse
    10 import json
     6 import json
    11 import datetime
     7 import datetime
    12 import requests
     8 import requests
    13 import os.path
     9 import os.path
    15 import sys
    11 import sys
    16 import time
    12 import time
    17 import uuid #@UnresolvedImport
    13 import uuid #@UnresolvedImport
    18 from dateutil.parser import parse as parse_date
    14 from dateutil.parser import parse as parse_date
    19 import bisect
    15 import bisect
       
    16 import logging
    20 
    17 
    21 #class TweetExclude(object):
    18 #class TweetExclude(object):
    22 #    def __init__(self, id):
    19 #    def __init__(self, id):
    23 #        self.id = id
    20 #        self.id = id
    24 #
    21 #
    26 #        return "<TweetExclude(id=%d)>" % (self.id)
    23 #        return "<TweetExclude(id=%d)>" % (self.id)
    27 
    24 
    28 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    25 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    29 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
    26 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
    30 DEFAULT_ANNOTATION_CHANNEL = 'ANNOT'
    27 DEFAULT_ANNOTATION_CHANNEL = 'ANNOT'
       
    28 
       
    29 def get_logger():
       
    30     return logging.getLogger(__name__)
    31 
    31 
    32 
    32 
    33 def get_filter(start_date, end_date, events, channels, user_whitelist):
    33 def get_filter(start_date, end_date, events, channels, user_whitelist):
    34     res = []
    34     res = []
    35     #TODO: check timezone...
    35     #TODO: check timezone...
    41         res.append({'name': 'event', 'op': "in", 'val':events })
    41         res.append({'name': 'event', 'op': "in", 'val':events })
    42     if channels:
    42     if channels:
    43         res.append({'name': 'channel', 'op': "in", 'val':channels })
    43         res.append({'name': 'channel', 'op': "in", 'val':channels })
    44     if user_whitelist:
    44     if user_whitelist:
    45         res.append({'name': 'user', 'op': "in", 'val':user_whitelist })
    45         res.append({'name': 'user', 'op': "in", 'val':user_whitelist })
       
    46     return res
    46 
    47 
    47 # def parse_polemics(tw, extended_mode):
    48 # def parse_polemics(tw, extended_mode):
    48 #     """
    49 #     """
    49 #     parse polemics in text and return a list of polemic code. None if not polemic found
    50 #     parse polemics in text and return a list of polemic code. None if not polemic found
    50 #     """
    51 #     """
    67 #
    68 #
    68 #     if len(polemics) > 0:
    69 #     if len(polemics) > 0:
    69 #         return polemics.keys()
    70 #         return polemics.keys()
    70 #     else:
    71 #     else:
    71 #         return None
    72 #         return None
       
    73 
       
    74 def set_logging(options, plogger=None, queue=None):
       
    75 
       
    76     logging_config = {
       
    77         "format" : '%(asctime)s %(levelname)s:%(name)s:%(message)s',
       
    78         "level" : max(logging.NOTSET, min(logging.CRITICAL, logging.WARNING - 10 * options.verbose + 10 * options.quiet)), #@UndefinedVariable
       
    79     }
       
    80 
       
    81     if options.logfile == "stdout":
       
    82         logging_config["stream"] = sys.stdout
       
    83     elif options.logfile == "stderr":
       
    84         logging_config["stream"] = sys.stderr
       
    85     else:
       
    86         logging_config["filename"] = options.logfile
       
    87 
       
    88     logger = plogger
       
    89     if logger is None:
       
    90         logger = get_logger() #@UndefinedVariable
       
    91 
       
    92     if len(logger.handlers) == 0:
       
    93         filename = logging_config.get("filename")
       
    94         if queue is not None:
       
    95             hdlr = QueueHandler(queue, True)
       
    96         elif filename:
       
    97             mode = logging_config.get("filemode", 'a')
       
    98             hdlr = logging.FileHandler(filename, mode) #@UndefinedVariable
       
    99         else:
       
   100             stream = logging_config.get("stream")
       
   101             hdlr = logging.StreamHandler(stream) #@UndefinedVariable
       
   102 
       
   103         fs = logging_config.get("format", logging.BASIC_FORMAT) #@UndefinedVariable
       
   104         dfs = logging_config.get("datefmt", None)
       
   105         fmt = logging.Formatter(fs, dfs) #@UndefinedVariable
       
   106         hdlr.setFormatter(fmt)
       
   107         logger.addHandler(hdlr)
       
   108         level = logging_config.get("level")
       
   109         if level is not None:
       
   110             logger.setLevel(level)
       
   111 
       
   112     options.debug = (options.verbose-options.quiet > 0)
       
   113     return logger
       
   114 
       
   115 def set_logging_options(parser):
       
   116     parser.add_argument("-l", "--log", dest="logfile",
       
   117                       help="log to file", metavar="LOG", default="stderr")
       
   118     parser.add_argument("-v", dest="verbose", action="count",
       
   119                       help="verbose", default=0)
       
   120     parser.add_argument("-q", dest="quiet", action="count",
       
   121                       help="quiet", default=0)
       
   122 
    72 
   123 
    73 def get_options():
   124 def get_options():
    74 
   125 
    75     usage = "usage: %(prog)s [options]"
   126     usage = "usage: %(prog)s [options]"
    76 
   127 
   114                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
   165                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
   115     parser.add_argument("-p", "--project", dest="project_id",
   166     parser.add_argument("-p", "--project", dest="project_id",
   116                       help="Project id", metavar="PROJECT_ID", default=None)
   167                       help="Project id", metavar="PROJECT_ID", default=None)
   117     parser.add_argument("-P", "--post-param", dest="post_param",
   168     parser.add_argument("-P", "--post-param", dest="post_param",
   118                       help="Post param", metavar="POST_PARAM", default=None)
   169                       help="Post param", metavar="POST_PARAM", default=None)
       
   170     parser.add_argument("-B", "--batch-size", dest="batch_size", type=int,
       
   171                       help="Batch size for annotation request", metavar="BATCH_SIZE", default=500)
   119     parser.add_argument("--user-whitelist", dest="user_whitelist", action="store",
   172     parser.add_argument("--user-whitelist", dest="user_whitelist", action="store",
   120                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
   173                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
   121     parser.add_argument("--cut", dest="cuts", action="append",
   174     parser.add_argument("--cut", dest="cuts", action="append",
   122                       help="A cut with the forma <ts in ms>::<duration>", metavar="CUT", default=[])
   175                       help="A cut with the forma <ts in ms>::<duration>", metavar="CUT", default=[])
   123 
   176 
   152     page_nb = 1
   205     page_nb = 1
   153     while page < page_nb:
   206     while page < page_nb:
   154         page += 1
   207         page += 1
   155         params['page'] = page
   208         params['page'] = page
   156         resp = requests.get(url, params=params, headers=headers)
   209         resp = requests.get(url, params=params, headers=headers)
   157         if resp.code != 200:
   210         if resp.status_code != requests.codes.ok:
   158             return
   211             return
   159         resp_json = resp.json()
   212         resp_json = resp.json()
   160         page_nb = resp_json.get('total_pages', 1)
   213         page_nb = resp_json.get('total_pages', 1)
   161         for item in resp_json.get('results', []):
   214         for item in resp_json.get('objects', []):
   162             #TODO: add progress log
   215             #TODO: add progress log
   163             yield item
   216             yield item
   164 
   217 
   165 
   218 
   166 if __name__ == "__main__" :
   219 if __name__ == "__main__" :
   179         for c, d in cuts_raw:
   232         for c, d in cuts_raw:
   180             deltas.append((c+total_delta, -1))
   233             deltas.append((c+total_delta, -1))
   181             total_delta += d
   234             total_delta += d
   182             deltas.append((c+total_delta, total_delta))
   235             deltas.append((c+total_delta, total_delta))
   183 
   236 
   184     if len(sys.argv) == 1 or options.database is None:
   237     if len(sys.argv) == 1 or options.annot_url is None:
   185         parser.print_help()
   238         parser.print_help()
   186         sys.exit(1)
   239         sys.exit(1)
   187 
   240 
   188     user_whitelist_file = options.user_whitelist
   241     user_whitelist_file = options.user_whitelist
   189     user_whitelist = None
   242     user_whitelist = None
   192 
   245 
   193     if options.listconf:
   246     if options.listconf:
   194 
   247 
   195         parameters = []
   248         parameters = []
   196         confdoc = etree.parse(options.listconf)
   249         confdoc = etree.parse(options.listconf)
   197         for node in confdoc.xpath("/twitter_export/file"):
   250         for node in confdoc.xpath("/annotation_export/file"):
   198             params = {}
   251             params = {}
   199             for snode in node:
   252             for snode in node:
   200                 if snode.tag == "path":
   253                 if snode.tag == "path":
   201                     params['content_file'] = snode.text
   254                     params['content_file'] = snode.text
   202                     params['content_file_write'] = snode.text
   255                     params['content_file_write'] = snode.text
   367             if ensemble is not None:
   420             if ensemble is not None:
   368                 elements = ensemble.find(u".//elements")
   421                 elements = ensemble.find(u".//elements")
   369                 decoupage = ensemble.find(u"decoupage")
   422                 decoupage = ensemble.find(u"decoupage")
   370 
   423 
   371         if ensemble is None or elements is None:
   424         if ensemble is None or elements is None:
   372             ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
   425             ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Annotation", u"author":u"IRI Web", u"abstract":u"Ensemble Annotation"})
   373             decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
   426             decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
   374 
   427 
   375             etree.SubElement(decoupage, u"title").text = unicode(options.name)
   428             etree.SubElement(decoupage, u"title").text = unicode(options.name)
   376             etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   429             etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   377 
   430 
   386         elif start_date and duration:
   439         elif start_date and duration:
   387             end_date = start_date + datetime.timedelta(seconds=duration)
   440             end_date = start_date + datetime.timedelta(seconds=duration)
   388         elif start_date and options.base_url:
   441         elif start_date and options.base_url:
   389             # get duration from api
   442             # get duration from api
   390             content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
   443             content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
   391             r = requests.get(content_url)
   444             get_logger().debug("get duration " + content_url) #@UndefinedVariable
       
   445             r = requests.get(content_url, params=post_param)
       
   446             get_logger().debug("get duration resp " + repr(r)) #@UndefinedVariable
   392             duration = int(r.json()['duration'])
   447             duration = int(r.json()['duration'])
   393             get_logger().debug("get duration " + content_url) #@UndefinedVariable
       
   394             get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
   448             get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
   395 
   449 
   396             end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
   450             end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
   397 
   451 
   398         if end_date and deltas:
   452         if end_date and deltas:
   399             end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1])
   453             end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1])
   400 
   454 
   401 
   455 
   402         filters = get_filter(start_date, end_date, user_whitelist)
   456         filters = get_filter(start_date, end_date, events, channels, user_whitelist)
   403 
   457 
   404         headers = {'Content-Type': 'application/json'}
   458         headers = {'Content-Type': 'application/json'}
   405 
   459 
   406         params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size}
   460         params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size}
   407 
   461 
   408 
   462 
   409         for annot in build_annotation_iterator(url, params, headers)::
   463         for annot in build_annotation_iterator(annotation_url, params, headers):
   410             #TODO : check timezone !!!
   464             #TODO : check timezone !!!
   411             annot_ts_dt = annot['ts']
   465             annot_ts_dt = parse_date(annot['ts'])
   412             annot_ts = int(time.mktime(annot_ts_dt.timetuple()))
   466             annot_ts = int(time.mktime(annot_ts_dt.timetuple()))
   413             if ts is None:
   467             if ts is None:
   414                 ts = annot_ts
   468                 ts = annot_ts
   415             annot_ts_rel = (annot_ts-ts) * 1000
   469             annot_ts_rel = (annot_ts-ts) * 1000
   416             if deltas:
   470             if deltas: