26 # return "<TweetExclude(id=%d)>" % (self.id) |
23 # return "<TweetExclude(id=%d)>" % (self.id) |
27 |
24 |
28 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/" |
25 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/" |
29 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" |
26 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" |
30 DEFAULT_ANNOTATION_CHANNEL = 'ANNOT' |
27 DEFAULT_ANNOTATION_CHANNEL = 'ANNOT' |
|
28 |
|
29 def get_logger(): |
|
30 return logging.getLogger(__name__) |
31 |
31 |
32 |
32 |
33 def get_filter(start_date, end_date, events, channels, user_whitelist): |
33 def get_filter(start_date, end_date, events, channels, user_whitelist): |
34 res = [] |
34 res = [] |
35 #TODO: check timezone... |
35 #TODO: check timezone... |
41 res.append({'name': 'event', 'op': "in", 'val':events }) |
41 res.append({'name': 'event', 'op': "in", 'val':events }) |
42 if channels: |
42 if channels: |
43 res.append({'name': 'channel', 'op': "in", 'val':channels }) |
43 res.append({'name': 'channel', 'op': "in", 'val':channels }) |
44 if user_whitelist: |
44 if user_whitelist: |
45 res.append({'name': 'user', 'op': "in", 'val':user_whitelist }) |
45 res.append({'name': 'user', 'op': "in", 'val':user_whitelist }) |
|
46 return res |
46 |
47 |
47 # def parse_polemics(tw, extended_mode): |
48 # def parse_polemics(tw, extended_mode): |
48 # """ |
49 # """ |
49 # parse polemics in text and return a list of polemic code. None if not polemic found |
50 # parse polemics in text and return a list of polemic code. None if not polemic found |
50 # """ |
51 # """ |
67 # |
68 # |
68 # if len(polemics) > 0: |
69 # if len(polemics) > 0: |
69 # return polemics.keys() |
70 # return polemics.keys() |
70 # else: |
71 # else: |
71 # return None |
72 # return None |
|
73 |
|
74 def set_logging(options, plogger=None, queue=None): |
|
75 |
|
76 logging_config = { |
|
77 "format" : '%(asctime)s %(levelname)s:%(name)s:%(message)s', |
|
78 "level" : max(logging.NOTSET, min(logging.CRITICAL, logging.WARNING - 10 * options.verbose + 10 * options.quiet)), #@UndefinedVariable |
|
79 } |
|
80 |
|
81 if options.logfile == "stdout": |
|
82 logging_config["stream"] = sys.stdout |
|
83 elif options.logfile == "stderr": |
|
84 logging_config["stream"] = sys.stderr |
|
85 else: |
|
86 logging_config["filename"] = options.logfile |
|
87 |
|
88 logger = plogger |
|
89 if logger is None: |
|
90 logger = get_logger() #@UndefinedVariable |
|
91 |
|
92 if len(logger.handlers) == 0: |
|
93 filename = logging_config.get("filename") |
|
94 if queue is not None: |
|
95 hdlr = QueueHandler(queue, True) |
|
96 elif filename: |
|
97 mode = logging_config.get("filemode", 'a') |
|
98 hdlr = logging.FileHandler(filename, mode) #@UndefinedVariable |
|
99 else: |
|
100 stream = logging_config.get("stream") |
|
101 hdlr = logging.StreamHandler(stream) #@UndefinedVariable |
|
102 |
|
103 fs = logging_config.get("format", logging.BASIC_FORMAT) #@UndefinedVariable |
|
104 dfs = logging_config.get("datefmt", None) |
|
105 fmt = logging.Formatter(fs, dfs) #@UndefinedVariable |
|
106 hdlr.setFormatter(fmt) |
|
107 logger.addHandler(hdlr) |
|
108 level = logging_config.get("level") |
|
109 if level is not None: |
|
110 logger.setLevel(level) |
|
111 |
|
112 options.debug = (options.verbose-options.quiet > 0) |
|
113 return logger |
|
114 |
|
115 def set_logging_options(parser): |
|
116 parser.add_argument("-l", "--log", dest="logfile", |
|
117 help="log to file", metavar="LOG", default="stderr") |
|
118 parser.add_argument("-v", dest="verbose", action="count", |
|
119 help="verbose", default=0) |
|
120 parser.add_argument("-q", dest="quiet", action="count", |
|
121 help="quiet", default=0) |
|
122 |
72 |
123 |
73 def get_options(): |
124 def get_options(): |
74 |
125 |
75 usage = "usage: %(prog)s [options]" |
126 usage = "usage: %(prog)s [options]" |
76 |
127 |
114 help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/") |
165 help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/") |
115 parser.add_argument("-p", "--project", dest="project_id", |
166 parser.add_argument("-p", "--project", dest="project_id", |
116 help="Project id", metavar="PROJECT_ID", default=None) |
167 help="Project id", metavar="PROJECT_ID", default=None) |
117 parser.add_argument("-P", "--post-param", dest="post_param", |
168 parser.add_argument("-P", "--post-param", dest="post_param", |
118 help="Post param", metavar="POST_PARAM", default=None) |
169 help="Post param", metavar="POST_PARAM", default=None) |
|
170 parser.add_argument("-B", "--batch-size", dest="batch_size", type=int, |
|
171 help="Batch size for annotation request", metavar="BATCH_SIZE", default=500) |
119 parser.add_argument("--user-whitelist", dest="user_whitelist", action="store", |
172 parser.add_argument("--user-whitelist", dest="user_whitelist", action="store", |
120 help="A list of user screen name", metavar="USER_WHITELIST",default=None) |
173 help="A list of user screen name", metavar="USER_WHITELIST",default=None) |
121 parser.add_argument("--cut", dest="cuts", action="append", |
174 parser.add_argument("--cut", dest="cuts", action="append", |
122 help="A cut with the forma <ts in ms>::<duration>", metavar="CUT", default=[]) |
175 help="A cut with the forma <ts in ms>::<duration>", metavar="CUT", default=[]) |
123 |
176 |
152 page_nb = 1 |
205 page_nb = 1 |
153 while page < page_nb: |
206 while page < page_nb: |
154 page += 1 |
207 page += 1 |
155 params['page'] = page |
208 params['page'] = page |
156 resp = requests.get(url, params=params, headers=headers) |
209 resp = requests.get(url, params=params, headers=headers) |
157 if resp.code != 200: |
210 if resp.status_code != requests.codes.ok: |
158 return |
211 return |
159 resp_json = resp.json() |
212 resp_json = resp.json() |
160 page_nb = resp_json.get('total_pages', 1) |
213 page_nb = resp_json.get('total_pages', 1) |
161 for item in resp_json.get('results', []): |
214 for item in resp_json.get('objects', []): |
162 #TODO: add progress log |
215 #TODO: add progress log |
163 yield item |
216 yield item |
164 |
217 |
165 |
218 |
166 if __name__ == "__main__" : |
219 if __name__ == "__main__" : |
179 for c, d in cuts_raw: |
232 for c, d in cuts_raw: |
180 deltas.append((c+total_delta, -1)) |
233 deltas.append((c+total_delta, -1)) |
181 total_delta += d |
234 total_delta += d |
182 deltas.append((c+total_delta, total_delta)) |
235 deltas.append((c+total_delta, total_delta)) |
183 |
236 |
184 if len(sys.argv) == 1 or options.database is None: |
237 if len(sys.argv) == 1 or options.annot_url is None: |
185 parser.print_help() |
238 parser.print_help() |
186 sys.exit(1) |
239 sys.exit(1) |
187 |
240 |
188 user_whitelist_file = options.user_whitelist |
241 user_whitelist_file = options.user_whitelist |
189 user_whitelist = None |
242 user_whitelist = None |
367 if ensemble is not None: |
420 if ensemble is not None: |
368 elements = ensemble.find(u".//elements") |
421 elements = ensemble.find(u".//elements") |
369 decoupage = ensemble.find(u"decoupage") |
422 decoupage = ensemble.find(u"decoupage") |
370 |
423 |
371 if ensemble is None or elements is None: |
424 if ensemble is None or elements is None: |
372 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) |
425 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"annot_" + unicode(uuid.uuid4()), u"title":u"Ensemble Annotation", u"author":u"IRI Web", u"abstract":u"Ensemble Annotation"}) |
373 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) |
426 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) |
374 |
427 |
375 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
428 etree.SubElement(decoupage, u"title").text = unicode(options.name) |
376 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
429 etree.SubElement(decoupage, u"abstract").text = unicode(options.name) |
377 |
430 |
386 elif start_date and duration: |
439 elif start_date and duration: |
387 end_date = start_date + datetime.timedelta(seconds=duration) |
440 end_date = start_date + datetime.timedelta(seconds=duration) |
388 elif start_date and options.base_url: |
441 elif start_date and options.base_url: |
389 # get duration from api |
442 # get duration from api |
390 content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json" |
443 content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json" |
391 r = requests.get(content_url) |
444 get_logger().debug("get duration " + content_url) #@UndefinedVariable |
|
445 r = requests.get(content_url, params=post_param) |
|
446 get_logger().debug("get duration resp " + repr(r)) #@UndefinedVariable |
392 duration = int(r.json()['duration']) |
447 duration = int(r.json()['duration']) |
393 get_logger().debug("get duration " + content_url) #@UndefinedVariable |
|
394 get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable |
448 get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable |
395 |
449 |
396 end_date = start_date + datetime.timedelta(seconds=int(duration/1000)) |
450 end_date = start_date + datetime.timedelta(seconds=int(duration/1000)) |
397 |
451 |
398 if end_date and deltas: |
452 if end_date and deltas: |
399 end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1]) |
453 end_date = end_date + datetime.timedelta(milliseconds=deltas[-1][1]) |
400 |
454 |
401 |
455 |
402 filters = get_filter(start_date, end_date, user_whitelist) |
456 filters = get_filter(start_date, end_date, events, channels, user_whitelist) |
403 |
457 |
404 headers = {'Content-Type': 'application/json'} |
458 headers = {'Content-Type': 'application/json'} |
405 |
459 |
406 params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size} |
460 params = { 'q':json.dumps({'filters':filters}), 'results_per_page': options.batch_size} |
407 |
461 |
408 |
462 |
409 for annot in build_annotation_iterator(url, params, headers):: |
463 for annot in build_annotation_iterator(annotation_url, params, headers): |
410 #TODO : check timezone !!! |
464 #TODO : check timezone !!! |
411 annot_ts_dt = annot['ts'] |
465 annot_ts_dt = parse_date(annot['ts']) |
412 annot_ts = int(time.mktime(annot_ts_dt.timetuple())) |
466 annot_ts = int(time.mktime(annot_ts_dt.timetuple())) |
413 if ts is None: |
467 if ts is None: |
414 ts = annot_ts |
468 ts = annot_ts |
415 annot_ts_rel = (annot_ts-ts) * 1000 |
469 annot_ts_rel = (annot_ts-ts) * 1000 |
416 if deltas: |
470 if deltas: |