# HG changeset patch # User Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com> # Date 1325949164 -3600 # Node ID b9243ade95e2723f84d82c4ece25e77a4e75a889 # Parent d3b86c65c98015e5ce3b5705245ecf751b6ab447 code cleaning and reorganisation for scripts diff -r d3b86c65c980 -r b9243ade95e2 script/lib/iri_tweet/create_twitter_export_conf.py --- a/script/lib/iri_tweet/create_twitter_export_conf.py Tue Dec 20 16:26:34 2011 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -from lxml import etree -from optparse import OptionParser #@UnresolvedImport - -def get_options(): - - parser = OptionParser() - - parser.add_option("-f", "--file", dest="outputfile", - help="destination filename", metavar="FILE", default="twitter_export_conf.xml") - parser.add_option("-i", "--input", dest="inputfile", - help="inputfile", metavar="INPUT", default=None) - - return parser.parse_args() - -if __name__ == "__main__": - (options, args) = get_options() - - dest_filename = options.outputfile - - path_list = [] - if options.inputfile is None: - path_list = args - else: - with open(options.inputfile, 'r') as fi: - path_list = fi - - - root = etree.Element("twitter_export") - - - for path in path_list: - - iri_doc = etree.parse(path) - media_nodes = iri_doc.xpath("/iri/body/medias/media[@id='video']/video") - duration = int(media_nodes[0].get("dur"))/1000 - - file_elem = etree.SubElement(root, "file") - etree.SubElement(file_elem, "path").text = path - etree.SubElement(file_elem, "start_date") - etree.SubElement(file_elem, "duration").text = unicode(duration) - - tree = etree.ElementTree(root) - tree.write(dest_filename, encoding="utf-8", pretty_print=True, xml_declaration=True) \ No newline at end of file diff -r d3b86c65c980 -r b9243ade95e2 script/lib/iri_tweet/export_twitter_alchemy.py --- a/script/lib/iri_tweet/export_twitter_alchemy.py Tue Dec 20 16:26:34 2011 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,361 +0,0 @@ -#!/usr/bin/env python -# coding=utf-8 - -from lxml import etree -from models import setup_database -from optparse import OptionParser #@UnresolvedImport -from sqlalchemy import Table, Column, BigInteger -from utils import (parse_date, set_logging_options, set_logging, get_filter_query, - get_logger) -import anyjson -import datetime -import httplib2 -import os.path -import re -import sys -import time -import uuid #@UnresolvedImport - -#class TweetExclude(object): -# def __init__(self, id): -# self.id = id -# -# def __repr__(self): -# return "" % (self.id) - - -def parse_polemics(tw, extended_mode): - """ - parse polemics in text and return a list of polemic code. None if not polemic found - """ - polemics = {} - for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): - pol_link = { - '++' : u'OK', - '--' : u'KO', - '??' : u'Q', - '==' : u'REF'}[m.group(1)] - polemics[pol_link] = pol_link - - if extended_mode: - if "?" in tw.text: - polemics["Q"] = "Q" - - for entity in tw.entity_list: - if entity.type == "entity_url": - polemics["REF"] = "REF" - - if len(polemics) > 0: - return polemics.keys() - else: - return None - -def get_options(): - parser = OptionParser() - parser.add_option("-f", "--file", dest="filename", - help="write export to file", metavar="FILE", default="project.ldt") - parser.add_option("-d", "--database", dest="database", - help="Input database", metavar="DATABASE") - parser.add_option("-s", "--start-date", dest="start_date", - help="start date", metavar="START_DATE", default=None) - parser.add_option("-e", "--end-date", dest="end_date", - help="end date", metavar="END_DATE", default=None) - parser.add_option("-I", "--content-file", dest="content_file", - help="Content file", metavar="CONTENT_FILE") - parser.add_option("-c", "--content", dest="content", - help="Content url", metavar="CONTENT") - parser.add_option("-V", "--video-url", dest="video", - help="video url", metavar="VIDEO") - parser.add_option("-i", "--content-id", dest="content_id", - help="Content id", metavar="CONTENT_ID") - parser.add_option("-x", "--exclude", dest="exclude", - help="file containing the id to exclude", metavar="EXCLUDE") - parser.add_option("-C", "--color", dest="color", - help="Color code", metavar="COLOR", default="16763904") - parser.add_option("-H", "--hashtag", dest="hashtag", - help="Hashtag", metavar="HASHTAG", default=[], action="append") - parser.add_option("-D", "--duration", dest="duration", type="int", - help="Duration", metavar="DURATION", default=None) - parser.add_option("-n", "--name", dest="name", - help="Cutting name", metavar="NAME", default=u"Tweets") - parser.add_option("-R", "--replace", dest="replace", action="store_true", - help="Replace tweet ensemble", metavar="REPLACE", default=False) - parser.add_option("-m", "--merge", dest="merge", action="store_true", - help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False) - parser.add_option("-L", "--list-conf", dest="listconf", - help="list of file to process", metavar="LIST_CONF", default=None) - parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", - help="Trigger polemic extended mode", metavar="EXTENDED", default=False) - parser.add_option("--user-whitelist", dest="user_whitelist", action="store", - help="A list of user screen name", metavar="USER_WHITELIST",default=None) - - - set_logging_options(parser) - - - return parser.parse_args() + (parser,) - - -if __name__ == "__main__" : - - (options, args, parser) = get_options() - - set_logging(options) - - get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable - - if len(sys.argv) == 1 or options.database is None: - parser.print_help() - sys.exit(1) - - conn_str = options.database.strip() - if not re.match("^\w+://.+", conn_str): - conn_str = 'sqlite:///' + conn_str - - engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) - conn = None - try : - conn = engine.connect() - session = None - try : - session = Session(bind=conn) - tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) - #mapper(TweetExclude, tweet_exclude_table) - metadata.create_all(bind=conn, tables=[tweet_exclude_table]) - - if options.exclude and os.path.exists(options.exclude): - with open(options.exclude, 'r+') as f: - tei = tweet_exclude_table.insert() - for line in f: - conn.execute(tei.values(id=long(line.strip()))) - user_whitelist_file = options.user_whitelist - user_whitelist = None - - if options.listconf: - - parameters = [] - confdoc = etree.parse(options.listconf) - for node in confdoc.xpath("/twitter_export/file"): - params = {} - for snode in node: - if snode.tag == "path": - params['content_file'] = snode.text - elif snode.tag == "start_date": - params['start_date'] = snode.text - elif snode.tag == "end_date": - params['end_date'] = snode.text - elif snode.tag == "duration": - params['duration'] = int(snode.text) - elif snode.tag == "hashtags": - params['hashtags'] = [snode.text] - if options.hashtag or 'hashtags' not in params : - params['hashtags'] = options.hashtag - parameters.append(params) - else: - parameters = [{ - 'start_date': options.start_date, - 'end_date' : options.end_date, - 'duration' : options.duration, - 'content_file' : options.content_file, - 'hashtags' : options.hashtag - }] - - for params in parameters: - - get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable - - start_date_str = params.get("start_date",None) - end_date_str = params.get("end_date", None) - duration = params.get("duration", None) - content_file = params.get("content_file", None) - hashtags = params.get('hashtags', []) - - if user_whitelist_file: - with open(user_whitelist_file, 'r+') as f: - user_whitelist = list(set([s.strip() for s in f])) - - start_date = None - ts = None - if start_date_str: - start_date = parse_date(start_date_str) - ts = time.mktime(start_date.timetuple()) - - end_date = None - if end_date_str: - end_date = parse_date(end_date_str) - elif start_date and duration: - end_date = start_date + datetime.timedelta(seconds=duration) - - query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) - - query_res = query.all() - - root = None - ensemble_parent = None - - #to do : analyse situation ldt or iri ? filename set or not ? - - if content_file and content_file.find("http") == 0: - - get_logger().debug("url : " + content_file) #@UndefinedVariable - - h = httplib2.Http() - resp, content = h.request(content_file) - - get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable - - project = anyjson.deserialize(content) - root = etree.fromstring(project["ldt"]) - - elif content_file and os.path.exists(content_file): - - doc = etree.parse(content_file) - root = doc.getroot() - - - if root is None: - - root = etree.Element(u"iri") - - project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) - - medias = etree.SubElement(root, u"medias") - media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) - - annotations = etree.SubElement(root, u"annotations") - content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) - ensemble_parent = content - - - if ensemble_parent is None: - file_type = None - for node in root: - if node.tag == "project": - file_type = "ldt" - break - elif node.tag == "head": - file_type = "iri" - break - - if file_type == "ldt": - media_nodes = root.xpath("//media") - if len(media_nodes) > 0: - media = media_nodes[0] - annotations_node = root.find(u"annotations") - if annotations_node is None: - annotations_node = etree.SubElement(root, u"annotations") - content_node = annotations_node.find(u"content") - if content_node is None: - content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) - ensemble_parent = content_node - elif file_type == "iri": - body_node = root.find(u"body") - if body_node is None: - body_node = etree.SubElement(root, u"body") - ensembles_node = body_node.find(u"ensembles") - if ensembles_node is None: - ensembles_node = etree.SubElement(body_node, u"ensembles") - ensemble_parent = ensembles_node - - - if ensemble_parent is None: - get_logger().error("Can not process file") #@UndefinedVariable - sys.exit() - - if options.replace: - for ens in ensemble_parent.iterchildren(tag=u"ensemble"): - if ens.get("id","").startswith("tweet_"): - ensemble_parent.remove(ens) - - ensemble = None - elements = None - - if options.merge: - ensemble = ensemble_parent.find(u"ensemble") - if ensemble is not None: - elements = ensemble.find(u".//elements") - - if ensemble is None or elements is None: - ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) - decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) - - etree.SubElement(decoupage, u"title").text = unicode(options.name) - etree.SubElement(decoupage, u"abstract").text = unicode(options.name) - - elements = etree.SubElement(decoupage, u"elements") - - - for tw in query_res: - tweet_ts_dt = tw.created_at - tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) - if ts is None: - ts = tweet_ts - tweet_ts_rel = (tweet_ts-ts) * 1000 - username = None - profile_url = "" - if tw.user is not None: - username = tw.user.name - profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else "" - if not username: - username = "anon." - - element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)}) - etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) - etree.SubElement(element, u"abstract").text = unicode(tw.text) - - tags_node = etree.SubElement(element, u"tags") - - for entity in tw.entity_list: - if entity.type == u'entity_hashtag': - etree.SubElement(tags_node,u"tag").text = entity.hashtag.text - - meta_element = etree.SubElement(element, u'meta') - - polemics_list = parse_polemics(tw, options.extended_mode) - if polemics_list: - polemics_element = etree.Element(u'polemics') - for pol in polemics_list: - etree.SubElement(polemics_element, u'polemic').text = pol - meta_element.append(polemics_element) - - etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json)) - - # sort by tc in - if options.merge : - # remove all elements and put them in a array - # sort them with tc - #put them back - elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) - - - - - output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True) - - if content_file and content_file.find("http") == 0: - - project["ldt"] = output_data - body = anyjson.serialize(project) - get_logger().debug("write http " + content_file) #@UndefinedVariable - get_logger().debug("write http " + repr(body)) #@UndefinedVariable - h = httplib2.Http() - resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body) - get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable - else: - if content_file and os.path.exists(content_file): - dest_file_name = content_file - else: - dest_file_name = options.filename - - get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable - output = open(dest_file_name, "w") - output.write(output_data) - output.flush() - output.close() - - finally: - if session: - session.close() - finally: - if conn: - conn.close() diff -r d3b86c65c980 -r b9243ade95e2 script/lib/iri_tweet/utils.py --- a/script/lib/iri_tweet/utils.py Tue Dec 20 16:26:34 2011 +0100 +++ b/script/lib/iri_tweet/utils.py Sat Jan 07 16:12:44 2012 +0100 @@ -10,6 +10,7 @@ import logging import os.path import sys +import math import twitter.oauth #@UnresolvedImport import twitter.oauth_dance #@UnresolvedImport import twitter_text #@UnresolvedImport @@ -171,7 +172,7 @@ class TwitterProcessor(object): - def __init__(self, json_dict, json_txt, source_id, session, access_token=None, token_filename=None): + def __init__(self, json_dict, json_txt, source_id, session, access_token=None, token_filename=None, user_query_twitter=False): if json_dict is None and json_txt is None: raise TwitterProcessorException("No json") @@ -194,10 +195,11 @@ self.token_filename = token_filename self.access_token = access_token self.obj_buffer = ObjectsBuffer() + self.user_query_twitter = user_query_twitter - def __get_user(self, user_dict, do_merge, query_twitter = False): + def __get_user(self, user_dict, do_merge): get_logger().debug("Get user : " + repr(user_dict)) #@UndefinedVariable user_dict = adapt_fields(user_dict, fields_adapter["stream"]["user"]) @@ -243,7 +245,7 @@ user_created_at = user_dict.get("created_at", None) - if user_created_at is None and query_twitter: + if user_created_at is None and self.user_query_twitter: if self.access_token is not None: acess_token_key, access_token_secret = self.access_token @@ -333,7 +335,7 @@ return EntityHashtag, entity_dict def process_user_mentions(): - user_mention = self.__get_user(ind, False, False) + user_mention = self.__get_user(ind, False) if user_mention is None: entity_dict['user_id'] = None else: @@ -598,3 +600,17 @@ raise except: self.handleError(record) + +def show_progress(current_line, total_line, label, width): + + percent = (float(current_line) / float(total_line)) * 100.0 + + marks = math.floor(width * (percent / 100.0)) + spaces = math.floor(width - marks) + + loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']' + + sys.stdout.write(u"%s %d%% %d/%d - %r\r" % (loader, percent, current_line - 1, total_line - 1, label[:50].rjust(50))) #takes the header into account + if percent >= 100: + sys.stdout.write("\n") + sys.stdout.flush() diff -r d3b86c65c980 -r b9243ade95e2 script/rest/search_twitter.py --- a/script/rest/search_twitter.py Tue Dec 20 16:26:34 2011 +0100 +++ b/script/rest/search_twitter.py Sat Jan 07 16:12:44 2012 +0100 @@ -17,8 +17,6 @@ help="verbose", metavar="VERBOSE", default=0) parser.add_option("-q", dest="quiet", action="count", help="quiet", metavar="QUIET", default=0) - parser.add_option("-r", "--request", dest="request", - help="twitter request", metavar="REQUEST", default=0) parser.add_option("-Q", dest="query", help="query", metavar="QUERY") parser.add_option("-P", dest="rpp", metavar="RPP", default="50", @@ -27,9 +25,6 @@ help="Token file name") - #add request token - #add - return parser.parse_args() if __name__ == "__main__": diff -r d3b86c65c980 -r b9243ade95e2 script/stream/get_stats.py --- a/script/stream/get_stats.py Tue Dec 20 16:26:34 2011 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ - -import httplib2 -import anyjson -from lxml import etree -import sys -import pprint - -def get_stats(url): - - h = httplib2.Http() - resp, content = h.request(url) - #project = anyjson.deserialize(content) - root = etree.fromstring(content) - - #get all annotations - res_xpath = root.xpath("//ensemble[starts-with(@id,'tweet_')]//element") - - total_annot = len(res_xpath) - total_with_polemic = 0 - total_by_type = {} - - - for annot in res_xpath: - polemic_list = annot.xpath("meta/polemics/polemic") - if len(polemic_list)> 0: - total_with_polemic += 1 - for polemic_item in polemic_list: - pol_type = polemic_item.text - total_by_type[pol_type] = total_by_type.get(pol_type,0) + 1 - - - return {"total_annotations": total_annot, "total_with_polemics": total_with_polemic, "polemic_by_type": total_by_type} - -if __name__ == "__main__": - - pp = pprint.PrettyPrinter(indent=4, width=1) - - pp.pprint(get_stats(sys.argv[1])) \ No newline at end of file diff -r d3b86c65c980 -r b9243ade95e2 script/stream/recorder_tweetstream.py --- a/script/stream/recorder_tweetstream.py Tue Dec 20 16:26:34 2011 +0100 +++ b/script/stream/recorder_tweetstream.py Sat Jan 07 16:12:44 2012 +0100 @@ -229,7 +229,7 @@ self.stop_event.set() -def process_tweet(tweet, source_id, session, access_token, logger): +def process_tweet(tweet, source_id, session, access_token, twitter_query_user, logger): try: tweet_obj = anyjson.deserialize(tweet) if 'text' not in tweet_obj: @@ -241,7 +241,7 @@ screen_name = tweet_obj['user']['screen_name'] logger.info(u"Process_tweet from %s : %s" % (screen_name, tweet_obj['text'])) logger.debug(u"Process_tweet :" + repr(tweet)) - processor = utils.TwitterProcessor(tweet_obj, tweet, source_id, session, access_token, None) + processor = utils.TwitterProcessor(tweet_obj, tweet, source_id, session, access_token, None, twitter_query_user) processor.process() except Exception as e: message = u"Error %s processing tweet %s" % (repr(e), tweet) @@ -263,6 +263,7 @@ def __init__(self, session_maker, queue, options, access_token, stop_event, logger_queue, parent_pid): super(TweetProcess, self).__init__(session_maker, queue, options, access_token, stop_event, logger_queue, parent_pid) + self.twitter_query_user = options.twitter_query_user def do_run(self): @@ -277,7 +278,7 @@ except Exception as e: self.logger.debug('Process tweet exception in loop : ' + repr(e)) continue - process_tweet(tweet_txt, source_id, session, self.access_token, self.logger) + process_tweet(tweet_txt, source_id, session, self.access_token, self.twitter_query_user, self.logger) session.commit() finally: session.rollback() @@ -345,6 +346,9 @@ help="number of process.\nIf 0, only the lefovers of the database are processed.\nIf 1, no postprocessing is done on the tweets.", metavar="PROCESS_NB", default=2, type='int') parser.add_option("--url", dest="url", help="The twitter url to connect to.", metavar="URL", default=tweetstream.FilterStream.url) + parser.add_option("--query-user", dest="twitter_query_user", action="store_true", + help="Query twitter for users", default=False, metavar="QUERY_USER") + utils.set_logging_options(parser) diff -r d3b86c65c980 -r b9243ade95e2 script/utils/create_twitter_export_conf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/create_twitter_export_conf.py Sat Jan 07 16:12:44 2012 +0100 @@ -0,0 +1,43 @@ +from lxml import etree +from optparse import OptionParser #@UnresolvedImport + +def get_options(): + + parser = OptionParser() + + parser.add_option("-f", "--file", dest="outputfile", + help="destination filename", metavar="FILE", default="twitter_export_conf.xml") + parser.add_option("-i", "--input", dest="inputfile", + help="inputfile", metavar="INPUT", default=None) + + return parser.parse_args() + +if __name__ == "__main__": + (options, args) = get_options() + + dest_filename = options.outputfile + + path_list = [] + if options.inputfile is None: + path_list = args + else: + with open(options.inputfile, 'r') as fi: + path_list = fi + + + root = etree.Element("twitter_export") + + + for path in path_list: + + iri_doc = etree.parse(path) + media_nodes = iri_doc.xpath("/iri/body/medias/media[@id='video']/video") + duration = int(media_nodes[0].get("dur"))/1000 + + file_elem = etree.SubElement(root, "file") + etree.SubElement(file_elem, "path").text = path + etree.SubElement(file_elem, "start_date") + etree.SubElement(file_elem, "duration").text = unicode(duration) + + tree = etree.ElementTree(root) + tree.write(dest_filename, encoding="utf-8", pretty_print=True, xml_declaration=True) diff -r d3b86c65c980 -r b9243ade95e2 script/utils/export_twitter_alchemy.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/export_twitter_alchemy.py Sat Jan 07 16:12:44 2012 +0100 @@ -0,0 +1,362 @@ +#!/usr/bin/env python +# coding=utf-8 + +from lxml import etree +from iri_tweet.models import setup_database +from optparse import OptionParser #@UnresolvedImport +from sqlalchemy import Table, Column, BigInteger +from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, + get_logger) +import anyjson +import datetime +import httplib2 +import os.path +import re +import sys +import time +import uuid #@UnresolvedImport +from dateutil.parser import parse as parse_date + +#class TweetExclude(object): +# def __init__(self, id): +# self.id = id +# +# def __repr__(self): +# return "" % (self.id) + + +def parse_polemics(tw, extended_mode): + """ + parse polemics in text and return a list of polemic code. None if not polemic found + """ + polemics = {} + for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): + pol_link = { + '++' : u'OK', + '--' : u'KO', + '??' : u'Q', + '==' : u'REF'}[m.group(1)] + polemics[pol_link] = pol_link + + if extended_mode: + if "?" in tw.text: + polemics["Q"] = "Q" + + for entity in tw.entity_list: + if entity.type == "entity_url": + polemics["REF"] = "REF" + + if len(polemics) > 0: + return polemics.keys() + else: + return None + +def get_options(): + parser = OptionParser() + parser.add_option("-f", "--file", dest="filename", + help="write export to file", metavar="FILE", default="project.ldt") + parser.add_option("-d", "--database", dest="database", + help="Input database", metavar="DATABASE") + parser.add_option("-s", "--start-date", dest="start_date", + help="start date", metavar="START_DATE", default=None) + parser.add_option("-e", "--end-date", dest="end_date", + help="end date", metavar="END_DATE", default=None) + parser.add_option("-I", "--content-file", dest="content_file", + help="Content file", metavar="CONTENT_FILE") + parser.add_option("-c", "--content", dest="content", + help="Content url", metavar="CONTENT") + parser.add_option("-V", "--video-url", dest="video", + help="video url", metavar="VIDEO") + parser.add_option("-i", "--content-id", dest="content_id", + help="Content id", metavar="CONTENT_ID") + parser.add_option("-x", "--exclude", dest="exclude", + help="file containing the id to exclude", metavar="EXCLUDE") + parser.add_option("-C", "--color", dest="color", + help="Color code", metavar="COLOR", default="16763904") + parser.add_option("-H", "--hashtag", dest="hashtag", + help="Hashtag", metavar="HASHTAG", default=[], action="append") + parser.add_option("-D", "--duration", dest="duration", type="int", + help="Duration", metavar="DURATION", default=None) + parser.add_option("-n", "--name", dest="name", + help="Cutting name", metavar="NAME", default=u"Tweets") + parser.add_option("-R", "--replace", dest="replace", action="store_true", + help="Replace tweet ensemble", metavar="REPLACE", default=False) + parser.add_option("-m", "--merge", dest="merge", action="store_true", + help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False) + parser.add_option("-L", "--list-conf", dest="listconf", + help="list of file to process", metavar="LIST_CONF", default=None) + parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", + help="Trigger polemic extended mode", metavar="EXTENDED", default=False) + parser.add_option("--user-whitelist", dest="user_whitelist", action="store", + help="A list of user screen name", metavar="USER_WHITELIST",default=None) + + + set_logging_options(parser) + + + return parser.parse_args() + (parser,) + + +if __name__ == "__main__" : + + (options, args, parser) = get_options() + + set_logging(options) + + get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable + + if len(sys.argv) == 1 or options.database is None: + parser.print_help() + sys.exit(1) + + conn_str = options.database.strip() + if not re.match("^\w+://.+", conn_str): + conn_str = 'sqlite:///' + conn_str + + engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) + conn = None + try : + conn = engine.connect() + session = None + try : + session = Session(bind=conn) + tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) + #mapper(TweetExclude, tweet_exclude_table) + metadata.create_all(bind=conn, tables=[tweet_exclude_table]) + + if options.exclude and os.path.exists(options.exclude): + with open(options.exclude, 'r+') as f: + tei = tweet_exclude_table.insert() + for line in f: + conn.execute(tei.values(id=long(line.strip()))) + user_whitelist_file = options.user_whitelist + user_whitelist = None + + if options.listconf: + + parameters = [] + confdoc = etree.parse(options.listconf) + for node in confdoc.xpath("/twitter_export/file"): + params = {} + for snode in node: + if snode.tag == "path": + params['content_file'] = snode.text + elif snode.tag == "start_date": + params['start_date'] = snode.text + elif snode.tag == "end_date": + params['end_date'] = snode.text + elif snode.tag == "duration": + params['duration'] = int(snode.text) + elif snode.tag == "hashtags": + params['hashtags'] = [snode.text] + if options.hashtag or 'hashtags' not in params : + params['hashtags'] = options.hashtag + parameters.append(params) + else: + parameters = [{ + 'start_date': options.start_date, + 'end_date' : options.end_date, + 'duration' : options.duration, + 'content_file' : options.content_file, + 'hashtags' : options.hashtag + }] + + for params in parameters: + + get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable + + start_date_str = params.get("start_date",None) + end_date_str = params.get("end_date", None) + duration = params.get("duration", None) + content_file = params.get("content_file", None) + hashtags = params.get('hashtags', []) + + if user_whitelist_file: + with open(user_whitelist_file, 'r+') as f: + user_whitelist = list(set([s.strip() for s in f])) + + start_date = None + ts = None + if start_date_str: + start_date = parse_date(start_date_str) + ts = time.mktime(start_date.timetuple()) + + end_date = None + if end_date_str: + end_date = parse_date(end_date_str) + elif start_date and duration: + end_date = start_date + datetime.timedelta(seconds=duration) + + query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist) + + query_res = query.all() + + root = None + ensemble_parent = None + + #to do : analyse situation ldt or iri ? filename set or not ? + + if content_file and content_file.find("http") == 0: + + get_logger().debug("url : " + content_file) #@UndefinedVariable + + h = httplib2.Http() + resp, content = h.request(content_file) + + get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable + + project = anyjson.deserialize(content) + root = etree.fromstring(project["ldt"]) + + elif content_file and os.path.exists(content_file): + + doc = etree.parse(content_file) + root = doc.getroot() + + + if root is None: + + root = etree.Element(u"iri") + + project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) + + medias = etree.SubElement(root, u"medias") + media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) + + annotations = etree.SubElement(root, u"annotations") + content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) + ensemble_parent = content + + + if ensemble_parent is None: + file_type = None + for node in root: + if node.tag == "project": + file_type = "ldt" + break + elif node.tag == "head": + file_type = "iri" + break + + if file_type == "ldt": + media_nodes = root.xpath("//media") + if len(media_nodes) > 0: + media = media_nodes[0] + annotations_node = root.find(u"annotations") + if annotations_node is None: + annotations_node = etree.SubElement(root, u"annotations") + content_node = annotations_node.find(u"content") + if content_node is None: + content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id")) + ensemble_parent = content_node + elif file_type == "iri": + body_node = root.find(u"body") + if body_node is None: + body_node = etree.SubElement(root, u"body") + ensembles_node = body_node.find(u"ensembles") + if ensembles_node is None: + ensembles_node = etree.SubElement(body_node, u"ensembles") + ensemble_parent = ensembles_node + + + if ensemble_parent is None: + get_logger().error("Can not process file") #@UndefinedVariable + sys.exit() + + if options.replace: + for ens in ensemble_parent.iterchildren(tag=u"ensemble"): + if ens.get("id","").startswith("tweet_"): + ensemble_parent.remove(ens) + + ensemble = None + elements = None + + if options.merge: + ensemble = ensemble_parent.find(u"ensemble") + if ensemble is not None: + elements = ensemble.find(u".//elements") + + if ensemble is None or elements is None: + ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"}) + decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) + + etree.SubElement(decoupage, u"title").text = unicode(options.name) + etree.SubElement(decoupage, u"abstract").text = unicode(options.name) + + elements = etree.SubElement(decoupage, u"elements") + + + for tw in query_res: + tweet_ts_dt = tw.created_at + tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) + if ts is None: + ts = tweet_ts + tweet_ts_rel = (tweet_ts-ts) * 1000 + username = None + profile_url = "" + if tw.user is not None: + username = tw.user.name + profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else "" + if not username: + username = "anon." + + element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)}) + etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) + etree.SubElement(element, u"abstract").text = unicode(tw.text) + + tags_node = etree.SubElement(element, u"tags") + + for entity in tw.entity_list: + if entity.type == u'entity_hashtag': + etree.SubElement(tags_node,u"tag").text = entity.hashtag.text + + meta_element = etree.SubElement(element, u'meta') + + polemics_list = parse_polemics(tw, options.extended_mode) + if polemics_list: + polemics_element = etree.Element(u'polemics') + for pol in polemics_list: + etree.SubElement(polemics_element, u'polemic').text = pol + meta_element.append(polemics_element) + + etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json)) + + # sort by tc in + if options.merge : + # remove all elements and put them in a array + # sort them with tc + #put them back + elements[:] = sorted(elements,key=lambda n: int(n.get('begin'))) + + + + + output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True) + + if content_file and content_file.find("http") == 0: + + project["ldt"] = output_data + body = anyjson.serialize(project) + get_logger().debug("write http " + content_file) #@UndefinedVariable + get_logger().debug("write http " + repr(body)) #@UndefinedVariable + h = httplib2.Http() + resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body) + get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable + else: + if content_file and os.path.exists(content_file): + dest_file_name = content_file + else: + dest_file_name = options.filename + + get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable + output = open(dest_file_name, "w") + output.write(output_data) + output.flush() + output.close() + + finally: + if session: + session.close() + finally: + if conn: + conn.close() diff -r d3b86c65c980 -r b9243ade95e2 script/utils/get_stats.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/get_stats.py Sat Jan 07 16:12:44 2012 +0100 @@ -0,0 +1,38 @@ + +import httplib2 +import anyjson +from lxml import etree +import sys +import pprint + +def get_stats(url): + + h = httplib2.Http() + resp, content = h.request(url) + #project = anyjson.deserialize(content) + root = etree.fromstring(content) + + #get all annotations + res_xpath = root.xpath("//ensemble[starts-with(@id,'tweet_')]//element") + + total_annot = len(res_xpath) + total_with_polemic = 0 + total_by_type = {} + + + for annot in res_xpath: + polemic_list = annot.xpath("meta/polemics/polemic") + if len(polemic_list)> 0: + total_with_polemic += 1 + for polemic_item in polemic_list: + pol_type = polemic_item.text + total_by_type[pol_type] = total_by_type.get(pol_type,0) + 1 + + + return {"total_annotations": total_annot, "total_with_polemics": total_with_polemic, "polemic_by_type": total_by_type} + +if __name__ == "__main__": + + pp = pprint.PrettyPrinter(indent=4, width=1) + + pp.pprint(get_stats(sys.argv[1])) \ No newline at end of file diff -r d3b86c65c980 -r b9243ade95e2 script/utils/merge_tweets.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/utils/merge_tweets.py Sat Jan 07 16:12:44 2012 +0100 @@ -0,0 +1,105 @@ +#from models import setup_database +from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog +from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress +import argparse +import sys +import re +import anyjson +import math +import codecs + +def get_option(): + + parser = argparse.ArgumentParser(description='Merge tweets databases') + + parser.add_argument("-l", "--log", dest="logfile", + help="log to file", metavar="LOG", default="stderr") + parser.add_argument("-v", dest="verbose", action="count", + help="verbose", default=0) + parser.add_argument("-q", dest="quiet", action="count", + help="quiet", default=0) + parser.add_argument("--query-user", dest="query_user", action="store_true", + help="Query twitter for user information", default=False) + parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token", + help="Token file name") + + + parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE") + parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET") + + + return parser.parse_args() + +if __name__ == "__main__": + + sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout) + + options = get_option() + + access_token = None + if options.query_user: + access_token = get_oauth_token(options.token_filename) + + #open source + src_conn_str = options.source[0].strip() + if not re.match("^\w+://.+", src_conn_str): + src_conn_str = 'sqlite:///' + src_conn_str + tgt_conn_str = options.target[0].strip() + if not re.match("^\w+://.+", tgt_conn_str): + tgt_conn_str = 'sqlite:///' + tgt_conn_str + + + engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) + engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) + + conn_src = conn_tgt = session_src = session_tgt = None + + try: + #conn_src = engine_src.connect() + #conn_tgt = engine_tgt.connect() + session_src = Session_src() + session_tgt = Session_tgt() + + count_tw_query = Tweet.__table__.count() + + count_tw = engine_src.scalar(count_tw_query) + + if count_tw == 0: + print "No tweet to process : exit" + sys.exit() + + query_src = session_src.query(Tweet).join(TweetSource).yield_per(100) + added = 0 + + for i,tweet in enumerate(query_src): + + tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count() + + progress_text = u"Process: " + if tweet_count == 0: + added += 1 + progress_text = u"Adding : " + tweet_source = tweet.tweet_source.original_json + + tweet_obj = anyjson.deserialize(tweet_source) + if 'text' not in tweet_obj: + tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET']) + session_tgt.add(tweet_log) + else: + tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user) + tp.process() + + session_tgt.flush() + + show_progress(i+1, count_tw, progress_text+tweet.text, 70) + + session_tgt.commit() + print u"%d new tweet added" % (added) + + finally: + session_tgt.close() if session_tgt is not None else None + session_src.close() if session_src is not None else None + conn_tgt.close() if conn_tgt is not None else None + conn_src.close() if conn_src is not None else None + + \ No newline at end of file diff -r d3b86c65c980 -r b9243ade95e2 script/virtualenv/res/python-dateutil-1.5.tar.gz Binary file script/virtualenv/res/python-dateutil-1.5.tar.gz has changed