--- a/script/lib/iri_tweet/create_twitter_export_conf.py Tue Dec 20 16:26:34 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-from lxml import etree
-from optparse import OptionParser #@UnresolvedImport
-
-def get_options():
-
- parser = OptionParser()
-
- parser.add_option("-f", "--file", dest="outputfile",
- help="destination filename", metavar="FILE", default="twitter_export_conf.xml")
- parser.add_option("-i", "--input", dest="inputfile",
- help="inputfile", metavar="INPUT", default=None)
-
- return parser.parse_args()
-
-if __name__ == "__main__":
- (options, args) = get_options()
-
- dest_filename = options.outputfile
-
- path_list = []
- if options.inputfile is None:
- path_list = args
- else:
- with open(options.inputfile, 'r') as fi:
- path_list = fi
-
-
- root = etree.Element("twitter_export")
-
-
- for path in path_list:
-
- iri_doc = etree.parse(path)
- media_nodes = iri_doc.xpath("/iri/body/medias/media[@id='video']/video")
- duration = int(media_nodes[0].get("dur"))/1000
-
- file_elem = etree.SubElement(root, "file")
- etree.SubElement(file_elem, "path").text = path
- etree.SubElement(file_elem, "start_date")
- etree.SubElement(file_elem, "duration").text = unicode(duration)
-
- tree = etree.ElementTree(root)
- tree.write(dest_filename, encoding="utf-8", pretty_print=True, xml_declaration=True)
\ No newline at end of file
--- a/script/lib/iri_tweet/export_twitter_alchemy.py Tue Dec 20 16:26:34 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,361 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-from lxml import etree
-from models import setup_database
-from optparse import OptionParser #@UnresolvedImport
-from sqlalchemy import Table, Column, BigInteger
-from utils import (parse_date, set_logging_options, set_logging, get_filter_query,
- get_logger)
-import anyjson
-import datetime
-import httplib2
-import os.path
-import re
-import sys
-import time
-import uuid #@UnresolvedImport
-
-#class TweetExclude(object):
-# def __init__(self, id):
-# self.id = id
-#
-# def __repr__(self):
-# return "<TweetExclude(id=%d)>" % (self.id)
-
-
-def parse_polemics(tw, extended_mode):
- """
- parse polemics in text and return a list of polemic code. None if not polemic found
- """
- polemics = {}
- for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
- pol_link = {
- '++' : u'OK',
- '--' : u'KO',
- '??' : u'Q',
- '==' : u'REF'}[m.group(1)]
- polemics[pol_link] = pol_link
-
- if extended_mode:
- if "?" in tw.text:
- polemics["Q"] = "Q"
-
- for entity in tw.entity_list:
- if entity.type == "entity_url":
- polemics["REF"] = "REF"
-
- if len(polemics) > 0:
- return polemics.keys()
- else:
- return None
-
-def get_options():
- parser = OptionParser()
- parser.add_option("-f", "--file", dest="filename",
- help="write export to file", metavar="FILE", default="project.ldt")
- parser.add_option("-d", "--database", dest="database",
- help="Input database", metavar="DATABASE")
- parser.add_option("-s", "--start-date", dest="start_date",
- help="start date", metavar="START_DATE", default=None)
- parser.add_option("-e", "--end-date", dest="end_date",
- help="end date", metavar="END_DATE", default=None)
- parser.add_option("-I", "--content-file", dest="content_file",
- help="Content file", metavar="CONTENT_FILE")
- parser.add_option("-c", "--content", dest="content",
- help="Content url", metavar="CONTENT")
- parser.add_option("-V", "--video-url", dest="video",
- help="video url", metavar="VIDEO")
- parser.add_option("-i", "--content-id", dest="content_id",
- help="Content id", metavar="CONTENT_ID")
- parser.add_option("-x", "--exclude", dest="exclude",
- help="file containing the id to exclude", metavar="EXCLUDE")
- parser.add_option("-C", "--color", dest="color",
- help="Color code", metavar="COLOR", default="16763904")
- parser.add_option("-H", "--hashtag", dest="hashtag",
- help="Hashtag", metavar="HASHTAG", default=[], action="append")
- parser.add_option("-D", "--duration", dest="duration", type="int",
- help="Duration", metavar="DURATION", default=None)
- parser.add_option("-n", "--name", dest="name",
- help="Cutting name", metavar="NAME", default=u"Tweets")
- parser.add_option("-R", "--replace", dest="replace", action="store_true",
- help="Replace tweet ensemble", metavar="REPLACE", default=False)
- parser.add_option("-m", "--merge", dest="merge", action="store_true",
- help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
- parser.add_option("-L", "--list-conf", dest="listconf",
- help="list of file to process", metavar="LIST_CONF", default=None)
- parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
- help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
- parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
- help="A list of user screen name", metavar="USER_WHITELIST",default=None)
-
-
- set_logging_options(parser)
-
-
- return parser.parse_args() + (parser,)
-
-
-if __name__ == "__main__" :
-
- (options, args, parser) = get_options()
-
- set_logging(options)
-
- get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
-
- if len(sys.argv) == 1 or options.database is None:
- parser.print_help()
- sys.exit(1)
-
- conn_str = options.database.strip()
- if not re.match("^\w+://.+", conn_str):
- conn_str = 'sqlite:///' + conn_str
-
- engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
- conn = None
- try :
- conn = engine.connect()
- session = None
- try :
- session = Session(bind=conn)
- tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
- #mapper(TweetExclude, tweet_exclude_table)
- metadata.create_all(bind=conn, tables=[tweet_exclude_table])
-
- if options.exclude and os.path.exists(options.exclude):
- with open(options.exclude, 'r+') as f:
- tei = tweet_exclude_table.insert()
- for line in f:
- conn.execute(tei.values(id=long(line.strip())))
- user_whitelist_file = options.user_whitelist
- user_whitelist = None
-
- if options.listconf:
-
- parameters = []
- confdoc = etree.parse(options.listconf)
- for node in confdoc.xpath("/twitter_export/file"):
- params = {}
- for snode in node:
- if snode.tag == "path":
- params['content_file'] = snode.text
- elif snode.tag == "start_date":
- params['start_date'] = snode.text
- elif snode.tag == "end_date":
- params['end_date'] = snode.text
- elif snode.tag == "duration":
- params['duration'] = int(snode.text)
- elif snode.tag == "hashtags":
- params['hashtags'] = [snode.text]
- if options.hashtag or 'hashtags' not in params :
- params['hashtags'] = options.hashtag
- parameters.append(params)
- else:
- parameters = [{
- 'start_date': options.start_date,
- 'end_date' : options.end_date,
- 'duration' : options.duration,
- 'content_file' : options.content_file,
- 'hashtags' : options.hashtag
- }]
-
- for params in parameters:
-
- get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
-
- start_date_str = params.get("start_date",None)
- end_date_str = params.get("end_date", None)
- duration = params.get("duration", None)
- content_file = params.get("content_file", None)
- hashtags = params.get('hashtags', [])
-
- if user_whitelist_file:
- with open(user_whitelist_file, 'r+') as f:
- user_whitelist = list(set([s.strip() for s in f]))
-
- start_date = None
- ts = None
- if start_date_str:
- start_date = parse_date(start_date_str)
- ts = time.mktime(start_date.timetuple())
-
- end_date = None
- if end_date_str:
- end_date = parse_date(end_date_str)
- elif start_date and duration:
- end_date = start_date + datetime.timedelta(seconds=duration)
-
- query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
-
- query_res = query.all()
-
- root = None
- ensemble_parent = None
-
- #to do : analyse situation ldt or iri ? filename set or not ?
-
- if content_file and content_file.find("http") == 0:
-
- get_logger().debug("url : " + content_file) #@UndefinedVariable
-
- h = httplib2.Http()
- resp, content = h.request(content_file)
-
- get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
-
- project = anyjson.deserialize(content)
- root = etree.fromstring(project["ldt"])
-
- elif content_file and os.path.exists(content_file):
-
- doc = etree.parse(content_file)
- root = doc.getroot()
-
-
- if root is None:
-
- root = etree.Element(u"iri")
-
- project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
-
- medias = etree.SubElement(root, u"medias")
- media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
-
- annotations = etree.SubElement(root, u"annotations")
- content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
- ensemble_parent = content
-
-
- if ensemble_parent is None:
- file_type = None
- for node in root:
- if node.tag == "project":
- file_type = "ldt"
- break
- elif node.tag == "head":
- file_type = "iri"
- break
-
- if file_type == "ldt":
- media_nodes = root.xpath("//media")
- if len(media_nodes) > 0:
- media = media_nodes[0]
- annotations_node = root.find(u"annotations")
- if annotations_node is None:
- annotations_node = etree.SubElement(root, u"annotations")
- content_node = annotations_node.find(u"content")
- if content_node is None:
- content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
- ensemble_parent = content_node
- elif file_type == "iri":
- body_node = root.find(u"body")
- if body_node is None:
- body_node = etree.SubElement(root, u"body")
- ensembles_node = body_node.find(u"ensembles")
- if ensembles_node is None:
- ensembles_node = etree.SubElement(body_node, u"ensembles")
- ensemble_parent = ensembles_node
-
-
- if ensemble_parent is None:
- get_logger().error("Can not process file") #@UndefinedVariable
- sys.exit()
-
- if options.replace:
- for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
- if ens.get("id","").startswith("tweet_"):
- ensemble_parent.remove(ens)
-
- ensemble = None
- elements = None
-
- if options.merge:
- ensemble = ensemble_parent.find(u"ensemble")
- if ensemble is not None:
- elements = ensemble.find(u".//elements")
-
- if ensemble is None or elements is None:
- ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
- decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
-
- etree.SubElement(decoupage, u"title").text = unicode(options.name)
- etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
-
- elements = etree.SubElement(decoupage, u"elements")
-
-
- for tw in query_res:
- tweet_ts_dt = tw.created_at
- tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
- if ts is None:
- ts = tweet_ts
- tweet_ts_rel = (tweet_ts-ts) * 1000
- username = None
- profile_url = ""
- if tw.user is not None:
- username = tw.user.name
- profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
- if not username:
- username = "anon."
-
- element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
- etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
- etree.SubElement(element, u"abstract").text = unicode(tw.text)
-
- tags_node = etree.SubElement(element, u"tags")
-
- for entity in tw.entity_list:
- if entity.type == u'entity_hashtag':
- etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
-
- meta_element = etree.SubElement(element, u'meta')
-
- polemics_list = parse_polemics(tw, options.extended_mode)
- if polemics_list:
- polemics_element = etree.Element(u'polemics')
- for pol in polemics_list:
- etree.SubElement(polemics_element, u'polemic').text = pol
- meta_element.append(polemics_element)
-
- etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json))
-
- # sort by tc in
- if options.merge :
- # remove all elements and put them in a array
- # sort them with tc
- #put them back
- elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
-
-
-
-
- output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)
-
- if content_file and content_file.find("http") == 0:
-
- project["ldt"] = output_data
- body = anyjson.serialize(project)
- get_logger().debug("write http " + content_file) #@UndefinedVariable
- get_logger().debug("write http " + repr(body)) #@UndefinedVariable
- h = httplib2.Http()
- resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
- get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
- else:
- if content_file and os.path.exists(content_file):
- dest_file_name = content_file
- else:
- dest_file_name = options.filename
-
- get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
- output = open(dest_file_name, "w")
- output.write(output_data)
- output.flush()
- output.close()
-
- finally:
- if session:
- session.close()
- finally:
- if conn:
- conn.close()
--- a/script/lib/iri_tweet/utils.py Tue Dec 20 16:26:34 2011 +0100
+++ b/script/lib/iri_tweet/utils.py Sat Jan 07 16:12:44 2012 +0100
@@ -10,6 +10,7 @@
import logging
import os.path
import sys
+import math
import twitter.oauth #@UnresolvedImport
import twitter.oauth_dance #@UnresolvedImport
import twitter_text #@UnresolvedImport
@@ -171,7 +172,7 @@
class TwitterProcessor(object):
- def __init__(self, json_dict, json_txt, source_id, session, access_token=None, token_filename=None):
+ def __init__(self, json_dict, json_txt, source_id, session, access_token=None, token_filename=None, user_query_twitter=False):
if json_dict is None and json_txt is None:
raise TwitterProcessorException("No json")
@@ -194,10 +195,11 @@
self.token_filename = token_filename
self.access_token = access_token
self.obj_buffer = ObjectsBuffer()
+ self.user_query_twitter = user_query_twitter
- def __get_user(self, user_dict, do_merge, query_twitter = False):
+ def __get_user(self, user_dict, do_merge):
get_logger().debug("Get user : " + repr(user_dict)) #@UndefinedVariable
user_dict = adapt_fields(user_dict, fields_adapter["stream"]["user"])
@@ -243,7 +245,7 @@
user_created_at = user_dict.get("created_at", None)
- if user_created_at is None and query_twitter:
+ if user_created_at is None and self.user_query_twitter:
if self.access_token is not None:
acess_token_key, access_token_secret = self.access_token
@@ -333,7 +335,7 @@
return EntityHashtag, entity_dict
def process_user_mentions():
- user_mention = self.__get_user(ind, False, False)
+ user_mention = self.__get_user(ind, False)
if user_mention is None:
entity_dict['user_id'] = None
else:
@@ -598,3 +600,17 @@
raise
except:
self.handleError(record)
+
+def show_progress(current_line, total_line, label, width):
+
+ percent = (float(current_line) / float(total_line)) * 100.0
+
+ marks = math.floor(width * (percent / 100.0))
+ spaces = math.floor(width - marks)
+
+ loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']'
+
+ sys.stdout.write(u"%s %d%% %d/%d - %r\r" % (loader, percent, current_line - 1, total_line - 1, label[:50].rjust(50))) #takes the header into account
+ if percent >= 100:
+ sys.stdout.write("\n")
+ sys.stdout.flush()
--- a/script/rest/search_twitter.py Tue Dec 20 16:26:34 2011 +0100
+++ b/script/rest/search_twitter.py Sat Jan 07 16:12:44 2012 +0100
@@ -17,8 +17,6 @@
help="verbose", metavar="VERBOSE", default=0)
parser.add_option("-q", dest="quiet", action="count",
help="quiet", metavar="QUIET", default=0)
- parser.add_option("-r", "--request", dest="request",
- help="twitter request", metavar="REQUEST", default=0)
parser.add_option("-Q", dest="query",
help="query", metavar="QUERY")
parser.add_option("-P", dest="rpp", metavar="RPP", default="50",
@@ -27,9 +25,6 @@
help="Token file name")
- #add request token
- #add
-
return parser.parse_args()
if __name__ == "__main__":
--- a/script/stream/get_stats.py Tue Dec 20 16:26:34 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,38 +0,0 @@
-
-import httplib2
-import anyjson
-from lxml import etree
-import sys
-import pprint
-
-def get_stats(url):
-
- h = httplib2.Http()
- resp, content = h.request(url)
- #project = anyjson.deserialize(content)
- root = etree.fromstring(content)
-
- #get all annotations
- res_xpath = root.xpath("//ensemble[starts-with(@id,'tweet_')]//element")
-
- total_annot = len(res_xpath)
- total_with_polemic = 0
- total_by_type = {}
-
-
- for annot in res_xpath:
- polemic_list = annot.xpath("meta/polemics/polemic")
- if len(polemic_list)> 0:
- total_with_polemic += 1
- for polemic_item in polemic_list:
- pol_type = polemic_item.text
- total_by_type[pol_type] = total_by_type.get(pol_type,0) + 1
-
-
- return {"total_annotations": total_annot, "total_with_polemics": total_with_polemic, "polemic_by_type": total_by_type}
-
-if __name__ == "__main__":
-
- pp = pprint.PrettyPrinter(indent=4, width=1)
-
- pp.pprint(get_stats(sys.argv[1]))
\ No newline at end of file
--- a/script/stream/recorder_tweetstream.py Tue Dec 20 16:26:34 2011 +0100
+++ b/script/stream/recorder_tweetstream.py Sat Jan 07 16:12:44 2012 +0100
@@ -229,7 +229,7 @@
self.stop_event.set()
-def process_tweet(tweet, source_id, session, access_token, logger):
+def process_tweet(tweet, source_id, session, access_token, twitter_query_user, logger):
try:
tweet_obj = anyjson.deserialize(tweet)
if 'text' not in tweet_obj:
@@ -241,7 +241,7 @@
screen_name = tweet_obj['user']['screen_name']
logger.info(u"Process_tweet from %s : %s" % (screen_name, tweet_obj['text']))
logger.debug(u"Process_tweet :" + repr(tweet))
- processor = utils.TwitterProcessor(tweet_obj, tweet, source_id, session, access_token, None)
+ processor = utils.TwitterProcessor(tweet_obj, tweet, source_id, session, access_token, None, twitter_query_user)
processor.process()
except Exception as e:
message = u"Error %s processing tweet %s" % (repr(e), tweet)
@@ -263,6 +263,7 @@
def __init__(self, session_maker, queue, options, access_token, stop_event, logger_queue, parent_pid):
super(TweetProcess, self).__init__(session_maker, queue, options, access_token, stop_event, logger_queue, parent_pid)
+ self.twitter_query_user = options.twitter_query_user
def do_run(self):
@@ -277,7 +278,7 @@
except Exception as e:
self.logger.debug('Process tweet exception in loop : ' + repr(e))
continue
- process_tweet(tweet_txt, source_id, session, self.access_token, self.logger)
+ process_tweet(tweet_txt, source_id, session, self.access_token, self.twitter_query_user, self.logger)
session.commit()
finally:
session.rollback()
@@ -345,6 +346,9 @@
help="number of process.\nIf 0, only the lefovers of the database are processed.\nIf 1, no postprocessing is done on the tweets.", metavar="PROCESS_NB", default=2, type='int')
parser.add_option("--url", dest="url",
help="The twitter url to connect to.", metavar="URL", default=tweetstream.FilterStream.url)
+ parser.add_option("--query-user", dest="twitter_query_user", action="store_true",
+ help="Query twitter for users", default=False, metavar="QUERY_USER")
+
utils.set_logging_options(parser)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/create_twitter_export_conf.py Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,43 @@
+from lxml import etree
+from optparse import OptionParser #@UnresolvedImport
+
+def get_options():
+
+ parser = OptionParser()
+
+ parser.add_option("-f", "--file", dest="outputfile",
+ help="destination filename", metavar="FILE", default="twitter_export_conf.xml")
+ parser.add_option("-i", "--input", dest="inputfile",
+ help="inputfile", metavar="INPUT", default=None)
+
+ return parser.parse_args()
+
+if __name__ == "__main__":
+ (options, args) = get_options()
+
+ dest_filename = options.outputfile
+
+ path_list = []
+ if options.inputfile is None:
+ path_list = args
+ else:
+ with open(options.inputfile, 'r') as fi:
+ path_list = fi
+
+
+ root = etree.Element("twitter_export")
+
+
+ for path in path_list:
+
+ iri_doc = etree.parse(path)
+ media_nodes = iri_doc.xpath("/iri/body/medias/media[@id='video']/video")
+ duration = int(media_nodes[0].get("dur"))/1000
+
+ file_elem = etree.SubElement(root, "file")
+ etree.SubElement(file_elem, "path").text = path
+ etree.SubElement(file_elem, "start_date")
+ etree.SubElement(file_elem, "duration").text = unicode(duration)
+
+ tree = etree.ElementTree(root)
+ tree.write(dest_filename, encoding="utf-8", pretty_print=True, xml_declaration=True)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/export_twitter_alchemy.py Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+from lxml import etree
+from iri_tweet.models import setup_database
+from optparse import OptionParser #@UnresolvedImport
+from sqlalchemy import Table, Column, BigInteger
+from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query,
+ get_logger)
+import anyjson
+import datetime
+import httplib2
+import os.path
+import re
+import sys
+import time
+import uuid #@UnresolvedImport
+from dateutil.parser import parse as parse_date
+
+#class TweetExclude(object):
+# def __init__(self, id):
+# self.id = id
+#
+# def __repr__(self):
+# return "<TweetExclude(id=%d)>" % (self.id)
+
+
+def parse_polemics(tw, extended_mode):
+ """
+ parse polemics in text and return a list of polemic code. None if not polemic found
+ """
+ polemics = {}
+ for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
+ pol_link = {
+ '++' : u'OK',
+ '--' : u'KO',
+ '??' : u'Q',
+ '==' : u'REF'}[m.group(1)]
+ polemics[pol_link] = pol_link
+
+ if extended_mode:
+ if "?" in tw.text:
+ polemics["Q"] = "Q"
+
+ for entity in tw.entity_list:
+ if entity.type == "entity_url":
+ polemics["REF"] = "REF"
+
+ if len(polemics) > 0:
+ return polemics.keys()
+ else:
+ return None
+
+def get_options():
+ parser = OptionParser()
+ parser.add_option("-f", "--file", dest="filename",
+ help="write export to file", metavar="FILE", default="project.ldt")
+ parser.add_option("-d", "--database", dest="database",
+ help="Input database", metavar="DATABASE")
+ parser.add_option("-s", "--start-date", dest="start_date",
+ help="start date", metavar="START_DATE", default=None)
+ parser.add_option("-e", "--end-date", dest="end_date",
+ help="end date", metavar="END_DATE", default=None)
+ parser.add_option("-I", "--content-file", dest="content_file",
+ help="Content file", metavar="CONTENT_FILE")
+ parser.add_option("-c", "--content", dest="content",
+ help="Content url", metavar="CONTENT")
+ parser.add_option("-V", "--video-url", dest="video",
+ help="video url", metavar="VIDEO")
+ parser.add_option("-i", "--content-id", dest="content_id",
+ help="Content id", metavar="CONTENT_ID")
+ parser.add_option("-x", "--exclude", dest="exclude",
+ help="file containing the id to exclude", metavar="EXCLUDE")
+ parser.add_option("-C", "--color", dest="color",
+ help="Color code", metavar="COLOR", default="16763904")
+ parser.add_option("-H", "--hashtag", dest="hashtag",
+ help="Hashtag", metavar="HASHTAG", default=[], action="append")
+ parser.add_option("-D", "--duration", dest="duration", type="int",
+ help="Duration", metavar="DURATION", default=None)
+ parser.add_option("-n", "--name", dest="name",
+ help="Cutting name", metavar="NAME", default=u"Tweets")
+ parser.add_option("-R", "--replace", dest="replace", action="store_true",
+ help="Replace tweet ensemble", metavar="REPLACE", default=False)
+ parser.add_option("-m", "--merge", dest="merge", action="store_true",
+ help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
+ parser.add_option("-L", "--list-conf", dest="listconf",
+ help="list of file to process", metavar="LIST_CONF", default=None)
+ parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
+ help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
+ parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
+ help="A list of user screen name", metavar="USER_WHITELIST",default=None)
+
+
+ set_logging_options(parser)
+
+
+ return parser.parse_args() + (parser,)
+
+
+if __name__ == "__main__" :
+
+ (options, args, parser) = get_options()
+
+ set_logging(options)
+
+ get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
+
+ if len(sys.argv) == 1 or options.database is None:
+ parser.print_help()
+ sys.exit(1)
+
+ conn_str = options.database.strip()
+ if not re.match("^\w+://.+", conn_str):
+ conn_str = 'sqlite:///' + conn_str
+
+ engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
+ conn = None
+ try :
+ conn = engine.connect()
+ session = None
+ try :
+ session = Session(bind=conn)
+ tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
+ #mapper(TweetExclude, tweet_exclude_table)
+ metadata.create_all(bind=conn, tables=[tweet_exclude_table])
+
+ if options.exclude and os.path.exists(options.exclude):
+ with open(options.exclude, 'r+') as f:
+ tei = tweet_exclude_table.insert()
+ for line in f:
+ conn.execute(tei.values(id=long(line.strip())))
+ user_whitelist_file = options.user_whitelist
+ user_whitelist = None
+
+ if options.listconf:
+
+ parameters = []
+ confdoc = etree.parse(options.listconf)
+ for node in confdoc.xpath("/twitter_export/file"):
+ params = {}
+ for snode in node:
+ if snode.tag == "path":
+ params['content_file'] = snode.text
+ elif snode.tag == "start_date":
+ params['start_date'] = snode.text
+ elif snode.tag == "end_date":
+ params['end_date'] = snode.text
+ elif snode.tag == "duration":
+ params['duration'] = int(snode.text)
+ elif snode.tag == "hashtags":
+ params['hashtags'] = [snode.text]
+ if options.hashtag or 'hashtags' not in params :
+ params['hashtags'] = options.hashtag
+ parameters.append(params)
+ else:
+ parameters = [{
+ 'start_date': options.start_date,
+ 'end_date' : options.end_date,
+ 'duration' : options.duration,
+ 'content_file' : options.content_file,
+ 'hashtags' : options.hashtag
+ }]
+
+ for params in parameters:
+
+ get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
+
+ start_date_str = params.get("start_date",None)
+ end_date_str = params.get("end_date", None)
+ duration = params.get("duration", None)
+ content_file = params.get("content_file", None)
+ hashtags = params.get('hashtags', [])
+
+ if user_whitelist_file:
+ with open(user_whitelist_file, 'r+') as f:
+ user_whitelist = list(set([s.strip() for s in f]))
+
+ start_date = None
+ ts = None
+ if start_date_str:
+ start_date = parse_date(start_date_str)
+ ts = time.mktime(start_date.timetuple())
+
+ end_date = None
+ if end_date_str:
+ end_date = parse_date(end_date_str)
+ elif start_date and duration:
+ end_date = start_date + datetime.timedelta(seconds=duration)
+
+ query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
+
+ query_res = query.all()
+
+ root = None
+ ensemble_parent = None
+
+ #to do : analyse situation ldt or iri ? filename set or not ?
+
+ if content_file and content_file.find("http") == 0:
+
+ get_logger().debug("url : " + content_file) #@UndefinedVariable
+
+ h = httplib2.Http()
+ resp, content = h.request(content_file)
+
+ get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
+
+ project = anyjson.deserialize(content)
+ root = etree.fromstring(project["ldt"])
+
+ elif content_file and os.path.exists(content_file):
+
+ doc = etree.parse(content_file)
+ root = doc.getroot()
+
+
+ if root is None:
+
+ root = etree.Element(u"iri")
+
+ project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
+
+ medias = etree.SubElement(root, u"medias")
+ media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
+
+ annotations = etree.SubElement(root, u"annotations")
+ content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
+ ensemble_parent = content
+
+
+ if ensemble_parent is None:
+ file_type = None
+ for node in root:
+ if node.tag == "project":
+ file_type = "ldt"
+ break
+ elif node.tag == "head":
+ file_type = "iri"
+ break
+
+ if file_type == "ldt":
+ media_nodes = root.xpath("//media")
+ if len(media_nodes) > 0:
+ media = media_nodes[0]
+ annotations_node = root.find(u"annotations")
+ if annotations_node is None:
+ annotations_node = etree.SubElement(root, u"annotations")
+ content_node = annotations_node.find(u"content")
+ if content_node is None:
+ content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
+ ensemble_parent = content_node
+ elif file_type == "iri":
+ body_node = root.find(u"body")
+ if body_node is None:
+ body_node = etree.SubElement(root, u"body")
+ ensembles_node = body_node.find(u"ensembles")
+ if ensembles_node is None:
+ ensembles_node = etree.SubElement(body_node, u"ensembles")
+ ensemble_parent = ensembles_node
+
+
+ if ensemble_parent is None:
+ get_logger().error("Can not process file") #@UndefinedVariable
+ sys.exit()
+
+ if options.replace:
+ for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
+ if ens.get("id","").startswith("tweet_"):
+ ensemble_parent.remove(ens)
+
+ ensemble = None
+ elements = None
+
+ if options.merge:
+ ensemble = ensemble_parent.find(u"ensemble")
+ if ensemble is not None:
+ elements = ensemble.find(u".//elements")
+
+ if ensemble is None or elements is None:
+ ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
+ decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
+
+ etree.SubElement(decoupage, u"title").text = unicode(options.name)
+ etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
+
+ elements = etree.SubElement(decoupage, u"elements")
+
+
+ for tw in query_res:
+ tweet_ts_dt = tw.created_at
+ tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
+ if ts is None:
+ ts = tweet_ts
+ tweet_ts_rel = (tweet_ts-ts) * 1000
+ username = None
+ profile_url = ""
+ if tw.user is not None:
+ username = tw.user.name
+ profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
+ if not username:
+ username = "anon."
+
+ element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
+ etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
+ etree.SubElement(element, u"abstract").text = unicode(tw.text)
+
+ tags_node = etree.SubElement(element, u"tags")
+
+ for entity in tw.entity_list:
+ if entity.type == u'entity_hashtag':
+ etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
+
+ meta_element = etree.SubElement(element, u'meta')
+
+ polemics_list = parse_polemics(tw, options.extended_mode)
+ if polemics_list:
+ polemics_element = etree.Element(u'polemics')
+ for pol in polemics_list:
+ etree.SubElement(polemics_element, u'polemic').text = pol
+ meta_element.append(polemics_element)
+
+ etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json))
+
+ # sort by tc in
+ if options.merge :
+ # remove all elements and put them in a array
+ # sort them with tc
+ #put them back
+ elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
+
+
+
+
+ output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)
+
+ if content_file and content_file.find("http") == 0:
+
+ project["ldt"] = output_data
+ body = anyjson.serialize(project)
+ get_logger().debug("write http " + content_file) #@UndefinedVariable
+ get_logger().debug("write http " + repr(body)) #@UndefinedVariable
+ h = httplib2.Http()
+ resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
+ get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
+ else:
+ if content_file and os.path.exists(content_file):
+ dest_file_name = content_file
+ else:
+ dest_file_name = options.filename
+
+ get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
+ output = open(dest_file_name, "w")
+ output.write(output_data)
+ output.flush()
+ output.close()
+
+ finally:
+ if session:
+ session.close()
+ finally:
+ if conn:
+ conn.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/get_stats.py Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,38 @@
+
+import httplib2
+import anyjson
+from lxml import etree
+import sys
+import pprint
+
+def get_stats(url):
+
+ h = httplib2.Http()
+ resp, content = h.request(url)
+ #project = anyjson.deserialize(content)
+ root = etree.fromstring(content)
+
+ #get all annotations
+ res_xpath = root.xpath("//ensemble[starts-with(@id,'tweet_')]//element")
+
+ total_annot = len(res_xpath)
+ total_with_polemic = 0
+ total_by_type = {}
+
+
+ for annot in res_xpath:
+ polemic_list = annot.xpath("meta/polemics/polemic")
+ if len(polemic_list)> 0:
+ total_with_polemic += 1
+ for polemic_item in polemic_list:
+ pol_type = polemic_item.text
+ total_by_type[pol_type] = total_by_type.get(pol_type,0) + 1
+
+
+ return {"total_annotations": total_annot, "total_with_polemics": total_with_polemic, "polemic_by_type": total_by_type}
+
+if __name__ == "__main__":
+
+ pp = pprint.PrettyPrinter(indent=4, width=1)
+
+ pp.pprint(get_stats(sys.argv[1]))
\ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/merge_tweets.py Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,105 @@
+#from models import setup_database
+from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
+from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress
+import argparse
+import sys
+import re
+import anyjson
+import math
+import codecs
+
+def get_option():
+
+ parser = argparse.ArgumentParser(description='Merge tweets databases')
+
+ parser.add_argument("-l", "--log", dest="logfile",
+ help="log to file", metavar="LOG", default="stderr")
+ parser.add_argument("-v", dest="verbose", action="count",
+ help="verbose", default=0)
+ parser.add_argument("-q", dest="quiet", action="count",
+ help="quiet", default=0)
+ parser.add_argument("--query-user", dest="query_user", action="store_true",
+ help="Query twitter for user information", default=False)
+ parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
+ help="Token file name")
+
+
+ parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE")
+ parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET")
+
+
+ return parser.parse_args()
+
+if __name__ == "__main__":
+
+ sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
+
+ options = get_option()
+
+ access_token = None
+ if options.query_user:
+ access_token = get_oauth_token(options.token_filename)
+
+ #open source
+ src_conn_str = options.source[0].strip()
+ if not re.match("^\w+://.+", src_conn_str):
+ src_conn_str = 'sqlite:///' + src_conn_str
+ tgt_conn_str = options.target[0].strip()
+ if not re.match("^\w+://.+", tgt_conn_str):
+ tgt_conn_str = 'sqlite:///' + tgt_conn_str
+
+
+ engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
+ engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
+
+ conn_src = conn_tgt = session_src = session_tgt = None
+
+ try:
+ #conn_src = engine_src.connect()
+ #conn_tgt = engine_tgt.connect()
+ session_src = Session_src()
+ session_tgt = Session_tgt()
+
+ count_tw_query = Tweet.__table__.count()
+
+ count_tw = engine_src.scalar(count_tw_query)
+
+ if count_tw == 0:
+ print "No tweet to process : exit"
+ sys.exit()
+
+ query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
+ added = 0
+
+ for i,tweet in enumerate(query_src):
+
+ tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count()
+
+ progress_text = u"Process: "
+ if tweet_count == 0:
+ added += 1
+ progress_text = u"Adding : "
+ tweet_source = tweet.tweet_source.original_json
+
+ tweet_obj = anyjson.deserialize(tweet_source)
+ if 'text' not in tweet_obj:
+ tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
+ session_tgt.add(tweet_log)
+ else:
+ tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user)
+ tp.process()
+
+ session_tgt.flush()
+
+ show_progress(i+1, count_tw, progress_text+tweet.text, 70)
+
+ session_tgt.commit()
+ print u"%d new tweet added" % (added)
+
+ finally:
+ session_tgt.close() if session_tgt is not None else None
+ session_src.close() if session_src is not None else None
+ conn_tgt.close() if conn_tgt is not None else None
+ conn_src.close() if conn_src is not None else None
+
+
\ No newline at end of file
Binary file script/virtualenv/res/python-dateutil-1.5.tar.gz has changed