# HG changeset patch # User Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com> # Date 1294741037 -3600 # Node ID bb44692e09ee295ce91f6e9fc7b83f56737b10c2 # Parent b7f4b0554ef82018210b3577cdb95416237491e1 script apres traitement enmi diff -r b7f4b0554ef8 -r bb44692e09ee .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,7 @@ + +syntax: regexp +^script/stream/virtualenv/twitter_env$ +syntax: regexp +^script/stream/virtualenv$ +syntax: regexp +^script/rest/virtualenv$ \ No newline at end of file diff -r b7f4b0554ef8 -r bb44692e09ee .project --- a/.project Mon Dec 13 19:19:55 2010 +0100 +++ b/.project Tue Jan 11 11:17:17 2011 +0100 @@ -5,7 +5,13 @@ + + org.python.pydev.PyDevBuilder + + + + org.python.pydev.pythonNature diff -r b7f4b0554ef8 -r bb44692e09ee .pydevproject --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.pydevproject Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,7 @@ + + + + +Default +python 2.6 + diff -r b7f4b0554ef8 -r bb44692e09ee .settings/org.eclipse.core.resources.prefs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.settings/org.eclipse.core.resources.prefs Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,4 @@ +#Fri Jan 07 10:05:33 CET 2011 +eclipse.preferences.version=1 +encoding//script/iri_tweet/export_twitter_alchemy.py=utf-8 +encoding//script/rest/export_twitter.py=utf-8 diff -r b7f4b0554ef8 -r bb44692e09ee script/backup/enmi2010_twitter.db Binary file script/backup/enmi2010_twitter.db has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/backup/enmi2010_twitter_rest.db Binary file script/backup/enmi2010_twitter_rest.db has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/backup/export_tweet_enmi2010.db Binary file script/backup/export_tweet_enmi2010.db has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/iri_tweet/__init__.py diff -r b7f4b0554ef8 -r bb44692e09ee script/iri_tweet/create_twitter_export_conf.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/iri_tweet/create_twitter_export_conf.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,43 @@ +from lxml import etree +from optparse import OptionParser + +def get_options(): + + parser = OptionParser() + + parser.add_option("-f", "--file", dest="outputfile", + help="destination filename", metavar="FILE", default="twitter_export_conf.xml") + parser.add_option("-i", "--input", dest="inputfile", + help="inputfile", metavar="INPUT", default=None) + + return parser.parse_args() + +if __name__ == "__main__": + (options, args) = get_options() + + dest_filename = options.outputfile + + path_list = [] + if options.inputfile is None: + path_list = args + else: + with open(options.inputfile, 'r') as fi: + path_list = fi + + + root = etree.Element("twitter_export") + + + for path in path_list: + + iri_doc = etree.parse(path) + media_nodes = iri_doc.xpath("/iri/body/medias/media[@id='video']/video") + duration = int(media_nodes[0].get("dur"))/1000 + + file_elem = etree.SubElement(root, "file") + etree.SubElement(file_elem, "path").text = path + etree.SubElement(file_elem, "start_date") + etree.SubElement(file_elem, "duration").text = unicode(duration) + + tree = etree.ElementTree(root) + tree.write(dest_filename, encoding="utf-8", pretty_print=True, xml_declaration=True) \ No newline at end of file diff -r b7f4b0554ef8 -r bb44692e09ee script/iri_tweet/export_tweet_db.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/iri_tweet/export_tweet_db.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,78 @@ +from models import * +from utils import * +from optparse import OptionParser +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +import logging +import sqlite3 +import sys + + +# 'entities': "tweet_entity", +# 'user': "tweet_user" + +def get_option(): + + parser = OptionParser() + + parser.add_option("-l", "--log", dest="logfile", + help="log to file", metavar="LOG", default="stderr") + parser.add_option("-v", dest="verbose", action="count", + help="verbose", metavar="VERBOSE", default=0) + parser.add_option("-q", dest="quiet", action="count", + help="quiet", metavar="QUIET", default=0) + + return parser.parse_args() + +if __name__ == "__main__": + + (options, args) = get_option() + + logging_config = {} + + if options.logfile == "stdout": + logging_config["stream"] = sys.stdout + elif options.logfile == "stderr": + logging_config["stream"] = sys.stderr + else: + logging_config["filename"] = options.logfile + + logging_config["level"] = max(logging.NOTSET, min(logging.CRITICAL, logging.WARNING - 10 * options.verbose + 10 * options.quiet)) + + logging.basicConfig(**logging_config) + + with sqlite3.connect(args[0]) as conn_in: + engine = create_engine('sqlite:///'+args[1], echo=((options.verbose-options.quiet)>0)) + metadata = Base.metadata + metadata.create_all(engine) + Session = sessionmaker(bind=engine) + session = Session() + try: + curs_in = conn_in.cursor() + fields_mapping = {} + for i,res in enumerate(curs_in.execute("select json from tweet_tweet;")): + logging.debug("main loop %d : %s" % (i, res[0])) + json = eval(res[0]) + if "metadata" in json: + from_twitter_rest(json, res[0], session) + else: + from_twitter_stream(json, res[0], session) + #if "user_mentions" in json["entities"]: + # for hash in json["entities"]["user_mentions"]: + ## for key,value in hash.items(): + # if key not in fields_mapping or fields_mapping[key] is type(None): + # fields_mapping[key] = type(value) + + + #for key,value in fields_mapping.items(): + # print key,value.__name__ + session.commit() + logging.debug("main : %d tweet processed" % (i+1)) + except Exception, e: + session.rollback() + raise e + finally: + session.close() + + + \ No newline at end of file diff -r b7f4b0554ef8 -r bb44692e09ee script/iri_tweet/export_twitter_alchemy.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/iri_tweet/export_twitter_alchemy.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,230 @@ +#!/usr/bin/env python +# coding=utf-8 + +from lxml import etree +from models import * +from optparse import OptionParser +from sqlalchemy import Table, Column, Integer, BigInteger, String, MetaData, \ + ForeignKey, create_engine +from sqlalchemy.orm import sessionmaker, mapper +from sqlalchemy.sql import select +import datetime +import time +import email.utils +import logging +import os +import os.path +import re +import sys +import uuid + +#class TweetExclude(object): +# def __init__(self, id): +# self.id = id +# +# def __repr__(self): +# return "" % (self.id) + +def parse_date(date_str): + ts = email.utils.parsedate_tz(date_str) + return datetime.datetime(*ts[0:7]) + + +if __name__ == "__main__" : + + parser = OptionParser() + parser.add_option("-f", "--file", dest="filename", + help="write export to file", metavar="FILE", default="project_enmi.ldt") + parser.add_option("-d", "--database", dest="database", + help="Input database", metavar="DATABASE") + parser.add_option("-s", "--start-date", dest="start_date", + help="start date", metavar="START_DATE") + parser.add_option("-e", "--end-date", dest="end_date", + help="end date", metavar="END_DATE") + parser.add_option("-I", "--content-file", dest="content_file", + help="Content file", metavar="CONTENT_FILE") + parser.add_option("-c", "--content", dest="content", + help="Content url", metavar="CONTENT") + parser.add_option("-V", "--video-url", dest="video", + help="video url", metavar="VIDEO") + parser.add_option("-i", "--content-id", dest="content_id", + help="Content id", metavar="CONTENT_ID") + parser.add_option("-x", "--exclude", dest="exclude", + help="file containing the id to exclude", metavar="EXCLUDE") + parser.add_option("-C", "--color", dest="color", + help="Color code", metavar="COLOR", default="16763904") + parser.add_option("-H", "--hashtag", dest="hashtag", + help="Hashtag", metavar="HASHTAG", default="enmi") + parser.add_option("-D", "--duration", dest="duration", type="int", + help="Duration", metavar="DURATION", default=None) + parser.add_option("-n", "--name", dest="name", + help="Cutting name", metavar="NAME", default=u"Tweets") + parser.add_option("-R", "--replace", dest="replace", action="store_true", + help="Replace tweet ensemble", metavar="REPLACE", default=False) + parser.add_option("-l", "--log", dest="logfile", + help="log to file", metavar="LOG", default="stderr") + parser.add_option("-v", dest="verbose", action="count", + help="verbose", metavar="VERBOSE", default=0) + parser.add_option("-q", dest="quiet", action="count", + help="quiet", metavar="QUIET", default=0) + parser.add_option("-L", dest="listconf", + help="file containing the list of file to process", metavar="LIST", default=0) + + + + (options, args) = parser.parse_args() + + logging_config = {} + + if options.logfile == "stdout": + logging_config["stream"] = sys.stdout + elif options.logfile == "stderr": + logging_config["stream"] = sys.stderr + else: + logging_config["filename"] = options.logfile + + logging_config["level"] = max(logging.NOTSET, min(logging.CRITICAL, logging.WARNING - 10 * options.verbose + 10 * options.quiet)) + + logging.basicConfig(**logging_config) + + logging.debug("OPTIONS : " + repr(options)) + + + engine = create_engine('sqlite:///'+options.database, echo=((options.verbose-options.quiet)>0)) + Session = sessionmaker() + + conn = engine.connect() + try : + session = Session(bind=conn) + try : + + metadata = MetaData(bind=conn) + tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY']) + #mapper(TweetExclude, tweet_exclude_table) + metadata.create_all() + + if options.exclude and os.path.exists(options.exclude): + with open(options.exclude, 'r+') as f: + tei = tweet_exclude_table.insert() + for line in f: + conn.execute(tei.values(id=long(line.strip()))) + + if options.listconf: + + parameters = [] + confdoc = etree.parse(options.listconf) + for node in confdoc.xpath("/twitter_export/file"): + params = {} + for snode in node: + if snode.tag == "path": + params['content_file'] = snode.text + elif snode.tag == "start_date": + params['start_date'] = snode.text + elif snode.tag == "end_date": + params['end_date'] = snode.text + elif snode.tag == "duration": + params['duration'] = int(snode.text) + parameters.append(params) + else: + parameters = [{ + 'start_date': options.start_date, + 'end_date' : options.end_date, + 'duration' : options.duration, + 'content_file' : otions.content_file + + }] + + for params in parameters: + + logging.debug("PARAMETERS " + repr(params)) + + start_date_str = params.get("start_date",None) + end_date_str = params.get("end_date", None) + duration = params.get("duration", None) + content_file = params.get("content_file", None) + + + start_date = parse_date(start_date_str) + ts = time.mktime(start_date.timetuple()) + + if end_date_str: + end_date = parse_date(end_date_str) + te = time.mktime(end_date.timetuple()) + else: + te = ts + duration + end_date = start_date + datetime.timedelta(seconds=duration) + + + query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >= start_date).filter(Tweet.created_at <= end_date).all() + + #hashtag = u"%#"+unicode(options.hashtag)+u"%" + + #cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te)); + + root = None + ensemble_parent = None + + if content_file and os.path.exists(content_file): + + doc = etree.parse(content_file) + root = doc.getroot() + + ensemble_parent = root.xpath("//ensembles")[0] + + else: + root = etree.Element(u"iri") + + project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) + + medias = etree.SubElement(root, u"medias") + media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) + + annotations = etree.SubElement(root, u"annotations") + content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) + ensemble_parent = content + + if options.replace: + for ens in ensemble_parent.iterchildren(tag=u"ensemble"): + if ens.get("id","").startswith("tweet_"): + ensemble_parent.remove(ens) + + ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"}) + decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) + + etree.SubElement(decoupage, u"title").text = unicode(options.name) + etree.SubElement(decoupage, u"abstract").text = unicode(options.name) + + elements = etree.SubElement(decoupage, u"elements") + + for tw in query_res: + tweet_ts_dt = tw.created_at + tweet_ts = int(time.mktime(tweet_ts_dt.timetuple())) + tweet_ts_rel = (tweet_ts-ts) * 1000 + username = None + if tw.user is not None: + username = tw.user.name + if not username: + username = "anon." + element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""}) + etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text) + etree.SubElement(element, u"abstract").text = unicode(tw.text) + + tags_node = etree.SubElement(element, u"tags") + + for entity in tw.entity_list: + if entity.type == u'entity_hashtag': + etree.SubElement(tags_node,u"tag").text = entity.hashtag.text + + if content_file and os.path.exists(content_file): + output = open(content_file, "w") + else: + output = open(options.filename, "w") + + output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)) + output.flush() + output.close() + + finally: + session.close() + finally: + conn.close() diff -r b7f4b0554ef8 -r bb44692e09ee script/iri_tweet/models.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/iri_tweet/models.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,297 @@ +from sqlalchemy import Boolean, Table, Column, BigInteger, \ + Integer, String, MetaData, ForeignKey, DateTime +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import relationship, backref, sessionmaker +import datetime +import email.utils +import simplejson + + +Base = declarative_base() + +CONSUMER_KEY = "54ThDZhpEjokcMgHJOMnQA" +CONSUMER_SECRET = "wUoL9UL2T87tfc97R0Dff2EaqRzpJ5XGdmaN2XK3udA" +ACCESS_TOKEN_KEY= "47312923-LiNTtz0I18YXMVIrFeTuhmH7bOvYsK6p3Ln2Dc" +ACCESS_TOKEN_SECRET = "r3LoXVcjImNAElUpWqTu2SG2xCdWFHkva7xeQoncA" + +def adapt_date(date_str): + ts = email.utils.parsedate_tz(date_str) + return datetime.datetime(*ts[0:7]) + +def adapt_json(obj): + if obj is None: + return None + else: + return simplejson.dumps(obj) + +class Entity(Base): + __tablename__ = "tweet_entity" + id = Column(Integer, primary_key = True) + tweet_id = Column(BigInteger, ForeignKey('tweet_tweet.id')) + #tweet = relationship(Tweet, primaryjoin = tweet_id == Tweet.id) + type = Column(String) + indice_start = Column(Integer) + indice_end = Column(Integer) + __mapper_args__ = {'polymorphic_on': type} + + def __init__(self, **kwargs): + for key, value in kwargs.items(): + if hasattr(self,key): + setattr(self,key,value) + + +class Tweet(Base): + __tablename__ = 'tweet_tweet' + + id = Column(BigInteger, primary_key=True, autoincrement=False) + id_str = Column(String) + contributors = Column(String) + coordinates = Column(String) + created_at = Column(DateTime) + favorited = Column(Boolean) + geo = Column(String) + in_reply_to_screen_name = Column(String) + in_reply_to_status_id = Column(BigInteger) + in_reply_to_status_id_str = Column(String) + in_reply_to_user_id = Column(Integer) + in_reply_to_user_id_str = Column(String) + place = Column(String) + retweet_count = Column(Integer) + retweeted = Column(Boolean) + source = Column(String) + text = Column(String) + truncated = Column(Boolean) + user_id = Column(Integer, ForeignKey('tweet_user.id')) + original_json = Column(String) + entity_list = relationship(Entity, backref='tweet') + + #user = relationship(User, primaryjoin=user_id == User.id) + + def __init__(self, **kwargs): + for key, value in kwargs.items(): + if hasattr(self,key): + setattr(self,key,value) + + +class User(Base): + __tablename__ = "tweet_user" + + id = Column(Integer, primary_key = True, autoincrement=False) + id_str= Column(String) + contributors_enabled= Column(Boolean) + created_at= Column(DateTime) + description= Column(String) + favourites_count = Column(Integer) + follow_request_sent = Column(Boolean) + followers_count = Column(Integer) + following = Column(String) + friends_count = Column(Integer) + geo_enabled= Column(Boolean) + is_translator= Column(Boolean) + lang = Column(String) + listed_count = Column(Integer) + location= Column(String) + name = Column(String) + notifications = Column(String) + profile_background_color= Column(String) + profile_background_image_url= Column(String) + profile_background_tile= Column(Boolean) + profile_image_url= Column(String) + profile_link_color= Column(String) + profile_sidebar_border_color= Column(String) + profile_sidebar_fill_color= Column(String) + profile_text_color= Column(String) + profile_use_background_image= Column(Boolean) + protected= Column(Boolean) + screen_name= Column(String) + show_all_inline_media= Column(Boolean) + statuses_count = Column(Integer) + time_zone= Column(String) + url= Column(String) + utc_offset = Column(Integer) + verified= Column(Boolean) + tweets = relationship(Tweet, backref='user') + + def __init__(self, **kwargs): + for key, value in kwargs.items(): + if hasattr(self,key): + setattr(self,key,value) + + + +class Hashtag(Base): + __tablename__ = "tweet_hashtag" + id = Column(Integer, primary_key=True) + text = Column(String, unique = True) + def __init__(self, **kwargs): + for key, value in kwargs.items(): + if hasattr(self,key): + setattr(self,key,value) + + + +class Url(Base): + __tablename__ = "tweet_url" + id = Column(Integer, primary_key=True) + url = Column(String, unique=True) + expanded_url = Column(String) + def __init__(self, **kwargs): + for key, value in kwargs.items(): + if hasattr(self,key): + setattr(self,key,value) + + + +class EntityHashtag(Entity): + __tablename__ = "tweet_entity_hashtag" + __mapper_args__ = {'polymorphic_identity': 'entity_hashtag'} + id = Column(Integer, ForeignKey('tweet_entity.id'), primary_key=True) + hashtag_id = Column(Integer, ForeignKey("tweet_hashtag.id")) + hashtag = relationship(Hashtag, primaryjoin=hashtag_id == Hashtag.id) + def __init__(self, **kwargs): + super(EntityHashtag, self).__init__(**kwargs) + for key, value in kwargs.items(): + if hasattr(self,key): + setattr(self,key,value) + + +class EntityUrl(Entity): + __tablename__ = "tweet_entity_url" + __mapper_args__ = {'polymorphic_identity': 'entity_url'} + id = Column(Integer, ForeignKey('tweet_entity.id'), primary_key=True) + url_id = Column(Integer, ForeignKey("tweet_url.id")) + url = relationship(Url, primaryjoin=url_id == Url.id) + def __init__(self, **kwargs): + super(EntityUrl, self).__init__(**kwargs) + for key, value in kwargs.items(): + if hasattr(self,key): + setattr(self,key,value) + +class EntityUser(Entity): + __tablename__ = "tweet_entity_user" + __mapper_args__ = {'polymorphic_identity': 'entity_user'} + id = Column(Integer, ForeignKey('tweet_entity.id'), primary_key=True) + user_id = Column(Integer, ForeignKey('tweet_user.id')) + user = relationship(User, primaryjoin=user_id == User.id) + + def __init__(self, **kwargs): + super(EntityUser, self).__init__(**kwargs) + for key, value in kwargs.items(): + if hasattr(self,key): + setattr(self,key,value) + +rest_tweet_tweet = { + u'iso_language_code': 'unicode', + u'text': 'unicode', + u'from_user_id_str': 'unicode', + u'profile_image_url': 'unicode', + u'to_user_id_str': 'NoneType', + u'created_at': 'unicode', + u'source': 'unicode', + u'to_user': 'unicode', + u'id_str': 'unicode', + u'from_user': 'unicode', + u'place': {u'type': 'unicode', u'id': 'unicode', u'full_name': 'unicode'}, + u'from_user_id': 'int', + u'to_user_id': 'NoneType', + u'geo': 'NoneType', + u'id': 'int', + u'metadata': {u'result_type': 'unicode'} +} + +tweet_tweet = { + 'contributors': None, + 'coordinates': None, + 'created_at': 'date', + 'entities': "tweet_entity", + 'favorited': "bool", + 'geo': None, + 'id': "long", + 'id_str': "string", + 'in_reply_to_screen_name': "string", + 'in_reply_to_status_id': "long", + 'in_reply_to_status_id_str': "string", + 'in_reply_to_user_id': "int", + 'in_reply_to_user_id_str': "string", + 'place': "string", + 'retweet_count': "int", + 'retweeted': "bool", + 'source': "string", + 'text': "string", + 'truncated': "bool", + 'user': "tweet_user" +} +tweet_user = { + 'contributors_enabled': 'bool', + 'created_at': 'str', + 'description': 'str', + 'favourites_count': 'int', + 'follow_request_sent': None, + 'followers_count': 'int', + 'following': None, + 'friends_count': 'int', + 'geo_enabled': 'bool', + 'id': 'int', + 'id_str': 'str', + 'is_translator': 'bool', + 'lang': 'str', + 'listed_count': 'int', + 'location': 'str', + 'name': 'str', + 'notifications': 'NoneType', + 'profile_background_color': 'str', + 'profile_background_image_url': 'str', + 'profile_background_tile': 'bool', + 'profile_image_url': 'str', + 'profile_link_color': 'str', + 'profile_sidebar_border_color': 'str', + 'profile_sidebar_fill_color': 'str', + 'profile_text_color': 'str', + 'profile_use_background_image': 'bool', + 'protected': 'bool', + 'screen_name': 'str', + 'show_all_inline_media': 'bool', + 'statuses_count': 'int', + 'time_zone': 'str', + 'url': 'str', + 'utc_offset': 'int', + 'verified': 'bool', +} + + +tweet_entity_hashtag = { + 'hashtag' : 'tweet_hashtag', + 'indice_start' : 'int', + 'indice_end' : 'int', + 'tweet':'tweet_tweet' +} + +tweet_entity_url = { + 'url' : 'tweet_url', + 'indice_start' : 'int', + 'indice_end' : 'int', + 'tweet':'tweet_tweet' +} + +tweet_entity_user = { + 'user' : 'tweet_user', + 'indice_start' : 'int', + 'indice_end' : 'int', + 'tweet':'tweet_tweet' +} + +#id int +#id_str str +#indices list +#name str +#screen_name str + +tweet_hashtag = { + "text": "string" +} + +tweet_url = { + "url": "string", + "expanded_url" : "string", +} + diff -r b7f4b0554ef8 -r bb44692e09ee script/iri_tweet/utils.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/iri_tweet/utils.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,240 @@ +import email.utils +import logging +from models import * +import datetime +import twitter +import twitter_text + + +def parse_date(date_str): + ts = email.utils.parsedate_tz(date_str) + return datetime.datetime(*ts[0:7]) + + +fields_adapter = { + 'stream': { + "tweet": { + "created_at" : adapt_date, + "coordinates" : adapt_json, + "place" : adapt_json, + "geo" : adapt_json, +# "original_json" : adapt_json, + }, + "user": { + "created_at" : adapt_date, + }, + }, + 'rest': { + "tweet" : { + "place" : adapt_json, + "geo" : adapt_json, + "created_at" : adapt_date, +# "original_json" : adapt_json, + }, + }, +} + +# +# adapt fields, return a copy of the field_dict with adapted fields +# +def adapt_fields(fields_dict, adapter_mapping): + def adapt_one_field(field, value): + if field in adapter_mapping and adapter_mapping[field] is not None: + return adapter_mapping[field](value) + else: + return value + return dict([(str(k),adapt_one_field(k,v)) for k,v in fields_dict.items()]) + +def get_user(user_dict, session): + + logging.debug("Get user : " + repr(user_dict)) + + user_id = user_dict.get("id",None) + user_name = user_dict.get("screen_name", user_dict.get("name", None)) + + if user_id is None and user_name is None: + return None + + if user_id: + user = session.query(User).filter(User.id == user_id).first() + else: + user = session.query(User).filter(User.screen_name == user_name).first() + + if user is not None: + return user + + user_created_at = user_dict.get("created_at", None) + + if user_created_at is None: + t = twitter.Twitter(auth=twitter.OAuth(ACCESS_TOKEN_KEY, ACCESS_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)) + try: + if user_id: + user_dict = t.users.show(user_id=user_id) + else: + user_dict = t.users.show(screen_name=user_name) + except Exception as e: + logging.info("get_user : TWITTER ERROR : " + repr(e)) + logging.info("get_user : TWITTER ERROR : " + str(e)) + + user_dict = adapt_fields(user_dict, fields_adapter["stream"]["user"]) + if "id" not in user_dict: + return None + + user = User(**user_dict) + + session.add(user) + session.flush() + + return user + # if not, if needed get info from twitter + # create user + # return it + +def process_entity(ind, ind_type, tweet, session): + + logging.debug("Process_entity : " + repr(ind) + " : " + repr(ind_type)) + + entity_dict = { + "indice_start": ind["indices"][0], + "indice_end" : ind["indices"][1], + "tweet_id" : tweet.id, + "tweet" : tweet + } + + def process_hashtags(): + text = ind.get("text", ind.get("hashtag", None)) + if text is None: + return None + hashtag = session.query(Hashtag).filter(Hashtag.text == text).first() + if not hashtag: + ind["text"] = text + hashtag = Hashtag(**ind) + session.add(hashtag) + session.flush() + entity_dict['hashtag'] = hashtag + entity_dict['hashtag_id'] = hashtag.id + entity = EntityHashtag(**entity_dict) + return entity + + def process_user_mentions(): + user_mention = get_user(ind, session) + if user_mention is None: + entity_dict['user'] = None + entity_dict['user_id'] = None + else: + entity_dict['user'] = user_mention + entity_dict['user_id'] = user_mention.id + entity = EntityUser(**entity_dict) + return entity + + def process_urls(): + url = session.query(Url).filter(Url.url == ind["url"]).first() + if url is None: + url = Url(**ind) + session.add(url) + session.flush() + entity_dict['url'] = url + entity_dict['url_id'] = url.id + entity = EntityUrl(**entity_dict) + return entity + + #{'': lambda } + entity = { + 'hashtags': process_hashtags, + 'user_mentions' : process_user_mentions, + 'urls' : process_urls + }[ind_type]() + + logging.debug("Process_entity entity_dict: " + repr(entity_dict)) + if entity: + session.add(entity) + + + +def from_twitter_rest(ts, jsontxt, session): + + tweet_nb = session.query(Tweet).filter(Tweet.id == ts["id"]).count() + if tweet_nb > 0: + return + + tweet_fields = { + 'created_at': ts["created_at"], + 'favorited': False, + 'id': ts["id"], + 'id_str': ts["id_str"], + #'in_reply_to_screen_name': ts["to_user"], + 'in_reply_to_user_id': ts["to_user_id"], + 'in_reply_to_user_id_str': ts["to_user_id_str"], + #'place': ts["place"], + 'source': ts["source"], + 'text': ts["text"], + 'truncated': False, + 'original_json' : jsontxt, + } + + #user + + user_fields = { + 'id' : ts['from_user_id'], + 'id_str' : ts['from_user_id_str'], + 'lang' : ts['iso_language_code'], + 'profile_image_url' : ts["profile_image_url"], + 'screen_name' : ts["from_user"], + } + + user = get_user(user_fields, session) + if user is None: + log.warning("USER not found " + repr(user_fields)) + tweet_fields["user"] = None + tweet_fields["user_id"] = None + else: + tweet_fields["user"] = user + tweet_fields["user_id"] = user.id + + tweet_fields = adapt_fields(tweet_fields, fields_adapter["rest"]["tweet"]) + tweet = Tweet(**tweet_fields) + session.add(tweet) + + text = tweet.text + + extractor = twitter_text.Extractor(text) + + for ind in extractor.extract_hashtags_with_indices(): + process_entity(ind, "hashtags", tweet, session) + + for ind in extractor.extract_mentioned_screen_names_with_indices(): + process_entity(ind, "user_mentions", tweet, session) + + for ind in extractor.extract_urls_with_indices(): + process_entity(ind, "urls", tweet, session) + + + + +def from_twitter_stream(ts, jsontxt, session): + + tweet_nb = session.query(Tweet).filter(Tweet.id == ts["id"]).count() + if tweet_nb > 0: + return + + ts_copy = adapt_fields(ts, fields_adapter["stream"]["tweet"]) + + # get or create user + user = get_user(ts["user"], session) + if user is None: + log.warning("USER not found " + repr(ts["user"])) + ts_copy["user"] = None + ts_copy["user_id"] = None + else: + ts_copy["user"] = user + ts_copy["user_id"] = ts_copy["user"].id + ts_copy["original_json"] = jsontxt + + tweet = Tweet(**ts_copy) + session.add(tweet) + session.flush() + + # get entities + for ind_type, entity_list in ts["entities"].items(): + for ind in entity_list: + process_entity(ind, ind_type, tweet, session) diff -r b7f4b0554ef8 -r bb44692e09ee script/rest/enmi_profile.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/rest/enmi_profile.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,134 @@ +import twython +from sqlite3 import * +import datetime, time +import email.utils +from optparse import OptionParser +import os.path +import os +import sys +import simplejson + + +#options filename rpp page total_pages start_date end_date + + + +def adapt_datetime(ts): + return time.mktime(ts.timetuple()) + +def adapt_geo(geo): + return simplejson.dumps(geo) + +def convert_geo(s): + return simplejson.loads(s) + + +register_adapter(datetime.datetime, adapt_datetime) +register_converter("geo", convert_geo) + +columns_tweet = [u'favorited', u'truncated', u'text', u'created_at', u'source', u'in_reply_to_status_id', u'in_reply_to_screen_name', u'in_reply_to_user_id', u'geo', u'id', u'user'] +columns_user = [u'id', u'verified', u'profile_sidebar_fill_color', u'profile_text_color', u'followers_count', u'protected', u'location', u'profile_background_color', u'utc_offset', u'statuses_count', u'description', u'friends_count', u'profile_link_color', u'profile_image_url', u'notifications', u'geo_enabled', u'profile_background_image_url', u'screen_name', u'profile_background_tile', u'favourites_count', u'name', u'url', u'created_at', u'time_zone', u'profile_sidebar_border_color', u'following'] + +def processDate(entry): + ts = email.utils.parsedate(entry["created_at"]) + entry["created_at_ts"] = datetime.datetime.fromtimestamp(time.mktime(ts)) + +def processPage(page, cursor, debug): + for entry in page: + if debug: + print "ENTRY : " + repr(entry) + curs.execute("select id from tweet_tweet where id = ?", (entry["id"],)) + res = curs.fetchone() + if res: + continue + + entry_user = entry["user"] + processDate(entry_user) + cursor.execute("insert into tweet_user ("+",".join(entry_user.keys())+") values (:"+",:".join(entry_user.keys())+");", entry_user); + new_id = cursor.lastrowid + processDate(entry) + entry["user"] = new_id + if entry["geo"]: + entry["geo"] = adapt_geo(entry["geo"]) + new_id = cursor.execute("insert into tweet_tweet ("+",".join(entry.keys())+") values (:"+",:".join(entry.keys())+");", entry); + + +if __name__ == "__main__" : + + parser = OptionParser() + parser.add_option("-f", "--file", dest="filename", + help="write tweet to FILE", metavar="FILE", default="enmi2010_twitter_rest.db") + parser.add_option("-r", "--rpp", dest="rpp", + help="Results per page", metavar="RESULT_PER_PAGE", default=200, type='int') + parser.add_option("-p", "--page", dest="page", + help="page result", metavar="PAGE", default=1, type='int') + parser.add_option("-t", "--total-page", dest="total_page", + help="Total page number", metavar="TOTAL_PAGE", default=16, type='int') + parser.add_option("-s", "--screenname", dest="screen_name", + help="Twitter screen name", metavar="SCREEN_NAME") + parser.add_option("-u", "--user", dest="username", + help="Twitter user", metavar="USER", default=None) + parser.add_option("-w", "--password", dest="password", + help="Twitter password", metavar="PASSWORD", default=None) + parser.add_option("-n", "--new", dest="new", action="store_true", + help="new database", default=False) + parser.add_option("-d", "--debug", dest="debug", action="store_true", + help="debug", default=False) + + + + (options, args) = parser.parse_args() + + if options.debug: + print "OPTIONS : " + print repr(options) + + if options.screen_name is None: + print "No Screen name. Exiting" + sys.exit() + + if options.new and os.path.exists(options.filename): + os.remove(options.filename) + + conn = connect(options.filename) + conn.row_factory = Row + curs = conn.cursor() + + curs.execute("create table if not exists tweet_user ("+ ",".join(columns_user) +", created_at_ts integer);") + + curs.execute("create table if not exists tweet_tweet ("+ ",".join(columns_tweet) +", created_at_ts integer);") + curs.execute("create index if not exists id_index on tweet_tweet (id asc);"); + + curs.execute("select count(*) from tweet_tweet;") + res = curs.fetchone() + + old_total = res[0] + + twitter = twython.setup(username=options.username, password=options.password, headers="IRI enmi (python urllib)") + twitter = twython.Twython(twitter_token = "54ThDZhpEjokcMgHJOMnQA", twitter_secret = "wUoL9UL2T87tfc97R0Dff2EaqRzpJ5XGdmaN2XK3udA") + + search_results = None + page = options.page-1 + + while (page < options.total_page and ( search_results is None or len(search_results) > 0)): + page += 1 + try: + search_results = twitter.getUserTimeline(screen_name=options.screen_name, count=options.rpp, page=page) + except twython.TwythonError, (e): + print "NAME : "+ options.screen_name + " ERROR : " + repr(e.msg) + break + print "NAME : "+ options.screen_name +" PAGE : " + repr(page) + " tweet: " + repr(len(search_results)) + " (total page : " + unicode(options.total_page) + " : rpp : "+unicode(options.rpp)+")" + processPage(search_results, curs, options.debug) + + conn.commit() + + curs.execute("select count(*) from tweet_tweet;") + res = curs.fetchone() + + total = res[0] + + print "Tweet for " + options.screen_name + " : " + unicode(total - old_total) +", Tweet total : " + repr(total) + + conn.close() + + diff -r b7f4b0554ef8 -r bb44692e09ee script/rest/export_twitter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/rest/export_twitter.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# coding=utf-8 + +from sqlite3 import * +import datetime, time +import email.utils +from optparse import OptionParser +import os.path +import os +import sys +from lxml import etree +import uuid +import re + +def parse_date(date_str): + ts = email.utils.parsedate_tz(date_str) + return time.mktime(ts[0:9]) - 60 * ts[9] + +def adapt_datetime(ts): + return time.mktime(ts.timetuple()) + +def adapt_geo(geo): + return simplejson.dumps(geo) + +def convert_geo(s): + return simplejson.loads(s) + + +register_adapter(datetime.datetime, adapt_datetime) +register_converter("geo", convert_geo) + +columns_tweet = [u'favorited', u'truncated', u'text', u'created_at', u'source', u'in_reply_to_status_id', u'in_reply_to_screen_name', u'in_reply_to_user_id', u'geo', u'id', u'user'] +columns_user = [u'id', u'verified', u'profile_sidebar_fill_color', u'profile_text_color', u'followers_count', u'protected', u'location', u'profile_background_color', u'utc_offset', u'statuses_count', u'description', u'friends_count', u'profile_link_color', u'profile_image_url', u'notifications', u'geo_enabled', u'profile_background_image_url', u'screen_name', u'profile_background_tile', u'favourites_count', u'name', u'url', u'created_at', u'time_zone', u'profile_sidebar_border_color', u'following'] + + +if __name__ == "__main__" : + + parser = OptionParser() + parser.add_option("-f", "--file", dest="filename", + help="write export to file", metavar="FILE", default="project_enmi.ldt") + parser.add_option("-d", "--database", dest="database", + help="Input database", metavar="DATABASE") + parser.add_option("-s", "--start-date", dest="start_date", + help="start date", metavar="START_DATE") + parser.add_option("-e", "--end-date", dest="end_date", + help="end date", metavar="END_DATE") + parser.add_option("-I", "--content-file", dest="content_file", + help="Content file", metavar="CONTENT_FILE") + parser.add_option("-c", "--content", dest="content", + help="Content url", metavar="CONTENT") + parser.add_option("-v", "--video-url", dest="video", + help="video url", metavar="VIDEO") + parser.add_option("-i", "--content-id", dest="content_id", + help="Content id", metavar="CONTENT_ID") + parser.add_option("-x", "--exclude", dest="exclude", + help="file containing the id to exclude", metavar="EXCLUDE") + parser.add_option("-C", "--color", dest="color", + help="Color code", metavar="COLOR", default="16763904") + parser.add_option("-H", "--hashtag", dest="hashtag", + help="Hashtag", metavar="HASHTAG", default="enmi09") + parser.add_option("-D", "--duration", dest="duration", type="int", + help="Duration", metavar="DURATION", default=None) + parser.add_option("-n", "--name", dest="name", + help="Cuttting name", metavar="NAME", default=u"Tweets") + parser.add_option("-R", "--replace", dest="replace", action="store_true", + help="Replace tweet ensemble", metavar="REPLACE", default=False) + + + + (options, args) = parser.parse_args() + + + ts = int(parse_date(options.start_date)) + + if options.end_date: + te = int(parse_date(options.end_date)) + else: + te = ts + options.duration + + conn = connect(options.database) + conn.row_factory = Row + cursor = conn.cursor() + + cursor.execute("create temporary table tweet_exclude (id)") + + if options.exclude and os.path.exists(options.exclude): + f = open(options.exclude, 'r+') + for line in f: + cursor.execute("insert into tweet_exclude (id) values (?)", (int(line.strip()),)) + f.close() + + hashtag = u"%#"+unicode(options.hashtag)+u"%" + cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te)); + + root = None + ensemble_parent = None + + if options.content_file and os.path.exists(options.content_file): + + doc = etree.parse(options.content_file) + root = doc.getroot() + + ensemble_parent = root.xpath("//ensembles")[0] + + else: + root = etree.Element(u"iri") + + project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())}) + + medias = etree.SubElement(root, u"medias") + media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""}) + + annotations = etree.SubElement(root, u"annotations") + content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)}) + ensemble_parent = content + + if options.replace: + for ens in ensemble_parent.iterchildren(tag=u"ensemble"): + if ens.get("id","").startswith("tweet_"): + ensemble_parent.remove(ens) + + ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"}) + decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"}) + + etree.SubElement(decoupage, u"title").text = unicode(options.name) + etree.SubElement(decoupage, u"abstract").text = unicode(options.name) + + elements = etree.SubElement(decoupage, u"elements") + + for res in cursor: + tweet_ts = int(res["created_at_ts"]) + tweet_ts_dt = datetime.datetime.fromtimestamp(tweet_ts) + tweet_ts_rel = (tweet_ts-ts) * 1000 + element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(res["id"]), u"color":unicode(options.color), u"author":unicode(res["name"]), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""}) + etree.SubElement(element, u"title").text = unicode(res["name"]) + u": " + unicode(res["text"]) + etree.SubElement(element, u"abstract").text = unicode(res["text"]) + + tags = {} + for m in re.finditer(u"\#(\\w+)",res["text"], re.U): + tags[m.group(1)] = "" + + tags_node = etree.SubElement(element, u"tags") + + for t in tags.keys(): + etree.SubElement(tags_node,u"tag").text = t + + if options.content_file and os.path.exists(options.content_file): + output = open(options.content_file, "w") + else: + output = open(options.filename, "w") + + output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)) + output.flush() + output.close() + + diff -r b7f4b0554ef8 -r bb44692e09ee script/rest/getscreennames.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/rest/getscreennames.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,36 @@ +from sqlite3 import * +import datetime, time +import email.utils +from optparse import OptionParser +import os.path +import os +import sys +import simplejson +import re + +if __name__ == "__main__" : + + parser = OptionParser() + + (options, args) = parser.parse_args() + + conn = connect(args[0]) + conn.row_factory = Row + curs = conn.cursor() + + names = {} + + curs.execute("select tt.text as text from tweet_tweet as tt left join tweet_user as tu on tt.user = tu.rowid where tt.text like \"%ENMI09%\" order by tt.created_at_ts asc;") + + regexp = re.compile("\@(\w+)") + + for row in curs: + text = row["text"] + for m in regexp.finditer(text): + names[m.group(1)]=m.group(1) + + + print repr(names.keys()) + print repr(len(names.keys())) + + diff -r b7f4b0554ef8 -r bb44692e09ee script/rest/process_iri.awk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/rest/process_iri.awk Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,11 @@ +{ + cmd = "ls -rc " $5"/*.iri | head -n1"; + cmd | getline RES ; + close(cmd); + cmd = "python ../export_twitter.py -d ../enmi2009_twitter_profile.db -x ../exclude.txt -s \""$2"\" -D "$4" -R -I "RES + print cmd; + system(cmd); + cmd = "scp "RES" web.iri.centrepompidou.fr:/iridata/www/amateur/nouveaumonde/static/media/ldt/"$5"/"; + print(cmd); + system(cmd); +} diff -r b7f4b0554ef8 -r bb44692e09ee script/rest/reinit_iri.awk --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/rest/reinit_iri.awk Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,6 @@ +{ + cmd = "ls -rc " $5"/*.iri | head -n1"; + cmd | getline RES ; + close(cmd); + system( "cp " RES".old "RES); +} diff -r b7f4b0554ef8 -r bb44692e09ee script/rest/search_enmi.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/rest/search_enmi.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,39 @@ +import sqlite3 +import twython + +def get_option(): + + parser = OptionParser() + + parser.add_option("-l", "--log", dest="logfile", + help="log to file", metavar="LOG", default="stderr") + parser.add_option("-v", dest="verbose", action="count", + help="verbose", metavar="VERBOSE", default=0) + parser.add_option("-q", dest="quiet", action="count", + help="quiet", metavar="QUIET", default=0) + parser.add_option("-r", "--request", dest="request", + help="twitter request", metavar="REQUEST", default=0) + #add request token + #add + + return parser.parse_args() + +if __name__ == "__main__": + + twitter = twython.Twython() + conn = sqlite3.connect('enmi2010_twitter_rest.db') + try: + conn.row_factory = sqlite3.Row + curs = conn.cursor() + curs.execute("create table if not exists tweet_tweet (json);") + conn.commit() + + results = twitter.searchTwitter(q="#enmi", rpp="50") + for tweet in results["results"]: + print tweet + curs.execute("insert into tweet_tweet (json) values (:json);", {"json":unicode(tweet)}) + conn.commit() + finally: + conn.close() + + diff -r b7f4b0554ef8 -r bb44692e09ee script/stream/recorder.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/stream/recorder.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,76 @@ +import time + +from getpass import getpass +from textwrap import TextWrapper + +import tweepy +import webbrowser + +CONSUMER_KEY = "54ThDZhpEjokcMgHJOMnQA" +CONSUMER_SECRET = "wUoL9UL2T87tfc97R0Dff2EaqRzpJ5XGdmaN2XK3udA" + +class StreamWatcherListener(tweepy.StreamListener): + + status_wrapper = TextWrapper(width=60, initial_indent=' ', subsequent_indent=' ') + + def on_status(self, status): + try: + print self.status_wrapper.fill(status.text) + print '\n %s %s via %s\n' % (status.author.screen_name, status.created_at, status.source) + except: + # Catch any unicode errors while printing to console + # and just ignore them to avoid breaking application. + pass + + def on_error(self, status_code): + print 'An error has occured! Status code = %s' % status_code + return True # keep stream alive + + def on_timeout(self): + print 'Snoozing Zzzzzz' + + + +def main(): + + auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) + auth_url = auth.get_authorization_url() + print 'Please authorize: ' + auth_url + webbrowser.open(auth_url) + + # Prompt for login credentials and setup stream object + verifier = raw_input('PIN: ').strip() + auth.get_access_token(verifier) + stream = tweepy.Stream(auth, StreamWatcherListener(), timeout=None) + + # Prompt for mode of streaming + valid_modes = ['sample', 'filter'] + while True: + mode = raw_input('Mode? [sample/filter] ') + if mode in valid_modes: + break + print 'Invalid mode! Try again.' + + if mode == 'sample': + stream.sample() + + elif mode == 'filter': + follow_list = raw_input('Users to follow (comma separated): ').strip() + track_list = raw_input('Keywords to track (comma seperated): ').strip() + if follow_list: + follow_list = [u for u in follow_list.split(',')] + else: + follow_list = None + if track_list: + track_list = [k for k in track_list.split(',')] + else: + track_list = None + + stream.filter(follow_list, track_list) + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + print '\nGoodbye!' diff -r b7f4b0554ef8 -r bb44692e09ee script/stream/recorder_tweetstream.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/stream/recorder_tweetstream.py Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,125 @@ +import tweetstream +from getpass import getpass +import socket +socket._fileobject.default_bufsize = 0 +from sqlite3 import * +from optparse import OptionParser +import os + + +#columns_tweet = [u'favorited', u'truncated', u'text', u'created_at', u'source', u'in_reply_to_status_id', u'in_reply_to_screen_name', u'in_reply_to_user_id', u'geo', u'id', u'user'] +columns_tweet = [u'user', u'favorited', u'contributors', u'truncated', u'text', u'created_at', u'retweeted', u'in_reply_to_status_id_str', u'coordinates', u'in_reply_to_user_id_str', u'entities', u'in_reply_to_status_id', u'place', u'in_reply_to_user_id', u'id', u'in_reply_to_screen_name', u'retweet_count', u'geo', u'id_str', u'source'] +#columns_user = [u'id', u'verified', u'profile_sidebar_fill_color', u'profile_text_color', u'followers_count', u'protected', u'location', u'profile_background_color', u'utc_offset', u'statuses_count', u'description', u'friends_count', u'profile_link_color', u'profile_image_url', u'notifications', u'geo_enabled', u'profile_background_image_url', u'screen_name', u'profile_background_tile', u'favourites_count', u'name', u'url', u'created_at', u'time_zone', u'profile_sidebar_border_color', u'following'] +columns_user = [u'follow_request_sent', u'profile_use_background_image', u'id', u'verified', u'profile_sidebar_fill_color', u'profile_text_color', u'followers_count', u'protected', u'location', u'profile_background_color', u'id_str', u'utc_offset', u'statuses_count', u'description', u'friends_count', u'profile_link_color', u'profile_image_url', u'notifications', u'show_all_inline_media', u'geo_enabled', u'profile_background_image_url', u'name', u'lang', u'following', u'profile_background_tile', u'favourites_count', u'screen_name', u'url', u'created_at', u'contributors_enabled', u'time_zone', u'profile_sidebar_border_color', u'is_translator', u'listed_count'] +#just put it in a sqlite3 tqble + + +class ReconnectingTweetStream(tweetstream.TrackStream): + """TweetStream class that automatically tries to reconnect if the + connecting goes down. Reconnecting, and waiting for reconnecting, is + blocking. + + :param username: See :TweetStream: + + :param password: See :TweetStream: + + :keyword url: See :TweetStream: + + :keyword reconnects: Number of reconnects before a ConnectionError is + raised. Default is 3 + + :error_cb: Optional callable that will be called just before trying to + reconnect. The callback will be called with a single argument, the + exception that caused the reconnect attempt. Default is None + + :retry_wait: Time to wait before reconnecting in seconds. Default is 5 + + """ + + def __init__(self, user, password, keywords, url="track", reconnects=3, error_cb=None, retry_wait=5, **kwargs): + self.max_reconnects = reconnects + self.retry_wait = retry_wait + self._reconnects = 0 + self._error_cb = error_cb + super(ReconnectingTweetStream,self).__init__(user, password, keywords, url, **kwargs) + + def next(self): + while True: + try: + return super(ReconnectingTweetStream,self).next() + except tweetstream.ConnectionError, e: + self._reconnects += 1 + if self._reconnects > self.max_reconnects: + raise ConnectionError("Too many retries") + + # Note: error_cb is not called on the last error since we + # raise a ConnectionError instead + if callable(self._error_cb): + self._error_cb(e) + + time.sleep(self.retry_wait) + # Don't listen to auth error, since we can't reasonably reconnect + # when we get one. + + + +def process_tweet(tweet, cursor, debug): + print tweet + cursor.execute("insert into tweet_tweet (json) values (:json);", {"json":unicode(tweet)}); + +def main(username, password, track, curs, debug, reconnects): + + username = username or raw_input('Twitter username: ') + password = password or getpass('Twitter password: ') + + track_list = track or raw_input('Keywords to track (comma seperated): ').strip() + track_list = [k for k in track_list.split(',')] + + stream = ReconnectingTweetStream(username, password, track_list, reconnects=reconnects) + try: + for tweet in stream: + process_tweet(tweet, curs, debug) + finally: + stream.close() + +if __name__ == '__main__': + + parser = OptionParser() + parser.add_option("-f", "--file", dest="filename", + help="write tweet to FILE", metavar="FILE", default="enmi2010_twitter.db") + parser.add_option("-u", "--user", dest="username", + help="Twitter user", metavar="USER", default=None) + parser.add_option("-w", "--password", dest="password", + help="Twitter password", metavar="PASSWORD", default=None) + parser.add_option("-t", "--track", dest="track", + help="Twitter track", metavar="TRACK") + parser.add_option("-n", "--new", dest="new", action="store_true", + help="new database", default=False) + parser.add_option("-d", "--debug", dest="debug", action="store_true", + help="debug", default=False) + parser.add_option("-r", "--reconnects", dest="reconnects", + help="Reconnects", metavar="RECONNECTS", default=10, type='int') + + + (options, args) = parser.parse_args() + + if options.debug: + print "OPTIONS : " + print repr(options) + + if options.new and os.path.exists(options.filename): + os.remove(options.filename) + + conn = connect(options.filename) + try: + conn.row_factory = Row + curs = conn.cursor() + + curs.execute("create table if not exists tweet_tweet (json);") + + try: + main(options.username, options.password, options.track, curs, options.debug, options.reconnects) + except KeyboardInterrupt: + print '\nGoodbye!' + finally: + conn.close() diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/credential.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/script/virtualenv/res/credential.txt Tue Jan 11 11:17:17 2011 +0100 @@ -0,0 +1,20 @@ +Consumer key +54ThDZhpEjokcMgHJOMnQA + +Consumer secret +wUoL9UL2T87tfc97R0Dff2EaqRzpJ5XGdmaN2XK3udA + +access_tokens: +47312923-LiNTtz0I18YXMVIrFeTuhmH7bOvYsK6p3Ln2Dc + +access_secret: +r3LoXVcjImNAElUpWqTu2SG2xCdWFHkva7xeQoncA + +Request token URL +http://twitter.com/oauth/request_token + +Access token URL +http://twitter.com/oauth/access_token + +Authorize URL +http://twitter.com/oauth/authorize \ No newline at end of file diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/httplib2-0.6.0.tar.gz Binary file script/virtualenv/res/httplib2-0.6.0.tar.gz has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/lxml-2.2.7.tar.gz Binary file script/virtualenv/res/lxml-2.2.7.tar.gz has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/python-oauth2.tar.gz Binary file script/virtualenv/res/python-oauth2.tar.gz has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/pytz-2010o.tar.gz Binary file script/virtualenv/res/pytz-2010o.tar.gz has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/simplejson-2.1.2.tar.gz Binary file script/virtualenv/res/simplejson-2.1.2.tar.gz has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/sqlalchemy-default.tar.gz Binary file script/virtualenv/res/sqlalchemy-default.tar.gz has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/tweetstream.tar.gz Binary file script/virtualenv/res/tweetstream.tar.gz has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/twitter-1.4.2.tar.gz Binary file script/virtualenv/res/twitter-1.4.2.tar.gz has changed diff -r b7f4b0554ef8 -r bb44692e09ee script/virtualenv/res/twitter-text.tar.gz Binary file script/virtualenv/res/twitter-text.tar.gz has changed