tweet_live: changeset 464:b9243ade95e2

--- a/script/lib/iri_tweet/create_twitter_export_conf.py	Tue Dec 20 16:26:34 2011 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-from lxml import etree
-from optparse import OptionParser #@UnresolvedImport
-
-def get_options():
-
-    parser = OptionParser()
-
-    parser.add_option("-f", "--file", dest="outputfile",
-                      help="destination filename", metavar="FILE", default="twitter_export_conf.xml")
-    parser.add_option("-i", "--input", dest="inputfile", 
-                      help="inputfile", metavar="INPUT", default=None)
-
-    return parser.parse_args()
-
-if __name__ == "__main__":
-    (options, args) = get_options()
-    
-    dest_filename = options.outputfile
-    
-    path_list = []
-    if options.inputfile is None:
-        path_list = args
-    else:
-        with open(options.inputfile, 'r') as fi:
-            path_list = fi
-    
-    
-    root = etree.Element("twitter_export")
-    
-    
-    for path in path_list:
-        
-        iri_doc = etree.parse(path)
-        media_nodes = iri_doc.xpath("/iri/body/medias/media[@id='video']/video")
-        duration = int(media_nodes[0].get("dur"))/1000
-        
-        file_elem = etree.SubElement(root, "file")
-        etree.SubElement(file_elem, "path").text = path
-        etree.SubElement(file_elem, "start_date")
-        etree.SubElement(file_elem, "duration").text = unicode(duration)
-         
-    tree = etree.ElementTree(root)
-    tree.write(dest_filename, encoding="utf-8", pretty_print=True, xml_declaration=True)
\ No newline at end of file

--- a/script/lib/iri_tweet/export_twitter_alchemy.py	Tue Dec 20 16:26:34 2011 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,361 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-from lxml import etree
-from models import setup_database
-from optparse import OptionParser #@UnresolvedImport
-from sqlalchemy import Table, Column, BigInteger
-from utils import (parse_date, set_logging_options, set_logging, get_filter_query, 
-    get_logger)
-import anyjson
-import datetime
-import httplib2
-import os.path
-import re
-import sys
-import time
-import uuid #@UnresolvedImport
-
-#class TweetExclude(object):
-#    def __init__(self, id):
-#        self.id = id
-#        
-#    def __repr__(self):
-#        return "<TweetExclude(id=%d)>" % (self.id)
-
-
-def parse_polemics(tw, extended_mode):
-    """
-    parse polemics in text and return a list of polemic code. None if not polemic found
-    """
-    polemics = {} 
-    for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
-        pol_link = {
-            '++' : u'OK',
-            '--' : u'KO',
-            '??' : u'Q',
-            '==' : u'REF'}[m.group(1)]
-        polemics[pol_link] = pol_link
-    
-    if extended_mode:
-        if "?" in tw.text:
-            polemics["Q"] = "Q"
-        
-        for entity in tw.entity_list:
-            if entity.type == "entity_url":
-                polemics["REF"] = "REF" 
-    
-    if len(polemics) > 0:
-        return polemics.keys()
-    else:
-        return None
-
-def get_options():
-    parser = OptionParser()
-    parser.add_option("-f", "--file", dest="filename",
-                      help="write export to file", metavar="FILE", default="project.ldt")
-    parser.add_option("-d", "--database", dest="database",
-                      help="Input database", metavar="DATABASE")
-    parser.add_option("-s", "--start-date", dest="start_date",
-                      help="start date", metavar="START_DATE", default=None)
-    parser.add_option("-e", "--end-date", dest="end_date",
-                      help="end date", metavar="END_DATE", default=None)
-    parser.add_option("-I", "--content-file", dest="content_file",
-                      help="Content file", metavar="CONTENT_FILE")
-    parser.add_option("-c", "--content", dest="content",
-                      help="Content url", metavar="CONTENT")
-    parser.add_option("-V", "--video-url", dest="video",
-                      help="video url", metavar="VIDEO")
-    parser.add_option("-i", "--content-id", dest="content_id",
-                      help="Content id", metavar="CONTENT_ID")
-    parser.add_option("-x", "--exclude", dest="exclude",
-                      help="file containing the id to exclude", metavar="EXCLUDE")
-    parser.add_option("-C", "--color", dest="color",
-                      help="Color code", metavar="COLOR", default="16763904")
-    parser.add_option("-H", "--hashtag", dest="hashtag",
-                      help="Hashtag", metavar="HASHTAG", default=[], action="append")                      
-    parser.add_option("-D", "--duration", dest="duration", type="int",
-                      help="Duration", metavar="DURATION", default=None)
-    parser.add_option("-n", "--name", dest="name",
-                      help="Cutting name", metavar="NAME", default=u"Tweets")
-    parser.add_option("-R", "--replace", dest="replace", action="store_true",
-                      help="Replace tweet ensemble", metavar="REPLACE", default=False)
-    parser.add_option("-m", "--merge", dest="merge", action="store_true",
-                      help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
-    parser.add_option("-L", "--list-conf", dest="listconf",
-                      help="list of file to process", metavar="LIST_CONF", default=None)
-    parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
-                      help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
-    parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
-                      help="A list of user screen name", metavar="USER_WHITELIST",default=None)
-    
-    
-    set_logging_options(parser)
-
-    
-    return parser.parse_args() + (parser,)
-
-
-if __name__ == "__main__" :
-
-    (options, args, parser) = get_options()
-    
-    set_logging(options)
-        
-    get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
-    
-    if len(sys.argv) == 1 or options.database is None:
-        parser.print_help()
-        sys.exit(1)
-    
-    conn_str = options.database.strip()
-    if not re.match("^\w+://.+", conn_str):
-        conn_str = 'sqlite:///' + conn_str
-
-    engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
-    conn = None
-    try :
-        conn = engine.connect()    
-        session = None
-        try :
-            session = Session(bind=conn)         
-            tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
-            #mapper(TweetExclude, tweet_exclude_table)
-            metadata.create_all(bind=conn, tables=[tweet_exclude_table])
-            
-            if options.exclude and os.path.exists(options.exclude):
-                with open(options.exclude, 'r+') as f:
-                    tei = tweet_exclude_table.insert()
-                    for line in f:
-                        conn.execute(tei.values(id=long(line.strip())))
-            user_whitelist_file = options.user_whitelist
-            user_whitelist = None
-            
-            if options.listconf:
-                
-                parameters = []
-                confdoc = etree.parse(options.listconf)
-                for node in confdoc.xpath("/twitter_export/file"):
-                    params = {}
-                    for snode in node:
-                        if snode.tag == "path":
-                            params['content_file'] = snode.text
-                        elif snode.tag == "start_date":
-                            params['start_date'] = snode.text
-                        elif snode.tag == "end_date":
-                            params['end_date'] = snode.text
-                        elif snode.tag == "duration":
-                            params['duration'] = int(snode.text)
-                        elif snode.tag == "hashtags":
-                            params['hashtags'] = [snode.text]
-                    if options.hashtag or 'hashtags' not in params :
-                        params['hashtags'] = options.hashtag
-                    parameters.append(params)
-            else:                        
-                parameters = [{
-                    'start_date': options.start_date,
-                    'end_date' : options.end_date,
-                    'duration' : options.duration,
-                    'content_file' : options.content_file,
-                    'hashtags' : options.hashtag
-                }]
-            
-            for params in parameters:
-                
-                get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
-                
-                start_date_str = params.get("start_date",None)
-                end_date_str = params.get("end_date", None)
-                duration = params.get("duration", None)
-                content_file = params.get("content_file", None)
-                hashtags = params.get('hashtags', [])
-                  
-                if user_whitelist_file:
-                    with open(user_whitelist_file, 'r+') as f:
-                        user_whitelist = list(set([s.strip() for s in f]))
-                
-                start_date = None
-                ts = None
-                if start_date_str:
-                    start_date = parse_date(start_date_str) 
-                    ts = time.mktime(start_date.timetuple())
-            
-                end_date = None
-                if end_date_str:
-                    end_date = parse_date(end_date_str)
-                elif start_date and duration:
-                    end_date = start_date + datetime.timedelta(seconds=duration)
-                
-                query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
-                    
-                query_res = query.all()
-                                 
-                root = None
-                ensemble_parent = None
-                
-                #to do : analyse situation ldt or iri ? filename set or not ?
-                
-                if content_file and content_file.find("http") == 0:
-                    
-                    get_logger().debug("url : " + content_file) #@UndefinedVariable
-                    
-                    h = httplib2.Http()
-                    resp, content = h.request(content_file)
-                    
-                    get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
-                    
-                    project = anyjson.deserialize(content)
-                    root = etree.fromstring(project["ldt"])
-                
-                elif content_file and os.path.exists(content_file):
-
-                    doc = etree.parse(content_file)
-                    root = doc.getroot()
-                    
-                
-                if root is None:
-                
-                    root = etree.Element(u"iri")
-                        
-                    project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
-                
-                    medias = etree.SubElement(root, u"medias")
-                    media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
-                    
-                    annotations = etree.SubElement(root, u"annotations")    
-                    content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
-                    ensemble_parent = content
-                    
-                
-                if ensemble_parent is None:
-                    file_type = None
-                    for node in root:
-                        if node.tag == "project":
-                            file_type = "ldt"
-                            break
-                        elif node.tag == "head":
-                            file_type = "iri"
-                            break
-                    
-                    if file_type == "ldt":
-                        media_nodes = root.xpath("//media")
-                        if len(media_nodes) > 0:
-                            media = media_nodes[0]
-                        annotations_node = root.find(u"annotations")
-                        if annotations_node is None:
-                            annotations_node = etree.SubElement(root, u"annotations")
-                        content_node = annotations_node.find(u"content")
-                        if content_node is None:
-                            content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
-                        ensemble_parent = content_node
-                    elif file_type == "iri":
-                        body_node = root.find(u"body")
-                        if body_node is None:
-                            body_node = etree.SubElement(root, u"body")
-                        ensembles_node = body_node.find(u"ensembles")
-                        if ensembles_node is None:
-                            ensembles_node = etree.SubElement(body_node, u"ensembles")
-                        ensemble_parent = ensembles_node
-                    
-                    
-                if ensemble_parent is None:
-                    get_logger().error("Can not process file") #@UndefinedVariable
-                    sys.exit()
-            
-                if options.replace:
-                    for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
-                        if ens.get("id","").startswith("tweet_"):
-                            ensemble_parent.remove(ens)
-                
-                ensemble = None
-                elements = None
-                
-                if options.merge:
-                    ensemble = ensemble_parent.find(u"ensemble")
-                    if ensemble is not None:
-                        elements = ensemble.find(u".//elements")                
-                    
-                if ensemble is None or elements is None:
-                    ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
-                    decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
-                
-                    etree.SubElement(decoupage, u"title").text = unicode(options.name)
-                    etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
-                
-                    elements = etree.SubElement(decoupage, u"elements")
-
-                
-                for tw in query_res:
-                    tweet_ts_dt = tw.created_at
-                    tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
-                    if ts is None:
-                        ts = tweet_ts
-                    tweet_ts_rel = (tweet_ts-ts) * 1000
-                    username = None
-                    profile_url = ""
-                    if tw.user is not None:
-                        username = tw.user.name                    
-                        profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
-                    if not username:
-                        username = "anon."
-                    
-                    element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
-                    etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
-                    etree.SubElement(element, u"abstract").text = unicode(tw.text)
-            
-                    tags_node = etree.SubElement(element, u"tags")
-                    
-                    for entity in tw.entity_list:
-                        if entity.type == u'entity_hashtag': 
-                            etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
-                            
-                    meta_element = etree.SubElement(element, u'meta')
-                    
-                    polemics_list = parse_polemics(tw, options.extended_mode)
-                    if polemics_list:
-                        polemics_element = etree.Element(u'polemics')
-                        for pol in polemics_list:
-                            etree.SubElement(polemics_element, u'polemic').text = pol
-                        meta_element.append(polemics_element)
-
-                    etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json))
-                    
-                # sort by tc in
-                if options.merge :
-                    # remove all elements and put them in a array
-                    # sort them with tc
-                    #put them back
-                    elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
-                    
-                    
-                
-                
-                output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)  
-                
-                if content_file and content_file.find("http") == 0:
-                    
-                    project["ldt"] = output_data
-                    body = anyjson.serialize(project)
-                    get_logger().debug("write http " + content_file) #@UndefinedVariable
-                    get_logger().debug("write http " + repr(body)) #@UndefinedVariable
-                    h = httplib2.Http()
-                    resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
-                    get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
-                else:
-                    if content_file and os.path.exists(content_file):
-                        dest_file_name = content_file 
-                    else:
-                        dest_file_name = options.filename
-            
-                    get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
-                    output = open(dest_file_name, "w")
-                    output.write(output_data)
-                    output.flush()
-                    output.close()
-                
-        finally:
-            if session:
-                session.close()
-    finally:
-        if conn:
-            conn.close()

--- a/script/lib/iri_tweet/utils.py	Tue Dec 20 16:26:34 2011 +0100
+++ b/script/lib/iri_tweet/utils.py	Sat Jan 07 16:12:44 2012 +0100
@@ -10,6 +10,7 @@
 import logging
 import os.path
 import sys
+import math
 import twitter.oauth #@UnresolvedImport
 import twitter.oauth_dance #@UnresolvedImport
 import twitter_text #@UnresolvedImport
@@ -171,7 +172,7 @@
 
 class TwitterProcessor(object):
     
-    def __init__(self, json_dict, json_txt, source_id, session, access_token=None, token_filename=None):
+    def __init__(self, json_dict, json_txt, source_id, session, access_token=None, token_filename=None, user_query_twitter=False):
 
         if json_dict is None and json_txt is None:
             raise TwitterProcessorException("No json")
@@ -194,10 +195,11 @@
         self.token_filename = token_filename
         self.access_token = access_token
         self.obj_buffer = ObjectsBuffer()
+        self.user_query_twitter = user_query_twitter  
         
 
 
-    def __get_user(self, user_dict, do_merge, query_twitter = False):
+    def __get_user(self, user_dict, do_merge):
         get_logger().debug("Get user : " + repr(user_dict)) #@UndefinedVariable
         
         user_dict = adapt_fields(user_dict, fields_adapter["stream"]["user"])
@@ -243,7 +245,7 @@
     
         user_created_at = user_dict.get("created_at", None)
         
-        if user_created_at is None and query_twitter:
+        if user_created_at is None and self.user_query_twitter:
             
             if self.access_token is not None:
                 acess_token_key, access_token_secret = self.access_token
@@ -333,7 +335,7 @@
             return EntityHashtag, entity_dict             
         
         def process_user_mentions():
-            user_mention = self.__get_user(ind, False, False)
+            user_mention = self.__get_user(ind, False)
             if user_mention is None:
                 entity_dict['user_id'] = None
             else:
@@ -598,3 +600,17 @@
             raise
         except:
             self.handleError(record)
+
+def show_progress(current_line, total_line, label, width):
+
+    percent = (float(current_line) / float(total_line)) * 100.0
+
+    marks = math.floor(width * (percent / 100.0))
+    spaces = math.floor(width - marks)
+
+    loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']'
+    
+    sys.stdout.write(u"%s %d%% %d/%d - %r\r" % (loader, percent, current_line - 1, total_line - 1, label[:50].rjust(50))) #takes the header into account
+    if percent >= 100:
+        sys.stdout.write("\n")
+    sys.stdout.flush()

--- a/script/rest/search_twitter.py	Tue Dec 20 16:26:34 2011 +0100
+++ b/script/rest/search_twitter.py	Sat Jan 07 16:12:44 2012 +0100
@@ -17,8 +17,6 @@
                       help="verbose", metavar="VERBOSE", default=0)
     parser.add_option("-q", dest="quiet", action="count",
                       help="quiet", metavar="QUIET", default=0)
-    parser.add_option("-r", "--request", dest="request",
-                      help="twitter request", metavar="REQUEST", default=0)
     parser.add_option("-Q", dest="query",
                       help="query", metavar="QUERY")
     parser.add_option("-P", dest="rpp", metavar="RPP", default="50",
@@ -27,9 +25,6 @@
                       help="Token file name")
     
 
-    #add request token
-    #add 
-
     return parser.parse_args()
 
 if __name__ == "__main__":

--- a/script/stream/get_stats.py	Tue Dec 20 16:26:34 2011 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,38 +0,0 @@
-
-import httplib2
-import anyjson
-from lxml import etree
-import sys
-import pprint
-
-def get_stats(url):
-    
-    h = httplib2.Http()
-    resp, content = h.request(url)    
-    #project = anyjson.deserialize(content)
-    root = etree.fromstring(content)
-
-    #get all annotations
-    res_xpath = root.xpath("//ensemble[starts-with(@id,'tweet_')]//element")
-    
-    total_annot = len(res_xpath)
-    total_with_polemic = 0
-    total_by_type = {}
-    
-    
-    for annot in res_xpath:
-        polemic_list = annot.xpath("meta/polemics/polemic")
-        if len(polemic_list)> 0:
-            total_with_polemic += 1
-            for polemic_item in polemic_list:
-                pol_type = polemic_item.text
-                total_by_type[pol_type] = total_by_type.get(pol_type,0) + 1
-            
-            
-    return {"total_annotations": total_annot, "total_with_polemics": total_with_polemic, "polemic_by_type": total_by_type}
-
-if __name__ == "__main__":
-    
-    pp = pprint.PrettyPrinter(indent=4, width=1)
-    
-    pp.pprint(get_stats(sys.argv[1]))
\ No newline at end of file

--- a/script/stream/recorder_tweetstream.py	Tue Dec 20 16:26:34 2011 +0100
+++ b/script/stream/recorder_tweetstream.py	Sat Jan 07 16:12:44 2012 +0100
@@ -229,7 +229,7 @@
             self.stop_event.set()
 
 
-def process_tweet(tweet, source_id, session, access_token, logger):
+def process_tweet(tweet, source_id, session, access_token, twitter_query_user, logger):
     try:
         tweet_obj = anyjson.deserialize(tweet)
         if 'text' not in tweet_obj:
@@ -241,7 +241,7 @@
             screen_name = tweet_obj['user']['screen_name']
         logger.info(u"Process_tweet from %s : %s" % (screen_name, tweet_obj['text']))
         logger.debug(u"Process_tweet :" + repr(tweet))
-        processor = utils.TwitterProcessor(tweet_obj, tweet, source_id, session, access_token, None)
+        processor = utils.TwitterProcessor(tweet_obj, tweet, source_id, session, access_token, None, twitter_query_user)
         processor.process()
     except Exception as e:
         message = u"Error %s processing tweet %s" % (repr(e), tweet)
@@ -263,6 +263,7 @@
     
     def __init__(self, session_maker, queue, options, access_token, stop_event, logger_queue, parent_pid):
         super(TweetProcess, self).__init__(session_maker, queue, options, access_token, stop_event, logger_queue, parent_pid)
+        self.twitter_query_user = options.twitter_query_user
 
 
     def do_run(self):
@@ -277,7 +278,7 @@
                 except Exception as e:
                     self.logger.debug('Process tweet exception in loop : ' + repr(e))
                     continue
-                process_tweet(tweet_txt, source_id, session, self.access_token, self.logger)
+                process_tweet(tweet_txt, source_id, session, self.access_token, self.twitter_query_user, self.logger)
                 session.commit()
         finally:
             session.rollback()
@@ -345,6 +346,9 @@
                       help="number of process.\nIf 0, only the lefovers of the database are processed.\nIf 1, no postprocessing is done on the tweets.", metavar="PROCESS_NB", default=2, type='int')
     parser.add_option("--url", dest="url",
                       help="The twitter url to connect to.", metavar="URL", default=tweetstream.FilterStream.url)
+    parser.add_option("--query-user", dest="twitter_query_user", action="store_true",
+                      help="Query twitter for users", default=False, metavar="QUERY_USER")
+
 
 
     utils.set_logging_options(parser)

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/create_twitter_export_conf.py	Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,43 @@
+from lxml import etree
+from optparse import OptionParser #@UnresolvedImport
+
+def get_options():
+
+    parser = OptionParser()
+
+    parser.add_option("-f", "--file", dest="outputfile",
+                      help="destination filename", metavar="FILE", default="twitter_export_conf.xml")
+    parser.add_option("-i", "--input", dest="inputfile", 
+                      help="inputfile", metavar="INPUT", default=None)
+
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    (options, args) = get_options()
+    
+    dest_filename = options.outputfile
+    
+    path_list = []
+    if options.inputfile is None:
+        path_list = args
+    else:
+        with open(options.inputfile, 'r') as fi:
+            path_list = fi
+    
+    
+    root = etree.Element("twitter_export")
+    
+    
+    for path in path_list:
+        
+        iri_doc = etree.parse(path)
+        media_nodes = iri_doc.xpath("/iri/body/medias/media[@id='video']/video")
+        duration = int(media_nodes[0].get("dur"))/1000
+        
+        file_elem = etree.SubElement(root, "file")
+        etree.SubElement(file_elem, "path").text = path
+        etree.SubElement(file_elem, "start_date")
+        etree.SubElement(file_elem, "duration").text = unicode(duration)
+         
+    tree = etree.ElementTree(root)
+    tree.write(dest_filename, encoding="utf-8", pretty_print=True, xml_declaration=True)

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/export_twitter_alchemy.py	Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,362 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+from lxml import etree
+from iri_tweet.models import setup_database
+from optparse import OptionParser #@UnresolvedImport
+from sqlalchemy import Table, Column, BigInteger
+from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
+    get_logger)
+import anyjson
+import datetime
+import httplib2
+import os.path
+import re
+import sys
+import time
+import uuid #@UnresolvedImport
+from dateutil.parser import parse as parse_date
+
+#class TweetExclude(object):
+#    def __init__(self, id):
+#        self.id = id
+#        
+#    def __repr__(self):
+#        return "<TweetExclude(id=%d)>" % (self.id)
+
+
+def parse_polemics(tw, extended_mode):
+    """
+    parse polemics in text and return a list of polemic code. None if not polemic found
+    """
+    polemics = {} 
+    for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
+        pol_link = {
+            '++' : u'OK',
+            '--' : u'KO',
+            '??' : u'Q',
+            '==' : u'REF'}[m.group(1)]
+        polemics[pol_link] = pol_link
+    
+    if extended_mode:
+        if "?" in tw.text:
+            polemics["Q"] = "Q"
+        
+        for entity in tw.entity_list:
+            if entity.type == "entity_url":
+                polemics["REF"] = "REF" 
+    
+    if len(polemics) > 0:
+        return polemics.keys()
+    else:
+        return None
+
+def get_options():
+    parser = OptionParser()
+    parser.add_option("-f", "--file", dest="filename",
+                      help="write export to file", metavar="FILE", default="project.ldt")
+    parser.add_option("-d", "--database", dest="database",
+                      help="Input database", metavar="DATABASE")
+    parser.add_option("-s", "--start-date", dest="start_date",
+                      help="start date", metavar="START_DATE", default=None)
+    parser.add_option("-e", "--end-date", dest="end_date",
+                      help="end date", metavar="END_DATE", default=None)
+    parser.add_option("-I", "--content-file", dest="content_file",
+                      help="Content file", metavar="CONTENT_FILE")
+    parser.add_option("-c", "--content", dest="content",
+                      help="Content url", metavar="CONTENT")
+    parser.add_option("-V", "--video-url", dest="video",
+                      help="video url", metavar="VIDEO")
+    parser.add_option("-i", "--content-id", dest="content_id",
+                      help="Content id", metavar="CONTENT_ID")
+    parser.add_option("-x", "--exclude", dest="exclude",
+                      help="file containing the id to exclude", metavar="EXCLUDE")
+    parser.add_option("-C", "--color", dest="color",
+                      help="Color code", metavar="COLOR", default="16763904")
+    parser.add_option("-H", "--hashtag", dest="hashtag",
+                      help="Hashtag", metavar="HASHTAG", default=[], action="append")                      
+    parser.add_option("-D", "--duration", dest="duration", type="int",
+                      help="Duration", metavar="DURATION", default=None)
+    parser.add_option("-n", "--name", dest="name",
+                      help="Cutting name", metavar="NAME", default=u"Tweets")
+    parser.add_option("-R", "--replace", dest="replace", action="store_true",
+                      help="Replace tweet ensemble", metavar="REPLACE", default=False)
+    parser.add_option("-m", "--merge", dest="merge", action="store_true",
+                      help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
+    parser.add_option("-L", "--list-conf", dest="listconf",
+                      help="list of file to process", metavar="LIST_CONF", default=None)
+    parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
+                      help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
+    parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
+                      help="A list of user screen name", metavar="USER_WHITELIST",default=None)
+    
+    
+    set_logging_options(parser)
+
+    
+    return parser.parse_args() + (parser,)
+
+
+if __name__ == "__main__" :
+
+    (options, args, parser) = get_options()
+    
+    set_logging(options)
+        
+    get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
+    
+    if len(sys.argv) == 1 or options.database is None:
+        parser.print_help()
+        sys.exit(1)
+    
+    conn_str = options.database.strip()
+    if not re.match("^\w+://.+", conn_str):
+        conn_str = 'sqlite:///' + conn_str
+
+    engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
+    conn = None
+    try :
+        conn = engine.connect()    
+        session = None
+        try :
+            session = Session(bind=conn)         
+            tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
+            #mapper(TweetExclude, tweet_exclude_table)
+            metadata.create_all(bind=conn, tables=[tweet_exclude_table])
+            
+            if options.exclude and os.path.exists(options.exclude):
+                with open(options.exclude, 'r+') as f:
+                    tei = tweet_exclude_table.insert()
+                    for line in f:
+                        conn.execute(tei.values(id=long(line.strip())))
+            user_whitelist_file = options.user_whitelist
+            user_whitelist = None
+            
+            if options.listconf:
+                
+                parameters = []
+                confdoc = etree.parse(options.listconf)
+                for node in confdoc.xpath("/twitter_export/file"):
+                    params = {}
+                    for snode in node:
+                        if snode.tag == "path":
+                            params['content_file'] = snode.text
+                        elif snode.tag == "start_date":
+                            params['start_date'] = snode.text
+                        elif snode.tag == "end_date":
+                            params['end_date'] = snode.text
+                        elif snode.tag == "duration":
+                            params['duration'] = int(snode.text)
+                        elif snode.tag == "hashtags":
+                            params['hashtags'] = [snode.text]
+                    if options.hashtag or 'hashtags' not in params :
+                        params['hashtags'] = options.hashtag
+                    parameters.append(params)
+            else:                        
+                parameters = [{
+                    'start_date': options.start_date,
+                    'end_date' : options.end_date,
+                    'duration' : options.duration,
+                    'content_file' : options.content_file,
+                    'hashtags' : options.hashtag
+                }]
+            
+            for params in parameters:
+                
+                get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
+                
+                start_date_str = params.get("start_date",None)
+                end_date_str = params.get("end_date", None)
+                duration = params.get("duration", None)
+                content_file = params.get("content_file", None)
+                hashtags = params.get('hashtags', [])
+                  
+                if user_whitelist_file:
+                    with open(user_whitelist_file, 'r+') as f:
+                        user_whitelist = list(set([s.strip() for s in f]))
+                
+                start_date = None
+                ts = None
+                if start_date_str:
+                    start_date = parse_date(start_date_str) 
+                    ts = time.mktime(start_date.timetuple())
+            
+                end_date = None
+                if end_date_str:
+                    end_date = parse_date(end_date_str)
+                elif start_date and duration:
+                    end_date = start_date + datetime.timedelta(seconds=duration)
+                
+                query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
+                    
+                query_res = query.all()
+                                 
+                root = None
+                ensemble_parent = None
+                
+                #to do : analyse situation ldt or iri ? filename set or not ?
+                
+                if content_file and content_file.find("http") == 0:
+                    
+                    get_logger().debug("url : " + content_file) #@UndefinedVariable
+                    
+                    h = httplib2.Http()
+                    resp, content = h.request(content_file)
+                    
+                    get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
+                    
+                    project = anyjson.deserialize(content)
+                    root = etree.fromstring(project["ldt"])
+                
+                elif content_file and os.path.exists(content_file):
+
+                    doc = etree.parse(content_file)
+                    root = doc.getroot()
+                    
+                
+                if root is None:
+                
+                    root = etree.Element(u"iri")
+                        
+                    project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
+                
+                    medias = etree.SubElement(root, u"medias")
+                    media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
+                    
+                    annotations = etree.SubElement(root, u"annotations")    
+                    content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
+                    ensemble_parent = content
+                    
+                
+                if ensemble_parent is None:
+                    file_type = None
+                    for node in root:
+                        if node.tag == "project":
+                            file_type = "ldt"
+                            break
+                        elif node.tag == "head":
+                            file_type = "iri"
+                            break
+                    
+                    if file_type == "ldt":
+                        media_nodes = root.xpath("//media")
+                        if len(media_nodes) > 0:
+                            media = media_nodes[0]
+                        annotations_node = root.find(u"annotations")
+                        if annotations_node is None:
+                            annotations_node = etree.SubElement(root, u"annotations")
+                        content_node = annotations_node.find(u"content")
+                        if content_node is None:
+                            content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
+                        ensemble_parent = content_node
+                    elif file_type == "iri":
+                        body_node = root.find(u"body")
+                        if body_node is None:
+                            body_node = etree.SubElement(root, u"body")
+                        ensembles_node = body_node.find(u"ensembles")
+                        if ensembles_node is None:
+                            ensembles_node = etree.SubElement(body_node, u"ensembles")
+                        ensemble_parent = ensembles_node
+                    
+                    
+                if ensemble_parent is None:
+                    get_logger().error("Can not process file") #@UndefinedVariable
+                    sys.exit()
+            
+                if options.replace:
+                    for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
+                        if ens.get("id","").startswith("tweet_"):
+                            ensemble_parent.remove(ens)
+                
+                ensemble = None
+                elements = None
+                
+                if options.merge:
+                    ensemble = ensemble_parent.find(u"ensemble")
+                    if ensemble is not None:
+                        elements = ensemble.find(u".//elements")                
+                    
+                if ensemble is None or elements is None:
+                    ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
+                    decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
+                
+                    etree.SubElement(decoupage, u"title").text = unicode(options.name)
+                    etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
+                
+                    elements = etree.SubElement(decoupage, u"elements")
+
+                
+                for tw in query_res:
+                    tweet_ts_dt = tw.created_at
+                    tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
+                    if ts is None:
+                        ts = tweet_ts
+                    tweet_ts_rel = (tweet_ts-ts) * 1000
+                    username = None
+                    profile_url = ""
+                    if tw.user is not None:
+                        username = tw.user.name                    
+                        profile_url = tw.user.profile_image_url if tw.user.profile_image_url is not None else ""
+                    if not username:
+                        username = "anon."
+                    
+                    element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":unicode(profile_url)})
+                    etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
+                    etree.SubElement(element, u"abstract").text = unicode(tw.text)
+            
+                    tags_node = etree.SubElement(element, u"tags")
+                    
+                    for entity in tw.entity_list:
+                        if entity.type == u'entity_hashtag': 
+                            etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
+                            
+                    meta_element = etree.SubElement(element, u'meta')
+                    
+                    polemics_list = parse_polemics(tw, options.extended_mode)
+                    if polemics_list:
+                        polemics_element = etree.Element(u'polemics')
+                        for pol in polemics_list:
+                            etree.SubElement(polemics_element, u'polemic').text = pol
+                        meta_element.append(polemics_element)
+
+                    etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.tweet_source.original_json))
+                    
+                # sort by tc in
+                if options.merge :
+                    # remove all elements and put them in a array
+                    # sort them with tc
+                    #put them back
+                    elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
+                    
+                    
+                
+                
+                output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)  
+                
+                if content_file and content_file.find("http") == 0:
+                    
+                    project["ldt"] = output_data
+                    body = anyjson.serialize(project)
+                    get_logger().debug("write http " + content_file) #@UndefinedVariable
+                    get_logger().debug("write http " + repr(body)) #@UndefinedVariable
+                    h = httplib2.Http()
+                    resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
+                    get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
+                else:
+                    if content_file and os.path.exists(content_file):
+                        dest_file_name = content_file 
+                    else:
+                        dest_file_name = options.filename
+            
+                    get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
+                    output = open(dest_file_name, "w")
+                    output.write(output_data)
+                    output.flush()
+                    output.close()
+                
+        finally:
+            if session:
+                session.close()
+    finally:
+        if conn:
+            conn.close()

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/get_stats.py	Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,38 @@
+
+import httplib2
+import anyjson
+from lxml import etree
+import sys
+import pprint
+
+def get_stats(url):
+    
+    h = httplib2.Http()
+    resp, content = h.request(url)    
+    #project = anyjson.deserialize(content)
+    root = etree.fromstring(content)
+
+    #get all annotations
+    res_xpath = root.xpath("//ensemble[starts-with(@id,'tweet_')]//element")
+    
+    total_annot = len(res_xpath)
+    total_with_polemic = 0
+    total_by_type = {}
+    
+    
+    for annot in res_xpath:
+        polemic_list = annot.xpath("meta/polemics/polemic")
+        if len(polemic_list)> 0:
+            total_with_polemic += 1
+            for polemic_item in polemic_list:
+                pol_type = polemic_item.text
+                total_by_type[pol_type] = total_by_type.get(pol_type,0) + 1
+            
+            
+    return {"total_annotations": total_annot, "total_with_polemics": total_with_polemic, "polemic_by_type": total_by_type}
+
+if __name__ == "__main__":
+    
+    pp = pprint.PrettyPrinter(indent=4, width=1)
+    
+    pp.pprint(get_stats(sys.argv[1]))
\ No newline at end of file

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/script/utils/merge_tweets.py	Sat Jan 07 16:12:44 2012 +0100
@@ -0,0 +1,105 @@
+#from models import setup_database
+from iri_tweet.models import setup_database, TweetSource, Tweet, TweetLog
+from iri_tweet.utils import TwitterProcessor, get_oauth_token, show_progress
+import argparse
+import sys
+import re
+import anyjson
+import math
+import codecs
+
+def get_option():
+    
+    parser = argparse.ArgumentParser(description='Merge tweets databases')
+
+    parser.add_argument("-l", "--log", dest="logfile",
+                        help="log to file", metavar="LOG", default="stderr")
+    parser.add_argument("-v", dest="verbose", action="count",
+                        help="verbose", default=0)
+    parser.add_argument("-q", dest="quiet", action="count",
+                        help="quiet", default=0)
+    parser.add_argument("--query-user", dest="query_user", action="store_true",
+                        help="Query twitter for user information",  default=False)
+    parser.add_argument("-t", dest="token_filename", metavar="TOKEN_FILENAME", default=".oauth_token",
+                      help="Token file name")
+
+    
+    parser.add_argument("source", action="store", nargs=1, type=str, metavar="SOURCE")
+    parser.add_argument("target", action="store", nargs=1, type=str, metavar="TARGET")
+    
+
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    
+    sys.stdout = codecs.getwriter(sys.stdout.encoding)(sys.stdout)
+    
+    options = get_option()
+    
+    access_token = None
+    if options.query_user:
+        access_token = get_oauth_token(options.token_filename)
+    
+    #open source
+    src_conn_str = options.source[0].strip()
+    if not re.match("^\w+://.+", src_conn_str):
+        src_conn_str = 'sqlite:///' + src_conn_str
+    tgt_conn_str = options.target[0].strip()
+    if not re.match("^\w+://.+", tgt_conn_str):
+        tgt_conn_str = 'sqlite:///' + tgt_conn_str
+
+
+    engine_src, metadata_src, Session_src = setup_database(src_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)
+    engine_tgt, metadata_tgt, Session_tgt = setup_database(tgt_conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
+
+    conn_src = conn_tgt = session_src = session_tgt = None
+    
+    try:
+        #conn_src = engine_src.connect()
+        #conn_tgt = engine_tgt.connect()
+        session_src = Session_src()
+        session_tgt = Session_tgt()
+        
+        count_tw_query = Tweet.__table__.count()
+        
+        count_tw = engine_src.scalar(count_tw_query)
+        
+        if count_tw == 0:
+            print "No tweet to process : exit"
+            sys.exit()
+            
+        query_src = session_src.query(Tweet).join(TweetSource).yield_per(100)
+        added = 0
+        
+        for i,tweet in enumerate(query_src):
+            
+            tweet_count = session_tgt.query(Tweet).filter(Tweet.id == tweet.id).count()
+            
+            progress_text = u"Process: "
+            if tweet_count == 0:
+                added += 1
+                progress_text = u"Adding : "
+                tweet_source = tweet.tweet_source.original_json
+                                
+                tweet_obj = anyjson.deserialize(tweet_source)
+                if 'text' not in tweet_obj:
+                    tweet_log = TweetLog(tweet_source_id=source_id, status=TweetLog.TWEET_STATUS['NOT_TWEET'])
+                    session_tgt.add(tweet_log)
+                else:                
+                    tp = TwitterProcessor(None, tweet_source, None, session_tgt, access_token, options.token_filename, user_query_twitter=options.query_user)
+                    tp.process()
+                
+                session_tgt.flush()
+                
+            show_progress(i+1, count_tw, progress_text+tweet.text, 70)
+                            
+        session_tgt.commit()
+        print u"%d new tweet added" % (added)
+        
+    finally:
+        session_tgt.close() if session_tgt is not None else None
+        session_src.close() if session_src is not None else None
+        conn_tgt.close() if conn_tgt is not None else None
+        conn_src.close() if conn_src is not None else None
+        
+        
\ No newline at end of file

Binary file script/virtualenv/res/python-dateutil-1.5.tar.gz has changed

author	Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
	Sat, 07 Jan 2012 16:12:44 +0100
changeset 464	b9243ade95e2
parent 463	d3b86c65c980
child 465	297156c2efbe

script/lib/iri_tweet/create_twitter_export_conf.py		file \| annotate \| diff \| comparison \| revisions
script/lib/iri_tweet/export_twitter_alchemy.py		file \| annotate \| diff \| comparison \| revisions
script/lib/iri_tweet/utils.py		file \| annotate \| diff \| comparison \| revisions
script/rest/search_twitter.py		file \| annotate \| diff \| comparison \| revisions
script/stream/get_stats.py		file \| annotate \| diff \| comparison \| revisions
script/stream/recorder_tweetstream.py		file \| annotate \| diff \| comparison \| revisions
script/utils/create_twitter_export_conf.py		file \| annotate \| diff \| comparison \| revisions
script/utils/export_twitter_alchemy.py		file \| annotate \| diff \| comparison \| revisions
script/utils/get_stats.py		file \| annotate \| diff \| comparison \| revisions
script/utils/merge_tweets.py		file \| annotate \| diff \| comparison \| revisions
script/virtualenv/res/python-dateutil-1.5.tar.gz		file \| annotate \| diff \| comparison \| revisions