various correction for export tweet alchemy. Can give a project
authorYves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
Wed, 16 Jan 2013 05:04:23 +0100
changeset 763 bc29a6fbb8e8
parent 762 38ff25c1db25
child 764 67a0cee0077f
various correction for export tweet alchemy. Can give a project
script/lib/iri_tweet/iri_tweet/utils.py
script/utils/export_twitter_alchemy.py
script/utils/merge_tweets.py
script/virtualenv/res/lib/lib_create_env.py
script/virtualenv/res/src/requests-1.1.0.tar.gz
script/virtualenv/res/src/requests-v1.0.2.tar.gz
--- a/script/lib/iri_tweet/iri_tweet/utils.py	Fri Jan 11 11:59:03 2013 +0100
+++ b/script/lib/iri_tweet/iri_tweet/utils.py	Wed Jan 16 05:04:23 2013 +0100
@@ -624,3 +624,4 @@
     writer.flush()
     
     return writer
+
--- a/script/utils/export_twitter_alchemy.py	Fri Jan 11 11:59:03 2013 +0100
+++ b/script/utils/export_twitter_alchemy.py	Wed Jan 16 05:04:23 2013 +0100
@@ -2,14 +2,14 @@
 # coding=utf-8
 
 from lxml import etree
-from iri_tweet.models import setup_database
+from iri_tweet.models import setup_database, Tweet, User
 from optparse import OptionParser #@UnresolvedImport
 from sqlalchemy import Table, Column, BigInteger
 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     get_logger)
 import anyjson
 import datetime
-import httplib2
+import requests
 import os.path
 import re
 import sys
@@ -24,6 +24,9 @@
 #    def __repr__(self):
 #        return "<TweetExclude(id=%d)>" % (self.id)
 
+LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
+LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
+ 
 
 def parse_polemics(tw, extended_mode):
     """
@@ -87,6 +90,12 @@
                       help="list of file to process", metavar="LIST_CONF", default=None)
     parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
                       help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
+    parser.add_option("-b", "--base-url", dest="base_url",
+                      help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
+    parser.add_option("-p", "--project", dest="project_id", 
+                      help="Project id", metavar="PROJECT_ID", default=None)
+    parser.add_option("-P", "--post-param", dest="post_param", 
+                      help="Post param", metavar="POST_PARAM", default=None)        
     parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
     
@@ -127,8 +136,30 @@
             if options.exclude and os.path.exists(options.exclude):
                 with open(options.exclude, 'r+') as f:
                     tei = tweet_exclude_table.insert()
+                    ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I)
                     for line in f:
-                        conn.execute(tei.values(id=long(line.strip())))
+                        res = ex_regexp.match(line.strip())
+                        if res:
+                            if res.group('field') == "id":                                
+                                conn.execute(tei.values(id=res.group('value')))
+                            else:
+                                exclude_query = session.query(Tweet)
+                                filter_obj = Tweet
+                                filter_field = res.group('field')
+                                if filter_field.startswith("user_"):
+                                    exclude_query = exclude_query.join(User)
+                                    filter_obj = User
+                                    filter_field = filter_field[len("user_"):]
+                                    
+
+                                if res.group('op') == "=":
+                                    exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field) == res.group('value'))
+                                else:
+                                    exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field).like(res.group('value')))
+                                
+                                for t in exclude_query.all():
+                                     conn.execute(tei.values(id=t.id))
+                                
             user_whitelist_file = options.user_whitelist
             user_whitelist = None
             
@@ -141,6 +172,10 @@
                     for snode in node:
                         if snode.tag == "path":
                             params['content_file'] = snode.text
+                            params['content_file_write'] = snode.text
+                        elif snode.tag == "project_id":
+                            params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
+                            params['project_id'] = snode.text
                         elif snode.tag == "start_date":
                             params['start_date'] = snode.text
                         elif snode.tag == "end_date":
@@ -152,15 +187,24 @@
                     if options.hashtag or 'hashtags' not in params :
                         params['hashtags'] = options.hashtag
                     parameters.append(params)
-            else:                        
+            else:
+                if options.project_id:
+                    content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json"
+                else:
+                    content_file = options.content_file                                          
                 parameters = [{
                     'start_date': options.start_date,
                     'end_date' : options.end_date,
                     'duration' : options.duration,
-                    'content_file' : options.content_file,
-                    'hashtags' : options.hashtag
+                    'content_file' : content_file,
+                    'content_file_write' : content_file,
+                    'hashtags' : options.hashtag,
+                    'project_id' : options.project_id 
                 }]
-            
+            post_param = {}
+            if options.post_param:
+                post_param = anyjson.loads(options.post_param)
+
             for params in parameters:
                 
                 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
@@ -169,6 +213,7 @@
                 end_date_str = params.get("end_date", None)
                 duration = params.get("duration", None)
                 content_file = params.get("content_file", None)
+                content_file_write = params.get("content_file_write", None)
                 hashtags = params.get('hashtags', [])
                   
                 if user_whitelist_file:
@@ -181,15 +226,6 @@
                     start_date = parse_date(start_date_str) 
                     ts = time.mktime(start_date.timetuple())
             
-                end_date = None
-                if end_date_str:
-                    end_date = parse_date(end_date_str)
-                elif start_date and duration:
-                    end_date = start_date + datetime.timedelta(seconds=duration)
-                
-                query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
-                    
-                query_res = query.all()
                                  
                 root = None
                 ensemble_parent = None
@@ -200,19 +236,17 @@
                     
                     get_logger().debug("url : " + content_file) #@UndefinedVariable
                     
-                    h = httplib2.Http()
-                    resp, content = h.request(content_file)
-                    
-                    get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
-                    
-                    project = anyjson.deserialize(content)
+                    r = requests.get(content_file, params=post_param)                    
+                    get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
+                    project = r.json()                    
                     root = etree.fromstring(project["ldt"])
                 
                 elif content_file and os.path.exists(content_file):
 
                     doc = etree.parse(content_file)
                     root = doc.getroot()
-                    
+                
+                content_id = None    
                 
                 if root is None:
                 
@@ -227,6 +261,8 @@
                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
                     ensemble_parent = content
                     
+                    content_id = options.content_id
+                    
                 
                 if ensemble_parent is None:
                     file_type = None
@@ -249,6 +285,7 @@
                         if content_node is None:
                             content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
                         ensemble_parent = content_node
+                        content_id = content_node.get(u"id")
                     elif file_type == "iri":
                         body_node = root.find(u"body")
                         if body_node is None:
@@ -257,6 +294,7 @@
                         if ensembles_node is None:
                             ensembles_node = etree.SubElement(body_node, u"ensembles")
                         ensemble_parent = ensembles_node
+                        content_id = root.xpath("head/meta[@name='id']/@content")[0]
                     
                     
                 if ensemble_parent is None:
@@ -285,6 +323,25 @@
                 
                     elements = etree.SubElement(decoupage, u"elements")
 
+                end_date = None
+                if end_date_str:
+                    end_date = parse_date(end_date_str)
+                elif start_date and duration:
+                    end_date = start_date + datetime.timedelta(seconds=duration)
+                elif start_date and options.base_url:                    
+                    # get duration from api
+                    content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
+                    r = requests.get(content_url)
+                    duration = int(r.json()['duration'])
+                    get_logger().debug("get duration " + content_url) #@UndefinedVariable
+                    get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
+
+                    end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
+                
+                query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
+                    
+                query_res = query.all()
+
                 
                 for tw in query_res:
                     tweet_ts_dt = tw.created_at
@@ -333,21 +390,23 @@
                 
                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
                 
-                if content_file and content_file.find("http") == 0:
+                if content_file_write and content_file_write.find("http") == 0:
                     
                     project["ldt"] = output_data
-                    body = anyjson.serialize(project)
-                    get_logger().debug("write http " + content_file) #@UndefinedVariable
-                    get_logger().debug("write http " + repr(body)) #@UndefinedVariable
-                    h = httplib2.Http()
-                    resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
-                    get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
-                    if resp.status != 200:
-                        get_logger().error("Error http " + repr(resp) + " content " + content) #@UndefinedVariable
-                        raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason))                        
+                    post_param = {}
+                    if options.post_param:
+                        post_param = anyjson.loads(options.post_param)
+
+                    get_logger().debug("write http " + content_file_write) #@UndefinedVariable
+                    get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable
+                    get_logger().debug("write http " + repr(project)) #@UndefinedVariable
+                    r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param);
+                    get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable
+                    if r.status_code != requests.codes.ok:
+                        r.raise_for_status()
                 else:
-                    if content_file and os.path.exists(content_file):
-                        dest_file_name = content_file 
+                    if content_file_write and os.path.exists(content_file_write):
+                        dest_file_name = content_file_write
                     else:
                         dest_file_name = options.filename
             
--- a/script/utils/merge_tweets.py	Fri Jan 11 11:59:03 2013 +0100
+++ b/script/utils/merge_tweets.py	Wed Jan 16 05:04:23 2013 +0100
@@ -91,7 +91,7 @@
                 
                 session_tgt.flush()
                 
-            show_progress(i+1, count_tw, progress_text+tweet.text, 70)
+            show_progress(i+1, count_tw, repr(progress_text+tweet.text), 70)
                             
         session_tgt.commit()
         print u"%d new tweet added" % (added)
--- a/script/virtualenv/res/lib/lib_create_env.py	Fri Jan 11 11:59:03 2013 +0100
+++ b/script/virtualenv/res/lib/lib_create_env.py	Wed Jan 16 05:04:23 2013 +0100
@@ -29,7 +29,7 @@
     'TWEEPY': {'setup': 'tweepy', 'url':'https://github.com/tweepy/tweepy/archive/1.12.tar.gz', 'local':"tweepy-1.12.tar.gz", 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}},
     'TWITTER': {'setup': 'twitter', 'url':'http://pypi.python.org/packages/source/t/twitter/twitter-1.9.0.tar.gz', 'local':"twitter-1.9.0.tar.gz", 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}},
     'TWITTER-TEXT': {'setup': 'twitter-text', 'url':'https://github.com/dryan/twitter-text-py/archive/master.tar.gz', 'local':"twitter-text-1.0.4.tar.gz", 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}},
-    'REQUESTS': {'setup': 'requests', 'url':'https://github.com/kennethreitz/requests/archive/v1.0.2.tar.gz', 'local':'requests-v1.0.2.tar.gz', 'install' : {'method':'pip', 'option_str': None, 'dict_extra_env': None}},
+    'REQUESTS': {'setup': 'requests', 'url':'https://github.com/kennethreitz/requests/archive/v1.1.0.tar.gz', 'local':'requests-v1.1.0.tar.gz', 'install' : {'method':'pip', 'option_str': None, 'dict_extra_env': None}},
 }
 
 if system_str == 'Windows':
Binary file script/virtualenv/res/src/requests-1.1.0.tar.gz has changed
Binary file script/virtualenv/res/src/requests-v1.0.2.tar.gz has changed