script/utils/export_twitter_alchemy.py
changeset 764 67a0cee0077f
parent 763 bc29a6fbb8e8
child 886 1e110b03ae96
--- a/script/utils/export_twitter_alchemy.py	Wed Jan 16 05:04:23 2013 +0100
+++ b/script/utils/export_twitter_alchemy.py	Wed Jan 16 18:25:10 2013 +0100
@@ -4,7 +4,8 @@
 from lxml import etree
 from iri_tweet.models import setup_database, Tweet, User
 from optparse import OptionParser #@UnresolvedImport
-from sqlalchemy import Table, Column, BigInteger
+from sqlalchemy import Table, Column, BigInteger, event, bindparam
+from sqlalchemy.sql import select, func
 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     get_logger)
 import anyjson
@@ -26,7 +27,13 @@
 
 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
- 
+
+def re_fn(expr, item):    
+    reg = re.compile(expr, re.I)
+    res = reg.search(item)
+    if res:
+        get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable
+    return res is not None 
 
 def parse_polemics(tw, extended_mode):
     """
@@ -125,7 +132,10 @@
     engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
     conn = None
     try :
-        conn = engine.connect()    
+        conn = engine.connect()
+        @event.listens_for(conn, "begin")
+        def do_begin(conn):
+            conn.connection.create_function('regexp', 2, re_fn)    
         session = None
         try :
             session = Session(bind=conn)         
@@ -146,19 +156,21 @@
                                 exclude_query = session.query(Tweet)
                                 filter_obj = Tweet
                                 filter_field = res.group('field')
-                                if filter_field.startswith("user_"):
-                                    exclude_query = exclude_query.join(User)
+                                if filter_field.startswith("user__"):
+                                    exclude_query = exclude_query.outerjoin(User, Tweet.user_id==User.id)
                                     filter_obj = User
-                                    filter_field = filter_field[len("user_"):]
-                                    
+                                    filter_field = filter_field[len("user__"):]                                    
 
                                 if res.group('op') == "=":
-                                    exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field) == res.group('value'))
+                                    exclude_query = exclude_query.filter(getattr(filter_obj, filter_field) == res.group('value'))
                                 else:
-                                    exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field).like(res.group('value')))
+                                    exclude_query = exclude_query.filter(getattr(filter_obj, filter_field).op('regexp')(res.group('value')))
                                 
+                                test_query = select([func.count()]).where(tweet_exclude_table.c.id==bindparam('t_id'))
                                 for t in exclude_query.all():
-                                     conn.execute(tei.values(id=t.id))
+                                    get_logger().debug("t : " + repr(t))
+                                    if conn.execute(test_query, t_id=t.id).fetchone()[0] == 0:
+                                        conn.execute(tei.values(id=t.id))
                                 
             user_whitelist_file = options.user_whitelist
             user_whitelist = None
@@ -175,6 +187,7 @@
                             params['content_file_write'] = snode.text
                         elif snode.tag == "project_id":
                             params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
+                            params['content_file_write'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
                             params['project_id'] = snode.text
                         elif snode.tag == "start_date":
                             params['start_date'] = snode.text
@@ -237,9 +250,10 @@
                     get_logger().debug("url : " + content_file) #@UndefinedVariable
                     
                     r = requests.get(content_file, params=post_param)                    
-                    get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
-                    project = r.json()                    
-                    root = etree.fromstring(project["ldt"])
+                    #get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
+                    project = r.json()
+                    text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S)
+                    root = etree.fromstring(text_match.group(1) if text_match else project['ldt'])
                 
                 elif content_file and os.path.exists(content_file):