script/utils/export_twitter_alchemy.py
changeset 764 67a0cee0077f
parent 763 bc29a6fbb8e8
child 886 1e110b03ae96
equal deleted inserted replaced
763:bc29a6fbb8e8 764:67a0cee0077f
     2 # coding=utf-8
     2 # coding=utf-8
     3 
     3 
     4 from lxml import etree
     4 from lxml import etree
     5 from iri_tweet.models import setup_database, Tweet, User
     5 from iri_tweet.models import setup_database, Tweet, User
     6 from optparse import OptionParser #@UnresolvedImport
     6 from optparse import OptionParser #@UnresolvedImport
     7 from sqlalchemy import Table, Column, BigInteger
     7 from sqlalchemy import Table, Column, BigInteger, event, bindparam
       
     8 from sqlalchemy.sql import select, func
     8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     9 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     9     get_logger)
    10     get_logger)
    10 import anyjson
    11 import anyjson
    11 import datetime
    12 import datetime
    12 import requests
    13 import requests
    24 #    def __repr__(self):
    25 #    def __repr__(self):
    25 #        return "<TweetExclude(id=%d)>" % (self.id)
    26 #        return "<TweetExclude(id=%d)>" % (self.id)
    26 
    27 
    27 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    28 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
    28 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
    29 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
    29  
    30 
       
    31 def re_fn(expr, item):    
       
    32     reg = re.compile(expr, re.I)
       
    33     res = reg.search(item)
       
    34     if res:
       
    35         get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable
       
    36     return res is not None 
    30 
    37 
    31 def parse_polemics(tw, extended_mode):
    38 def parse_polemics(tw, extended_mode):
    32     """
    39     """
    33     parse polemics in text and return a list of polemic code. None if not polemic found
    40     parse polemics in text and return a list of polemic code. None if not polemic found
    34     """
    41     """
   123         conn_str = 'sqlite:///' + conn_str
   130         conn_str = 'sqlite:///' + conn_str
   124 
   131 
   125     engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
   132     engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
   126     conn = None
   133     conn = None
   127     try :
   134     try :
   128         conn = engine.connect()    
   135         conn = engine.connect()
       
   136         @event.listens_for(conn, "begin")
       
   137         def do_begin(conn):
       
   138             conn.connection.create_function('regexp', 2, re_fn)    
   129         session = None
   139         session = None
   130         try :
   140         try :
   131             session = Session(bind=conn)         
   141             session = Session(bind=conn)         
   132             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
   142             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
   133             #mapper(TweetExclude, tweet_exclude_table)
   143             #mapper(TweetExclude, tweet_exclude_table)
   144                                 conn.execute(tei.values(id=res.group('value')))
   154                                 conn.execute(tei.values(id=res.group('value')))
   145                             else:
   155                             else:
   146                                 exclude_query = session.query(Tweet)
   156                                 exclude_query = session.query(Tweet)
   147                                 filter_obj = Tweet
   157                                 filter_obj = Tweet
   148                                 filter_field = res.group('field')
   158                                 filter_field = res.group('field')
   149                                 if filter_field.startswith("user_"):
   159                                 if filter_field.startswith("user__"):
   150                                     exclude_query = exclude_query.join(User)
   160                                     exclude_query = exclude_query.outerjoin(User, Tweet.user_id==User.id)
   151                                     filter_obj = User
   161                                     filter_obj = User
   152                                     filter_field = filter_field[len("user_"):]
   162                                     filter_field = filter_field[len("user__"):]                                    
   153                                     
       
   154 
   163 
   155                                 if res.group('op') == "=":
   164                                 if res.group('op') == "=":
   156                                     exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field) == res.group('value'))
   165                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field) == res.group('value'))
   157                                 else:
   166                                 else:
   158                                     exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field).like(res.group('value')))
   167                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field).op('regexp')(res.group('value')))
   159                                 
   168                                 
       
   169                                 test_query = select([func.count()]).where(tweet_exclude_table.c.id==bindparam('t_id'))
   160                                 for t in exclude_query.all():
   170                                 for t in exclude_query.all():
   161                                      conn.execute(tei.values(id=t.id))
   171                                     get_logger().debug("t : " + repr(t))
       
   172                                     if conn.execute(test_query, t_id=t.id).fetchone()[0] == 0:
       
   173                                         conn.execute(tei.values(id=t.id))
   162                                 
   174                                 
   163             user_whitelist_file = options.user_whitelist
   175             user_whitelist_file = options.user_whitelist
   164             user_whitelist = None
   176             user_whitelist = None
   165             
   177             
   166             if options.listconf:
   178             if options.listconf:
   173                         if snode.tag == "path":
   185                         if snode.tag == "path":
   174                             params['content_file'] = snode.text
   186                             params['content_file'] = snode.text
   175                             params['content_file_write'] = snode.text
   187                             params['content_file_write'] = snode.text
   176                         elif snode.tag == "project_id":
   188                         elif snode.tag == "project_id":
   177                             params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
   189                             params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
       
   190                             params['content_file_write'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
   178                             params['project_id'] = snode.text
   191                             params['project_id'] = snode.text
   179                         elif snode.tag == "start_date":
   192                         elif snode.tag == "start_date":
   180                             params['start_date'] = snode.text
   193                             params['start_date'] = snode.text
   181                         elif snode.tag == "end_date":
   194                         elif snode.tag == "end_date":
   182                             params['end_date'] = snode.text
   195                             params['end_date'] = snode.text
   235                 if content_file and content_file.find("http") == 0:
   248                 if content_file and content_file.find("http") == 0:
   236                     
   249                     
   237                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   250                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   238                     
   251                     
   239                     r = requests.get(content_file, params=post_param)                    
   252                     r = requests.get(content_file, params=post_param)                    
   240                     get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
   253                     #get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
   241                     project = r.json()                    
   254                     project = r.json()
   242                     root = etree.fromstring(project["ldt"])
   255                     text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S)
       
   256                     root = etree.fromstring(text_match.group(1) if text_match else project['ldt'])
   243                 
   257                 
   244                 elif content_file and os.path.exists(content_file):
   258                 elif content_file and os.path.exists(content_file):
   245 
   259 
   246                     doc = etree.parse(content_file)
   260                     doc = etree.parse(content_file)
   247                     root = doc.getroot()
   261                     root = doc.getroot()