script/utils/export_twitter_alchemy.py
changeset 886 1e110b03ae96
parent 693 2ef837069108
parent 764 67a0cee0077f
child 888 6fc6637d8403
equal deleted inserted replaced
885:2251fb41dbc7 886:1e110b03ae96
     1 #!/usr/bin/env python
     1 #!/usr/bin/env python
     2 # coding=utf-8
     2 # coding=utf-8
     3 
     3 
     4 from lxml import etree
     4 from lxml import etree
     5 from iri_tweet.models import setup_database
     5 from iri_tweet.models import setup_database, Tweet, User
     6 from optparse import OptionParser #@UnresolvedImport
     6 from optparse import OptionParser #@UnresolvedImport
     7 from sqlalchemy import Table, Column, BigInteger
     7 from sqlalchemy import Table, Column, BigInteger, event, bindparam
       
     8 from sqlalchemy.sql import select, func
     8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     9 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     9     get_logger)
    10     get_logger)
    10 import anyjson
    11 import anyjson
    11 import datetime
    12 import datetime
    12 import httplib2
    13 import requests
    13 import os.path
    14 import os.path
    14 import re
    15 import re
    15 import sys
    16 import sys
    16 import time
    17 import time
    17 import uuid #@UnresolvedImport
    18 import uuid #@UnresolvedImport
    22 #        self.id = id
    23 #        self.id = id
    23 #        
    24 #        
    24 #    def __repr__(self):
    25 #    def __repr__(self):
    25 #        return "<TweetExclude(id=%d)>" % (self.id)
    26 #        return "<TweetExclude(id=%d)>" % (self.id)
    26 
    27 
       
    28 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
       
    29 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
       
    30 
       
    31 def re_fn(expr, item):    
       
    32     reg = re.compile(expr, re.I)
       
    33     res = reg.search(item)
       
    34     if res:
       
    35         get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable
       
    36     return res is not None 
    27 
    37 
    28 def parse_polemics(tw, extended_mode):
    38 def parse_polemics(tw, extended_mode):
    29     """
    39     """
    30     parse polemics in text and return a list of polemic code. None if not polemic found
    40     parse polemics in text and return a list of polemic code. None if not polemic found
    31     """
    41     """
    85                       help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
    95                       help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
    86     parser.add_option("-L", "--list-conf", dest="listconf",
    96     parser.add_option("-L", "--list-conf", dest="listconf",
    87                       help="list of file to process", metavar="LIST_CONF", default=None)
    97                       help="list of file to process", metavar="LIST_CONF", default=None)
    88     parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
    98     parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
    89                       help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
    99                       help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
       
   100     parser.add_option("-b", "--base-url", dest="base_url",
       
   101                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
       
   102     parser.add_option("-p", "--project", dest="project_id", 
       
   103                       help="Project id", metavar="PROJECT_ID", default=None)
       
   104     parser.add_option("-P", "--post-param", dest="post_param", 
       
   105                       help="Post param", metavar="POST_PARAM", default=None)        
    90     parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
   106     parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
    91                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
   107                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
    92     
   108     
    93     
   109     
    94     set_logging_options(parser)
   110     set_logging_options(parser)
   114         conn_str = 'sqlite:///' + conn_str
   130         conn_str = 'sqlite:///' + conn_str
   115 
   131 
   116     engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
   132     engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
   117     conn = None
   133     conn = None
   118     try :
   134     try :
   119         conn = engine.connect()    
   135         conn = engine.connect()
       
   136         @event.listens_for(conn, "begin")
       
   137         def do_begin(conn):
       
   138             conn.connection.create_function('regexp', 2, re_fn)    
   120         session = None
   139         session = None
   121         try :
   140         try :
   122             session = Session(bind=conn)         
   141             session = Session(bind=conn)         
   123             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
   142             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
   124             #mapper(TweetExclude, tweet_exclude_table)
   143             #mapper(TweetExclude, tweet_exclude_table)
   125             metadata.create_all(bind=conn, tables=[tweet_exclude_table])
   144             metadata.create_all(bind=conn, tables=[tweet_exclude_table])
   126             
   145             
   127             if options.exclude and os.path.exists(options.exclude):
   146             if options.exclude and os.path.exists(options.exclude):
   128                 with open(options.exclude, 'r+') as f:
   147                 with open(options.exclude, 'r+') as f:
   129                     tei = tweet_exclude_table.insert()
   148                     tei = tweet_exclude_table.insert()
       
   149                     ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I)
   130                     for line in f:
   150                     for line in f:
   131                         conn.execute(tei.values(id=long(line.strip())))
   151                         res = ex_regexp.match(line.strip())
       
   152                         if res:
       
   153                             if res.group('field') == "id":                                
       
   154                                 conn.execute(tei.values(id=res.group('value')))
       
   155                             else:
       
   156                                 exclude_query = session.query(Tweet)
       
   157                                 filter_obj = Tweet
       
   158                                 filter_field = res.group('field')
       
   159                                 if filter_field.startswith("user__"):
       
   160                                     exclude_query = exclude_query.outerjoin(User, Tweet.user_id==User.id)
       
   161                                     filter_obj = User
       
   162                                     filter_field = filter_field[len("user__"):]                                    
       
   163 
       
   164                                 if res.group('op') == "=":
       
   165                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field) == res.group('value'))
       
   166                                 else:
       
   167                                     exclude_query = exclude_query.filter(getattr(filter_obj, filter_field).op('regexp')(res.group('value')))
       
   168                                 
       
   169                                 test_query = select([func.count()]).where(tweet_exclude_table.c.id==bindparam('t_id'))
       
   170                                 for t in exclude_query.all():
       
   171                                     get_logger().debug("t : " + repr(t))
       
   172                                     if conn.execute(test_query, t_id=t.id).fetchone()[0] == 0:
       
   173                                         conn.execute(tei.values(id=t.id))
       
   174                                 
   132             user_whitelist_file = options.user_whitelist
   175             user_whitelist_file = options.user_whitelist
   133             user_whitelist = None
   176             user_whitelist = None
   134             
   177             
   135             if options.listconf:
   178             if options.listconf:
   136                 
   179                 
   139                 for node in confdoc.xpath("/twitter_export/file"):
   182                 for node in confdoc.xpath("/twitter_export/file"):
   140                     params = {}
   183                     params = {}
   141                     for snode in node:
   184                     for snode in node:
   142                         if snode.tag == "path":
   185                         if snode.tag == "path":
   143                             params['content_file'] = snode.text
   186                             params['content_file'] = snode.text
       
   187                             params['content_file_write'] = snode.text
       
   188                         elif snode.tag == "project_id":
       
   189                             params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
       
   190                             params['content_file_write'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
       
   191                             params['project_id'] = snode.text
   144                         elif snode.tag == "start_date":
   192                         elif snode.tag == "start_date":
   145                             params['start_date'] = snode.text
   193                             params['start_date'] = snode.text
   146                         elif snode.tag == "end_date":
   194                         elif snode.tag == "end_date":
   147                             params['end_date'] = snode.text
   195                             params['end_date'] = snode.text
   148                         elif snode.tag == "duration":
   196                         elif snode.tag == "duration":
   150                         elif snode.tag == "hashtags":
   198                         elif snode.tag == "hashtags":
   151                             params['hashtags'] = [snode.text]
   199                             params['hashtags'] = [snode.text]
   152                     if options.hashtag or 'hashtags' not in params :
   200                     if options.hashtag or 'hashtags' not in params :
   153                         params['hashtags'] = options.hashtag
   201                         params['hashtags'] = options.hashtag
   154                     parameters.append(params)
   202                     parameters.append(params)
   155             else:                        
   203             else:
       
   204                 if options.project_id:
       
   205                     content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json"
       
   206                 else:
       
   207                     content_file = options.content_file                                          
   156                 parameters = [{
   208                 parameters = [{
   157                     'start_date': options.start_date,
   209                     'start_date': options.start_date,
   158                     'end_date' : options.end_date,
   210                     'end_date' : options.end_date,
   159                     'duration' : options.duration,
   211                     'duration' : options.duration,
   160                     'content_file' : options.content_file,
   212                     'content_file' : content_file,
   161                     'hashtags' : options.hashtag
   213                     'content_file_write' : content_file,
       
   214                     'hashtags' : options.hashtag,
       
   215                     'project_id' : options.project_id 
   162                 }]
   216                 }]
   163             
   217             post_param = {}
       
   218             if options.post_param:
       
   219                 post_param = anyjson.loads(options.post_param)
       
   220 
   164             for params in parameters:
   221             for params in parameters:
   165                 
   222                 
   166                 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
   223                 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
   167                 
   224                 
   168                 start_date_str = params.get("start_date",None)
   225                 start_date_str = params.get("start_date",None)
   169                 end_date_str = params.get("end_date", None)
   226                 end_date_str = params.get("end_date", None)
   170                 duration = params.get("duration", None)
   227                 duration = params.get("duration", None)
   171                 content_file = params.get("content_file", None)
   228                 content_file = params.get("content_file", None)
       
   229                 content_file_write = params.get("content_file_write", None)
   172                 hashtags = params.get('hashtags', [])
   230                 hashtags = params.get('hashtags', [])
   173                   
   231                   
   174                 if user_whitelist_file:
   232                 if user_whitelist_file:
   175                     with open(user_whitelist_file, 'r+') as f:
   233                     with open(user_whitelist_file, 'r+') as f:
   176                         user_whitelist = list(set([s.strip() for s in f]))
   234                         user_whitelist = list(set([s.strip() for s in f]))
   179                 ts = None
   237                 ts = None
   180                 if start_date_str:
   238                 if start_date_str:
   181                     start_date = parse_date(start_date_str) 
   239                     start_date = parse_date(start_date_str) 
   182                     ts = time.mktime(start_date.timetuple())
   240                     ts = time.mktime(start_date.timetuple())
   183             
   241             
   184                 end_date = None
       
   185                 if end_date_str:
       
   186                     end_date = parse_date(end_date_str)
       
   187                 elif start_date and duration:
       
   188                     end_date = start_date + datetime.timedelta(seconds=duration)
       
   189                 
       
   190                 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
       
   191                     
       
   192                 query_res = query.all()
       
   193                                  
   242                                  
   194                 root = None
   243                 root = None
   195                 ensemble_parent = None
   244                 ensemble_parent = None
   196                 
   245                 
   197                 #to do : analyse situation ldt or iri ? filename set or not ?
   246                 #to do : analyse situation ldt or iri ? filename set or not ?
   198                 
   247                 
   199                 if content_file and content_file.find("http") == 0:
   248                 if content_file and content_file.find("http") == 0:
   200                     
   249                     
   201                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   250                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   202                     
   251                     
   203                     h = httplib2.Http()
   252                     r = requests.get(content_file, params=post_param)                    
   204                     resp, content = h.request(content_file)
   253                     #get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
   205                     
   254                     project = r.json()
   206                     get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
   255                     text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S)
   207                     
   256                     root = etree.fromstring(text_match.group(1) if text_match else project['ldt'])
   208                     project = anyjson.deserialize(content)
       
   209                     root = etree.fromstring(project["ldt"])
       
   210                 
   257                 
   211                 elif content_file and os.path.exists(content_file):
   258                 elif content_file and os.path.exists(content_file):
   212 
   259 
   213                     doc = etree.parse(content_file)
   260                     doc = etree.parse(content_file)
   214                     root = doc.getroot()
   261                     root = doc.getroot()
   215                     
   262                 
       
   263                 content_id = None    
   216                 
   264                 
   217                 if root is None:
   265                 if root is None:
   218                 
   266                 
   219                     root = etree.Element(u"iri")
   267                     root = etree.Element(u"iri")
   220                         
   268                         
   224                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   272                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   225                     
   273                     
   226                     annotations = etree.SubElement(root, u"annotations")    
   274                     annotations = etree.SubElement(root, u"annotations")    
   227                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
   275                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
   228                     ensemble_parent = content
   276                     ensemble_parent = content
       
   277                     
       
   278                     content_id = options.content_id
   229                     
   279                     
   230                 
   280                 
   231                 if ensemble_parent is None:
   281                 if ensemble_parent is None:
   232                     file_type = None
   282                     file_type = None
   233                     for node in root:
   283                     for node in root:
   247                             annotations_node = etree.SubElement(root, u"annotations")
   297                             annotations_node = etree.SubElement(root, u"annotations")
   248                         content_node = annotations_node.find(u"content")
   298                         content_node = annotations_node.find(u"content")
   249                         if content_node is None:
   299                         if content_node is None:
   250                             content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
   300                             content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
   251                         ensemble_parent = content_node
   301                         ensemble_parent = content_node
       
   302                         content_id = content_node.get(u"id")
   252                     elif file_type == "iri":
   303                     elif file_type == "iri":
   253                         body_node = root.find(u"body")
   304                         body_node = root.find(u"body")
   254                         if body_node is None:
   305                         if body_node is None:
   255                             body_node = etree.SubElement(root, u"body")
   306                             body_node = etree.SubElement(root, u"body")
   256                         ensembles_node = body_node.find(u"ensembles")
   307                         ensembles_node = body_node.find(u"ensembles")
   257                         if ensembles_node is None:
   308                         if ensembles_node is None:
   258                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   309                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   259                         ensemble_parent = ensembles_node
   310                         ensemble_parent = ensembles_node
       
   311                         content_id = root.xpath("head/meta[@name='id']/@content")[0]
   260                     
   312                     
   261                     
   313                     
   262                 if ensemble_parent is None:
   314                 if ensemble_parent is None:
   263                     get_logger().error("Can not process file") #@UndefinedVariable
   315                     get_logger().error("Can not process file") #@UndefinedVariable
   264                     sys.exit()
   316                     sys.exit()
   282                 
   334                 
   283                     etree.SubElement(decoupage, u"title").text = unicode(options.name)
   335                     etree.SubElement(decoupage, u"title").text = unicode(options.name)
   284                     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   336                     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   285                 
   337                 
   286                     elements = etree.SubElement(decoupage, u"elements")
   338                     elements = etree.SubElement(decoupage, u"elements")
       
   339 
       
   340                 end_date = None
       
   341                 if end_date_str:
       
   342                     end_date = parse_date(end_date_str)
       
   343                 elif start_date and duration:
       
   344                     end_date = start_date + datetime.timedelta(seconds=duration)
       
   345                 elif start_date and options.base_url:                    
       
   346                     # get duration from api
       
   347                     content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
       
   348                     r = requests.get(content_url)
       
   349                     duration = int(r.json()['duration'])
       
   350                     get_logger().debug("get duration " + content_url) #@UndefinedVariable
       
   351                     get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
       
   352 
       
   353                     end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
       
   354                 
       
   355                 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
       
   356                     
       
   357                 query_res = query.all()
   287 
   358 
   288                 
   359                 
   289                 for tw in query_res:
   360                 for tw in query_res:
   290                     tweet_ts_dt = tw.created_at
   361                     tweet_ts_dt = tw.created_at
   291                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
   362                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
   331                     
   402                     
   332                 
   403                 
   333                 
   404                 
   334                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
   405                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
   335                 
   406                 
   336                 if content_file and content_file.find("http") == 0:
   407                 if content_file_write and content_file_write.find("http") == 0:
   337                     
   408                     
   338                     project["ldt"] = output_data
   409                     project["ldt"] = output_data
   339                     body = anyjson.serialize(project)
   410                     post_param = {}
   340                     get_logger().debug("write http " + content_file) #@UndefinedVariable
   411                     if options.post_param:
   341                     get_logger().debug("write http " + repr(body)) #@UndefinedVariable
   412                         post_param = anyjson.loads(options.post_param)
   342                     h = httplib2.Http()
   413 
   343                     resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
   414                     get_logger().debug("write http " + content_file_write) #@UndefinedVariable
   344                     get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
   415                     get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable
   345                     if resp.status != 200:
   416                     get_logger().debug("write http " + repr(project)) #@UndefinedVariable
   346                         get_logger().error("Error http " + repr(resp) + " content " + content) #@UndefinedVariable
   417                     r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param);
   347                         raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason))                        
   418                     get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable
       
   419                     if r.status_code != requests.codes.ok:
       
   420                         r.raise_for_status()
   348                 else:
   421                 else:
   349                     if content_file and os.path.exists(content_file):
   422                     if content_file_write and os.path.exists(content_file_write):
   350                         dest_file_name = content_file 
   423                         dest_file_name = content_file_write
   351                     else:
   424                     else:
   352                         dest_file_name = options.filename
   425                         dest_file_name = options.filename
   353             
   426             
   354                     get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
   427                     get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
   355                     output = open(dest_file_name, "w")
   428                     output = open(dest_file_name, "w")