script/utils/export_twitter_alchemy.py
changeset 763 bc29a6fbb8e8
parent 487 323b5f770fa0
child 764 67a0cee0077f
equal deleted inserted replaced
762:38ff25c1db25 763:bc29a6fbb8e8
     1 #!/usr/bin/env python
     1 #!/usr/bin/env python
     2 # coding=utf-8
     2 # coding=utf-8
     3 
     3 
     4 from lxml import etree
     4 from lxml import etree
     5 from iri_tweet.models import setup_database
     5 from iri_tweet.models import setup_database, Tweet, User
     6 from optparse import OptionParser #@UnresolvedImport
     6 from optparse import OptionParser #@UnresolvedImport
     7 from sqlalchemy import Table, Column, BigInteger
     7 from sqlalchemy import Table, Column, BigInteger
     8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     9     get_logger)
     9     get_logger)
    10 import anyjson
    10 import anyjson
    11 import datetime
    11 import datetime
    12 import httplib2
    12 import requests
    13 import os.path
    13 import os.path
    14 import re
    14 import re
    15 import sys
    15 import sys
    16 import time
    16 import time
    17 import uuid #@UnresolvedImport
    17 import uuid #@UnresolvedImport
    22 #        self.id = id
    22 #        self.id = id
    23 #        
    23 #        
    24 #    def __repr__(self):
    24 #    def __repr__(self):
    25 #        return "<TweetExclude(id=%d)>" % (self.id)
    25 #        return "<TweetExclude(id=%d)>" % (self.id)
    26 
    26 
       
    27 LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/"
       
    28 LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/"
       
    29  
    27 
    30 
    28 def parse_polemics(tw, extended_mode):
    31 def parse_polemics(tw, extended_mode):
    29     """
    32     """
    30     parse polemics in text and return a list of polemic code. None if not polemic found
    33     parse polemics in text and return a list of polemic code. None if not polemic found
    31     """
    34     """
    85                       help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
    88                       help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
    86     parser.add_option("-L", "--list-conf", dest="listconf",
    89     parser.add_option("-L", "--list-conf", dest="listconf",
    87                       help="list of file to process", metavar="LIST_CONF", default=None)
    90                       help="list of file to process", metavar="LIST_CONF", default=None)
    88     parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
    91     parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
    89                       help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
    92                       help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
       
    93     parser.add_option("-b", "--base-url", dest="base_url",
       
    94                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
       
    95     parser.add_option("-p", "--project", dest="project_id", 
       
    96                       help="Project id", metavar="PROJECT_ID", default=None)
       
    97     parser.add_option("-P", "--post-param", dest="post_param", 
       
    98                       help="Post param", metavar="POST_PARAM", default=None)        
    90     parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
    99     parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
    91                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
   100                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
    92     
   101     
    93     
   102     
    94     set_logging_options(parser)
   103     set_logging_options(parser)
   125             metadata.create_all(bind=conn, tables=[tweet_exclude_table])
   134             metadata.create_all(bind=conn, tables=[tweet_exclude_table])
   126             
   135             
   127             if options.exclude and os.path.exists(options.exclude):
   136             if options.exclude and os.path.exists(options.exclude):
   128                 with open(options.exclude, 'r+') as f:
   137                 with open(options.exclude, 'r+') as f:
   129                     tei = tweet_exclude_table.insert()
   138                     tei = tweet_exclude_table.insert()
       
   139                     ex_regexp = re.compile("(?P<field>\w+)(?P<op>[~=])(?P<value>.+)", re.I)
   130                     for line in f:
   140                     for line in f:
   131                         conn.execute(tei.values(id=long(line.strip())))
   141                         res = ex_regexp.match(line.strip())
       
   142                         if res:
       
   143                             if res.group('field') == "id":                                
       
   144                                 conn.execute(tei.values(id=res.group('value')))
       
   145                             else:
       
   146                                 exclude_query = session.query(Tweet)
       
   147                                 filter_obj = Tweet
       
   148                                 filter_field = res.group('field')
       
   149                                 if filter_field.startswith("user_"):
       
   150                                     exclude_query = exclude_query.join(User)
       
   151                                     filter_obj = User
       
   152                                     filter_field = filter_field[len("user_"):]
       
   153                                     
       
   154 
       
   155                                 if res.group('op') == "=":
       
   156                                     exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field) == res.group('value'))
       
   157                                 else:
       
   158                                     exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field).like(res.group('value')))
       
   159                                 
       
   160                                 for t in exclude_query.all():
       
   161                                      conn.execute(tei.values(id=t.id))
       
   162                                 
   132             user_whitelist_file = options.user_whitelist
   163             user_whitelist_file = options.user_whitelist
   133             user_whitelist = None
   164             user_whitelist = None
   134             
   165             
   135             if options.listconf:
   166             if options.listconf:
   136                 
   167                 
   139                 for node in confdoc.xpath("/twitter_export/file"):
   170                 for node in confdoc.xpath("/twitter_export/file"):
   140                     params = {}
   171                     params = {}
   141                     for snode in node:
   172                     for snode in node:
   142                         if snode.tag == "path":
   173                         if snode.tag == "path":
   143                             params['content_file'] = snode.text
   174                             params['content_file'] = snode.text
       
   175                             params['content_file_write'] = snode.text
       
   176                         elif snode.tag == "project_id":
       
   177                             params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json"
       
   178                             params['project_id'] = snode.text
   144                         elif snode.tag == "start_date":
   179                         elif snode.tag == "start_date":
   145                             params['start_date'] = snode.text
   180                             params['start_date'] = snode.text
   146                         elif snode.tag == "end_date":
   181                         elif snode.tag == "end_date":
   147                             params['end_date'] = snode.text
   182                             params['end_date'] = snode.text
   148                         elif snode.tag == "duration":
   183                         elif snode.tag == "duration":
   150                         elif snode.tag == "hashtags":
   185                         elif snode.tag == "hashtags":
   151                             params['hashtags'] = [snode.text]
   186                             params['hashtags'] = [snode.text]
   152                     if options.hashtag or 'hashtags' not in params :
   187                     if options.hashtag or 'hashtags' not in params :
   153                         params['hashtags'] = options.hashtag
   188                         params['hashtags'] = options.hashtag
   154                     parameters.append(params)
   189                     parameters.append(params)
   155             else:                        
   190             else:
       
   191                 if options.project_id:
       
   192                     content_file = options.base_url + LDT_PROJECT_REST_API_PATH + options.project_id + "/?format=json"
       
   193                 else:
       
   194                     content_file = options.content_file                                          
   156                 parameters = [{
   195                 parameters = [{
   157                     'start_date': options.start_date,
   196                     'start_date': options.start_date,
   158                     'end_date' : options.end_date,
   197                     'end_date' : options.end_date,
   159                     'duration' : options.duration,
   198                     'duration' : options.duration,
   160                     'content_file' : options.content_file,
   199                     'content_file' : content_file,
   161                     'hashtags' : options.hashtag
   200                     'content_file_write' : content_file,
       
   201                     'hashtags' : options.hashtag,
       
   202                     'project_id' : options.project_id 
   162                 }]
   203                 }]
   163             
   204             post_param = {}
       
   205             if options.post_param:
       
   206                 post_param = anyjson.loads(options.post_param)
       
   207 
   164             for params in parameters:
   208             for params in parameters:
   165                 
   209                 
   166                 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
   210                 get_logger().debug("PARAMETERS " + repr(params)) #@UndefinedVariable
   167                 
   211                 
   168                 start_date_str = params.get("start_date",None)
   212                 start_date_str = params.get("start_date",None)
   169                 end_date_str = params.get("end_date", None)
   213                 end_date_str = params.get("end_date", None)
   170                 duration = params.get("duration", None)
   214                 duration = params.get("duration", None)
   171                 content_file = params.get("content_file", None)
   215                 content_file = params.get("content_file", None)
       
   216                 content_file_write = params.get("content_file_write", None)
   172                 hashtags = params.get('hashtags', [])
   217                 hashtags = params.get('hashtags', [])
   173                   
   218                   
   174                 if user_whitelist_file:
   219                 if user_whitelist_file:
   175                     with open(user_whitelist_file, 'r+') as f:
   220                     with open(user_whitelist_file, 'r+') as f:
   176                         user_whitelist = list(set([s.strip() for s in f]))
   221                         user_whitelist = list(set([s.strip() for s in f]))
   179                 ts = None
   224                 ts = None
   180                 if start_date_str:
   225                 if start_date_str:
   181                     start_date = parse_date(start_date_str) 
   226                     start_date = parse_date(start_date_str) 
   182                     ts = time.mktime(start_date.timetuple())
   227                     ts = time.mktime(start_date.timetuple())
   183             
   228             
   184                 end_date = None
       
   185                 if end_date_str:
       
   186                     end_date = parse_date(end_date_str)
       
   187                 elif start_date and duration:
       
   188                     end_date = start_date + datetime.timedelta(seconds=duration)
       
   189                 
       
   190                 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
       
   191                     
       
   192                 query_res = query.all()
       
   193                                  
   229                                  
   194                 root = None
   230                 root = None
   195                 ensemble_parent = None
   231                 ensemble_parent = None
   196                 
   232                 
   197                 #to do : analyse situation ldt or iri ? filename set or not ?
   233                 #to do : analyse situation ldt or iri ? filename set or not ?
   198                 
   234                 
   199                 if content_file and content_file.find("http") == 0:
   235                 if content_file and content_file.find("http") == 0:
   200                     
   236                     
   201                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   237                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   202                     
   238                     
   203                     h = httplib2.Http()
   239                     r = requests.get(content_file, params=post_param)                    
   204                     resp, content = h.request(content_file)
   240                     get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
   205                     
   241                     project = r.json()                    
   206                     get_logger().debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
       
   207                     
       
   208                     project = anyjson.deserialize(content)
       
   209                     root = etree.fromstring(project["ldt"])
   242                     root = etree.fromstring(project["ldt"])
   210                 
   243                 
   211                 elif content_file and os.path.exists(content_file):
   244                 elif content_file and os.path.exists(content_file):
   212 
   245 
   213                     doc = etree.parse(content_file)
   246                     doc = etree.parse(content_file)
   214                     root = doc.getroot()
   247                     root = doc.getroot()
   215                     
   248                 
       
   249                 content_id = None    
   216                 
   250                 
   217                 if root is None:
   251                 if root is None:
   218                 
   252                 
   219                     root = etree.Element(u"iri")
   253                     root = etree.Element(u"iri")
   220                         
   254                         
   224                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   258                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   225                     
   259                     
   226                     annotations = etree.SubElement(root, u"annotations")    
   260                     annotations = etree.SubElement(root, u"annotations")    
   227                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
   261                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
   228                     ensemble_parent = content
   262                     ensemble_parent = content
       
   263                     
       
   264                     content_id = options.content_id
   229                     
   265                     
   230                 
   266                 
   231                 if ensemble_parent is None:
   267                 if ensemble_parent is None:
   232                     file_type = None
   268                     file_type = None
   233                     for node in root:
   269                     for node in root:
   247                             annotations_node = etree.SubElement(root, u"annotations")
   283                             annotations_node = etree.SubElement(root, u"annotations")
   248                         content_node = annotations_node.find(u"content")
   284                         content_node = annotations_node.find(u"content")
   249                         if content_node is None:
   285                         if content_node is None:
   250                             content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
   286                             content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
   251                         ensemble_parent = content_node
   287                         ensemble_parent = content_node
       
   288                         content_id = content_node.get(u"id")
   252                     elif file_type == "iri":
   289                     elif file_type == "iri":
   253                         body_node = root.find(u"body")
   290                         body_node = root.find(u"body")
   254                         if body_node is None:
   291                         if body_node is None:
   255                             body_node = etree.SubElement(root, u"body")
   292                             body_node = etree.SubElement(root, u"body")
   256                         ensembles_node = body_node.find(u"ensembles")
   293                         ensembles_node = body_node.find(u"ensembles")
   257                         if ensembles_node is None:
   294                         if ensembles_node is None:
   258                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   295                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   259                         ensemble_parent = ensembles_node
   296                         ensemble_parent = ensembles_node
       
   297                         content_id = root.xpath("head/meta[@name='id']/@content")[0]
   260                     
   298                     
   261                     
   299                     
   262                 if ensemble_parent is None:
   300                 if ensemble_parent is None:
   263                     get_logger().error("Can not process file") #@UndefinedVariable
   301                     get_logger().error("Can not process file") #@UndefinedVariable
   264                     sys.exit()
   302                     sys.exit()
   282                 
   320                 
   283                     etree.SubElement(decoupage, u"title").text = unicode(options.name)
   321                     etree.SubElement(decoupage, u"title").text = unicode(options.name)
   284                     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   322                     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   285                 
   323                 
   286                     elements = etree.SubElement(decoupage, u"elements")
   324                     elements = etree.SubElement(decoupage, u"elements")
       
   325 
       
   326                 end_date = None
       
   327                 if end_date_str:
       
   328                     end_date = parse_date(end_date_str)
       
   329                 elif start_date and duration:
       
   330                     end_date = start_date + datetime.timedelta(seconds=duration)
       
   331                 elif start_date and options.base_url:                    
       
   332                     # get duration from api
       
   333                     content_url = options.base_url + LDT_CONTENT_REST_API_PATH + content_id + "/?format=json"
       
   334                     r = requests.get(content_url)
       
   335                     duration = int(r.json()['duration'])
       
   336                     get_logger().debug("get duration " + content_url) #@UndefinedVariable
       
   337                     get_logger().debug("get duration " + repr(duration)) #@UndefinedVariable
       
   338 
       
   339                     end_date = start_date + datetime.timedelta(seconds=int(duration/1000))
       
   340                 
       
   341                 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
       
   342                     
       
   343                 query_res = query.all()
   287 
   344 
   288                 
   345                 
   289                 for tw in query_res:
   346                 for tw in query_res:
   290                     tweet_ts_dt = tw.created_at
   347                     tweet_ts_dt = tw.created_at
   291                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
   348                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
   331                     
   388                     
   332                 
   389                 
   333                 
   390                 
   334                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
   391                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
   335                 
   392                 
   336                 if content_file and content_file.find("http") == 0:
   393                 if content_file_write and content_file_write.find("http") == 0:
   337                     
   394                     
   338                     project["ldt"] = output_data
   395                     project["ldt"] = output_data
   339                     body = anyjson.serialize(project)
   396                     post_param = {}
   340                     get_logger().debug("write http " + content_file) #@UndefinedVariable
   397                     if options.post_param:
   341                     get_logger().debug("write http " + repr(body)) #@UndefinedVariable
   398                         post_param = anyjson.loads(options.post_param)
   342                     h = httplib2.Http()
   399 
   343                     resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
   400                     get_logger().debug("write http " + content_file_write) #@UndefinedVariable
   344                     get_logger().debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
   401                     get_logger().debug("write http " + repr(post_param)) #@UndefinedVariable
   345                     if resp.status != 200:
   402                     get_logger().debug("write http " + repr(project)) #@UndefinedVariable
   346                         get_logger().error("Error http " + repr(resp) + " content " + content) #@UndefinedVariable
   403                     r = requests.put(content_file_write, data=anyjson.dumps(project), headers={'content-type':'application/json'}, params=post_param);
   347                         raise Exception("Error writing content : %d : %s"%(resp.status, resp.reason))                        
   404                     get_logger().debug("write http " + repr(r) + " content " + r.text) #@UndefinedVariable
       
   405                     if r.status_code != requests.codes.ok:
       
   406                         r.raise_for_status()
   348                 else:
   407                 else:
   349                     if content_file and os.path.exists(content_file):
   408                     if content_file_write and os.path.exists(content_file_write):
   350                         dest_file_name = content_file 
   409                         dest_file_name = content_file_write
   351                     else:
   410                     else:
   352                         dest_file_name = options.filename
   411                         dest_file_name = options.filename
   353             
   412             
   354                     get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
   413                     get_logger().debug("WRITE : " + dest_file_name) #@UndefinedVariable
   355                     output = open(dest_file_name, "w")
   414                     output = open(dest_file_name, "w")