script/utils/export_twitter_alchemy.py
changeset 957 e4d0094f097b
parent 888 6fc6637d8403
child 1023 7d87ba8cc268
equal deleted inserted replaced
956:883d0724ffd0 957:e4d0094f097b
     1 #!/usr/bin/env python
     1 #!/usr/bin/env python
     2 # coding=utf-8
     2 # coding=utf-8
     3 
     3 
     4 from lxml import etree
     4 from lxml import etree
     5 from iri_tweet.models import setup_database, Tweet, User
     5 from iri_tweet.models import setup_database, Tweet, User
     6 from optparse import OptionParser #@UnresolvedImport
       
     7 from sqlalchemy import Table, Column, BigInteger, event, bindparam
     6 from sqlalchemy import Table, Column, BigInteger, event, bindparam
     8 from sqlalchemy.sql import select, func
     7 from sqlalchemy.sql import select, func
     9 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
     8 from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, 
    10     get_logger)
     9     get_logger)
       
    10 import argparse
    11 import anyjson
    11 import anyjson
    12 import datetime
    12 import datetime
    13 import requests
    13 import requests
    14 import os.path
    14 import os.path
    15 import re
    15 import re
    60         return polemics.keys()
    60         return polemics.keys()
    61     else:
    61     else:
    62         return None
    62         return None
    63 
    63 
    64 def get_options():
    64 def get_options():
    65     parser = OptionParser()
    65     
    66     parser.add_option("-f", "--file", dest="filename",
    66     usage = "usage: %(prog)s [options]"
       
    67     
       
    68     parser = argparse.ArgumentParser(usage)
       
    69     
       
    70     parser.add_argument("-f", "--file", dest="filename",
    67                       help="write export to file", metavar="FILE", default="project.ldt")
    71                       help="write export to file", metavar="FILE", default="project.ldt")
    68     parser.add_option("-d", "--database", dest="database",
    72     parser.add_argument("-d", "--database", dest="database",
    69                       help="Input database", metavar="DATABASE")
    73                       help="Input database", metavar="DATABASE")
    70     parser.add_option("-s", "--start-date", dest="start_date",
    74     parser.add_argument("-s", "--start-date", dest="start_date",
    71                       help="start date", metavar="START_DATE", default=None)
    75                       help="start date", metavar="START_DATE", default=None)
    72     parser.add_option("-e", "--end-date", dest="end_date",
    76     parser.add_argument("-e", "--end-date", dest="end_date",
    73                       help="end date", metavar="END_DATE", default=None)
    77                       help="end date", metavar="END_DATE", default=None)
    74     parser.add_option("-I", "--content-file", dest="content_file",
    78     parser.add_argument("-I", "--content-file", dest="content_file",
    75                       help="Content file", metavar="CONTENT_FILE")
    79                       help="Content file", metavar="CONTENT_FILE")
    76     parser.add_option("-c", "--content", dest="content",
    80     parser.add_argument("-c", "--content", dest="content",
    77                       help="Content url", metavar="CONTENT")
    81                       help="Content url", metavar="CONTENT")
    78     parser.add_option("-V", "--video-url", dest="video",
    82     parser.add_argument("-V", "--video-url", dest="video",
    79                       help="video url", metavar="VIDEO")
    83                       help="video url", metavar="VIDEO")
    80     parser.add_option("-i", "--content-id", dest="content_id",
    84     parser.add_argument("-i", "--content-id", dest="content_id",
    81                       help="Content id", metavar="CONTENT_ID")
    85                       help="Content id", metavar="CONTENT_ID")
    82     parser.add_option("-x", "--exclude", dest="exclude",
    86     parser.add_argument("-x", "--exclude", dest="exclude",
    83                       help="file containing the id to exclude", metavar="EXCLUDE")
    87                       help="file containing the id to exclude", metavar="EXCLUDE")
    84     parser.add_option("-C", "--color", dest="color",
    88     parser.add_argument("-C", "--color", dest="color",
    85                       help="Color code", metavar="COLOR", default="16763904")
    89                       help="Color code", metavar="COLOR", default="16763904")
    86     parser.add_option("-H", "--hashtag", dest="hashtag",
    90     parser.add_argument("-H", "--hashtag", dest="hashtag",
    87                       help="Hashtag", metavar="HASHTAG", default=[], action="append")                      
    91                       help="Hashtag", metavar="HASHTAG", default=[], action="append")                      
    88     parser.add_option("-D", "--duration", dest="duration", type="int",
    92     parser.add_argument("-D", "--duration", dest="duration", type=int,
    89                       help="Duration", metavar="DURATION", default=None)
    93                       help="Duration", metavar="DURATION", default=None)
    90     parser.add_option("-n", "--name", dest="name",
    94     parser.add_argument("-n", "--name", dest="name",
    91                       help="Cutting name", metavar="NAME", default=u"Tweets")
    95                       help="Cutting name", metavar="NAME", default=u"Tweets")
    92     parser.add_option("-R", "--replace", dest="replace", action="store_true",
    96     parser.add_argument("-R", "--replace", dest="replace", action="store_true",
    93                       help="Replace tweet ensemble", metavar="REPLACE", default=False)
    97                       help="Replace tweet ensemble", default=False)
    94     parser.add_option("-m", "--merge", dest="merge", action="store_true",
    98     parser.add_argument("-m", "--merge", dest="merge", action="store_true",
    95                       help="merge tweet ensemble, choose the first ensemble", metavar="MERGE", default=False)
    99                       help="merge tweet ensemble, choose the first ensemble", default=False)
    96     parser.add_option("-L", "--list-conf", dest="listconf",
   100     parser.add_argument("-L", "--list-conf", dest="listconf",
    97                       help="list of file to process", metavar="LIST_CONF", default=None)
   101                       help="list of file to process", metavar="LIST_CONF", default=None)
    98     parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
   102     parser.add_argument("-E", "--extended", dest="extended_mode", action="store_true",
    99                       help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
   103                       help="Trigger polemic extended mode", default=False)
   100     parser.add_option("-b", "--base-url", dest="base_url",
   104     parser.add_argument("-b", "--base-url", dest="base_url",
   101                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
   105                       help="base URL of the platform", metavar="BASE_URL", default="http://ldt.iri.centrepompidou.fr/ldtplatform/")
   102     parser.add_option("-p", "--project", dest="project_id", 
   106     parser.add_argument("-p", "--project", dest="project_id", 
   103                       help="Project id", metavar="PROJECT_ID", default=None)
   107                       help="Project id", metavar="PROJECT_ID", default=None)
   104     parser.add_option("-P", "--post-param", dest="post_param", 
   108     parser.add_argument("-P", "--post-param", dest="post_param", 
   105                       help="Post param", metavar="POST_PARAM", default=None)        
   109                       help="Post param", metavar="POST_PARAM", default=None)        
   106     parser.add_option("--user-whitelist", dest="user_whitelist", action="store",
   110     parser.add_argument("--user-whitelist", dest="user_whitelist", action="store",
   107                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
   111                       help="A list of user screen name", metavar="USER_WHITELIST",default=None)
   108     
   112     
   109     
   113     
   110     set_logging_options(parser)
   114     set_logging_options(parser)
   111 
   115 
   112     
   116     
   113     return parser.parse_args() + (parser,)
   117     return (parser.parse_args(), parser)
   114 
   118 
   115 
   119 
   116 if __name__ == "__main__" :
   120 if __name__ == "__main__" :
   117 
   121 
   118     (options, args, parser) = get_options()
   122     (options, parser) = get_options()
   119     
   123     
   120     set_logging(options)
   124     set_logging(options)
   121         
   125         
   122     get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
   126     get_logger().debug("OPTIONS : " + repr(options)) #@UndefinedVariable
   123     
   127     
   248                 if content_file and content_file.find("http") == 0:
   252                 if content_file and content_file.find("http") == 0:
   249                     
   253                     
   250                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   254                     get_logger().debug("url : " + content_file) #@UndefinedVariable
   251                     
   255                     
   252                     r = requests.get(content_file, params=post_param)                    
   256                     r = requests.get(content_file, params=post_param)                    
   253                     #get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
   257                     get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable                    
   254                     project = r.json()
   258                     project = r.json()
   255                     text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S)
   259                     text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S)
   256                     root = etree.fromstring(text_match.group(1) if text_match else project['ldt'])
   260                     root = etree.fromstring(text_match.group(1) if text_match else project['ldt'])
   257                 
   261                 
   258                 elif content_file and os.path.exists(content_file):
   262                 elif content_file and os.path.exists(content_file):
   298                         content_node = annotations_node.find(u"content")
   302                         content_node = annotations_node.find(u"content")
   299                         if content_node is None:
   303                         if content_node is None:
   300                             content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
   304                             content_node = etree.SubElement(annotations_node,u"content", id=media.get(u"id"))
   301                         ensemble_parent = content_node
   305                         ensemble_parent = content_node
   302                         content_id = content_node.get(u"id")
   306                         content_id = content_node.get(u"id")
       
   307                         display_nodes = root.xpath("//displays/display/content[@id='%s']" % content_id)
       
   308                         if len(display_nodes) == 0:
       
   309                             get_logger().info("No display node found. Will not update display")
       
   310                             display_content_node = None
       
   311                         else:
       
   312                             display_content_node = display_nodes[0] 
       
   313                         
   303                     elif file_type == "iri":
   314                     elif file_type == "iri":
   304                         body_node = root.find(u"body")
   315                         body_node = root.find(u"body")
   305                         if body_node is None:
   316                         if body_node is None:
   306                             body_node = etree.SubElement(root, u"body")
   317                             body_node = etree.SubElement(root, u"body")
   307                         ensembles_node = body_node.find(u"ensembles")
   318                         ensembles_node = body_node.find(u"ensembles")
   308                         if ensembles_node is None:
   319                         if ensembles_node is None:
   309                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   320                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   310                         ensemble_parent = ensembles_node
   321                         ensemble_parent = ensembles_node
   311                         content_id = root.xpath("head/meta[@name='id']/@content")[0]
   322                         content_id = root.xpath("head/meta[@name='id']/@content")[0]
       
   323                         display_content_node = None
   312                     
   324                     
   313                     
   325                     
   314                 if ensemble_parent is None:
   326                 if ensemble_parent is None:
   315                     get_logger().error("Can not process file") #@UndefinedVariable
   327                     get_logger().error("Can not process file") #@UndefinedVariable
   316                     sys.exit()
   328                     sys.exit()
   317             
   329             
   318                 if options.replace:
   330                 if options.replace:
   319                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
   331                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
   320                         if ens.get("id","").startswith("tweet_"):
   332                         ens_id = ens.get("id","") 
       
   333                         if ens_id.startswith("tweet_"):                            
   321                             ensemble_parent.remove(ens)
   334                             ensemble_parent.remove(ens)
       
   335                             # remove in display nodes
       
   336                             if display_content_node is not None:
       
   337                                 for cut_display in display_content_node.iterchildren():
       
   338                                     if cut_display.get('idens','') == ens_id:
       
   339                                         display_content_node.remove(cut_display)
   322                 
   340                 
   323                 ensemble = None
   341                 ensemble = None
   324                 elements = None
   342                 elements = None
   325                 
   343                 
   326                 if options.merge:
   344                 if options.merge:
   327                     ensemble = ensemble_parent.find(u"ensemble")
   345                     for ens in ensemble_parent.findall(u"ensemble"):
   328                     if ensemble is not None:
   346                         if ens.get('id',"").startswith("tweet_"):
   329                         elements = ensemble.find(u".//elements")                
   347                             ensemble = ens
       
   348                             break
       
   349                     if ensemble is not None:                            
       
   350                         elements = ensemble.find(u".//elements")
       
   351                         decoupage = ensemble.find(u"decoupage")
   330                     
   352                     
   331                 if ensemble is None or elements is None:
   353                 if ensemble is None or elements is None:
   332                     ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
   354                     ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
   333                     decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
   355                     decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
   334                 
   356                 
   335                     etree.SubElement(decoupage, u"title").text = unicode(options.name)
   357                     etree.SubElement(decoupage, u"title").text = unicode(options.name)
   336                     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   358                     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   337                 
   359                 
   338                     elements = etree.SubElement(decoupage, u"elements")
   360                     elements = etree.SubElement(decoupage, u"elements")
       
   361 
       
   362                 ensemble_id = ensemble.get('id', '')                
       
   363                 decoupage_id = decoupage.get('id', '') if decoupage is not None else None
   339 
   364 
   340                 end_date = None
   365                 end_date = None
   341                 if end_date_str:
   366                 if end_date_str:
   342                     end_date = parse_date(end_date_str)
   367                     end_date = parse_date(end_date_str)
   343                 elif start_date and duration:
   368                 elif start_date and duration:
   397                     # remove all elements and put them in a array
   422                     # remove all elements and put them in a array
   398                     # sort them with tc
   423                     # sort them with tc
   399                     #put them back
   424                     #put them back
   400                     elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
   425                     elements[:] = sorted(elements,key=lambda n: int(n.get('begin')))
   401                     
   426                     
   402                     
   427                 #add to display node    
   403                 
   428                 if display_content_node is not None:
       
   429                     display_dec = None
       
   430                     for dec in display_content_node.iterchildren(tag=u"decoupage"):
       
   431                         if dec.get('idens','') == ensemble_id and dec.get('id', '') == decoupage_id:
       
   432                             display_dec = dec
       
   433                             break
       
   434                     if display_dec is None and ensemble_id and decoupage_id:
       
   435                         etree.SubElement(display_content_node, u"decoupage", attrib={'idens': ensemble_id, 'id': decoupage_id, 'tagsSelect':''})
   404                 
   436                 
   405                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
   437                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=False, xml_declaration=True)  
   406                 
   438                 
   407                 if content_file_write and content_file_write.find("http") == 0:
   439                 if content_file_write and content_file_write.find("http") == 0:
   408                     
   440                     
   409                     project["ldt"] = output_data
   441                     project["ldt"] = output_data
       
   442                     project['owner'] = project['owner'].replace('%7E','~')
       
   443                     project['contents'] = [c_url.replace('%7E','~') for c_url in project['contents']]
       
   444                           
   410                     post_param = {}
   445                     post_param = {}
   411                     if options.post_param:
   446                     if options.post_param:
   412                         post_param = anyjson.loads(options.post_param)
   447                         post_param = anyjson.loads(options.post_param)
   413 
   448 
   414                     get_logger().debug("write http " + content_file_write) #@UndefinedVariable
   449                     get_logger().debug("write http " + content_file_write) #@UndefinedVariable