script/iri_tweet/export_twitter_alchemy.py
changeset 9 bb44692e09ee
child 11 54d7f1486ac4
equal deleted inserted replaced
8:b7f4b0554ef8 9:bb44692e09ee
       
     1 #!/usr/bin/env python
       
     2 # coding=utf-8
       
     3 
       
     4 from lxml import etree
       
     5 from models import *
       
     6 from optparse import OptionParser
       
     7 from sqlalchemy import Table, Column, Integer, BigInteger, String, MetaData, \
       
     8     ForeignKey, create_engine
       
     9 from sqlalchemy.orm import sessionmaker, mapper
       
    10 from sqlalchemy.sql import select
       
    11 import datetime
       
    12 import time
       
    13 import email.utils
       
    14 import logging
       
    15 import os
       
    16 import os.path
       
    17 import re
       
    18 import sys
       
    19 import uuid
       
    20 
       
    21 #class TweetExclude(object):
       
    22 #    def __init__(self, id):
       
    23 #        self.id = id
       
    24 #        
       
    25 #    def __repr__(self):
       
    26 #        return "<TweetExclude(id=%d)>" % (self.id)
       
    27 
       
    28 def parse_date(date_str):
       
    29     ts = email.utils.parsedate_tz(date_str)
       
    30     return datetime.datetime(*ts[0:7])
       
    31 
       
    32 
       
    33 if __name__ == "__main__" :
       
    34 
       
    35     parser = OptionParser()
       
    36     parser.add_option("-f", "--file", dest="filename",
       
    37                       help="write export to file", metavar="FILE", default="project_enmi.ldt")
       
    38     parser.add_option("-d", "--database", dest="database",
       
    39                       help="Input database", metavar="DATABASE")
       
    40     parser.add_option("-s", "--start-date", dest="start_date",
       
    41                       help="start date", metavar="START_DATE")
       
    42     parser.add_option("-e", "--end-date", dest="end_date",
       
    43                       help="end date", metavar="END_DATE")
       
    44     parser.add_option("-I", "--content-file", dest="content_file",
       
    45                       help="Content file", metavar="CONTENT_FILE")
       
    46     parser.add_option("-c", "--content", dest="content",
       
    47                       help="Content url", metavar="CONTENT")
       
    48     parser.add_option("-V", "--video-url", dest="video",
       
    49                       help="video url", metavar="VIDEO")
       
    50     parser.add_option("-i", "--content-id", dest="content_id",
       
    51                       help="Content id", metavar="CONTENT_ID")
       
    52     parser.add_option("-x", "--exclude", dest="exclude",
       
    53                       help="file containing the id to exclude", metavar="EXCLUDE")
       
    54     parser.add_option("-C", "--color", dest="color",
       
    55                       help="Color code", metavar="COLOR", default="16763904")
       
    56     parser.add_option("-H", "--hashtag", dest="hashtag",
       
    57                       help="Hashtag", metavar="HASHTAG", default="enmi")                      
       
    58     parser.add_option("-D", "--duration", dest="duration", type="int",
       
    59                       help="Duration", metavar="DURATION", default=None)
       
    60     parser.add_option("-n", "--name", dest="name",
       
    61                       help="Cutting name", metavar="NAME", default=u"Tweets")
       
    62     parser.add_option("-R", "--replace", dest="replace", action="store_true",
       
    63                       help="Replace tweet ensemble", metavar="REPLACE", default=False)
       
    64     parser.add_option("-l", "--log", dest="logfile",
       
    65                       help="log to file", metavar="LOG", default="stderr")
       
    66     parser.add_option("-v", dest="verbose", action="count",
       
    67                       help="verbose", metavar="VERBOSE", default=0)
       
    68     parser.add_option("-q", dest="quiet", action="count",
       
    69                       help="quiet", metavar="QUIET", default=0)
       
    70     parser.add_option("-L", dest="listconf", 
       
    71                       help="file containing the list of file to process", metavar="LIST", default=0)
       
    72 
       
    73 
       
    74     
       
    75     (options, args) = parser.parse_args()
       
    76     
       
    77     logging_config = {}
       
    78     
       
    79     if options.logfile == "stdout":
       
    80         logging_config["stream"] = sys.stdout
       
    81     elif options.logfile == "stderr":
       
    82         logging_config["stream"] = sys.stderr
       
    83     else:
       
    84         logging_config["filename"] = options.logfile
       
    85 
       
    86     logging_config["level"] = max(logging.NOTSET, min(logging.CRITICAL, logging.WARNING - 10 * options.verbose + 10 * options.quiet))
       
    87     
       
    88     logging.basicConfig(**logging_config)
       
    89     
       
    90     logging.debug("OPTIONS : " + repr(options))
       
    91 
       
    92         
       
    93     engine = create_engine('sqlite:///'+options.database, echo=((options.verbose-options.quiet)>0))
       
    94     Session = sessionmaker()
       
    95 
       
    96     conn = engine.connect()
       
    97     try :
       
    98         session = Session(bind=conn)
       
    99         try : 
       
   100         
       
   101             metadata = MetaData(bind=conn)
       
   102             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
       
   103             #mapper(TweetExclude, tweet_exclude_table)
       
   104             metadata.create_all()
       
   105             
       
   106             if options.exclude and os.path.exists(options.exclude):
       
   107                 with open(options.exclude, 'r+') as f:
       
   108                     tei = tweet_exclude_table.insert()
       
   109                     for line in f:
       
   110                         conn.execute(tei.values(id=long(line.strip())))
       
   111 
       
   112             if options.listconf:
       
   113                 
       
   114                 parameters = []
       
   115                 confdoc = etree.parse(options.listconf)
       
   116                 for node in confdoc.xpath("/twitter_export/file"):
       
   117                     params = {}
       
   118                     for snode in node:
       
   119                         if snode.tag == "path":
       
   120                             params['content_file'] = snode.text
       
   121                         elif snode.tag == "start_date":
       
   122                             params['start_date'] = snode.text
       
   123                         elif snode.tag == "end_date":
       
   124                             params['end_date'] = snode.text
       
   125                         elif snode.tag == "duration":
       
   126                             params['duration'] = int(snode.text)
       
   127                     parameters.append(params)
       
   128             else:                        
       
   129                 parameters = [{
       
   130                     'start_date': options.start_date,
       
   131                     'end_date' : options.end_date,
       
   132                     'duration' : options.duration,
       
   133                     'content_file' : otions.content_file
       
   134                     
       
   135                 }]
       
   136             
       
   137             for params in parameters:
       
   138                 
       
   139                 logging.debug("PARAMETERS " + repr(params))
       
   140                 
       
   141                 start_date_str = params.get("start_date",None)
       
   142                 end_date_str = params.get("end_date", None)
       
   143                 duration = params.get("duration", None)
       
   144                 content_file = params.get("content_file", None)
       
   145                 
       
   146                 
       
   147                 start_date = parse_date(start_date_str) 
       
   148                 ts = time.mktime(start_date.timetuple())
       
   149             
       
   150                 if end_date_str:
       
   151                     end_date = parse_date(end_date_str)
       
   152                     te = time.mktime(end_date.timetuple())
       
   153                 else:
       
   154                     te = ts + duration
       
   155                     end_date = start_date + datetime.timedelta(seconds=duration)
       
   156         
       
   157             
       
   158                 query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >=  start_date).filter(Tweet.created_at <=  end_date).all()
       
   159                  
       
   160                 #hashtag = u"%#"+unicode(options.hashtag)+u"%"
       
   161                 
       
   162                 #cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te));
       
   163                 
       
   164                 root = None
       
   165                 ensemble_parent = None
       
   166                 
       
   167                 if content_file and os.path.exists(content_file):
       
   168             
       
   169                     doc = etree.parse(content_file)
       
   170                     root = doc.getroot()
       
   171                     
       
   172                     ensemble_parent = root.xpath("//ensembles")[0]
       
   173                 
       
   174                 else:
       
   175                     root = etree.Element(u"iri")
       
   176                         
       
   177                     project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
       
   178                 
       
   179                     medias = etree.SubElement(root, u"medias")
       
   180                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
       
   181                     
       
   182                     annotations = etree.SubElement(root, u"annotations")    
       
   183                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
       
   184                     ensemble_parent = content
       
   185             
       
   186                 if options.replace:
       
   187                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
       
   188                         if ens.get("id","").startswith("tweet_"):
       
   189                             ensemble_parent.remove(ens)
       
   190             
       
   191                 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"})
       
   192                 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
       
   193                 
       
   194                 etree.SubElement(decoupage, u"title").text = unicode(options.name)
       
   195                 etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
       
   196                 
       
   197                 elements = etree.SubElement(decoupage, u"elements")
       
   198                 
       
   199                 for tw in query_res:
       
   200                     tweet_ts_dt = tw.created_at
       
   201                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
       
   202                     tweet_ts_rel = (tweet_ts-ts) * 1000
       
   203                     username = None
       
   204                     if tw.user is not None:
       
   205                         username = tw.user.name
       
   206                     if not username:
       
   207                         username = "anon."
       
   208                     element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""})
       
   209                     etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
       
   210                     etree.SubElement(element, u"abstract").text = unicode(tw.text)
       
   211             
       
   212                     tags_node = etree.SubElement(element, u"tags")
       
   213                     
       
   214                     for entity in tw.entity_list:
       
   215                         if entity.type == u'entity_hashtag': 
       
   216                             etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
       
   217                 
       
   218                 if content_file and os.path.exists(content_file):
       
   219                     output = open(content_file, "w")
       
   220                 else:
       
   221                     output = open(options.filename, "w")
       
   222             
       
   223                 output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True))
       
   224                 output.flush()
       
   225                 output.close()
       
   226                 
       
   227         finally:
       
   228             session.close()
       
   229     finally:
       
   230         conn.close()