script/lib/iri_tweet/export_twitter_alchemy.py
changeset 12 4daf47fcf792
parent 11 54d7f1486ac4
child 21 8003bcd8d9a2
equal deleted inserted replaced
11:54d7f1486ac4 12:4daf47fcf792
       
     1 #!/usr/bin/env python
       
     2 # coding=utf-8
       
     3 
       
     4 from lxml import etree
       
     5 from models import *
       
     6 from optparse import OptionParser
       
     7 from sqlalchemy import Table, Column, Integer, BigInteger, String, MetaData, \
       
     8     ForeignKey
       
     9 from sqlalchemy.orm import sessionmaker, mapper
       
    10 from sqlalchemy.sql import select
       
    11 import datetime
       
    12 import email.utils
       
    13 import logging
       
    14 import os
       
    15 import os.path
       
    16 import re
       
    17 import sys
       
    18 import time
       
    19 import uuid
       
    20 
       
    21 #class TweetExclude(object):
       
    22 #    def __init__(self, id):
       
    23 #        self.id = id
       
    24 #        
       
    25 #    def __repr__(self):
       
    26 #        return "<TweetExclude(id=%d)>" % (self.id)
       
    27 
       
    28 def parse_date(date_str):
       
    29     ts = email.utils.parsedate_tz(date_str)
       
    30     return datetime.datetime(*ts[0:7])
       
    31 
       
    32 def get_options():
       
    33     parser = OptionParser()
       
    34     parser.add_option("-f", "--file", dest="filename",
       
    35                       help="write export to file", metavar="FILE", default="project_enmi.ldt")
       
    36     parser.add_option("-d", "--database", dest="database",
       
    37                       help="Input database", metavar="DATABASE")
       
    38     parser.add_option("-s", "--start-date", dest="start_date",
       
    39                       help="start date", metavar="START_DATE")
       
    40     parser.add_option("-e", "--end-date", dest="end_date",
       
    41                       help="end date", metavar="END_DATE")
       
    42     parser.add_option("-I", "--content-file", dest="content_file",
       
    43                       help="Content file", metavar="CONTENT_FILE")
       
    44     parser.add_option("-c", "--content", dest="content",
       
    45                       help="Content url", metavar="CONTENT")
       
    46     parser.add_option("-V", "--video-url", dest="video",
       
    47                       help="video url", metavar="VIDEO")
       
    48     parser.add_option("-i", "--content-id", dest="content_id",
       
    49                       help="Content id", metavar="CONTENT_ID")
       
    50     parser.add_option("-x", "--exclude", dest="exclude",
       
    51                       help="file containing the id to exclude", metavar="EXCLUDE")
       
    52     parser.add_option("-C", "--color", dest="color",
       
    53                       help="Color code", metavar="COLOR", default="16763904")
       
    54     parser.add_option("-H", "--hashtag", dest="hashtag",
       
    55                       help="Hashtag", metavar="HASHTAG", default="enmi")                      
       
    56     parser.add_option("-D", "--duration", dest="duration", type="int",
       
    57                       help="Duration", metavar="DURATION", default=None)
       
    58     parser.add_option("-n", "--name", dest="name",
       
    59                       help="Cutting name", metavar="NAME", default=u"Tweets")
       
    60     parser.add_option("-R", "--replace", dest="replace", action="store_true",
       
    61                       help="Replace tweet ensemble", metavar="REPLACE", default=False)
       
    62     parser.add_option("-l", "--log", dest="logfile",
       
    63                       help="log to file", metavar="LOG", default="stderr")
       
    64     
       
    65     set_logging_options(parser)
       
    66 
       
    67     
       
    68     return parser.parse_args()
       
    69 
       
    70 
       
    71 if __name__ == "__main__" :
       
    72 
       
    73     (options, args) = get_options()
       
    74         
       
    75     set_logging(options)
       
    76         
       
    77     logging.debug("OPTIONS : " + repr(options))
       
    78 
       
    79     engine, metadata = setup_database('sqlite:///'+options.database, echo=((options.verbose-options.quiet)>0), create_all = False)        
       
    80     
       
    81     Session = sessionmaker()
       
    82     conn = engine.connect()
       
    83     try :
       
    84         session = Session(bind=conn)
       
    85         try : 
       
    86         
       
    87             metadata = MetaData(bind=conn)
       
    88             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
       
    89             #mapper(TweetExclude, tweet_exclude_table)
       
    90             metadata.create_all()
       
    91             
       
    92             if options.exclude and os.path.exists(options.exclude):
       
    93                 with open(options.exclude, 'r+') as f:
       
    94                     tei = tweet_exclude_table.insert()
       
    95                     for line in f:
       
    96                         conn.execute(tei.values(id=long(line.strip())))
       
    97 
       
    98             if options.listconf:
       
    99                 
       
   100                 parameters = []
       
   101                 confdoc = etree.parse(options.listconf)
       
   102                 for node in confdoc.xpath("/twitter_export/file"):
       
   103                     params = {}
       
   104                     for snode in node:
       
   105                         if snode.tag == "path":
       
   106                             params['content_file'] = snode.text
       
   107                         elif snode.tag == "start_date":
       
   108                             params['start_date'] = snode.text
       
   109                         elif snode.tag == "end_date":
       
   110                             params['end_date'] = snode.text
       
   111                         elif snode.tag == "duration":
       
   112                             params['duration'] = int(snode.text)
       
   113                     parameters.append(params)
       
   114             else:                        
       
   115                 parameters = [{
       
   116                     'start_date': options.start_date,
       
   117                     'end_date' : options.end_date,
       
   118                     'duration' : options.duration,
       
   119                     'content_file' : otions.content_file
       
   120                     
       
   121                 }]
       
   122             
       
   123             for params in parameters:
       
   124                 
       
   125                 logging.debug("PARAMETERS " + repr(params))
       
   126                 
       
   127                 start_date_str = params.get("start_date",None)
       
   128                 end_date_str = params.get("end_date", None)
       
   129                 duration = params.get("duration", None)
       
   130                 content_file = params.get("content_file", None)
       
   131                 
       
   132                 
       
   133                 start_date = parse_date(start_date_str) 
       
   134                 ts = time.mktime(start_date.timetuple())
       
   135             
       
   136                 if end_date_str:
       
   137                     end_date = parse_date(end_date_str)
       
   138                     te = time.mktime(end_date.timetuple())
       
   139                 else:
       
   140                     te = ts + duration
       
   141                     end_date = start_date + datetime.timedelta(seconds=duration)
       
   142         
       
   143             
       
   144                 query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >=  start_date).filter(Tweet.created_at <=  end_date).all()
       
   145                  
       
   146                 #hashtag = u"%#"+unicode(options.hashtag)+u"%"
       
   147                 
       
   148                 #cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te));
       
   149                 
       
   150                 root = None
       
   151                 ensemble_parent = None
       
   152                 
       
   153                 if content_file and os.path.exists(content_file):
       
   154             
       
   155                     doc = etree.parse(content_file)
       
   156                     root = doc.getroot()
       
   157                     
       
   158                     ensemble_parent = root.xpath("//ensembles")[0]
       
   159                 
       
   160                 else:
       
   161                     root = etree.Element(u"iri")
       
   162                         
       
   163                     project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
       
   164                 
       
   165                     medias = etree.SubElement(root, u"medias")
       
   166                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
       
   167                     
       
   168                     annotations = etree.SubElement(root, u"annotations")    
       
   169                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
       
   170                     ensemble_parent = content
       
   171             
       
   172                 if options.replace:
       
   173                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
       
   174                         if ens.get("id","").startswith("tweet_"):
       
   175                             ensemble_parent.remove(ens)
       
   176             
       
   177                 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"})
       
   178                 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
       
   179                 
       
   180                 etree.SubElement(decoupage, u"title").text = unicode(options.name)
       
   181                 etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
       
   182                 
       
   183                 elements = etree.SubElement(decoupage, u"elements")
       
   184                 
       
   185                 for tw in query_res:
       
   186                     tweet_ts_dt = tw.created_at
       
   187                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
       
   188                     tweet_ts_rel = (tweet_ts-ts) * 1000
       
   189                     username = None
       
   190                     if tw.user is not None:
       
   191                         username = tw.user.name
       
   192                     if not username:
       
   193                         username = "anon."
       
   194                     element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(tw.id), u"color":unicode(options.color), u"author":unicode(username), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""})
       
   195                     etree.SubElement(element, u"title").text = unicode(username) + u": " + unicode(tw.text)
       
   196                     etree.SubElement(element, u"abstract").text = unicode(tw.text)
       
   197             
       
   198                     tags_node = etree.SubElement(element, u"tags")
       
   199                     
       
   200                     for entity in tw.entity_list:
       
   201                         if entity.type == u'entity_hashtag': 
       
   202                             etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
       
   203                 
       
   204                 if content_file and os.path.exists(content_file):
       
   205                     output = open(content_file, "w")
       
   206                 else:
       
   207                     output = open(options.filename, "w")
       
   208             
       
   209                 output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True))
       
   210                 output.flush()
       
   211                 output.close()
       
   212                 
       
   213         finally:
       
   214             session.close()
       
   215     finally:
       
   216         conn.close()