script/rest/export_twitter.py
changeset 9 bb44692e09ee
child 888 6fc6637d8403
equal deleted inserted replaced
8:b7f4b0554ef8 9:bb44692e09ee
       
     1 #!/usr/bin/env python
       
     2 # coding=utf-8
       
     3 
       
     4 from sqlite3 import *
       
     5 import datetime, time
       
     6 import email.utils
       
     7 from optparse import OptionParser
       
     8 import os.path
       
     9 import os
       
    10 import sys
       
    11 from lxml import etree
       
    12 import uuid
       
    13 import re
       
    14 
       
    15 def parse_date(date_str):
       
    16     ts = email.utils.parsedate_tz(date_str)
       
    17     return time.mktime(ts[0:9]) - 60 * ts[9]
       
    18 
       
    19 def adapt_datetime(ts):
       
    20     return time.mktime(ts.timetuple())
       
    21     
       
    22 def adapt_geo(geo):
       
    23 	return simplejson.dumps(geo)
       
    24 	
       
    25 def convert_geo(s):
       
    26 	return simplejson.loads(s)
       
    27 
       
    28 
       
    29 register_adapter(datetime.datetime, adapt_datetime)
       
    30 register_converter("geo", convert_geo)
       
    31 
       
    32 columns_tweet = [u'favorited', u'truncated', u'text', u'created_at', u'source', u'in_reply_to_status_id', u'in_reply_to_screen_name', u'in_reply_to_user_id', u'geo', u'id', u'user']
       
    33 columns_user = [u'id', u'verified', u'profile_sidebar_fill_color', u'profile_text_color', u'followers_count', u'protected', u'location', u'profile_background_color', u'utc_offset', u'statuses_count', u'description', u'friends_count', u'profile_link_color', u'profile_image_url', u'notifications', u'geo_enabled', u'profile_background_image_url', u'screen_name', u'profile_background_tile', u'favourites_count', u'name', u'url', u'created_at', u'time_zone', u'profile_sidebar_border_color', u'following']
       
    34 
       
    35 
       
    36 if __name__ == "__main__" :
       
    37 
       
    38     parser = OptionParser()
       
    39     parser.add_option("-f", "--file", dest="filename",
       
    40                       help="write export to file", metavar="FILE", default="project_enmi.ldt")
       
    41     parser.add_option("-d", "--database", dest="database",
       
    42                       help="Input database", metavar="DATABASE")
       
    43     parser.add_option("-s", "--start-date", dest="start_date",
       
    44                       help="start date", metavar="START_DATE")
       
    45     parser.add_option("-e", "--end-date", dest="end_date",
       
    46                       help="end date", metavar="END_DATE")
       
    47     parser.add_option("-I", "--content-file", dest="content_file",
       
    48                       help="Content file", metavar="CONTENT_FILE")
       
    49     parser.add_option("-c", "--content", dest="content",
       
    50                       help="Content url", metavar="CONTENT")
       
    51     parser.add_option("-v", "--video-url", dest="video",
       
    52                       help="video url", metavar="VIDEO")
       
    53     parser.add_option("-i", "--content-id", dest="content_id",
       
    54                       help="Content id", metavar="CONTENT_ID")
       
    55     parser.add_option("-x", "--exclude", dest="exclude",
       
    56                       help="file containing the id to exclude", metavar="EXCLUDE")
       
    57     parser.add_option("-C", "--color", dest="color",
       
    58                       help="Color code", metavar="COLOR", default="16763904")
       
    59     parser.add_option("-H", "--hashtag", dest="hashtag",
       
    60                       help="Hashtag", metavar="HASHTAG", default="enmi09")                      
       
    61     parser.add_option("-D", "--duration", dest="duration", type="int",
       
    62                       help="Duration", metavar="DURATION", default=None)
       
    63     parser.add_option("-n", "--name", dest="name",
       
    64                       help="Cuttting name", metavar="NAME", default=u"Tweets")
       
    65     parser.add_option("-R", "--replace", dest="replace", action="store_true",
       
    66                       help="Replace tweet ensemble", metavar="REPLACE", default=False)
       
    67                 
       
    68                       
       
    69     
       
    70     (options, args) = parser.parse_args()
       
    71     
       
    72         
       
    73     ts = int(parse_date(options.start_date))
       
    74 
       
    75     if options.end_date:
       
    76     	te = int(parse_date(options.end_date))
       
    77     else:
       
    78         te = ts + options.duration
       
    79     
       
    80     conn = connect(options.database)
       
    81     conn.row_factory = Row
       
    82     cursor = conn.cursor()
       
    83 
       
    84     cursor.execute("create temporary table tweet_exclude (id)")
       
    85 
       
    86     if options.exclude and os.path.exists(options.exclude):
       
    87         f = open(options.exclude, 'r+')
       
    88         for line in f:
       
    89             cursor.execute("insert into tweet_exclude (id) values (?)", (int(line.strip()),))
       
    90         f.close()
       
    91 
       
    92     hashtag = u"%#"+unicode(options.hashtag)+u"%"
       
    93     cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te));
       
    94     
       
    95     root = None
       
    96     ensemble_parent = None
       
    97     
       
    98     if options.content_file and os.path.exists(options.content_file):
       
    99 
       
   100         doc = etree.parse(options.content_file)
       
   101         root = doc.getroot()
       
   102         
       
   103         ensemble_parent = root.xpath("//ensembles")[0]
       
   104     
       
   105     else:
       
   106         root = etree.Element(u"iri")
       
   107             
       
   108         project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
       
   109     
       
   110         medias = etree.SubElement(root, u"medias")
       
   111         media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
       
   112         
       
   113         annotations = etree.SubElement(root, u"annotations")    
       
   114         content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
       
   115         ensemble_parent = content
       
   116 
       
   117     if options.replace:
       
   118         for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
       
   119             if ens.get("id","").startswith("tweet_"):
       
   120                 ensemble_parent.remove(ens)
       
   121 
       
   122     ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"})
       
   123     decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
       
   124     
       
   125     etree.SubElement(decoupage, u"title").text = unicode(options.name)
       
   126     etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
       
   127     
       
   128     elements = etree.SubElement(decoupage, u"elements")
       
   129     
       
   130     for res in cursor:
       
   131         tweet_ts = int(res["created_at_ts"])
       
   132         tweet_ts_dt = datetime.datetime.fromtimestamp(tweet_ts)
       
   133         tweet_ts_rel = (tweet_ts-ts) * 1000
       
   134         element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(res["id"]), u"color":unicode(options.color), u"author":unicode(res["name"]), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""})
       
   135         etree.SubElement(element, u"title").text = unicode(res["name"]) + u": " + unicode(res["text"])
       
   136         etree.SubElement(element, u"abstract").text = unicode(res["text"])
       
   137 
       
   138         tags = {}
       
   139         for m in re.finditer(u"\#(\\w+)",res["text"], re.U):
       
   140             tags[m.group(1)] = ""
       
   141 
       
   142         tags_node = etree.SubElement(element, u"tags")
       
   143         
       
   144         for t in tags.keys():
       
   145             etree.SubElement(tags_node,u"tag").text = t
       
   146     
       
   147     if options.content_file and os.path.exists(options.content_file):
       
   148         output = open(options.content_file, "w")
       
   149     else:
       
   150         output = open(options.filename, "w")
       
   151 
       
   152     output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True))
       
   153     output.flush()
       
   154     output.close()
       
   155     
       
   156