script/rest/export_twitter.py
author ymh <ymh.work@gmail.com>
Sat, 19 Oct 2024 01:42:20 +0200
changeset 1565 b1d408b2381d
parent 957 e4d0094f097b
permissions -rw-r--r--
upgrade metadataplayer

#!/usr/bin/env python
# coding=utf-8

from sqlite3 import register_adapter, register_converter, connect, Row
import datetime, time
import email.utils
from optparse import OptionParser
import os.path
from lxml import etree
import uuid
import re
import simplejson

def parse_date(date_str):
    ts = email.utils.parsedate_tz(date_str)
    return time.mktime(ts[0:9]) - 60 * ts[9]

def adapt_datetime(ts):
    return time.mktime(ts.timetuple())
    
def adapt_geo(geo):
    return simplejson.dumps(geo)

def convert_geo(s):
    return simplejson.loads(s)


register_adapter(datetime.datetime, adapt_datetime)
register_converter("geo", convert_geo)

columns_tweet = [u'favorited', u'truncated', u'text', u'created_at', u'source', u'in_reply_to_status_id', u'in_reply_to_screen_name', u'in_reply_to_user_id', u'geo', u'id', u'user']
columns_user = [u'id', u'verified', u'profile_sidebar_fill_color', u'profile_text_color', u'followers_count', u'protected', u'location', u'profile_background_color', u'utc_offset', u'statuses_count', u'description', u'friends_count', u'profile_link_color', u'profile_image_url', u'notifications', u'geo_enabled', u'profile_background_image_url', u'screen_name', u'profile_background_tile', u'favourites_count', u'name', u'url', u'created_at', u'time_zone', u'profile_sidebar_border_color', u'following']


if __name__ == "__main__" :

    parser = OptionParser()
    parser.add_option("-f", "--file", dest="filename",
                      help="write export to file", metavar="FILE", default="project_enmi.ldt")
    parser.add_option("-d", "--database", dest="database",
                      help="Input database", metavar="DATABASE")
    parser.add_option("-s", "--start-date", dest="start_date",
                      help="start date", metavar="START_DATE")
    parser.add_option("-e", "--end-date", dest="end_date",
                      help="end date", metavar="END_DATE")
    parser.add_option("-I", "--content-file", dest="content_file",
                      help="Content file", metavar="CONTENT_FILE")
    parser.add_option("-c", "--content", dest="content",
                      help="Content url", metavar="CONTENT")
    parser.add_option("-v", "--video-url", dest="video",
                      help="video url", metavar="VIDEO")
    parser.add_option("-i", "--content-id", dest="content_id",
                      help="Content id", metavar="CONTENT_ID")
    parser.add_option("-x", "--exclude", dest="exclude",
                      help="file containing the id to exclude", metavar="EXCLUDE")
    parser.add_option("-C", "--color", dest="color",
                      help="Color code", metavar="COLOR", default="16763904")
    parser.add_option("-H", "--hashtag", dest="hashtag",
                      help="Hashtag", metavar="HASHTAG", default="enmi09")                      
    parser.add_option("-D", "--duration", dest="duration", type="int",
                      help="Duration", metavar="DURATION", default=None)
    parser.add_option("-n", "--name", dest="name",
                      help="Cuttting name", metavar="NAME", default=u"Tweets")
    parser.add_option("-R", "--replace", dest="replace", action="store_true",
                      help="Replace tweet ensemble", metavar="REPLACE", default=False)
                
                      
    
    (options, args) = parser.parse_args()
    
        
    ts = int(parse_date(options.start_date))

    if options.end_date:
        te = int(parse_date(options.end_date))
    else:
        te = ts + options.duration
    
    conn = connect(options.database)
    conn.row_factory = Row
    cursor = conn.cursor()

    cursor.execute("create temporary table tweet_exclude (id)")

    if options.exclude and os.path.exists(options.exclude):
        f = open(options.exclude, 'r+')
        for line in f:
            cursor.execute("insert into tweet_exclude (id) values (?)", (int(line.strip()),))
        f.close()

    hashtag = u"%#"+unicode(options.hashtag)+u"%"
    cursor.execute("select tt.id, tt.text, tt.created_at_ts, tu.name, tu.screen_name from tweet_tweet as tt join tweet_user as tu on tt.user = tu.rowid where text like ? and tt.created_at_ts >= ? and tt.created_at_ts <= ? and tt.id not in (select id from tweet_exclude) order by tt.created_at_ts asc;", (hashtag,ts,te));
    
    root = None
    ensemble_parent = None
    
    if options.content_file and os.path.exists(options.content_file):

        doc = etree.parse(options.content_file)
        root = doc.getroot()
        
        ensemble_parent = root.xpath("//ensembles")[0]
    
    else:
        root = etree.Element(u"iri")
            
        project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
    
        medias = etree.SubElement(root, u"medias")
        media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
        
        annotations = etree.SubElement(root, u"annotations")    
        content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
        ensemble_parent = content

    if options.replace:
        for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
            ensid = ens.get("id","")
            if ensid.startswith("tweet_"):
                ensemble_parent.remove(ens)
                #remove form displays too
                for display_decoupage_node in root.xpath("displays/display/content/decoupage[@idens='%s']" % ensid):
                    display_decoupage_node.getparent().remove(display_decoupage_node)
                

    ensemble_id = u"tweet_" + unicode(uuid.uuid4())
    ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id": ensemble_id, u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
    decoupage_id = unicode(uuid.uuid4())
    decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": decoupage_id, u"author": u"IRI Web"})
    
    etree.SubElement(decoupage, u"title").text = unicode(options.name)
    etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
    
    elements = etree.SubElement(decoupage, u"elements")
    
    for res in cursor:
        tweet_ts = int(res["created_at_ts"])
        tweet_ts_dt = datetime.datetime.fromtimestamp(tweet_ts)
        tweet_ts_rel = (tweet_ts-ts) * 1000
        element = etree.SubElement(elements, u"element" , {u"id":unicode(uuid.uuid4())+u"-"+unicode(res["id"]), u"color":unicode(options.color), u"author":unicode(res["name"]), u"date":unicode(tweet_ts_dt.strftime("%Y/%m/%d")), u"begin": unicode(tweet_ts_rel), u"dur":u"0", u"src":u""})
        etree.SubElement(element, u"title").text = unicode(res["name"]) + u": " + unicode(res["text"])
        etree.SubElement(element, u"abstract").text = unicode(res["text"])

        tags = {}
        for m in re.finditer(u"\#(\\w+)",res["text"], re.U):
            tags[m.group(1)] = ""

        tags_node = etree.SubElement(element, u"tags")
        
        for t in tags.keys():
            etree.SubElement(tags_node,u"tag").text = t
    
    #add this new decoupage to display
    display_nodes = root.xpath("displays/display/content/decoupage[@idens='%s']" % ensid)
    
    
    if options.content_file and os.path.exists(options.content_file):
        output = open(options.content_file, "w")
    else:
        output = open(options.filename, "w")

    output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True))
    output.flush()
    output.close()