script/lib/iri_tweet/export_twitter_alchemy.py
changeset 289 a5eff8f2b81d
parent 275 483cc4e35193
child 379 083320a74eb2
equal deleted inserted replaced
288:4c870c767d3e 289:a5eff8f2b81d
     1 #!/usr/bin/env python
     1 #!/usr/bin/env python
     2 # coding=utf-8
     2 # coding=utf-8
     3 
     3 
     4 from lxml import etree
     4 from lxml import etree
       
     5 from models import setup_database
     5 from optparse import OptionParser #@UnresolvedImport
     6 from optparse import OptionParser #@UnresolvedImport
     6 from sqlalchemy import Table, Column, BigInteger, MetaData
     7 from sqlalchemy import Table, Column, BigInteger
     7 from sqlalchemy.orm import sessionmaker
     8 from utils import (parse_date, set_logging_options, set_logging, get_filter_query, 
     8 from utils import parse_date, set_logging_options, set_logging, get_filter_query, get_logger
     9     get_logger)
     9 from models import setup_database
    10 import anyjson
    10 import datetime
    11 import datetime
       
    12 import httplib2
    11 import os.path
    13 import os.path
    12 import re
    14 import re
    13 import sys
    15 import sys
    14 import time
    16 import time
    15 import uuid #@UnresolvedImport
    17 import uuid #@UnresolvedImport
    16 import httplib2
       
    17 import anyjson
       
    18 
    18 
    19 #class TweetExclude(object):
    19 #class TweetExclude(object):
    20 #    def __init__(self, id):
    20 #    def __init__(self, id):
    21 #        self.id = id
    21 #        self.id = id
    22 #        
    22 #        
    55     parser.add_option("-f", "--file", dest="filename",
    55     parser.add_option("-f", "--file", dest="filename",
    56                       help="write export to file", metavar="FILE", default="project.ldt")
    56                       help="write export to file", metavar="FILE", default="project.ldt")
    57     parser.add_option("-d", "--database", dest="database",
    57     parser.add_option("-d", "--database", dest="database",
    58                       help="Input database", metavar="DATABASE")
    58                       help="Input database", metavar="DATABASE")
    59     parser.add_option("-s", "--start-date", dest="start_date",
    59     parser.add_option("-s", "--start-date", dest="start_date",
    60                       help="start date", metavar="START_DATE")
    60                       help="start date", metavar="START_DATE", default=None)
    61     parser.add_option("-e", "--end-date", dest="end_date",
    61     parser.add_option("-e", "--end-date", dest="end_date",
    62                       help="end date", metavar="END_DATE")
    62                       help="end date", metavar="END_DATE", default=None)
    63     parser.add_option("-I", "--content-file", dest="content_file",
    63     parser.add_option("-I", "--content-file", dest="content_file",
    64                       help="Content file", metavar="CONTENT_FILE")
    64                       help="Content file", metavar="CONTENT_FILE")
    65     parser.add_option("-c", "--content", dest="content",
    65     parser.add_option("-c", "--content", dest="content",
    66                       help="Content url", metavar="CONTENT")
    66                       help="Content url", metavar="CONTENT")
    67     parser.add_option("-V", "--video-url", dest="video",
    67     parser.add_option("-V", "--video-url", dest="video",
   108     
   108     
   109     conn_str = options.database.strip()
   109     conn_str = options.database.strip()
   110     if not re.match("^\w+://.+", conn_str):
   110     if not re.match("^\w+://.+", conn_str):
   111         conn_str = 'sqlite:///' + conn_str
   111         conn_str = 'sqlite:///' + conn_str
   112 
   112 
   113     engine, metadata = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
   113     engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False)        
   114     
   114     conn = None
   115     Session = sessionmaker()
       
   116     conn = engine.connect()
       
   117     try :
   115     try :
   118         session = Session(bind=conn)
   116         conn = engine.connect()    
   119         try : 
   117         session = None
   120         
   118         try :
   121             metadata = MetaData(bind=conn)
   119             session = Session(bind=conn)         
   122             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
   120             tweet_exclude_table = Table("tweet_exclude", metadata, Column('id', BigInteger, primary_key=True), prefixes=['TEMPORARY'])
   123             #mapper(TweetExclude, tweet_exclude_table)
   121             #mapper(TweetExclude, tweet_exclude_table)
   124             metadata.create_all()
   122             metadata.create_all(bind=conn, tables=[tweet_exclude_table])
   125             
   123             
   126             if options.exclude and os.path.exists(options.exclude):
   124             if options.exclude and os.path.exists(options.exclude):
   127                 with open(options.exclude, 'r+') as f:
   125                 with open(options.exclude, 'r+') as f:
   128                     tei = tweet_exclude_table.insert()
   126                     tei = tweet_exclude_table.insert()
   129                     for line in f:
   127                     for line in f:
   172                   
   170                   
   173                 if user_whitelist_file:
   171                 if user_whitelist_file:
   174                     with open(user_whitelist_file, 'r+') as f:
   172                     with open(user_whitelist_file, 'r+') as f:
   175                         user_whitelist = list(set([s.strip() for s in f]))
   173                         user_whitelist = list(set([s.strip() for s in f]))
   176                 
   174                 
   177                 start_date = parse_date(start_date_str) 
   175                 start_date = None
   178                 ts = time.mktime(start_date.timetuple())
   176                 ts = None
   179             
   177                 if start_date_str:
       
   178                     start_date = parse_date(start_date_str) 
       
   179                     ts = time.mktime(start_date.timetuple())
       
   180             
       
   181                 end_date = None
   180                 if end_date_str:
   182                 if end_date_str:
   181                     end_date = parse_date(end_date_str)
   183                     end_date = parse_date(end_date_str)
   182                     te = time.mktime(end_date.timetuple())
   184                 elif start_date and duration:
   183                 else:
       
   184                     te = ts + duration
       
   185                     end_date = start_date + datetime.timedelta(seconds=duration)
   185                     end_date = start_date + datetime.timedelta(seconds=duration)
   186                 
   186                 
   187                 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
   187                 query = get_filter_query(session, start_date, end_date, hashtags, tweet_exclude_table, user_whitelist)
   188                     
   188                     
   189                 query_res = query.all()
   189                 query_res = query.all()
   274                 elements = etree.SubElement(decoupage, u"elements")
   274                 elements = etree.SubElement(decoupage, u"elements")
   275                 
   275                 
   276                 for tw in query_res:
   276                 for tw in query_res:
   277                     tweet_ts_dt = tw.created_at
   277                     tweet_ts_dt = tw.created_at
   278                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
   278                     tweet_ts = int(time.mktime(tweet_ts_dt.timetuple()))
       
   279                     if ts is None:
       
   280                         ts = tweet_ts
   279                     tweet_ts_rel = (tweet_ts-ts) * 1000
   281                     tweet_ts_rel = (tweet_ts-ts) * 1000
   280                     username = None
   282                     username = None
   281                     profile_url = ""
   283                     profile_url = ""
   282                     if tw.user is not None:
   284                     if tw.user is not None:
   283                         username = tw.user.name
   285                         username = tw.user.name
   328                     output.write(output_data)
   330                     output.write(output_data)
   329                     output.flush()
   331                     output.flush()
   330                     output.close()
   332                     output.close()
   331                 
   333                 
   332         finally:
   334         finally:
   333             session.close()
   335             if session:
       
   336                 session.close()
   334     finally:
   337     finally:
   335         conn.close()
   338         if conn:
       
   339             conn.close()