# HG changeset patch # User Yves-Marie Haussonne <1218002+ymph@users.noreply.github.com> # Date 1368185262 -7200 # Node ID 8628c590f6087f86fce560065d3fb03260c8bb81 # Parent 9c57883dbb9de688a25ed6e041db05a4fc7bc5b3 Remove old script and correct obvious script errors diff -r 9c57883dbb9d -r 8628c590f608 script/lib/iri_tweet/iri_tweet/processor.py --- a/script/lib/iri_tweet/iri_tweet/processor.py Wed May 08 01:24:19 2013 +0200 +++ b/script/lib/iri_tweet/iri_tweet/processor.py Fri May 10 13:27:42 2013 +0200 @@ -67,6 +67,9 @@ class TwitterProcessorStatus(TwitterProcessor): + def __init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=None, token_filename=None, user_query_twitter=False, logger=None): + TwitterProcessor.__init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=access_token, token_filename=token_filename, user_query_twitter=user_query_twitter, logger=logger) + def __get_user(self, user_dict, do_merge): self.logger.debug("Get user : " + repr(user_dict)) #@UndefinedVariable @@ -351,6 +354,8 @@ } } """ + def __init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=None, token_filename=None, user_query_twitter=False, logger=None): + TwitterProcessor.__init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=access_token, token_filename=token_filename, user_query_twitter=user_query_twitter, logger=logger) def process(self): @@ -382,6 +387,9 @@ } } """ + + def __init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=None, token_filename=None, user_query_twitter=False, logger=None): + TwitterProcessor.__init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=access_token, token_filename=token_filename, user_query_twitter=user_query_twitter, logger=logger) def process_source(self): up_to_status_id = self.json_dict.get("scrub_geo", {}).get("up_to_status_id", None) @@ -408,7 +416,10 @@ "track":1234 } } - """ + """ + def __init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=None, token_filename=None, user_query_twitter=False, logger=None): + TwitterProcessor.__init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=access_token, token_filename=token_filename, user_query_twitter=user_query_twitter, logger=logger) + def process_source(self): """ do nothing, just log the information @@ -428,6 +439,9 @@ } } """ + def __init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=None, token_filename=None, user_query_twitter=False, logger=None): + TwitterProcessor.__init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=access_token, token_filename=token_filename, user_query_twitter=user_query_twitter, logger=logger) + def process_source(self): """ do nothing, just log the information @@ -447,6 +461,9 @@ } } """ + def __init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=None, token_filename=None, user_query_twitter=False, logger=None): + TwitterProcessor.__init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=access_token, token_filename=token_filename, user_query_twitter=user_query_twitter, logger=logger) + def process_source(self): """ do nothing, just log the information @@ -468,6 +485,9 @@ } } """ + def __init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=None, token_filename=None, user_query_twitter=False, logger=None): + TwitterProcessor.__init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=access_token, token_filename=token_filename, user_query_twitter=user_query_twitter, logger=logger) + def process_source(self): """ do nothing, just log the information @@ -488,6 +508,9 @@ } } """ + def __init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=None, token_filename=None, user_query_twitter=False, logger=None): + TwitterProcessor.__init__(self, json_dict, json_txt, source_id, session, consumer_token, access_token=access_token, token_filename=token_filename, user_query_twitter=user_query_twitter, logger=logger) + def process_source(self): """ do nothing, just log the information diff -r 9c57883dbb9d -r 8628c590f608 script/rest/enmi_profile.py --- a/script/rest/enmi_profile.py Wed May 08 01:24:19 2013 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,134 +0,0 @@ -import twython -from sqlite3 import * -import datetime, time -import email.utils -from optparse import OptionParser -import os.path -import os -import sys -import simplejson - - -#options filename rpp page total_pages start_date end_date - - - -def adapt_datetime(ts): - return time.mktime(ts.timetuple()) - -def adapt_geo(geo): - return simplejson.dumps(geo) - -def convert_geo(s): - return simplejson.loads(s) - - -register_adapter(datetime.datetime, adapt_datetime) -register_converter("geo", convert_geo) - -columns_tweet = [u'favorited', u'truncated', u'text', u'created_at', u'source', u'in_reply_to_status_id', u'in_reply_to_screen_name', u'in_reply_to_user_id', u'geo', u'id', u'user'] -columns_user = [u'id', u'verified', u'profile_sidebar_fill_color', u'profile_text_color', u'followers_count', u'protected', u'location', u'profile_background_color', u'utc_offset', u'statuses_count', u'description', u'friends_count', u'profile_link_color', u'profile_image_url', u'notifications', u'geo_enabled', u'profile_background_image_url', u'screen_name', u'profile_background_tile', u'favourites_count', u'name', u'url', u'created_at', u'time_zone', u'profile_sidebar_border_color', u'following'] - -def processDate(entry): - ts = email.utils.parsedate(entry["created_at"]) - entry["created_at_ts"] = datetime.datetime.fromtimestamp(time.mktime(ts)) - -def processPage(page, cursor, debug): - for entry in page: - if debug: - print "ENTRY : " + repr(entry) - curs.execute("select id from tweet_tweet where id = ?", (entry["id"],)) - res = curs.fetchone() - if res: - continue - - entry_user = entry["user"] - processDate(entry_user) - cursor.execute("insert into tweet_user ("+",".join(entry_user.keys())+") values (:"+",:".join(entry_user.keys())+");", entry_user); - new_id = cursor.lastrowid - processDate(entry) - entry["user"] = new_id - if entry["geo"]: - entry["geo"] = adapt_geo(entry["geo"]) - new_id = cursor.execute("insert into tweet_tweet ("+",".join(entry.keys())+") values (:"+",:".join(entry.keys())+");", entry); - - -if __name__ == "__main__" : - - parser = OptionParser() - parser.add_option("-f", "--file", dest="filename", - help="write tweet to FILE", metavar="FILE", default="enmi2010_twitter_rest.db") - parser.add_option("-r", "--rpp", dest="rpp", - help="Results per page", metavar="RESULT_PER_PAGE", default=200, type='int') - parser.add_option("-p", "--page", dest="page", - help="page result", metavar="PAGE", default=1, type='int') - parser.add_option("-t", "--total-page", dest="total_page", - help="Total page number", metavar="TOTAL_PAGE", default=16, type='int') - parser.add_option("-s", "--screenname", dest="screen_name", - help="Twitter screen name", metavar="SCREEN_NAME") - parser.add_option("-u", "--user", dest="username", - help="Twitter user", metavar="USER", default=None) - parser.add_option("-w", "--password", dest="password", - help="Twitter password", metavar="PASSWORD", default=None) - parser.add_option("-n", "--new", dest="new", action="store_true", - help="new database", default=False) - parser.add_option("-d", "--debug", dest="debug", action="store_true", - help="debug", default=False) - - - - (options, args) = parser.parse_args() - - if options.debug: - print "OPTIONS : " - print repr(options) - - if options.screen_name is None: - print "No Screen name. Exiting" - sys.exit() - - if options.new and os.path.exists(options.filename): - os.remove(options.filename) - - conn = connect(options.filename) - conn.row_factory = Row - curs = conn.cursor() - - curs.execute("create table if not exists tweet_user ("+ ",".join(columns_user) +", created_at_ts integer);") - - curs.execute("create table if not exists tweet_tweet ("+ ",".join(columns_tweet) +", created_at_ts integer);") - curs.execute("create index if not exists id_index on tweet_tweet (id asc);"); - - curs.execute("select count(*) from tweet_tweet;") - res = curs.fetchone() - - old_total = res[0] - - twitter = twython.setup(username=options.username, password=options.password, headers="IRI enmi (python urllib)") - twitter = twython.Twython(twitter_token = "54ThDZhpEjokcMgHJOMnQA", twitter_secret = "wUoL9UL2T87tfc97R0Dff2EaqRzpJ5XGdmaN2XK3udA") - - search_results = None - page = options.page-1 - - while (page < options.total_page and ( search_results is None or len(search_results) > 0)): - page += 1 - try: - search_results = twitter.getUserTimeline(screen_name=options.screen_name, count=options.rpp, page=page) - except twython.TwythonError, (e): - print "NAME : "+ options.screen_name + " ERROR : " + repr(e.msg) - break - print "NAME : "+ options.screen_name +" PAGE : " + repr(page) + " tweet: " + repr(len(search_results)) + " (total page : " + unicode(options.total_page) + " : rpp : "+unicode(options.rpp)+")" - processPage(search_results, curs, options.debug) - - conn.commit() - - curs.execute("select count(*) from tweet_tweet;") - res = curs.fetchone() - - total = res[0] - - print "Tweet for " + options.screen_name + " : " + unicode(total - old_total) +", Tweet total : " + repr(total) - - conn.close() - - diff -r 9c57883dbb9d -r 8628c590f608 script/rest/getscreennames.py --- a/script/rest/getscreennames.py Wed May 08 01:24:19 2013 +0200 +++ b/script/rest/getscreennames.py Fri May 10 13:27:42 2013 +0200 @@ -1,11 +1,5 @@ -from sqlite3 import * -import datetime, time -import email.utils from optparse import OptionParser -import os.path -import os -import sys -import simplejson +from sqlite3 import connect, Row import re if __name__ == "__main__" : diff -r 9c57883dbb9d -r 8628c590f608 script/rest/search_twitter.py --- a/script/rest/search_twitter.py Wed May 08 01:24:19 2013 +0200 +++ b/script/rest/search_twitter.py Fri May 10 13:27:42 2013 +0200 @@ -1,10 +1,8 @@ -from iri_tweet import models, utils -from sqlalchemy.orm import sessionmaker +from iri_tweet import models, processor +from optparse import OptionParser import anyjson -import sqlite3 +import re import twitter -import re -from optparse import OptionParser def get_option(): @@ -59,7 +57,7 @@ print tweet tweet_str = anyjson.serialize(tweet) #invalidate user id - processor = utils.TwitterProcessor(tweet, tweet_str, None, session, None, options.token_filename) + processor = processor.TwitterProcessorStatus(json_dict=tweet, json_txt=tweet_str, source_id=None, session=session, consumer_token=None, access_token=None, token_filename=options.token_filename, user_query_twitter=False, logger=None) processor.process() session.flush() session.commit() diff -r 9c57883dbb9d -r 8628c590f608 script/utils/export_pad.py --- a/script/utils/export_pad.py Wed May 08 01:24:19 2013 +0200 +++ b/script/utils/export_pad.py Fri May 10 13:27:42 2013 +0200 @@ -1,23 +1,19 @@ #!/usr/bin/env python # coding=utf-8 +from dateutil.parser import parse as parse_date +from iri_tweet.utils import set_logging_options, set_logging, get_logger from lxml import etree -from iri_tweet.models import setup_database -from optparse import OptionParser #@UnresolvedImport -from sqlalchemy import Table, Column, BigInteger -from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, - get_logger) +from optparse import OptionParser import anyjson import datetime +import functools import httplib2 import os.path -import re +import requests import sys import time -import uuid #@UnresolvedImport -from dateutil.parser import parse as parse_date -import json -import functools +import uuid class EtherpadRequestException(Exception): @@ -149,7 +145,7 @@ elif start_date and duration: end_date = start_date + datetime.timedelta(seconds=duration) - if start_date is None or ts is None: + if start_date is None or end_date is None: abort("No start date found") end_ts = None @@ -187,7 +183,7 @@ if cutting_name is None: cutting_name = "pad_%s" % pad_id - format = options.get('format','html') + output_format = options.get('format','html') ensemble_parent = None file_type = None @@ -248,10 +244,10 @@ etp_req = EtherpadRequest(base_url, api_key) - rev_count = et_req.getRevisionCount(pad_id) + rev_count = etp_req.getRevisionCount(pad_id) - version_range = range(1,rev_count+1, step) + version_range = range(1,rev_count+1, 1) #make sure that teh last version is exported if rev_count not in version_range: version_range.append(rev_count) @@ -259,12 +255,12 @@ data = None text = "" - - if format == "html": - data = etp_req.getHtml(padID=padID, rev=rev) + + if output_format == "html": + data = etp_req.getHtml(padID=pad_id, rev=rev) text = data.get("html", "") else: - data = etp_req.getText(padID=padID, rev=rev) + data = etp_req.getText(padID=pad_id, rev=rev) text = data.get("text","") pad_ts = data['timestamp'] @@ -273,9 +269,9 @@ continue if end_ts is not None and pad_ts > end_ts: - break + break - pad_dt = datetime.fromtimestamp(float(pad_ts)/1000.0) + pad_dt = datetime.datetime.fromtimestamp(float(pad_ts)/1000.0) pad_ts_rel = pad_ts - start_ts username = None @@ -293,7 +289,7 @@ etree.SubElement(element, u"abstract").text = unicode(text) meta_element = etree.SubElement(element, u'meta') - etree.SubElement(meta_element, "pad_url").text = etree.CDATA(unicode(etp_req.getPadUrl(padID))) + etree.SubElement(meta_element, "pad_url").text = etree.CDATA(unicode(etp_req.getPadUrl(pad_id))) etree.SubElement(meta_element, "revision").text = etree.CDATA(unicode(rev)) # sort by tc in diff -r 9c57883dbb9d -r 8628c590f608 script/utils/export_tweet_db.py --- a/script/utils/export_tweet_db.py Wed May 08 01:24:19 2013 +0200 +++ b/script/utils/export_tweet_db.py Fri May 10 13:27:42 2013 +0200 @@ -1,8 +1,11 @@ -from models import setup_database -from optparse import OptionParser #@UnresolvedImport -from sqlalchemy.orm import sessionmaker -from utils import set_logging_options, set_logging, TwitterProcessor, logger -import sqlite3 #@UnresolvedImport +from iri_tweet.models import setup_database +from iri_tweet.processor import TwitterProcessorStatus +from iri_tweet.utils import set_logging_options, set_logging +from optparse import OptionParser +import logging +import sqlite3 + +logger = logging.getLogger(__name__) # 'entities': "tweet_entity", @@ -33,7 +36,7 @@ fields_mapping = {} for i,res in enumerate(curs_in.execute("select json from tweet_tweet;")): logger.debug("main loop %d : %s" % (i, res[0])) #@UndefinedVariable - processor = TwitterProcessor(eval(res[0]), res[0], None, session, options.token_filename) + processor = TwitterProcessorStatus(json_dict=eval(res[0]), json_txt=res[0], source_id=None, session=session, consumer_token=None, access_token=None, token_filename=options.token_filename, user_query_twitter=False, logger=logger) processor.process() session.commit() logger.debug("main : %d tweet processed" % (i+1)) #@UndefinedVariable diff -r 9c57883dbb9d -r 8628c590f608 script/utils/get_stats.py --- a/script/utils/get_stats.py Wed May 08 01:24:19 2013 +0200 +++ b/script/utils/get_stats.py Fri May 10 13:27:42 2013 +0200 @@ -1,14 +1,13 @@ +from lxml import etree import httplib2 -import anyjson -from lxml import etree +import pprint import sys -import pprint def get_stats(url): h = httplib2.Http() - resp, content = h.request(url) + _, content = h.request(url) #project = anyjson.deserialize(content) root = etree.fromstring(content)