diff -r bc29a6fbb8e8 -r 67a0cee0077f script/utils/export_twitter_alchemy.py --- a/script/utils/export_twitter_alchemy.py Wed Jan 16 05:04:23 2013 +0100 +++ b/script/utils/export_twitter_alchemy.py Wed Jan 16 18:25:10 2013 +0100 @@ -4,7 +4,8 @@ from lxml import etree from iri_tweet.models import setup_database, Tweet, User from optparse import OptionParser #@UnresolvedImport -from sqlalchemy import Table, Column, BigInteger +from sqlalchemy import Table, Column, BigInteger, event, bindparam +from sqlalchemy.sql import select, func from iri_tweet.utils import (set_logging_options, set_logging, get_filter_query, get_logger) import anyjson @@ -26,7 +27,13 @@ LDT_CONTENT_REST_API_PATH = "api/ldt/1.0/contents/" LDT_PROJECT_REST_API_PATH = "api/ldt/1.0/projects/" - + +def re_fn(expr, item): + reg = re.compile(expr, re.I) + res = reg.search(item) + if res: + get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable + return res is not None def parse_polemics(tw, extended_mode): """ @@ -125,7 +132,10 @@ engine, metadata, Session = setup_database(conn_str, echo=((options.verbose-options.quiet)>0), create_all = False) conn = None try : - conn = engine.connect() + conn = engine.connect() + @event.listens_for(conn, "begin") + def do_begin(conn): + conn.connection.create_function('regexp', 2, re_fn) session = None try : session = Session(bind=conn) @@ -146,19 +156,21 @@ exclude_query = session.query(Tweet) filter_obj = Tweet filter_field = res.group('field') - if filter_field.startswith("user_"): - exclude_query = exclude_query.join(User) + if filter_field.startswith("user__"): + exclude_query = exclude_query.outerjoin(User, Tweet.user_id==User.id) filter_obj = User - filter_field = filter_field[len("user_"):] - + filter_field = filter_field[len("user__"):] if res.group('op') == "=": - exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field) == res.group('value')) + exclude_query = exclude_query.filter(getattr(filter_obj, filter_field) == res.group('value')) else: - exclude_query = session.query(Tweet).filter(getattr(filter_obj, filter_field).like(res.group('value'))) + exclude_query = exclude_query.filter(getattr(filter_obj, filter_field).op('regexp')(res.group('value'))) + test_query = select([func.count()]).where(tweet_exclude_table.c.id==bindparam('t_id')) for t in exclude_query.all(): - conn.execute(tei.values(id=t.id)) + get_logger().debug("t : " + repr(t)) + if conn.execute(test_query, t_id=t.id).fetchone()[0] == 0: + conn.execute(tei.values(id=t.id)) user_whitelist_file = options.user_whitelist user_whitelist = None @@ -175,6 +187,7 @@ params['content_file_write'] = snode.text elif snode.tag == "project_id": params['content_file'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json" + params['content_file_write'] = options.base_url + LDT_PROJECT_REST_API_PATH + snode.text + "/?format=json" params['project_id'] = snode.text elif snode.tag == "start_date": params['start_date'] = snode.text @@ -237,9 +250,10 @@ get_logger().debug("url : " + content_file) #@UndefinedVariable r = requests.get(content_file, params=post_param) - get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable - project = r.json() - root = etree.fromstring(project["ldt"]) + #get_logger().debug("url response " + repr(r) + " content " + repr(r.text)) #@UndefinedVariable + project = r.json() + text_match = re.match(r"\<\?\s*xml.*?\?\>(.*)", project['ldt'], re.I|re.S) + root = etree.fromstring(text_match.group(1) if text_match else project['ldt']) elif content_file and os.path.exists(content_file):