diff -r 4b99898f55de -r c5d7dd0ec4e1 script/lib/iri_tweet/export_twitter_alchemy.py --- a/script/lib/iri_tweet/export_twitter_alchemy.py Mon Jan 31 11:47:45 2011 +0100 +++ b/script/lib/iri_tweet/export_twitter_alchemy.py Tue Feb 01 18:49:43 2011 +0100 @@ -34,6 +34,32 @@ ts = email.utils.parsedate_tz(date_str) return datetime.datetime(*ts[0:7]) +def parse_polemics(tw, extended_mode): + """ + parse polemics in text and return a list of polemic code. None if not polemic found + """ + polemics = {} + for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): + pol_link = { + '++' : u'OK', + '--' : u'KO', + '??' : u'Q', + '==' : u'REF'}[m.group(1)] + polemics[pol_link] = pol_link + + if extended_mode: + if "?" in tw.text: + polemics["Q"] = "Q" + + for entity in tw.entity_list: + if entity.type == "entity_url": + polemics["REF"] = "REF" + + if len(polemics) > 0: + return polemics.keys() + else: + return None + def get_options(): parser = OptionParser() parser.add_option("-f", "--file", dest="filename", @@ -66,6 +92,9 @@ help="Replace tweet ensemble", metavar="REPLACE", default=False) parser.add_option("-L", "--list-conf", dest="listconf", help="list of file to process", metavar="LIST_CONF", default=None) + parser.add_option("-E", "--extended", dest="extended_mode", action="store_true", + help="Trigger polemic extended mode", metavar="EXTENDED", default=False) + set_logging_options(parser) @@ -254,19 +283,13 @@ meta_element = etree.SubElement(element, u'meta') - polemics_element = etree.Element(u'polemics') - polemic_added = False - for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text): - polemic_added = True - pol_link = { - '++' : u'OK', - '--' : u'KO', - '??' : u'Q', - '==' : u'REF'}[m.group(1)] - etree.SubElement(polemics_element, u'polemic').text = pol_link - if polemic_added: + polemics_list = parse_polemics(tw, options.extended_mode) + if polemics_list: + polemics_element = etree.Element(u'polemics') + for pol in polemics_list: + etree.SubElement(polemics_element, u'polemic').text = pol meta_element.append(polemics_element) - + etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.original_json)) output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)