diff -r e652022fd1f7 -r 03d2aa7b4967 script/utils/export_twitter_alchemy.py --- a/script/utils/export_twitter_alchemy.py Tue Nov 17 12:15:00 2015 +0100 +++ b/script/utils/export_twitter_alchemy.py Wed Nov 18 15:39:05 2015 +0100 @@ -44,7 +44,7 @@ get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable return res is not None -def parse_polemics(tw, extended_mode): +def parse_polemics_1(tw, extended_mode): """ parse polemics in text and return a list of polemic code. None if not polemic found """ @@ -70,6 +70,37 @@ else: return None +def parse_polemics_2(tw, extended_mode): + """ + parse polemics in text and return a list of polemic code. None if not polemic found + """ + polemics = {} + for m in re.finditer("(\+\+|\!\!|\?\?|\=\=)",tw.text): + pol_link = { + '++' : u'OK', + '!!' : u'KO', + '??' : u'Q', + '==' : u'REF'}[m.group(1)] + polemics[pol_link] = pol_link + + if extended_mode: + if "?" in tw.text: + polemics["Q"] = "Q" + + for entity in tw.entity_list: + if entity.type == "entity_url": + polemics["REF"] = "REF" + + if len(polemics) > 0: + return polemics.keys() + else: + return None + +protocol_version_map = { + "1" : parse_polemics_1, + "2" : parse_polemics_2 +} + def get_options(): parser = argparse.ArgumentParser(description="All date should be given using iso8601 format. If no timezone is used, the date is considered as UTC") @@ -78,6 +109,9 @@ help="write export to file", metavar="FILE", default="project.ldt") parser.add_argument("-d", "--database", dest="database", help="Input database", metavar="DATABASE") + parser.add_argument("-a", "--annotation-protocol", dest="protocol_version", + help="annotation protocol version", metavar="PROTOCOL_VERSION", + default="2") parser.add_argument("-s", "--start-date", dest="start_date", help="start date", metavar="START_DATE", default=None) parser.add_argument("-e", "--end-date", dest="end_date", @@ -457,6 +491,7 @@ meta_element = etree.SubElement(element, u'meta') + parse_polemics = protocol_version_map.get(options.protocol_version, parse_polemics_2) polemics_list = parse_polemics(tw, options.extended_mode) if polemics_list: polemics_element = etree.Element(u'polemics')