script/utils/export_twitter_alchemy.py
changeset 1295 03d2aa7b4967
parent 1153 02722ce55cf8
child 1308 ef42d4f12cfc
equal deleted inserted replaced
1294:e652022fd1f7 1295:03d2aa7b4967
    42     res = reg.search(item)
    42     res = reg.search(item)
    43     if res:
    43     if res:
    44         get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable
    44         get_logger().debug("re_fn : " + repr(expr) + "~" + repr(item)) #@UndefinedVariable
    45     return res is not None
    45     return res is not None
    46 
    46 
    47 def parse_polemics(tw, extended_mode):
    47 def parse_polemics_1(tw, extended_mode):
    48     """
    48     """
    49     parse polemics in text and return a list of polemic code. None if not polemic found
    49     parse polemics in text and return a list of polemic code. None if not polemic found
    50     """
    50     """
    51     polemics = {}
    51     polemics = {}
    52     for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
    52     for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
    68     if len(polemics) > 0:
    68     if len(polemics) > 0:
    69         return polemics.keys()
    69         return polemics.keys()
    70     else:
    70     else:
    71         return None
    71         return None
    72 
    72 
       
    73 def parse_polemics_2(tw, extended_mode):
       
    74     """
       
    75     parse polemics in text and return a list of polemic code. None if not polemic found
       
    76     """
       
    77     polemics = {}
       
    78     for m in re.finditer("(\+\+|\!\!|\?\?|\=\=)",tw.text):
       
    79         pol_link = {
       
    80             '++' : u'OK',
       
    81             '!!' : u'KO',
       
    82             '??' : u'Q',
       
    83             '==' : u'REF'}[m.group(1)]
       
    84         polemics[pol_link] = pol_link
       
    85 
       
    86     if extended_mode:
       
    87         if "?" in tw.text:
       
    88             polemics["Q"] = "Q"
       
    89 
       
    90         for entity in tw.entity_list:
       
    91             if entity.type == "entity_url":
       
    92                 polemics["REF"] = "REF"
       
    93 
       
    94     if len(polemics) > 0:
       
    95         return polemics.keys()
       
    96     else:
       
    97         return None
       
    98 
       
    99 protocol_version_map = {
       
   100     "1" : parse_polemics_1,
       
   101     "2" : parse_polemics_2
       
   102 }
       
   103 
    73 def get_options():
   104 def get_options():
    74 
   105 
    75     parser = argparse.ArgumentParser(description="All date should be given using iso8601 format. If no timezone is used, the date is considered as UTC")
   106     parser = argparse.ArgumentParser(description="All date should be given using iso8601 format. If no timezone is used, the date is considered as UTC")
    76 
   107 
    77     parser.add_argument("-f", "--file", dest="filename",
   108     parser.add_argument("-f", "--file", dest="filename",
    78                       help="write export to file", metavar="FILE", default="project.ldt")
   109                       help="write export to file", metavar="FILE", default="project.ldt")
    79     parser.add_argument("-d", "--database", dest="database",
   110     parser.add_argument("-d", "--database", dest="database",
    80                       help="Input database", metavar="DATABASE")
   111                       help="Input database", metavar="DATABASE")
       
   112     parser.add_argument("-a", "--annotation-protocol", dest="protocol_version",
       
   113                       help="annotation protocol version", metavar="PROTOCOL_VERSION",
       
   114                       default="2")
    81     parser.add_argument("-s", "--start-date", dest="start_date",
   115     parser.add_argument("-s", "--start-date", dest="start_date",
    82                       help="start date", metavar="START_DATE", default=None)
   116                       help="start date", metavar="START_DATE", default=None)
    83     parser.add_argument("-e", "--end-date", dest="end_date",
   117     parser.add_argument("-e", "--end-date", dest="end_date",
    84                       help="end date", metavar="END_DATE", default=None)
   118                       help="end date", metavar="END_DATE", default=None)
    85     parser.add_argument("-I", "--content-file", dest="content_file",
   119     parser.add_argument("-I", "--content-file", dest="content_file",
   455                         if entity.type == u'entity_hashtag':
   489                         if entity.type == u'entity_hashtag':
   456                             etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
   490                             etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
   457 
   491 
   458                     meta_element = etree.SubElement(element, u'meta')
   492                     meta_element = etree.SubElement(element, u'meta')
   459 
   493 
       
   494                     parse_polemics = protocol_version_map.get(options.protocol_version, parse_polemics_2)
   460                     polemics_list = parse_polemics(tw, options.extended_mode)
   495                     polemics_list = parse_polemics(tw, options.extended_mode)
   461                     if polemics_list:
   496                     if polemics_list:
   462                         polemics_element = etree.Element(u'polemics')
   497                         polemics_element = etree.Element(u'polemics')
   463                         for pol in polemics_list:
   498                         for pol in polemics_list:
   464                             etree.SubElement(polemics_element, u'polemic').text = pol
   499                             etree.SubElement(polemics_element, u'polemic').text = pol