script/lib/iri_tweet/export_twitter_alchemy.py
changeset 39 c5d7dd0ec4e1
parent 31 93fd53a97d6d
child 82 210dc265c70f
equal deleted inserted replaced
38:4b99898f55de 39:c5d7dd0ec4e1
    31 #        return "<TweetExclude(id=%d)>" % (self.id)
    31 #        return "<TweetExclude(id=%d)>" % (self.id)
    32 
    32 
    33 def parse_date(date_str):
    33 def parse_date(date_str):
    34     ts = email.utils.parsedate_tz(date_str)
    34     ts = email.utils.parsedate_tz(date_str)
    35     return datetime.datetime(*ts[0:7])
    35     return datetime.datetime(*ts[0:7])
       
    36 
       
    37 def parse_polemics(tw, extended_mode):
       
    38     """
       
    39     parse polemics in text and return a list of polemic code. None if not polemic found
       
    40     """
       
    41     polemics = {} 
       
    42     for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
       
    43         pol_link = {
       
    44             '++' : u'OK',
       
    45             '--' : u'KO',
       
    46             '??' : u'Q',
       
    47             '==' : u'REF'}[m.group(1)]
       
    48         polemics[pol_link] = pol_link
       
    49     
       
    50     if extended_mode:
       
    51         if "?" in tw.text:
       
    52             polemics["Q"] = "Q"
       
    53         
       
    54         for entity in tw.entity_list:
       
    55             if entity.type == "entity_url":
       
    56                 polemics["REF"] = "REF" 
       
    57     
       
    58     if len(polemics) > 0:
       
    59         return polemics.keys()
       
    60     else:
       
    61         return None
    36 
    62 
    37 def get_options():
    63 def get_options():
    38     parser = OptionParser()
    64     parser = OptionParser()
    39     parser.add_option("-f", "--file", dest="filename",
    65     parser.add_option("-f", "--file", dest="filename",
    40                       help="write export to file", metavar="FILE", default="project_enmi.ldt")
    66                       help="write export to file", metavar="FILE", default="project_enmi.ldt")
    64                       help="Cutting name", metavar="NAME", default=u"Tweets")
    90                       help="Cutting name", metavar="NAME", default=u"Tweets")
    65     parser.add_option("-R", "--replace", dest="replace", action="store_true",
    91     parser.add_option("-R", "--replace", dest="replace", action="store_true",
    66                       help="Replace tweet ensemble", metavar="REPLACE", default=False)
    92                       help="Replace tweet ensemble", metavar="REPLACE", default=False)
    67     parser.add_option("-L", "--list-conf", dest="listconf",
    93     parser.add_option("-L", "--list-conf", dest="listconf",
    68                       help="list of file to process", metavar="LIST_CONF", default=None)
    94                       help="list of file to process", metavar="LIST_CONF", default=None)
       
    95     parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
       
    96                       help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
       
    97     
    69     
    98     
    70     set_logging_options(parser)
    99     set_logging_options(parser)
    71 
   100 
    72     
   101     
    73     return parser.parse_args()
   102     return parser.parse_args()
   252                         if entity.type == u'entity_hashtag': 
   281                         if entity.type == u'entity_hashtag': 
   253                             etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
   282                             etree.SubElement(tags_node,u"tag").text = entity.hashtag.text
   254                             
   283                             
   255                     meta_element = etree.SubElement(element, u'meta')
   284                     meta_element = etree.SubElement(element, u'meta')
   256                     
   285                     
   257                     polemics_element = etree.Element(u'polemics')
   286                     polemics_list = parse_polemics(tw, options.extended_mode)
   258                     polemic_added = False
   287                     if polemics_list:
   259                     for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
   288                         polemics_element = etree.Element(u'polemics')
   260                         polemic_added = True
   289                         for pol in polemics_list:
   261                         pol_link = {
   290                             etree.SubElement(polemics_element, u'polemic').text = pol
   262                             '++' : u'OK',
       
   263                             '--' : u'KO',
       
   264                             '??' : u'Q',
       
   265                             '==' : u'REF'}[m.group(1)]
       
   266                         etree.SubElement(polemics_element, u'polemic').text = pol_link
       
   267                     if polemic_added:
       
   268                         meta_element.append(polemics_element)
   291                         meta_element.append(polemics_element)
   269                     
   292 
   270                     etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.original_json))
   293                     etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.original_json))
   271                 
   294                 
   272                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)  
   295                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)  
   273                 
   296                 
   274                 if content_file and content_file.find("http") == 0:
   297                 if content_file and content_file.find("http") == 0: