add extended mode for tweet parsing
authorYves-Marie Haussonne <1218002+ymph@users.noreply.github.com>
Tue, 01 Feb 2011 18:49:43 +0100
changeset 39 c5d7dd0ec4e1
parent 38 4b99898f55de
child 40 49140e9ee14f
add extended mode for tweet parsing
.hgignore
script/lib/iri_tweet/export_twitter_alchemy.py
--- a/.hgignore	Mon Jan 31 11:47:45 2011 +0100
+++ b/.hgignore	Tue Feb 01 18:49:43 2011 +0100
@@ -21,3 +21,8 @@
 syntax: regexp
 \.DS_Store$
 \.pyc
+
+syntax: regexp
+^script/virtualenv/distribute-0\.6\.14\.tar\.gz$
+syntax: regexp
+^script/virtualenv/venv$
\ No newline at end of file
--- a/script/lib/iri_tweet/export_twitter_alchemy.py	Mon Jan 31 11:47:45 2011 +0100
+++ b/script/lib/iri_tweet/export_twitter_alchemy.py	Tue Feb 01 18:49:43 2011 +0100
@@ -34,6 +34,32 @@
     ts = email.utils.parsedate_tz(date_str)
     return datetime.datetime(*ts[0:7])
 
+def parse_polemics(tw, extended_mode):
+    """
+    parse polemics in text and return a list of polemic code. None if not polemic found
+    """
+    polemics = {} 
+    for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
+        pol_link = {
+            '++' : u'OK',
+            '--' : u'KO',
+            '??' : u'Q',
+            '==' : u'REF'}[m.group(1)]
+        polemics[pol_link] = pol_link
+    
+    if extended_mode:
+        if "?" in tw.text:
+            polemics["Q"] = "Q"
+        
+        for entity in tw.entity_list:
+            if entity.type == "entity_url":
+                polemics["REF"] = "REF" 
+    
+    if len(polemics) > 0:
+        return polemics.keys()
+    else:
+        return None
+
 def get_options():
     parser = OptionParser()
     parser.add_option("-f", "--file", dest="filename",
@@ -66,6 +92,9 @@
                       help="Replace tweet ensemble", metavar="REPLACE", default=False)
     parser.add_option("-L", "--list-conf", dest="listconf",
                       help="list of file to process", metavar="LIST_CONF", default=None)
+    parser.add_option("-E", "--extended", dest="extended_mode", action="store_true",
+                      help="Trigger polemic extended mode", metavar="EXTENDED", default=False)
+    
     
     set_logging_options(parser)
 
@@ -254,19 +283,13 @@
                             
                     meta_element = etree.SubElement(element, u'meta')
                     
-                    polemics_element = etree.Element(u'polemics')
-                    polemic_added = False
-                    for m in re.finditer("(\+\+|\-\-|\?\?|\=\=)",tw.text):
-                        polemic_added = True
-                        pol_link = {
-                            '++' : u'OK',
-                            '--' : u'KO',
-                            '??' : u'Q',
-                            '==' : u'REF'}[m.group(1)]
-                        etree.SubElement(polemics_element, u'polemic').text = pol_link
-                    if polemic_added:
+                    polemics_list = parse_polemics(tw, options.extended_mode)
+                    if polemics_list:
+                        polemics_element = etree.Element(u'polemics')
+                        for pol in polemics_list:
+                            etree.SubElement(polemics_element, u'polemic').text = pol
                         meta_element.append(polemics_element)
-                    
+
                     etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.original_json))
                 
                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)