script/lib/iri_tweet/export_twitter_alchemy.py
changeset 27 c3ea041c6cde
parent 23 2b17b26ca153
child 31 93fd53a97d6d
equal deleted inserted replaced
26:23a7bb04c6af 27:c3ea041c6cde
    17 import re
    17 import re
    18 import re
    18 import re
    19 import sys
    19 import sys
    20 import time
    20 import time
    21 import uuid
    21 import uuid
       
    22 import httplib2
       
    23 import anyjson
       
    24 import StringIO
    22 
    25 
    23 #class TweetExclude(object):
    26 #class TweetExclude(object):
    24 #    def __init__(self, id):
    27 #    def __init__(self, id):
    25 #        self.id = id
    28 #        self.id = id
    26 #        
    29 #        
   116             else:                        
   119             else:                        
   117                 parameters = [{
   120                 parameters = [{
   118                     'start_date': options.start_date,
   121                     'start_date': options.start_date,
   119                     'end_date' : options.end_date,
   122                     'end_date' : options.end_date,
   120                     'duration' : options.duration,
   123                     'duration' : options.duration,
   121                     'content_file' : otions.content_file
   124                     'content_file' : options.content_file
   122                     
   125                     
   123                 }]
   126                 }]
   124             
   127             
   125             for params in parameters:
   128             for params in parameters:
   126                 
   129                 
   146                 query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >=  start_date).filter(Tweet.created_at <=  end_date).all()
   149                 query_res = session.query(Tweet).join(EntityHashtag).join(Hashtag).filter(~Tweet.id.in_(select([tweet_exclude_table.c.id]))).filter(Hashtag.text.contains(options.hashtag)).filter(Tweet.created_at >=  start_date).filter(Tweet.created_at <=  end_date).all()
   147                                  
   150                                  
   148                 root = None
   151                 root = None
   149                 ensemble_parent = None
   152                 ensemble_parent = None
   150                 
   153                 
   151                 if content_file and os.path.exists(content_file):
   154                 #to do : analyse situation ldt or iri ? filename set or not ?
   152             
   155                 
       
   156                 if content_file and content_file.find("http") == 0:
       
   157                     
       
   158                     logging.debug("url : " + content_file)
       
   159                     
       
   160                     h = httplib2.Http()
       
   161                     resp, content = h.request(content_file)
       
   162                     
       
   163                     logging.debug("url response " + repr(resp) + " content " + repr(content))
       
   164                     
       
   165                     project = anyjson.deserialize(content)
       
   166                     root = etree.fromstring(project["ldt"])
       
   167                 
       
   168                 elif content_file and os.path.exists(content_file):
       
   169 
   153                     doc = etree.parse(content_file)
   170                     doc = etree.parse(content_file)
   154                     root = doc.getroot()
   171                     root = doc.getroot()
   155                     
   172                     
   156                     ensemble_parent = root.xpath("//ensembles")[0]
   173                 
   157                 
   174                 if root is None:
   158                 else:
   175                 
   159                     root = etree.Element(u"iri")
   176                     root = etree.Element(u"iri")
   160                         
   177                         
   161                     project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
   178                     project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
   162                 
   179                 
   163                     medias = etree.SubElement(root, u"medias")
   180                     medias = etree.SubElement(root, u"medias")
   164                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   181                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   165                     
   182                     
   166                     annotations = etree.SubElement(root, u"annotations")    
   183                     annotations = etree.SubElement(root, u"annotations")    
   167                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
   184                     content = etree.SubElement(annotations, u"content", {u"id":unicode(options.content_id)})
   168                     ensemble_parent = content
   185                     ensemble_parent = content
       
   186                     
       
   187                 
       
   188                 if ensemble_parent is None:
       
   189                     file_type = None
       
   190                     for node in root:
       
   191                         if node.tag == "project":
       
   192                             file_type = "ldt"
       
   193                             break
       
   194                         elif node.tag == "head":
       
   195                             file_type = "iri"
       
   196                             break
       
   197                     
       
   198                     if file_type == "ldt":
       
   199                         media_nodes = root.xpath("//media")
       
   200                         if len(media_nodes) > 0:
       
   201                             media = media_nodes[0]
       
   202                             annotations_node = root.find(u"annotations")
       
   203                             if annotations_node is None:
       
   204                                 annotations_node = etree.SubElement(root, u"annotations")
       
   205                             content_node = annotations_node.find(u"content")
       
   206                             if content_node is None:
       
   207                                 content_node = etree.SubElement(annotations_node,u"content", id=media["id"])
       
   208                             ensemble_parent = content_node
       
   209                     elif file_type == "iri":
       
   210                         body_node = root.find(u"body")
       
   211                         if body_node is None:
       
   212                             body_node = etree.SubElement(root, u"body")
       
   213                         ensembles_node = body_node.find(u"ensembles")
       
   214                         if ensembles_node is None:
       
   215                             ensembles_node = etree.SubElement(body_node, u"ensembles")
       
   216                         ensemble_parent = ensembles_node
       
   217                     
       
   218                     
       
   219                 if ensemble_parent is None:
       
   220                     logging.error("Can not process file")
       
   221                     sys.exit()
   169             
   222             
   170                 if options.replace:
   223                 if options.replace:
   171                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
   224                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
   172                         if ens.get("id","").startswith("tweet_"):
   225                         if ens.get("id","").startswith("tweet_"):
   173                             ensemble_parent.remove(ens)
   226                             ensemble_parent.remove(ens)
   174             
   227             
   175                 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter pour ENMI 2009"})
   228                 ensemble = etree.SubElement(ensemble_parent, u"ensemble", {u"id":u"tweet_" + unicode(uuid.uuid4()), u"title":u"Ensemble Twitter", u"author":u"IRI Web", u"abstract":u"Ensemble Twitter"})
   176                 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
   229                 decoupage = etree.SubElement(ensemble, u"decoupage", {u"id": unicode(uuid.uuid4()), u"author": u"IRI Web"})
   177                 
   230                 
   178                 etree.SubElement(decoupage, u"title").text = unicode(options.name)
   231                 etree.SubElement(decoupage, u"title").text = unicode(options.name)
   179                 etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   232                 etree.SubElement(decoupage, u"abstract").text = unicode(options.name)
   180                 
   233                 
   213                         etree.SubElement(polemics_element, u'polemic').text = pol_link
   266                         etree.SubElement(polemics_element, u'polemic').text = pol_link
   214                     if polemic_added:
   267                     if polemic_added:
   215                         meta_element.append(polemics_element)
   268                         meta_element.append(polemics_element)
   216                     
   269                     
   217                     etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.original_json))
   270                     etree.SubElement(meta_element, u"source", attrib={"url":u"http://dev.twitter.com", "mimetype":u"application/json"}).text = etree.CDATA(unicode(tw.original_json))
   218                     
   271                 
   219                 
   272                 output_data = etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True)  
   220                 if content_file and os.path.exists(content_file):
   273                 
   221                     dest_file_name = content_file 
   274                 if content_file and content_file.find("http") == 0:
       
   275                     
       
   276                     project["ldt"] = output_data
       
   277                     body = anyjson.serialize(project)
       
   278                     logging.debug("write http " + content_file)
       
   279                     logging.debug("write http " + repr(body))
       
   280                     h = httplib2.Http()
       
   281                     resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
       
   282                     logging.debug("write http " + repr(resp) + " content " + content)
   222                 else:
   283                 else:
   223                     dest_file_name = options.filename
   284                     if content_file and os.path.exists(content_file):
   224             
   285                         dest_file_name = content_file 
   225                 logging.debug("WRITE : " + dest_file_name)
   286                     else:
   226                 output = open(content_file, "w")
   287                         dest_file_name = options.filename
   227                 output.write(etree.tostring(root, encoding="utf-8", method="xml", pretty_print=True, xml_declaration=True))
   288             
   228                 output.flush()
   289                     logging.debug("WRITE : " + dest_file_name)
   229                 output.close()
   290                     output = open(content_file, "w")
       
   291                     output.write(output_data)
       
   292                     output.flush()
       
   293                     output.close()
   230                 
   294                 
   231         finally:
   295         finally:
   232             session.close()
   296             session.close()
   233     finally:
   297     finally:
   234         conn.close()
   298         conn.close()