script/lib/iri_tweet/export_twitter_alchemy.py
changeset 122 4c3a15877f80
parent 84 b1029aa40ec3
child 203 8124cde38141
equal deleted inserted replaced
121:2b794b7901d6 122:4c3a15877f80
     1 #!/usr/bin/env python
     1 #!/usr/bin/env python
     2 # coding=utf-8
     2 # coding=utf-8
     3 
     3 
     4 from lxml import etree
     4 from lxml import etree
     5 from models import *
     5 from optparse import OptionParser #@UnresolvedImport
     6 from optparse import OptionParser
     6 from sqlalchemy import Table, Column, BigInteger, MetaData
     7 from sqlalchemy import Table, Column, Integer, BigInteger, String, MetaData, \
     7 from sqlalchemy.orm import sessionmaker
     8     ForeignKey
     8 from utils import parse_date, set_logging_options, set_logging, get_filter_query
     9 from sqlalchemy.orm import sessionmaker, mapper
     9 from models import setup_database
    10 from sqlalchemy.sql import select, or_
       
    11 from utils import *
       
    12 import datetime
    10 import datetime
    13 import email.utils
       
    14 import logging
    11 import logging
    15 import os
       
    16 import os.path
    12 import os.path
    17 import re
       
    18 import re
    13 import re
    19 import sys
    14 import sys
    20 import time
    15 import time
    21 import uuid
    16 import uuid #@UnresolvedImport
    22 import httplib2
    17 import httplib2
    23 import anyjson
    18 import anyjson
    24 import StringIO
       
    25 
    19 
    26 #class TweetExclude(object):
    20 #class TweetExclude(object):
    27 #    def __init__(self, id):
    21 #    def __init__(self, id):
    28 #        self.id = id
    22 #        self.id = id
    29 #        
    23 #        
    30 #    def __repr__(self):
    24 #    def __repr__(self):
    31 #        return "<TweetExclude(id=%d)>" % (self.id)
    25 #        return "<TweetExclude(id=%d)>" % (self.id)
    32 
    26 
    33 def parse_date(date_str):
       
    34     ts = email.utils.parsedate_tz(date_str)
       
    35     return datetime.datetime(*ts[0:7])
       
    36 
    27 
    37 def parse_polemics(tw, extended_mode):
    28 def parse_polemics(tw, extended_mode):
    38     """
    29     """
    39     parse polemics in text and return a list of polemic code. None if not polemic found
    30     parse polemics in text and return a list of polemic code. None if not polemic found
    40     """
    31     """
   106 
    97 
   107     (options, args, parser) = get_options()
    98     (options, args, parser) = get_options()
   108     
    99     
   109     set_logging(options)
   100     set_logging(options)
   110         
   101         
   111     logging.debug("OPTIONS : " + repr(options))
   102     logging.debug("OPTIONS : " + repr(options)) #@UndefinedVariable
   112     
   103     
   113     if len(sys.argv) == 1 or options.database is None:
   104     if len(sys.argv) == 1 or options.database is None:
   114         parser.print_help()
   105         parser.print_help()
   115         sys.exit(1)
   106         sys.exit(1)
   116     
   107     
   162                     'hashtags' : options.hashtag
   153                     'hashtags' : options.hashtag
   163                 }]
   154                 }]
   164             
   155             
   165             for params in parameters:
   156             for params in parameters:
   166                 
   157                 
   167                 logging.debug("PARAMETERS " + repr(params))
   158                 logging.debug("PARAMETERS " + repr(params)) #@UndefinedVariable
   168                 
   159                 
   169                 start_date_str = params.get("start_date",None)
   160                 start_date_str = params.get("start_date",None)
   170                 end_date_str = params.get("end_date", None)
   161                 end_date_str = params.get("end_date", None)
   171                 duration = params.get("duration", None)
   162                 duration = params.get("duration", None)
   172                 content_file = params.get("content_file", None)
   163                 content_file = params.get("content_file", None)
   192                 
   183                 
   193                 #to do : analyse situation ldt or iri ? filename set or not ?
   184                 #to do : analyse situation ldt or iri ? filename set or not ?
   194                 
   185                 
   195                 if content_file and content_file.find("http") == 0:
   186                 if content_file and content_file.find("http") == 0:
   196                     
   187                     
   197                     logging.debug("url : " + content_file)
   188                     logging.debug("url : " + content_file) #@UndefinedVariable
   198                     
   189                     
   199                     h = httplib2.Http()
   190                     h = httplib2.Http()
   200                     resp, content = h.request(content_file)
   191                     resp, content = h.request(content_file)
   201                     
   192                     
   202                     logging.debug("url response " + repr(resp) + " content " + repr(content))
   193                     logging.debug("url response " + repr(resp) + " content " + repr(content)) #@UndefinedVariable
   203                     
   194                     
   204                     project = anyjson.deserialize(content)
   195                     project = anyjson.deserialize(content)
   205                     root = etree.fromstring(project["ldt"])
   196                     root = etree.fromstring(project["ldt"])
   206                 
   197                 
   207                 elif content_file and os.path.exists(content_file):
   198                 elif content_file and os.path.exists(content_file):
   212                 
   203                 
   213                 if root is None:
   204                 if root is None:
   214                 
   205                 
   215                     root = etree.Element(u"iri")
   206                     root = etree.Element(u"iri")
   216                         
   207                         
   217                     project = etree.SubElement(root, u"project", {u"abstract":u"Twitter comments on ENMI",u"title":u"Twitter comments on ENMI 2009", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
   208                     project = etree.SubElement(root, u"project", {u"abstract":u"Polemics Tweets",u"title":u"Polemic Tweets", u"user":u"IRI Web", u"id":unicode(uuid.uuid4())})
   218                 
   209                 
   219                     medias = etree.SubElement(root, u"medias")
   210                     medias = etree.SubElement(root, u"medias")
   220                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   211                     media = etree.SubElement(medias, u"media", {u"pict":u"", u"src":unicode(options.content), u"video":unicode(options.video), u"id":unicode(options.content_id), u"extra":u""})
   221                     
   212                     
   222                     annotations = etree.SubElement(root, u"annotations")    
   213                     annotations = etree.SubElement(root, u"annotations")    
   254                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   245                             ensembles_node = etree.SubElement(body_node, u"ensembles")
   255                         ensemble_parent = ensembles_node
   246                         ensemble_parent = ensembles_node
   256                     
   247                     
   257                     
   248                     
   258                 if ensemble_parent is None:
   249                 if ensemble_parent is None:
   259                     logging.error("Can not process file")
   250                     logging.error("Can not process file") #@UndefinedVariable
   260                     sys.exit()
   251                     sys.exit()
   261             
   252             
   262                 if options.replace:
   253                 if options.replace:
   263                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
   254                     for ens in ensemble_parent.iterchildren(tag=u"ensemble"):
   264                         if ens.get("id","").startswith("tweet_"):
   255                         if ens.get("id","").startswith("tweet_"):
   309                 
   300                 
   310                 if content_file and content_file.find("http") == 0:
   301                 if content_file and content_file.find("http") == 0:
   311                     
   302                     
   312                     project["ldt"] = output_data
   303                     project["ldt"] = output_data
   313                     body = anyjson.serialize(project)
   304                     body = anyjson.serialize(project)
   314                     logging.debug("write http " + content_file)
   305                     logging.debug("write http " + content_file) #@UndefinedVariable
   315                     logging.debug("write http " + repr(body))
   306                     logging.debug("write http " + repr(body)) #@UndefinedVariable
   316                     h = httplib2.Http()
   307                     h = httplib2.Http()
   317                     resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
   308                     resp, content = h.request(content_file, "PUT", headers={'content-type':'application/json'}, body=body)
   318                     logging.debug("write http " + repr(resp) + " content " + content)
   309                     logging.debug("write http " + repr(resp) + " content " + content) #@UndefinedVariable
   319                 else:
   310                 else:
   320                     if content_file and os.path.exists(content_file):
   311                     if content_file and os.path.exists(content_file):
   321                         dest_file_name = content_file 
   312                         dest_file_name = content_file 
   322                     else:
   313                     else:
   323                         dest_file_name = options.filename
   314                         dest_file_name = options.filename
   324             
   315             
   325                     logging.debug("WRITE : " + dest_file_name)
   316                     logging.debug("WRITE : " + dest_file_name) #@UndefinedVariable
   326                     output = open(dest_file_name, "w")
   317                     output = open(dest_file_name, "w")
   327                     output.write(output_data)
   318                     output.write(output_data)
   328                     output.flush()
   319                     output.flush()
   329                     output.close()
   320                     output.close()
   330                 
   321