web/ldt/ldt_utils/contentindexer.py
changeset 5 ae8593287883
child 10 84e31387a741
equal deleted inserted replaced
4:7c994c98d1df 5:ae8593287883
       
     1 import tempfile
       
     2 import os
       
     3 import os.path
       
     4 import shutil
       
     5 from ldt.utils import zipfileext
       
     6 import urllib
       
     7 # import ldt.utils.log
       
     8 import ldt.utils.xml
       
     9 from django.conf import settings
       
    10 from models import Content
       
    11 import xml
       
    12 import xml.dom
       
    13 import xml.dom.minidom
       
    14 import xml.dom.ext
       
    15 import xml.xpath
       
    16 import fnmatch
       
    17 import Ft
       
    18 import uuid
       
    19 import shutil
       
    20 import lucene
       
    21 from ldt.ldt_utils import STORE
       
    22 from ldt.ldt_utils import ANALYZER
       
    23 
       
    24 def Property(func):
       
    25     return property(**func()) 
       
    26 
       
    27 
       
    28 class ContentIndexer(object):
       
    29         
       
    30         def __init__(self, contentList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
       
    31                 self.__contentList = contentList
       
    32                 self.__decoupage_blacklist = decoupage_blackList
       
    33                 self.__writer = writer
       
    34                     
       
    35         @Property
       
    36         def decoupage_blacklist(): #@NoSelf
       
    37             doc = """get blacklist""" #@UnusedVariable
       
    38            
       
    39             def fget(self):
       
    40                 if self.__decoupage_blacklist is None:
       
    41                     self.__decoupage_blacklist = ()
       
    42                 return self.__decoupage_blacklist
       
    43                
       
    44             def fset(self, value):
       
    45                 self.__decoupage_blacklist = value
       
    46                
       
    47             def fdel(self):
       
    48                 del self.__decoupage_blacklist
       
    49                
       
    50             return locals()
       
    51                    
       
    52         def index_all(self):
       
    53             for content in self.__contentList:
       
    54                 self.index_content(content)
       
    55                 
       
    56         def index_content(self, content):
       
    57             url =content.iri_url()
       
    58             filepath = urllib.urlopen(url)
       
    59             doc = xml.dom.minidom.parse(filepath)
       
    60             doc = Ft.Xml.Domlette.ConvertDocument(doc)
       
    61                                    
       
    62             self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
       
    63             
       
    64             con = xml.xpath.Context.Context(doc, 1, 1, None)
       
    65             res = xml.xpath.Evaluate("/iri/body/ensembles/ensemble", context=con)
       
    66 
       
    67             for ensemble in res:
       
    68                 ensembleId = ensemble.getAttributeNS("id",None)
       
    69                 
       
    70                 for decoupageNode in ensemble.childNodes:
       
    71                     if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttributeNS("id",None) in self.decoupage_blacklist:
       
    72                         continue
       
    73                     
       
    74                     decoupId = decoupageNode.getAttributeNS("id",None)
       
    75                     res = xml.xpath.Evaluate("elements/element", decoupageNode)
       
    76                     for elementNode in res:
       
    77                         doc = lucene.Document()
       
    78                         elementId = elementNode.getAttributeNS("id",None)
       
    79                         tags = elementNode.getAttributeNS("tags",None)
       
    80                         
       
    81                         if tags is not None:                            
       
    82                             tags.replace(",", ";")
       
    83                         
       
    84                         if tags is None or len(tags) == 0:
       
    85                             tags = ""
       
    86                             restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
       
    87                             for tagnode in restagnode:
       
    88                                 tags = tags + " ; " + tagnode.data
       
    89                                 
       
    90                         if tags is None or len(tags) == 0:
       
    91                             tags = ""
       
    92                             restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
       
    93                             for tagnode in restagnode:
       
    94                                 tags = tags + " ; " + tagnode.data                            
       
    95     
       
    96                         title = ""
       
    97                         for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
       
    98                             title = title + txtRes.data 
       
    99                 
       
   100                         abstract = ""
       
   101                         for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
       
   102                             abstract = abstract + txtRes.data 
       
   103                 
       
   104                         doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
       
   105                         doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   106                         doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   107                         doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
       
   108                         doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   109                         doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   110                         doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   111                         doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   112 
       
   113                         seg = Segment(content=content,
       
   114                                       iri_id=content.iri_id,
       
   115                                       ensemble_id=ensembleId,
       
   116                                       cutting_id=decoupId,
       
   117                                       element_id=elementId,
       
   118                                       tags=tags,
       
   119                                       title=title,
       
   120                                       abstract=abstract,
       
   121                                       duration=duration,
       
   122                                       author=author,
       
   123                                       start_ts=start_ts,
       
   124                                       date=date_str)
       
   125                         seg.save()
       
   126 
       
   127             
       
   128                         self.__writer.addDocument(doc)
       
   129             
       
   130             self.__writer.commit()
       
   131             
       
   132             
       
   133 class ProjectIndexer(object):
       
   134         
       
   135         def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
       
   136                 self.__projectList = projectList
       
   137                 self.__decoupage_blacklist = decoupage_blackList
       
   138                 self.__writer = writer
       
   139                 
       
   140         @Property
       
   141         def decoupage_blacklist(): #@NoSelf
       
   142             doc = """get blacklist""" #@UnusedVariable
       
   143            
       
   144             def fget(self):
       
   145                 if self.__decoupage_blacklist is None:
       
   146                     self.__decoupage_blacklist = ()
       
   147                 return self.__decoupage_blacklist
       
   148                
       
   149             def fset(self, value):
       
   150                 self.__decoupage_blacklist = value
       
   151                
       
   152             def fdel(self):
       
   153                 del self.__decoupage_blacklist
       
   154                
       
   155             return locals()
       
   156                    
       
   157         def index_all(self):
       
   158             for project in self.__projectList:
       
   159                 self.index_project(project)
       
   160  
       
   161         def index_project(self, project):
       
   162             
       
   163             # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
       
   164             doc = xml.dom.minidom.parseString(project.ldt)
       
   165             doc = Ft.Xml.Domlette.ConvertDocument(doc) 
       
   166 
       
   167             self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
       
   168             
       
   169             con = xml.xpath.Context.Context(doc, 1, 1, None)
       
   170             res = xml.xpath.Evaluate("/iri/annotations/content", context=con)
       
   171 
       
   172             for content in res:
       
   173                 contentId = content.getAttributeNS("id",None)
       
   174                 
       
   175                 ensembleId = "ens_perso"
       
   176                 
       
   177                 for decoupageNode in content.childNodes:
       
   178                     # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
       
   179                     if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttributeNS("id",None) in self.decoupage_blacklist:
       
   180                         continue
       
   181                     
       
   182                     decoupId = decoupageNode.getAttributeNS("id",None)
       
   183                     res = xml.xpath.Evaluate("elements/element", decoupageNode)
       
   184                     for elementNode in res:
       
   185                         doc = lucene.Document()
       
   186                         elementId = elementNode.getAttributeNS("id",None)
       
   187                         tags = elementNode.getAttributeNS("tags",None)
       
   188                         
       
   189                         if tags is not None:                            
       
   190                             tags.replace(",", ";")
       
   191                         
       
   192                         if tags is None or len(tags) == 0:
       
   193                             tags = ""
       
   194                             restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
       
   195                             for tagnode in restagnode:
       
   196                                 tags = tags + " ; " + tagnode.data
       
   197                                 
       
   198                         if tags is None or len(tags) == 0:
       
   199                             tags = ""
       
   200                             restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
       
   201                             for tagnode in restagnode:
       
   202                                 tags = tags + " ; " + tagnode.data                            
       
   203     
       
   204                         title = ""
       
   205                         for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
       
   206                             title = title + txtRes.data 
       
   207                 
       
   208                         abstract = ""
       
   209                         for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
       
   210                             abstract = abstract + txtRes.data 
       
   211                 
       
   212                         doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
       
   213                         doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
       
   214                         doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   215                         doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   216                         doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   217                         doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   218                         doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   219                         doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   220                         doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   221                         
       
   222                         seg = Segment(content=content,
       
   223                                       iri_id=content.iri_id,
       
   224                                       ensemble_id=ensembleId,
       
   225                                       cutting_id=decoupId,
       
   226                                       element_id=elementId,
       
   227                                       tags=tags,
       
   228                                       title=title,
       
   229                                       abstract=abstract,
       
   230                                       duration=duration,
       
   231                                       author=author,
       
   232                                       start_ts=start_ts,
       
   233                                       date=date_str)
       
   234                         seg.save()
       
   235                                     
       
   236                         self.__writer.addDocument(doc)
       
   237             
       
   238             self.__writer.commit()