web/ldt/ldt_utils/contentindexer.py
changeset 1 3a30d255c235
child 22 83b28fc0d731
equal deleted inserted replaced
0:40eddcc3d063 1:3a30d255c235
       
     1 import tempfile
       
     2 import os
       
     3 import os.path
       
     4 import shutil
       
     5 from ldt.utils import zipfileext
       
     6 import urllib
       
     7 # import ldt.utils.log
       
     8 import ldt.utils.xml
       
     9 from django.conf import settings
       
    10 from models import Content
       
    11 import fnmatch
       
    12 import uuid
       
    13 import shutil
       
    14 import lucene
       
    15 from ldt.ldt_utils import STORE
       
    16 from ldt.ldt_utils import ANALYZER
       
    17 import lxml.etree
       
    18 
       
    19 def Property(func):
       
    20     return property(**func()) 
       
    21 
       
    22 
       
    23 class ContentIndexer(object):
       
    24         
       
    25         def __init__(self, contentList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
       
    26                 self.__contentList = contentList
       
    27                 self.__decoupage_blacklist = decoupage_blackList
       
    28                 self.__writer = writer
       
    29                     
       
    30         @Property
       
    31         def decoupage_blacklist(): #@NoSelf
       
    32             doc = """get blacklist""" #@UnusedVariable
       
    33            
       
    34             def fget(self):
       
    35                 if self.__decoupage_blacklist is None:
       
    36                     self.__decoupage_blacklist = ()
       
    37                 return self.__decoupage_blacklist
       
    38                
       
    39             def fset(self, value):
       
    40                 self.__decoupage_blacklist = value
       
    41                
       
    42             def fdel(self):
       
    43                 del self.__decoupage_blacklist
       
    44                
       
    45             return locals()
       
    46                    
       
    47         def index_all(self):
       
    48             for content in self.__contentList:
       
    49                 self.index_content(content)
       
    50                 
       
    51         def index_content(self, content):
       
    52             url =content.iri_url()
       
    53             filepath = urllib.urlopen(url)
       
    54             doc = lxml.etree.fromstring(filepath) 
       
    55            
       
    56             self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
       
    57             
       
    58             res = doc.xpath("/iri/body/ensembles/ensemble")
       
    59 
       
    60             for ensemble in res:
       
    61                 ensembleId = ensemble.get(None,u"id")
       
    62                 
       
    63                 for decoupageNode in ensemble.getchildren():
       
    64                     if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,u"id") in self.decoupage_blacklist:
       
    65 
       
    66                         continue
       
    67                     
       
    68                     decoupId = decoupageNode.get(None,u"id")
       
    69                     res = decoupageNode.xpath("elements/element")
       
    70                     for elementNode in res:
       
    71                         doc = lucene.Document()
       
    72                         elementId = elementNode.get(None,u"id")
       
    73                         tags = elementNode.get(None,u"tags")
       
    74                         
       
    75                         if tags is not None:                            
       
    76                             tags.replace(",", ";")
       
    77                         
       
    78                         if tags is None or len(tags) == 0:
       
    79                             tags = ""
       
    80                             restagnode = elementNode.xpath("tag/text()")
       
    81                             for tagnode in restagnode:
       
    82                                 tags = tags + " ; " + tagnode.text()
       
    83                                 
       
    84                         if tags is None or len(tags) == 0:
       
    85                             tags = ""
       
    86                             restagnode = elementNode.xpath("tags/tag/text()")
       
    87 
       
    88                             for tagnode in restagnode:
       
    89                                 tags = tags + " ; " + tagnode.text()
       
    90     
       
    91                         title = ""
       
    92                         for txtRes in elementNode.xpath("title/text()"): 
       
    93                             title = title + txtRes.text()
       
    94                 
       
    95                         abstract = ""
       
    96                         for txtRes in elementNode.xpath("abstract/text()"): 
       
    97                             abstract = abstract + txtRes.text()
       
    98                 
       
    99                         doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
       
   100                         doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   101                         doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   102                         doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
       
   103                         doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   104                         doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   105                         doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   106                         doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   107 
       
   108                         seg = Segment(content=content,
       
   109                                       iri_id=content.iri_id,
       
   110                                       ensemble_id=ensembleId,
       
   111                                       cutting_id=decoupId,
       
   112                                       element_id=elementId,
       
   113                                       tags=tags,
       
   114                                       title=title,
       
   115                                       abstract=abstract,
       
   116                                       duration=duration,
       
   117                                       author=author,
       
   118                                       start_ts=start_ts,
       
   119                                       date=date_str)
       
   120                         seg.save()
       
   121 
       
   122             
       
   123                         self.__writer.addDocument(doc)
       
   124             
       
   125             self.__writer.commit()
       
   126             
       
   127             
       
   128 class ProjectIndexer(object):
       
   129         
       
   130         def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
       
   131                 self.__projectList = projectList
       
   132                 self.__decoupage_blacklist = decoupage_blackList
       
   133                 self.__writer = writer
       
   134                 
       
   135         @Property
       
   136         def decoupage_blacklist(): #@NoSelf
       
   137             doc = """get blacklist""" #@UnusedVariable
       
   138            
       
   139             def fget(self):
       
   140                 if self.__decoupage_blacklist is None:
       
   141                     self.__decoupage_blacklist = ()
       
   142                 return self.__decoupage_blacklist
       
   143                
       
   144             def fset(self, value):
       
   145                 self.__decoupage_blacklist = value
       
   146                
       
   147             def fdel(self):
       
   148                 del self.__decoupage_blacklist
       
   149                
       
   150             return locals()
       
   151                    
       
   152         def index_all(self):
       
   153             for project in self.__projectList:
       
   154                 self.index_project(project)
       
   155  
       
   156         def index_project(self, project):
       
   157             
       
   158             # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
       
   159             doc = lxml.etree.fromstring(project.ldt)
       
   160 
       
   161             self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
       
   162             
       
   163             res = doc.xpath("/iri/annotations/content")
       
   164 
       
   165             for content in res:
       
   166                 contentId = content.get(None,u"id")
       
   167  
       
   168                 ensembleId = "ens_perso"
       
   169                 
       
   170                 for decoupageNode in content.getchildren():
       
   171                     # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
       
   172                     if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,"id") in self.decoupage_blacklist:
       
   173                         continue
       
   174                     
       
   175                     decoupId = decoupageNode.get(None,u"id")
       
   176                     res = decoupageNode.xpath("elements/element")
       
   177                     for elementNode in res:
       
   178                         doc = lucene.Document()
       
   179                         elementId = elementNode.get(None,u"id")
       
   180                         tags = elementNode.get(None,u"tags")
       
   181                         
       
   182                         if tags is not None:                            
       
   183                             tags.replace(",", ";")
       
   184                         
       
   185                         if tags is None or len(tags) == 0:
       
   186                             tags = ""
       
   187                             restagnode = elementNode.xpath("tag/text()")
       
   188                             for tagnode in restagnode:
       
   189                                 tags = tags + " ; " + tagnode.text()
       
   190                                 
       
   191                         if tags is None or len(tags) == 0:
       
   192                             tags = ""
       
   193                             restagnode = elementNode.xpath("tags/tag/text()")
       
   194                             for tagnode in restagnode:
       
   195                                 tags = tags + " ; " + tagnode.text()                  
       
   196     
       
   197                         title = ""
       
   198                         for txtRes in elementNode.xpath("title/text()"): 
       
   199                             title = title + txtRes.text()
       
   200                 
       
   201                         abstract = ""
       
   202                         for txtRes in elementNode.xpath("abstract/text()"): 
       
   203                             abstract = abstract + txtRes.text()
       
   204                 
       
   205                         doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
       
   206                         doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
       
   207                         doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   208                         doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   209                         doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   210                         doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   211                         doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   212                         doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   213                         doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   214                         
       
   215                         seg = Segment(content=content,
       
   216                                       iri_id=content.iri_id,
       
   217                                       ensemble_id=ensembleId,
       
   218                                       cutting_id=decoupId,
       
   219                                       element_id=elementId,
       
   220                                       tags=tags,
       
   221                                       title=title,
       
   222                                       abstract=abstract,
       
   223                                       duration=duration,
       
   224                                       author=author,
       
   225                                       start_ts=start_ts,
       
   226                                       date=date_str)
       
   227                         seg.save()
       
   228                                     
       
   229                         self.__writer.addDocument(doc)
       
   230             
       
   231             self.__writer.commit()