web/ldt/ldt_utils/projectindexer.py
changeset 2 59311c28454f
parent 1 3a30d255c235
child 3 9e6b4dbefcbc
child 6 4d17de9ee64e
equal deleted inserted replaced
1:3a30d255c235 2:59311c28454f
     1 import tempfile
       
     2 import os
       
     3 import os.path
       
     4 import shutil
       
     5 import ldt.utils.xml
       
     6 from ldt import settings
       
     7 import lucene
       
     8 from ldt.ldt_utils import STORE
       
     9 from ldt.ldt_utils import ANALYZER
       
    10 import lxml.etree
       
    11 
       
    12 def Property(func):
       
    13     return property(**func()) 
       
    14 
       
    15 class ProjectIndexer(object):
       
    16     def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
       
    17         self.__projectList = projectList
       
    18         self.__decoupage_blacklist = decoupage_blackList
       
    19         self.__writer = writer
       
    20             
       
    21     @Property
       
    22     def decoupage_blacklist(): #@NoSelf
       
    23         doc = """get blacklist""" #@UnusedVariable
       
    24        
       
    25         def fget(self):
       
    26             if self.__decoupage_blacklist is None:
       
    27                 self.__decoupage_blacklist = ()
       
    28             return self.__decoupage_blacklist
       
    29            
       
    30         def fset(self, value):
       
    31             self.__decoupage_blacklist = value
       
    32            
       
    33         def fdel(self):
       
    34             del self.__decoupage_blacklist
       
    35            
       
    36         return locals()
       
    37                
       
    38     def index_all(self):
       
    39         for project in self.__projectList:
       
    40             self.index_project(project)
       
    41 
       
    42     def index_project(self, project):
       
    43         # ldt.utils.log.debug("Indexing project : "+str(project.ldt_id))
       
    44         
       
    45         ldt=project.ldt
       
    46         doc = lxml.etree.fromstring(ldt.encode( "utf-8" ))
       
    47  
       
    48         self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id))
       
    49             
       
    50         res = doc.xpath("/iri/annotations/content")
       
    51         project.ldt.encode( "utf-8 " )
       
    52 
       
    53         for content in res:
       
    54             contentId = content.get("id")
       
    55  
       
    56             res =content.xpath("ensemble")
       
    57             for ensemble in res:
       
    58                 ensembleId = ensemble.get("id")
       
    59  
       
    60                 for decoupageNode in ensemble.getchildren():
       
    61                     # ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
       
    62                     if decoupageNode.tag != "decoupage"  or decoupageNode.get("id") in self.decoupage_blacklist:
       
    63                         continue
       
    64                 
       
    65                     decoupId = decoupageNode.get("id")
       
    66                     res = decoupageNode.xpath("elements/element")
       
    67 
       
    68                     for elementNode in res:
       
    69                         doc = lucene.Document()
       
    70                         elementId = elementNode.get("id")
       
    71                         tags = elementNode.get("tags")
       
    72                         
       
    73                         if tags is not None:                            
       
    74                             tags.replace(",", ";")
       
    75                         
       
    76                         if tags is None or len(tags) == 0:
       
    77                             tags = ""
       
    78                             restagnode = elementNode.xpath("tag/text()")
       
    79                             for tagnode in restagnode:
       
    80                                 tags = tags + " ; " + tagnode.text()
       
    81                                 
       
    82                         if tags is None or len(tags) == 0:
       
    83                             tags = ""
       
    84                             restagnode = elementNode.xpath("tags/tag/text()")
       
    85                             for tagnode in restagnode:
       
    86                                 tags = tags + " ; " + tagnode.text()                          
       
    87 
       
    88                         title = ""
       
    89                         for txtRes in elementNode.xpath("title/text()"): 
       
    90                             title = title + txtRes.text()
       
    91                 
       
    92                         abstract = ""
       
    93                         for txtRes in elementNode.xpath("abstract/text()"): 
       
    94                             abstract = abstract + txtRes.text() 
       
    95                             
       
    96                         doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
       
    97                         doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
       
    98                         doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
    99                         doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   100                         doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   101                         doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   102                         doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   103                         doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   104                         doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
       
   105             
       
   106                         self.__writer.addDocument(doc)
       
   107     
       
   108         self.__writer.commit()