web/ldt/ldt_utils/projectindexer.py
changeset 5 ae8593287883
equal deleted inserted replaced
4:7c994c98d1df 5:ae8593287883
       
     1 import tempfile
       
     2 import os
       
     3 import os.path
       
     4 import shutil
       
     5 import ldt.utils.xml
       
     6 from ldt import settings
       
     7 import xml
       
     8 import xml.dom
       
     9 import xml.dom.minidom
       
    10 import xml.dom.ext
       
    11 import xml.xpath
       
    12 import lucene
       
    13 from ldt.ldt_utils import STORE
       
    14 from ldt.ldt_utils import ANALYZER
       
    15 
       
    16 def Property(func):
       
    17     return property(**func()) 
       
    18 
       
    19 class ProjectIndexer(object):
       
    20     def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
       
    21         self.__projectList = projectList
       
    22         self.__decoupage_blacklist = decoupage_blackList
       
    23         self.__writer = writer
       
    24             
       
    25     @Property
       
    26     def decoupage_blacklist(): #@NoSelf
       
    27         doc = """get blacklist""" #@UnusedVariable
       
    28        
       
    29         def fget(self):
       
    30             if self.__decoupage_blacklist is None:
       
    31                 self.__decoupage_blacklist = ()
       
    32             return self.__decoupage_blacklist
       
    33            
       
    34         def fset(self, value):
       
    35             self.__decoupage_blacklist = value
       
    36            
       
    37         def fdel(self):
       
    38             del self.__decoupage_blacklist
       
    39            
       
    40         return locals()
       
    41                
       
    42     def index_all(self):
       
    43         for project in self.__projectList:
       
    44             self.index_project(project)
       
    45 
       
    46     def index_project(self, project):
       
    47         # ldt.utils.log.debug("Indexing project : "+str(project.ldt_id))
       
    48         ldt=project.ldt
       
    49         doc = xml.dom.minidom.parseString(ldt.encode( "utf-8" ))
       
    50         
       
    51         self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id))
       
    52             
       
    53         con = xml.xpath.Context.Context(doc, 1, 1, None)
       
    54         res = xml.xpath.Evaluate("/iri/annotations/content", context=con)
       
    55 
       
    56         for content in res:
       
    57             contentId = content.getAttribute("id")
       
    58             
       
    59             res =xml.xpath.Evaluate("ensemble", content)
       
    60             for ensemble in res:
       
    61                 ensembleId = ensemble.getAttribute("id")
       
    62                 
       
    63                 for decoupageNode in ensemble.childNodes:
       
    64                     # ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
       
    65                     if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttribute("id") in self.decoupage_blacklist:
       
    66                         continue
       
    67                 
       
    68                     decoupId = decoupageNode.getAttribute("id")
       
    69                     res = xml.xpath.Evaluate("elements/element", decoupageNode)
       
    70                     for elementNode in res:
       
    71                         doc = lucene.Document()
       
    72                         elementId = elementNode.getAttribute("id")
       
    73                         tags = elementNode.getAttribute("tags")
       
    74                         
       
    75                         if tags is not None:                            
       
    76                             tags.replace(",", ";")
       
    77                         
       
    78                         if tags is None or len(tags) == 0:
       
    79                             tags = ""
       
    80                             restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
       
    81                             for tagnode in restagnode:
       
    82                                 tags = tags + " ; " + tagnode.data
       
    83                                 
       
    84                         if tags is None or len(tags) == 0:
       
    85                             tags = ""
       
    86                             restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
       
    87                             for tagnode in restagnode:
       
    88                                 tags = tags + " ; " + tagnode.data                            
       
    89 
       
    90                         title = ""
       
    91                         for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
       
    92                             title = title + txtRes.data 
       
    93                 
       
    94                         abstract = ""
       
    95                         for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
       
    96                             abstract = abstract + txtRes.data 
       
    97                             
       
    98                         doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))              
       
    99                         doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))
       
   100                         doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   101                         doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   102                         doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
       
   103                         doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
       
   104                         doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
       
   105                         doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
       
   106                         doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
       
   107             
       
   108                         self.__writer.addDocument(doc)
       
   109     
       
   110         self.__writer.flush()