web/ldt/ldt_utils/projectindexer.py
author ymh <ymh.work@gmail.com>
Tue, 26 Oct 2010 14:29:57 +0200
changeset 5 5044dbe8745f
parent 1 eb9188f2ee4f
permissions -rw-r--r--
correct problem when creating content

import tempfile
import os
import os.path
import shutil
import ldt.utils.xml
from ldt import settings
import lucene
from ldt.ldt_utils import STORE
from ldt.ldt_utils import ANALYZER
import lxml.etree

def Property(func):
    return property(**func()) 

class ProjectIndexer(object):
    def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
        self.__projectList = projectList
        self.__decoupage_blacklist = decoupage_blackList
        self.__writer = writer
            
    @Property
    def decoupage_blacklist(): #@NoSelf
        doc = """get blacklist""" #@UnusedVariable
       
        def fget(self):
            if self.__decoupage_blacklist is None:
                self.__decoupage_blacklist = ()
            return self.__decoupage_blacklist
           
        def fset(self, value):
            self.__decoupage_blacklist = value
           
        def fdel(self):
            del self.__decoupage_blacklist
           
        return locals()
               
    def index_all(self):
        for project in self.__projectList:
            self.index_project(project)

    def index_project(self, project):
        # ldt.utils.log.debug("Indexing project : "+str(project.ldt_id))
        
        ldt=project.ldt
        doc = lxml.etree.fromstring(ldt.encode( "utf-8" ))
 
        self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id))
            
        res = doc.xpath("/iri/annotations/content")
        project.ldt.encode( "utf-8 " )

        for content in res:
            contentId = content.get("id")
 
            res =content.xpath("ensemble")
            for ensemble in res:
                ensembleId = ensemble.get("id")
 
                for decoupageNode in ensemble.getchildren():
                    # ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
                    if decoupageNode.tag != "decoupage"  or decoupageNode.get("id") in self.decoupage_blacklist:
                        continue
                
                    decoupId = decoupageNode.get("id")
                    res = decoupageNode.xpath("elements/element")

                    for elementNode in res:
                        doc = lucene.Document()
                        elementId = elementNode.get("id")
                        tags = elementNode.get("tags")
                        
                        if tags is not None:                            
                            tags.replace(",", ";")
                        
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tag/text()")
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()
                                
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tags/tag/text()")
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()                          

                        title = ""
                        for txtRes in elementNode.xpath("title/text()"): 
                            title = title + txtRes.text()
                
                        abstract = ""
                        for txtRes in elementNode.xpath("abstract/text()"): 
                            abstract = abstract + txtRes.text() 
                            
                        doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
            
                        self.__writer.addDocument(doc)
    
        self.__writer.commit()