web/ldt/ldt/projectindexer.py
author ymh <ymh.work@gmail.com>
Tue, 08 Jun 2010 01:16:35 +0200
changeset 0 ecdfc63274bf
permissions -rw-r--r--
first import

import tempfile
import os
import os.path
import shutil
import ldt.utils.xml
from ldt import settings
import xml
import xml.dom
import xml.dom.minidom
import xml.dom.ext
import xml.xpath
import lucene
from ldt.ldt import STORE
from ldt.ldt import ANALYZER

def Property(func):
    return property(**func()) 

class ProjectIndexer(object):
    def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
        self.__projectList = projectList
        self.__decoupage_blacklist = decoupage_blackList
        self.__writer = writer
            
    @Property
    def decoupage_blacklist(): #@NoSelf
        doc = """get blacklist""" #@UnusedVariable
       
        def fget(self):
            if self.__decoupage_blacklist is None:
                self.__decoupage_blacklist = ()
            return self.__decoupage_blacklist
           
        def fset(self, value):
            self.__decoupage_blacklist = value
           
        def fdel(self):
            del self.__decoupage_blacklist
           
        return locals()
               
    def index_all(self):
        for project in self.__projectList:
            self.index_project(project)

    def index_project(self, project):
        # ldt.utils.log.debug("Indexing project : "+str(project.ldt_id))
        ldt=project.ldt
        doc = xml.dom.minidom.parseString(ldt.encode( "utf-8" ))
        
        self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id))
            
        con = xml.xpath.Context.Context(doc, 1, 1, None)
        res = xml.xpath.Evaluate("/iri/annotations/content", context=con)

        for content in res:
            contentId = content.getAttribute("id")
            
            res =xml.xpath.Evaluate("ensemble", content)
            for ensemble in res:
                ensembleId = ensemble.getAttribute("id")
                
                for decoupageNode in ensemble.childNodes:
                    # ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
                    if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttribute("id") in self.decoupage_blacklist:
                        continue
                
                    decoupId = decoupageNode.getAttribute("id")
                    res = xml.xpath.Evaluate("elements/element", decoupageNode)
                    for elementNode in res:
                        doc = lucene.Document()
                        elementId = elementNode.getAttribute("id")
                        tags = elementNode.getAttribute("tags")
                        
                        if tags is not None:                            
                            tags.replace(",", ";")
                        
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.data
                                
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.data                            

                        title = ""
                        for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
                            title = title + txtRes.data 
                
                        abstract = ""
                        for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
                            abstract = abstract + txtRes.data 
                            
                        doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))              
                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
            
                        self.__writer.addDocument(doc)
    
        self.__writer.flush()