import tempfile
import os
import os.path
import shutil
import ldt.utils.xml
from ldt import settings
import xml
import xml.dom
import xml.dom.minidom
import xml.dom.ext
import xml.xpath
import lucene
from ldt.ldt_utils import STORE
from ldt.ldt_utils import ANALYZER
def Property(func):
return property(**func())
class ProjectIndexer(object):
def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
self.__projectList = projectList
self.__decoupage_blacklist = decoupage_blackList
self.__writer = writer
@Property
def decoupage_blacklist(): #@NoSelf
doc = """get blacklist""" #@UnusedVariable
def fget(self):
if self.__decoupage_blacklist is None:
self.__decoupage_blacklist = ()
return self.__decoupage_blacklist
def fset(self, value):
self.__decoupage_blacklist = value
def fdel(self):
del self.__decoupage_blacklist
return locals()
def index_all(self):
for project in self.__projectList:
self.index_project(project)
def index_project(self, project):
# ldt.utils.log.debug("Indexing project : "+str(project.ldt_id))
ldt=project.ldt
doc = xml.dom.minidom.parseString(ldt.encode( "utf-8" ))
self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id))
con = xml.xpath.Context.Context(doc, 1, 1, None)
res = xml.xpath.Evaluate("/iri/annotations/content", context=con)
for content in res:
contentId = content.getAttribute("id")
res =xml.xpath.Evaluate("ensemble", content)
for ensemble in res:
ensembleId = ensemble.getAttribute("id")
for decoupageNode in ensemble.childNodes:
# ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage" or decoupageNode.getAttribute("id") in self.decoupage_blacklist:
continue
decoupId = decoupageNode.getAttribute("id")
res = xml.xpath.Evaluate("elements/element", decoupageNode)
for elementNode in res:
doc = lucene.Document()
elementId = elementNode.getAttribute("id")
tags = elementNode.getAttribute("tags")
if tags is not None:
tags.replace(",", ";")
if tags is None or len(tags) == 0:
tags = ""
restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
for tagnode in restagnode:
tags = tags + " ; " + tagnode.data
if tags is None or len(tags) == 0:
tags = ""
restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
for tagnode in restagnode:
tags = tags + " ; " + tagnode.data
title = ""
for txtRes in xml.xpath.Evaluate("title/text()", elementNode):
title = title + txtRes.data
abstract = ""
for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode):
abstract = abstract + txtRes.data
doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))
doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))
doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
self.__writer.addDocument(doc)
self.__writer.flush()