--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/web/ldt/ldt_utils/projectindexer.py Tue Jun 08 15:44:35 2010 +0200
@@ -0,0 +1,110 @@
+import tempfile
+import os
+import os.path
+import shutil
+import ldt.utils.xml
+from ldt import settings
+import xml
+import xml.dom
+import xml.dom.minidom
+import xml.dom.ext
+import xml.xpath
+import lucene
+from ldt.ldt_utils import STORE
+from ldt.ldt_utils import ANALYZER
+
+def Property(func):
+ return property(**func())
+
+class ProjectIndexer(object):
+ def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
+ self.__projectList = projectList
+ self.__decoupage_blacklist = decoupage_blackList
+ self.__writer = writer
+
+ @Property
+ def decoupage_blacklist(): #@NoSelf
+ doc = """get blacklist""" #@UnusedVariable
+
+ def fget(self):
+ if self.__decoupage_blacklist is None:
+ self.__decoupage_blacklist = ()
+ return self.__decoupage_blacklist
+
+ def fset(self, value):
+ self.__decoupage_blacklist = value
+
+ def fdel(self):
+ del self.__decoupage_blacklist
+
+ return locals()
+
+ def index_all(self):
+ for project in self.__projectList:
+ self.index_project(project)
+
+ def index_project(self, project):
+ # ldt.utils.log.debug("Indexing project : "+str(project.ldt_id))
+ ldt=project.ldt
+ doc = xml.dom.minidom.parseString(ldt.encode( "utf-8" ))
+
+ self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id))
+
+ con = xml.xpath.Context.Context(doc, 1, 1, None)
+ res = xml.xpath.Evaluate("/iri/annotations/content", context=con)
+
+ for content in res:
+ contentId = content.getAttribute("id")
+
+ res =xml.xpath.Evaluate("ensemble", content)
+ for ensemble in res:
+ ensembleId = ensemble.getAttribute("id")
+
+ for decoupageNode in ensemble.childNodes:
+ # ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
+ if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage" or decoupageNode.getAttribute("id") in self.decoupage_blacklist:
+ continue
+
+ decoupId = decoupageNode.getAttribute("id")
+ res = xml.xpath.Evaluate("elements/element", decoupageNode)
+ for elementNode in res:
+ doc = lucene.Document()
+ elementId = elementNode.getAttribute("id")
+ tags = elementNode.getAttribute("tags")
+
+ if tags is not None:
+ tags.replace(",", ";")
+
+ if tags is None or len(tags) == 0:
+ tags = ""
+ restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
+ for tagnode in restagnode:
+ tags = tags + " ; " + tagnode.data
+
+ if tags is None or len(tags) == 0:
+ tags = ""
+ restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
+ for tagnode in restagnode:
+ tags = tags + " ; " + tagnode.data
+
+ title = ""
+ for txtRes in xml.xpath.Evaluate("title/text()", elementNode):
+ title = title + txtRes.data
+
+ abstract = ""
+ for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode):
+ abstract = abstract + txtRes.data
+
+ doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))
+ doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))
+ doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+ doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+ doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+ doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
+ doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
+ doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
+ doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
+
+ self.__writer.addDocument(doc)
+
+ self.__writer.flush()
\ No newline at end of file