diff -r 40eddcc3d063 -r 3a30d255c235 web/ldt/ldt_utils/projectindexer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/ldt/ldt_utils/projectindexer.py Sun Nov 14 20:25:22 2010 +0100 @@ -0,0 +1,108 @@ +import tempfile +import os +import os.path +import shutil +import ldt.utils.xml +from ldt import settings +import lucene +from ldt.ldt_utils import STORE +from ldt.ldt_utils import ANALYZER +import lxml.etree + +def Property(func): + return property(**func()) + +class ProjectIndexer(object): + def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST): + self.__projectList = projectList + self.__decoupage_blacklist = decoupage_blackList + self.__writer = writer + + @Property + def decoupage_blacklist(): #@NoSelf + doc = """get blacklist""" #@UnusedVariable + + def fget(self): + if self.__decoupage_blacklist is None: + self.__decoupage_blacklist = () + return self.__decoupage_blacklist + + def fset(self, value): + self.__decoupage_blacklist = value + + def fdel(self): + del self.__decoupage_blacklist + + return locals() + + def index_all(self): + for project in self.__projectList: + self.index_project(project) + + def index_project(self, project): + # ldt.utils.log.debug("Indexing project : "+str(project.ldt_id)) + + ldt=project.ldt + doc = lxml.etree.fromstring(ldt.encode( "utf-8" )) + + self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id)) + + res = doc.xpath("/iri/annotations/content") + project.ldt.encode( "utf-8 " ) + + for content in res: + contentId = content.get("id") + + res =content.xpath("ensemble") + for ensemble in res: + ensembleId = ensemble.get("id") + + for decoupageNode in ensemble.getchildren(): + # ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist)) + if decoupageNode.tag != "decoupage" or decoupageNode.get("id") in self.decoupage_blacklist: + continue + + decoupId = decoupageNode.get("id") + res = decoupageNode.xpath("elements/element") + + for elementNode in res: + doc = lucene.Document() + elementId = elementNode.get("id") + tags = elementNode.get("tags") + + if tags is not None: + tags.replace(",", ";") + + if tags is None or len(tags) == 0: + tags = "" + restagnode = elementNode.xpath("tag/text()") + for tagnode in restagnode: + tags = tags + " ; " + tagnode.text() + + if tags is None or len(tags) == 0: + tags = "" + restagnode = elementNode.xpath("tags/tag/text()") + for tagnode in restagnode: + tags = tags + " ; " + tagnode.text() + + title = "" + for txtRes in elementNode.xpath("title/text()"): + title = title + txtRes.text() + + abstract = "" + for txtRes in elementNode.xpath("abstract/text()"): + abstract = abstract + txtRes.text() + + doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + + self.__writer.addDocument(doc) + + self.__writer.commit()