diff -r 651f67b66c51 -r 7c994c98d1df web/ldt_utils/ldt/contentindexer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/ldt_utils/ldt/contentindexer.py Tue Jun 08 15:31:42 2010 +0200 @@ -0,0 +1,238 @@ +import tempfile +import os +import os.path +import shutil +from ldt.utils import zipfileext +import urllib +# import ldt.utils.log +import ldt.utils.xml +from django.conf import settings +from models import Content +import xml +import xml.dom +import xml.dom.minidom +import xml.dom.ext +import xml.xpath +import fnmatch +import Ft +import uuid +import shutil +import lucene +from ldt.ldt_utils import STORE +from ldt.ldt_utils import ANALYZER + +def Property(func): + return property(**func()) + + +class ContentIndexer(object): + + def __init__(self, contentList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST): + self.__contentList = contentList + self.__decoupage_blacklist = decoupage_blackList + self.__writer = writer + + @Property + def decoupage_blacklist(): #@NoSelf + doc = """get blacklist""" #@UnusedVariable + + def fget(self): + if self.__decoupage_blacklist is None: + self.__decoupage_blacklist = () + return self.__decoupage_blacklist + + def fset(self, value): + self.__decoupage_blacklist = value + + def fdel(self): + del self.__decoupage_blacklist + + return locals() + + def index_all(self): + for content in self.__contentList: + self.index_content(content) + + def index_content(self, content): + url =content.iri_url() + filepath = urllib.urlopen(url) + doc = xml.dom.minidom.parse(filepath) + doc = Ft.Xml.Domlette.ConvertDocument(doc) + + self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id)) + + con = xml.xpath.Context.Context(doc, 1, 1, None) + res = xml.xpath.Evaluate("/iri/body/ensembles/ensemble", context=con) + + for ensemble in res: + ensembleId = ensemble.getAttributeNS("id",None) + + for decoupageNode in ensemble.childNodes: + if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage" or decoupageNode.getAttributeNS("id",None) in self.decoupage_blacklist: + continue + + decoupId = decoupageNode.getAttributeNS("id",None) + res = xml.xpath.Evaluate("elements/element", decoupageNode) + for elementNode in res: + doc = lucene.Document() + elementId = elementNode.getAttributeNS("id",None) + tags = elementNode.getAttributeNS("tags",None) + + if tags is not None: + tags.replace(",", ";") + + if tags is None or len(tags) == 0: + tags = "" + restagnode = xml.xpath.Evaluate("tag/text()", elementNode) + for tagnode in restagnode: + tags = tags + " ; " + tagnode.data + + if tags is None or len(tags) == 0: + tags = "" + restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode) + for tagnode in restagnode: + tags = tags + " ; " + tagnode.data + + title = "" + for txtRes in xml.xpath.Evaluate("title/text()", elementNode): + title = title + txtRes.data + + abstract = "" + for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): + abstract = abstract + txtRes.data + + doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + + seg = Segment(content=content, + iri_id=content.iri_id, + ensemble_id=ensembleId, + cutting_id=decoupId, + element_id=elementId, + tags=tags, + title=title, + abstract=abstract, + duration=duration, + author=author, + start_ts=start_ts, + date=date_str) + seg.save() + + + self.__writer.addDocument(doc) + + self.__writer.commit() + + +class ProjectIndexer(object): + + def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST): + self.__projectList = projectList + self.__decoupage_blacklist = decoupage_blackList + self.__writer = writer + + @Property + def decoupage_blacklist(): #@NoSelf + doc = """get blacklist""" #@UnusedVariable + + def fget(self): + if self.__decoupage_blacklist is None: + self.__decoupage_blacklist = () + return self.__decoupage_blacklist + + def fset(self, value): + self.__decoupage_blacklist = value + + def fdel(self): + del self.__decoupage_blacklist + + return locals() + + def index_all(self): + for project in self.__projectList: + self.index_project(project) + + def index_project(self, project): + + # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id)) + doc = xml.dom.minidom.parseString(project.ldt) + doc = Ft.Xml.Domlette.ConvertDocument(doc) + + self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id)) + + con = xml.xpath.Context.Context(doc, 1, 1, None) + res = xml.xpath.Evaluate("/iri/annotations/content", context=con) + + for content in res: + contentId = content.getAttributeNS("id",None) + + ensembleId = "ens_perso" + + for decoupageNode in content.childNodes: + # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist)) + if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage" or decoupageNode.getAttributeNS("id",None) in self.decoupage_blacklist: + continue + + decoupId = decoupageNode.getAttributeNS("id",None) + res = xml.xpath.Evaluate("elements/element", decoupageNode) + for elementNode in res: + doc = lucene.Document() + elementId = elementNode.getAttributeNS("id",None) + tags = elementNode.getAttributeNS("tags",None) + + if tags is not None: + tags.replace(",", ";") + + if tags is None or len(tags) == 0: + tags = "" + restagnode = xml.xpath.Evaluate("tag/text()", elementNode) + for tagnode in restagnode: + tags = tags + " ; " + tagnode.data + + if tags is None or len(tags) == 0: + tags = "" + restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode) + for tagnode in restagnode: + tags = tags + " ; " + tagnode.data + + title = "" + for txtRes in xml.xpath.Evaluate("title/text()", elementNode): + title = title + txtRes.data + + abstract = "" + for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): + abstract = abstract + txtRes.data + + doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + + seg = Segment(content=content, + iri_id=content.iri_id, + ensemble_id=ensembleId, + cutting_id=decoupId, + element_id=elementId, + tags=tags, + title=title, + abstract=abstract, + duration=duration, + author=author, + start_ts=start_ts, + date=date_str) + seg.save() + + self.__writer.addDocument(doc) + + self.__writer.commit()