diff -r 3a30d255c235 -r 59311c28454f web/ldt/ldt_utils/contentindexer.py --- a/web/ldt/ldt_utils/contentindexer.py Sun Nov 14 20:25:22 2010 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,231 +0,0 @@ -import tempfile -import os -import os.path -import shutil -from ldt.utils import zipfileext -import urllib -# import ldt.utils.log -import ldt.utils.xml -from django.conf import settings -from models import Content -import fnmatch -import uuid -import shutil -import lucene -from ldt.ldt_utils import STORE -from ldt.ldt_utils import ANALYZER -import lxml.etree - -def Property(func): - return property(**func()) - - -class ContentIndexer(object): - - def __init__(self, contentList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST): - self.__contentList = contentList - self.__decoupage_blacklist = decoupage_blackList - self.__writer = writer - - @Property - def decoupage_blacklist(): #@NoSelf - doc = """get blacklist""" #@UnusedVariable - - def fget(self): - if self.__decoupage_blacklist is None: - self.__decoupage_blacklist = () - return self.__decoupage_blacklist - - def fset(self, value): - self.__decoupage_blacklist = value - - def fdel(self): - del self.__decoupage_blacklist - - return locals() - - def index_all(self): - for content in self.__contentList: - self.index_content(content) - - def index_content(self, content): - url =content.iri_url() - filepath = urllib.urlopen(url) - doc = lxml.etree.fromstring(filepath) - - self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id)) - - res = doc.xpath("/iri/body/ensembles/ensemble") - - for ensemble in res: - ensembleId = ensemble.get(None,u"id") - - for decoupageNode in ensemble.getchildren(): - if decoupageNode.tag != "decoupage" or decoupageNode.get(None,u"id") in self.decoupage_blacklist: - - continue - - decoupId = decoupageNode.get(None,u"id") - res = decoupageNode.xpath("elements/element") - for elementNode in res: - doc = lucene.Document() - elementId = elementNode.get(None,u"id") - tags = elementNode.get(None,u"tags") - - if tags is not None: - tags.replace(",", ";") - - if tags is None or len(tags) == 0: - tags = "" - restagnode = elementNode.xpath("tag/text()") - for tagnode in restagnode: - tags = tags + " ; " + tagnode.text() - - if tags is None or len(tags) == 0: - tags = "" - restagnode = elementNode.xpath("tags/tag/text()") - - for tagnode in restagnode: - tags = tags + " ; " + tagnode.text() - - title = "" - for txtRes in elementNode.xpath("title/text()"): - title = title + txtRes.text() - - abstract = "" - for txtRes in elementNode.xpath("abstract/text()"): - abstract = abstract + txtRes.text() - - doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - - seg = Segment(content=content, - iri_id=content.iri_id, - ensemble_id=ensembleId, - cutting_id=decoupId, - element_id=elementId, - tags=tags, - title=title, - abstract=abstract, - duration=duration, - author=author, - start_ts=start_ts, - date=date_str) - seg.save() - - - self.__writer.addDocument(doc) - - self.__writer.commit() - - -class ProjectIndexer(object): - - def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST): - self.__projectList = projectList - self.__decoupage_blacklist = decoupage_blackList - self.__writer = writer - - @Property - def decoupage_blacklist(): #@NoSelf - doc = """get blacklist""" #@UnusedVariable - - def fget(self): - if self.__decoupage_blacklist is None: - self.__decoupage_blacklist = () - return self.__decoupage_blacklist - - def fset(self, value): - self.__decoupage_blacklist = value - - def fdel(self): - del self.__decoupage_blacklist - - return locals() - - def index_all(self): - for project in self.__projectList: - self.index_project(project) - - def index_project(self, project): - - # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id)) - doc = lxml.etree.fromstring(project.ldt) - - self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id)) - - res = doc.xpath("/iri/annotations/content") - - for content in res: - contentId = content.get(None,u"id") - - ensembleId = "ens_perso" - - for decoupageNode in content.getchildren(): - # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist)) - if decoupageNode.tag != "decoupage" or decoupageNode.get(None,"id") in self.decoupage_blacklist: - continue - - decoupId = decoupageNode.get(None,u"id") - res = decoupageNode.xpath("elements/element") - for elementNode in res: - doc = lucene.Document() - elementId = elementNode.get(None,u"id") - tags = elementNode.get(None,u"tags") - - if tags is not None: - tags.replace(",", ";") - - if tags is None or len(tags) == 0: - tags = "" - restagnode = elementNode.xpath("tag/text()") - for tagnode in restagnode: - tags = tags + " ; " + tagnode.text() - - if tags is None or len(tags) == 0: - tags = "" - restagnode = elementNode.xpath("tags/tag/text()") - for tagnode in restagnode: - tags = tags + " ; " + tagnode.text() - - title = "" - for txtRes in elementNode.xpath("title/text()"): - title = title + txtRes.text() - - abstract = "" - for txtRes in elementNode.xpath("abstract/text()"): - abstract = abstract + txtRes.text() - - doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - - seg = Segment(content=content, - iri_id=content.iri_id, - ensemble_id=ensembleId, - cutting_id=decoupId, - element_id=elementId, - tags=tags, - title=title, - abstract=abstract, - duration=duration, - author=author, - start_ts=start_ts, - date=date_str) - seg.save() - - self.__writer.addDocument(doc) - - self.__writer.commit()