--- a/src/ldt/ldt/ldt_utils/contentindexer.py Wed May 04 12:44:51 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/contentindexer.py Fri May 06 00:06:42 2011 +0200
@@ -9,35 +9,124 @@
import urllib #@UnresolvedImport
# import ldt.utils.log
-
def Property(func):
return property(**func())
+
+
+class LdtIndexer(object):
+
+ def __init__(self, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
+ self.__decoupage_blacklist = decoupage_blackList
+ self.__writer = writer
+ @Property
+ def decoupage_blacklist(): #@NoSelf
+ doc = """get blacklist""" #@UnusedVariable
+
+ def fget(self):
+ if self.__decoupage_blacklist is None:
+ self.__decoupage_blacklist = ()
+ return self.__decoupage_blacklist
+
+ def fset(self, value):
+ self.__decoupage_blacklist = value
+
+ def fdel(self):
+ del self.__decoupage_blacklist
+
+ return locals()
+
+ @Property
+ def writer(): #@NoSelf
+ def fget(self):
+ return self.__writer
+ return locals()
-class ContentIndexer(object):
+ def index_all(self):
+ raise NotImplemented
+
+ def index_ensemble(self, ensemble, content, project=None):
+ ensembleId = ensemble.get(u"id", None)
+
+ for decoupageNode in ensemble.getchildren():
+ if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
+ continue
+
+ decoupId = decoupageNode.get(u"id", None)
+ res = decoupageNode.xpath("elements/element")
+ for elementNode in res:
+
+ elementId = elementNode.get(u"id", None)
+ tags = elementNode.get(u"tags", None)
+
+ if tags is not None:
+ tags.replace(",", ";")
+
+ if tags is None or len(tags) == 0:
+ tags = u""
+ restagnode = elementNode.xpath("tag/text()", smart_strings=False)
+ for tagnode in restagnode:
+ tags = tags + u" ; " + tagnode
+
+ if tags is None or len(tags) == 0:
+ tags = u""
+ restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
+
+ for tagnode in restagnode:
+ tags = tags + u" ; " + tagnode
+
+ if tags is None:
+ tags = ""
+ tags = ";".join([tag[0:50] for tag in tags.split(";")])
+
+
+ title = reduce_text_node(elementNode, "title/text()")
+ abstract = reduce_text_node(elementNode, "abstract/text()")
+
+ author = elementNode.get("author", "")
+ start_ts = int(elementNode.get("begin", "-1"))
+ duration = int(elementNode.get("dur", "-1"))
+ date_str = elementNode.get("date", "")
+ ldt_id = ""
+ if project:
+ ldt_id = project.ldt_id
+
+ doc = lucene.Document()
+ doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))
+ doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
+ doc.add(lucene.Field("project_id", ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
+ doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+ doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+ doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+ doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+ doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+ doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+ doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+
+ seg = Segment(content=content,
+ iri_id=content.iri_id,
+ ensemble_id=ensembleId,
+ cutting_id=decoupId,
+ element_id=elementId,
+ tags=tags,
+ title=title,
+ abstract=abstract,
+ duration=duration,
+ author=author,
+ start_ts=start_ts,
+ date=date_str,
+ project_obj=project)
+ seg.save()
+ self.writer.addDocument(doc)
+
+
+
+class ContentIndexer(LdtIndexer):
def __init__(self, contentList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
- self.__contentList = contentList
- self.__decoupage_blacklist = decoupage_blackList
- self.__writer = writer
-
- @Property
- def decoupage_blacklist(): #@NoSelf
- doc = """get blacklist""" #@UnusedVariable
-
- def fget(self):
- if self.__decoupage_blacklist is None:
- self.__decoupage_blacklist = ()
- return self.__decoupage_blacklist
-
- def fset(self, value):
- self.__decoupage_blacklist = value
-
- def fdel(self):
- del self.__decoupage_blacklist
-
- return locals()
-
+ super(ContentIndexer, self).__init__(writer, decoupage_blackList)
+ self.__contentList = contentList
+
def index_all(self):
for content in self.__contentList:
self.index_content(content)
@@ -47,104 +136,23 @@
filepath = urllib.urlopen(url)
doc = lxml.etree.parse(filepath) #@UndefinedVariable
- self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
+ self.writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable
res = doc.xpath("/iri/body/ensembles/ensemble")
- for ensemble in res:
- ensembleId = ensemble.get(u"id", None)
-
- for decoupageNode in ensemble.getchildren():
- if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
-
- continue
-
- decoupId = decoupageNode.get(u"id", None)
- res = decoupageNode.xpath("elements/element")
- for elementNode in res:
- doc = lucene.Document()
- elementId = elementNode.get(u"id", None)
- tags = elementNode.get(u"tags", None)
-
- if tags is not None:
- tags.replace(",", ";")
-
- if tags is None or len(tags) == 0:
- tags = u""
- restagnode = elementNode.xpath("tag/text()", smart_strings=False)
- for tagnode in restagnode:
- tags = tags + u" ; " + tagnode
-
- if tags is None or len(tags) == 0:
- tags = u""
- restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
-
- for tagnode in restagnode:
- tags = tags + u" ; " + tagnode
-
- title = reduce_text_node(elementNode, "title/text()")
- abstract = reduce_text_node(elementNode, "abstract/text()")
-
- author = elementNode.get("author", "")
- start_ts = int(elementNode.get("begin", "-1"))
- duration = int(elementNode.get("dur", "-1"))
- date_str = elementNode.get("date", "")
-
-
- doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
- doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
- doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
- doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-
- seg = Segment(content=content,
- iri_id=content.iri_id,
- ensemble_id=ensembleId,
- cutting_id=decoupId,
- element_id=elementId,
- tags=tags,
- title=title,
- abstract=abstract,
- duration=duration,
- author=author,
- start_ts=start_ts,
- date=date_str)
- seg.save()
-
+ for ensemble in res:
+ self.index_ensemble(ensemble, content)
- self.__writer.addDocument(doc)
-
- self.__writer.commit()
+ self.writer.commit()
-class ProjectIndexer(object):
+class ProjectIndexer(LdtIndexer):
def __init__(self, projectList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
- self.__projectList = projectList
- self.__decoupage_blacklist = decoupage_blackList
- self.__writer = writer
-
- @Property
- def decoupage_blacklist(): #@NoSelf
- doc = """get blacklist""" #@UnusedVariable
-
- def fget(self):
- if self.__decoupage_blacklist is None:
- self.__decoupage_blacklist = ()
- return self.__decoupage_blacklist
-
- def fset(self, value):
- self.__decoupage_blacklist = value
-
- def fdel(self):
- del self.__decoupage_blacklist
-
- return locals()
-
+ super(ProjectIndexer, self).__init__(writer, decoupage_blackList)
+ self.__projectList = projectList
+
def index_all(self):
for project in self.__projectList:
self.index_project(project)
@@ -154,8 +162,8 @@
# pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
doc = lxml.etree.fromstring(project.ldt) #@UndefinedVariable
- self.__writer.deleteDocuments(lucene.Term("project_id", project.ldt_id))
- Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete()
+ self.writer.deleteDocuments(lucene.Term("project_id", project.ldt_id))
+ Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() #@UndefinedVariable
res = doc.xpath("/iri/annotations/content")
@@ -163,79 +171,14 @@
contentId = content.get(u"id", None)
content_obj = None
- clist = Content.objects.filter(iri_id = contentId)
+ clist = Content.objects.filter(iri_id = contentId) #@UndefinedVariable
if len(clist) > 0:
content_obj = clist[0]
- for ensembleNode in content.getchildren():
- ensembleId = ensembleNode.get(u"id",None)
-
- for decoupageNode in ensembleNode.getchildren():
- # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
- if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
- continue
-
- decoupId = decoupageNode.get(u"id", None)
- res = decoupageNode.xpath("elements/element")
- for elementNode in res:
- doc = lucene.Document()
- elementId = elementNode.get(u"id", None)
- tags = elementNode.get(u"tags", None)
-
- if tags is not None:
- tags.replace(",", ";")
-
- if tags is None or len(tags) == 0:
- tags = u""
- restagnode = elementNode.xpath("tag/text()", smart_strings=False)
- for tagnode in restagnode:
- tags = tags + u" ; " + tagnode
-
- if tags is None or len(tags) == 0:
- tags = u""
- restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
-
- for tagnode in restagnode:
- tags = tags + u" ; " + tagnode
-
- title = reduce_text_node(elementNode, "title/text()")
- abstract = reduce_text_node(elementNode, "abstract/text()")
-
- author = elementNode.get("author", "")
- start_ts = int(elementNode.get("begin", "-1"))
- duration = int(elementNode.get("dur", "-1"))
- date_str = elementNode.get("date", "")
-
-
- doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("project_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
- doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
- doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
- doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-
- seg = Segment(content=content_obj,
- iri_id=contentId,
- ensemble_id=ensembleId,
- cutting_id=decoupId,
- element_id=elementId,
- tags=tags,
- title=title,
- abstract=abstract,
- duration=duration,
- author=author,
- start_ts=start_ts,
- date=date_str,
- project_obj = project)
- seg.save()
-
- self.__writer.addDocument(doc)
+ for ensemble in content.getchildren():
+ self.index_ensemble(ensemble, content_obj, project)
- self.__writer.commit()
+ self.writer.commit()
@receiver(post_save, sender=Project)
def index_project(sender, **kwargs):
@@ -243,7 +186,7 @@
writer = ldt.indexation.get_writer()
if instance.state != 2:
writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id))
- Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete()
+ Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() #@UndefinedVariable
else:
projectIndexer = ProjectIndexer([instance], writer)
projectIndexer.index_all()