from django.conf import settings
from ldt.ldt_utils.models import Segment
from ldt.ldt_utils.utils import reduce_text_node
import lucene
import lxml.etree
import urllib #@UnresolvedImport
# import ldt.utils.log
def Property(func):
return property(**func())
class ContentIndexer(object):
def __init__(self, contentList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
self.__contentList = contentList
self.__decoupage_blacklist = decoupage_blackList
self.__writer = writer
@Property
def decoupage_blacklist(): #@NoSelf
doc = """get blacklist""" #@UnusedVariable
def fget(self):
if self.__decoupage_blacklist is None:
self.__decoupage_blacklist = ()
return self.__decoupage_blacklist
def fset(self, value):
self.__decoupage_blacklist = value
def fdel(self):
del self.__decoupage_blacklist
return locals()
def index_all(self):
for content in self.__contentList:
self.index_content(content)
def index_content(self, content):
url = content.iri_url()
filepath = urllib.urlopen(url)
doc = lxml.etree.parse(filepath) #@UndefinedVariable
self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
res = doc.xpath("/iri/body/ensembles/ensemble")
for ensemble in res:
ensembleId = ensemble.get(u"id", None)
for decoupageNode in ensemble.getchildren():
if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
continue
decoupId = decoupageNode.get(u"id", None)
res = decoupageNode.xpath("elements/element")
for elementNode in res:
doc = lucene.Document()
elementId = elementNode.get(u"id", None)
tags = elementNode.get(u"tags", None)
if tags is not None:
tags.replace(",", ";")
if tags is None or len(tags) == 0:
tags = u""
restagnode = elementNode.xpath("tag/text()", smart_strings=False)
for tagnode in restagnode:
tags = tags + u" ; " + tagnode
if tags is None or len(tags) == 0:
tags = u""
restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
for tagnode in restagnode:
tags = tags + u" ; " + tagnode
title = reduce_text_node(elementNode, "title/text()")
abstract = reduce_text_node(elementNode, "abstract/text()")
author = elementNode.get("author", "")
start_ts = int(elementNode.get("begin", "-1"))
duration = int(elementNode.get("dur", "-1"))
date_str = elementNode.get("date", "")
doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
seg = Segment(content=content,
iri_id=content.iri_id,
ensemble_id=ensembleId,
cutting_id=decoupId,
element_id=elementId,
tags=tags,
title=title,
abstract=abstract,
duration=duration,
author=author,
start_ts=start_ts,
date=date_str)
seg.save()
self.__writer.addDocument(doc)
self.__writer.commit()
class ProjectIndexer(object):
def __init__(self, projectList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
self.__projectList = projectList
self.__decoupage_blacklist = decoupage_blackList
self.__writer = writer
@Property
def decoupage_blacklist(): #@NoSelf
doc = """get blacklist""" #@UnusedVariable
def fget(self):
if self.__decoupage_blacklist is None:
self.__decoupage_blacklist = ()
return self.__decoupage_blacklist
def fset(self, value):
self.__decoupage_blacklist = value
def fdel(self):
del self.__decoupage_blacklist
return locals()
def index_all(self):
for project in self.__projectList:
self.index_project(project)
def index_project(self, project):
# pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
doc = lxml.etree.fromstring(project.ldt) #@UndefinedVariable
self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
res = doc.xpath("/iri/annotations/content")
for content in res:
contentId = content.get(u"id", None)
ensembleId = "ens_perso"
for decoupageNode in content.getchildren():
# pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
continue
decoupId = decoupageNode.get(u"id", None)
res = decoupageNode.xpath("elements/element")
for elementNode in res:
doc = lucene.Document()
elementId = elementNode.get(u"id", None)
tags = elementNode.get(u"tags", None)
if tags is not None:
tags.replace(",", ";")
if tags is None or len(tags) == 0:
tags = ""
restagnode = elementNode.xpath("tag/text()")
for tagnode in restagnode:
tags = tags + " ; " + tagnode.text()
if tags is None or len(tags) == 0:
tags = ""
restagnode = elementNode.xpath("tags/tag/text()")
for tagnode in restagnode:
tags = tags + " ; " + tagnode.text()
title = reduce_text_node("")
for txtRes in elementNode.xpath("title/text()"):
title = title + txtRes.text()
abstract = ""
for txtRes in elementNode.xpath("abstract/text()"):
abstract = abstract + txtRes.text()
author = elementNode.get("author", "")
start_ts = int(elementNode.get("begin", "-1"))
duration = int(elementNode.get("dur", "-1"))
date_str = elementNode.get("date", "")
doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
seg = Segment(content=content,
iri_id=content.iri_id,
ensemble_id=ensembleId,
cutting_id=decoupId,
element_id=elementId,
tags=tags,
title=title,
abstract=abstract,
duration=duration,
author=author,
start_ts=start_ts,
date=date_str)
seg.save()
self.__writer.addDocument(doc)
self.__writer.commit()