src/ldt/ldt/ldt_utils/contentindexer.py
author ymh <ymh.work@gmail.com>
Fri, 06 May 2011 01:37:04 +0200
changeset 97 10f69a5bd9e1
parent 93 184314818a3c
child 99 483a30ff6e15
permissions -rw-r--r--
correct propagation of project id on indexation

from django.conf import settings
from django.db.models.signals import post_save
from django.dispatch import receiver
from ldt.ldt_utils.models import Segment, Content, Project
from ldt.ldt_utils.utils import reduce_text_node
import ldt.indexation
import lucene
import lxml.etree
import urllib #@UnresolvedImport
# import ldt.utils.log

def Property(func):
    return property(**func()) 


class LdtIndexer(object):
    
    def __init__(self, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
            self.__decoupage_blacklist = decoupage_blackList
            self.__writer = writer
        
    @Property
    def decoupage_blacklist(): #@NoSelf
        doc = """get blacklist""" #@UnusedVariable
       
        def fget(self):
            if self.__decoupage_blacklist is None:
                self.__decoupage_blacklist = ()
            return self.__decoupage_blacklist
           
        def fset(self, value):
            self.__decoupage_blacklist = value
           
        def fdel(self):
            del self.__decoupage_blacklist
           
        return locals()
    
    @Property
    def writer(): #@NoSelf
        def fget(self):
            return self.__writer
        return locals()

    def index_all(self):
        raise NotImplemented
    
    def index_ensemble(self, ensemble, content, project=None):
        ensembleId = ensemble.get(u"id", None)
        
        for decoupageNode in ensemble.getchildren():
            if decoupageNode.tag != "decoupage"  or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
                continue
            
            decoupId = decoupageNode.get(u"id", None)
            res = decoupageNode.xpath("elements/element")
            for elementNode in res:
                
                elementId = elementNode.get(u"id", None)
                tags = elementNode.get(u"tags", None)
                
                if tags is not None:                            
                    tags.replace(",", ";")
                
                if tags is None or len(tags) == 0:
                    tags = u""
                    restagnode = elementNode.xpath("tag/text()", smart_strings=False)
                    for tagnode in restagnode:
                        tags = tags + u" ; " + tagnode
                        
                if tags is None or len(tags) == 0:
                    tags = u""
                    restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)

                    for tagnode in restagnode:
                        tags = tags + u" ; " + tagnode
                
                if tags is None:
                    tags = ""
                tags = ";".join([tag[0:50] for tag in tags.split(";")])
                

                title = reduce_text_node(elementNode, "title/text()")                
                abstract = reduce_text_node(elementNode, "abstract/text()")
                
                author = elementNode.get("author", "")
                start_ts = int(float(elementNode.get("begin", "-1")))
                duration = int(float(elementNode.get("dur", "0")))
                date_str = elementNode.get("date", "")
                ldt_id = ""
                if project:
                    ldt_id = project.ldt_id

                doc = lucene.Document()
                doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))        
                doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
                doc.add(lucene.Field("project_id", ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
                doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
                doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))

                seg = Segment(content=content,
                              iri_id=content.iri_id,
                              ensemble_id=ensembleId,
                              cutting_id=decoupId,
                              element_id=elementId,
                              tags=tags,
                              title=title,
                              abstract=abstract,
                              duration=duration,
                              author=author,
                              start_ts=start_ts,
                              date=date_str,
                              project_obj=project,
                              project_id=ldt_id)
                seg.save()
                self.writer.addDocument(doc)



class ContentIndexer(LdtIndexer):
        
        def __init__(self, contentList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
            super(ContentIndexer, self).__init__(writer, decoupage_blackList)
            self.__contentList = contentList
                                       
        def index_all(self):
            for content in self.__contentList:
                self.index_content(content)
                
        def index_content(self, content):
            url = content.iri_url()
            filepath = urllib.urlopen(url)
            doc = lxml.etree.parse(filepath) #@UndefinedVariable
           
            self.writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
            Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable
            
            res = doc.xpath("/iri/body/ensembles/ensemble")

            for ensemble in res:                
                self.index_ensemble(ensemble, content)
            
            self.writer.commit()
            
            
class ProjectIndexer(LdtIndexer):
        
        def __init__(self, projectList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
            super(ProjectIndexer, self).__init__(writer, decoupage_blackList)                
            self.__projectList = projectList
                                   
        def index_all(self):
            for project in self.__projectList:
                self.index_project(project)
 
        def index_project(self, project):
            
            # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
            doc = lxml.etree.fromstring(project.ldt) #@UndefinedVariable

            self.writer.deleteDocuments(lucene.Term("project_id", project.ldt_id))
            Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() #@UndefinedVariable
            
            res = doc.xpath("/iri/annotations/content")

            for content in res:
                contentId = content.get(u"id", None)
                content_obj = None
                
                clist = Content.objects.filter(iri_id = contentId) #@UndefinedVariable
                if len(clist) > 0:
                    content_obj = clist[0]
 
                for ensemble in content.getchildren():
                    self.index_ensemble(ensemble, content_obj, project)
            
            self.writer.commit()

@receiver(post_save, sender=Project)
def index_project(sender, **kwargs):
    instance = kwargs['instance']
    writer = ldt.indexation.get_writer()
    if instance.state != 2:
        writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id))
        Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() #@UndefinedVariable
    else:
        projectIndexer = ProjectIndexer([instance], writer)
        projectIndexer.index_all()