web/ldt/ldt_utils/contentindexer.py
author ymh <ymh.work@gmail.com>
Tue, 26 Oct 2010 16:25:19 +0200
changeset 7 2b0de7414b92
parent 1 eb9188f2ee4f
permissions -rw-r--r--
correct write content error

import tempfile
import os
import os.path
import shutil
from ldt.utils import zipfileext
import urllib
# import ldt.utils.log
import ldt.utils.xml
from django.conf import settings
from models import Content
import fnmatch
import uuid
import shutil
import lucene
from ldt.ldt_utils import STORE
from ldt.ldt_utils import ANALYZER
import lxml.etree

def Property(func):
    return property(**func()) 


class ContentIndexer(object):
        
        def __init__(self, contentList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
                self.__contentList = contentList
                self.__decoupage_blacklist = decoupage_blackList
                self.__writer = writer
                    
        @Property
        def decoupage_blacklist(): #@NoSelf
            doc = """get blacklist""" #@UnusedVariable
           
            def fget(self):
                if self.__decoupage_blacklist is None:
                    self.__decoupage_blacklist = ()
                return self.__decoupage_blacklist
               
            def fset(self, value):
                self.__decoupage_blacklist = value
               
            def fdel(self):
                del self.__decoupage_blacklist
               
            return locals()
                   
        def index_all(self):
            for content in self.__contentList:
                self.index_content(content)
                
        def index_content(self, content):
            url =content.iri_url()
            filepath = urllib.urlopen(url)
            doc = lxml.etree.fromstring(filepath) 
           
            self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
            
            res = doc.xpath("/iri/body/ensembles/ensemble")

            for ensemble in res:
                ensembleId = ensemble.get(None,u"id")
                
                for decoupageNode in ensemble.getchildren():
                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,u"id") in self.decoupage_blacklist:

                        continue
                    
                    decoupId = decoupageNode.get(None,u"id")
                    res = decoupageNode.xpath("elements/element")
                    for elementNode in res:
                        doc = lucene.Document()
                        elementId = elementNode.get(None,u"id")
                        tags = elementNode.get(None,u"tags")
                        
                        if tags is not None:                            
                            tags.replace(",", ";")
                        
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tag/text()")
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()
                                
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tags/tag/text()")

                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()
    
                        title = ""
                        for txtRes in elementNode.xpath("title/text()"): 
                            title = title + txtRes.text()
                
                        abstract = ""
                        for txtRes in elementNode.xpath("abstract/text()"): 
                            abstract = abstract + txtRes.text()
                
                        doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))

                        seg = Segment(content=content,
                                      iri_id=content.iri_id,
                                      ensemble_id=ensembleId,
                                      cutting_id=decoupId,
                                      element_id=elementId,
                                      tags=tags,
                                      title=title,
                                      abstract=abstract,
                                      duration=duration,
                                      author=author,
                                      start_ts=start_ts,
                                      date=date_str)
                        seg.save()

            
                        self.__writer.addDocument(doc)
            
            self.__writer.commit()
            
            
class ProjectIndexer(object):
        
        def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
                self.__projectList = projectList
                self.__decoupage_blacklist = decoupage_blackList
                self.__writer = writer
                
        @Property
        def decoupage_blacklist(): #@NoSelf
            doc = """get blacklist""" #@UnusedVariable
           
            def fget(self):
                if self.__decoupage_blacklist is None:
                    self.__decoupage_blacklist = ()
                return self.__decoupage_blacklist
               
            def fset(self, value):
                self.__decoupage_blacklist = value
               
            def fdel(self):
                del self.__decoupage_blacklist
               
            return locals()
                   
        def index_all(self):
            for project in self.__projectList:
                self.index_project(project)
 
        def index_project(self, project):
            
            # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
            doc = lxml.etree.fromstring(project.ldt)

            self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
            
            res = doc.xpath("/iri/annotations/content")

            for content in res:
                contentId = content.get(None,u"id")
 
                ensembleId = "ens_perso"
                
                for decoupageNode in content.getchildren():
                    # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,"id") in self.decoupage_blacklist:
                        continue
                    
                    decoupId = decoupageNode.get(None,u"id")
                    res = decoupageNode.xpath("elements/element")
                    for elementNode in res:
                        doc = lucene.Document()
                        elementId = elementNode.get(None,u"id")
                        tags = elementNode.get(None,u"tags")
                        
                        if tags is not None:                            
                            tags.replace(",", ";")
                        
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tag/text()")
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()
                                
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tags/tag/text()")
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()                  
    
                        title = ""
                        for txtRes in elementNode.xpath("title/text()"): 
                            title = title + txtRes.text()
                
                        abstract = ""
                        for txtRes in elementNode.xpath("abstract/text()"): 
                            abstract = abstract + txtRes.text()
                
                        doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        
                        seg = Segment(content=content,
                                      iri_id=content.iri_id,
                                      ensemble_id=ensembleId,
                                      cutting_id=decoupId,
                                      element_id=elementId,
                                      tags=tags,
                                      title=title,
                                      abstract=abstract,
                                      duration=duration,
                                      author=author,
                                      start_ts=start_ts,
                                      date=date_str)
                        seg.save()
                                    
                        self.__writer.addDocument(doc)
            
            self.__writer.commit()