web/ldt/ldt_utils/contentindexer.py
author ymh <ymh.work@gmail.com>
Mon, 13 Dec 2010 23:55:19 +0100
changeset 22 83b28fc0d731
parent 1 3a30d255c235
permissions -rw-r--r--
improve on ldt test framework start migration for text test

import tempfile
import os
import os.path
import shutil
from ldt.utils import zipfileext
import urllib
import ldt.utils.xml
from django.conf import settings
from models import Content
import fnmatch
import uuid
import shutil
import lucene
from ldt.ldt_utils import STORE
from ldt.ldt_utils import ANALYZER
import lxml.etree

def Property(func):
    return property(**func()) 


class ContentIndexer(object):
        
        def __init__(self, contentList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
                self.__contentList = contentList
                self.__decoupage_blacklist = decoupage_blackList
                self.__writer = writer
                    
        @Property
        def decoupage_blacklist(): #@NoSelf
            doc = """get blacklist""" #@UnusedVariable
           
            def fget(self):
                if self.__decoupage_blacklist is None:
                    self.__decoupage_blacklist = ()
                return self.__decoupage_blacklist
               
            def fset(self, value):
                self.__decoupage_blacklist = value
               
            def fdel(self):
                del self.__decoupage_blacklist
               
            return locals()
                   
        def index_all(self):
            for content in self.__contentList:
                self.index_content(content)
                
        def index_content(self, content):
            url =content.iri_url()
            filepath = urllib.urlopen(url)
            doc = lxml.etree.fromstring(filepath) 
           
            self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
            
            res = doc.xpath("/iri/body/ensembles/ensemble")

            for ensemble in res:
                ensembleId = ensemble.get(None,u"id")
                
                for decoupageNode in ensemble.getchildren():
                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,u"id") in self.decoupage_blacklist:

                        continue
                    
                    decoupId = decoupageNode.get(None,u"id")
                    res = decoupageNode.xpath("elements/element")
                    for elementNode in res:
                        doc = lucene.Document()
                        elementId = elementNode.get(None,u"id")
                        tags = elementNode.get(None,u"tags")
                        
                        if tags is not None:                            
                            tags.replace(",", ";")
                        
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tag/text()")
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()
                                
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tags/tag/text()")

                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()
    
                        title = ""
                        for txtRes in elementNode.xpath("title/text()"): 
                            title = title + txtRes.text()
                
                        abstract = ""
                        for txtRes in elementNode.xpath("abstract/text()"): 
                            abstract = abstract + txtRes.text()
                
                        doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))

                        seg = Segment(content=content,
                                      iri_id=content.iri_id,
                                      ensemble_id=ensembleId,
                                      cutting_id=decoupId,
                                      element_id=elementId,
                                      tags=tags,
                                      title=title,
                                      abstract=abstract,
                                      duration=duration,
                                      author=author,
                                      start_ts=start_ts,
                                      date=date_str)
                        seg.save()

            
                        self.__writer.addDocument(doc)
            
            self.__writer.commit()
            
            
class ProjectIndexer(object):
        
        def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
                self.__projectList = projectList
                self.__decoupage_blacklist = decoupage_blackList
                self.__writer = writer
                
        @Property
        def decoupage_blacklist(): #@NoSelf
            doc = """get blacklist""" #@UnusedVariable
           
            def fget(self):
                if self.__decoupage_blacklist is None:
                    self.__decoupage_blacklist = ()
                return self.__decoupage_blacklist
               
            def fset(self, value):
                self.__decoupage_blacklist = value
               
            def fdel(self):
                del self.__decoupage_blacklist
               
            return locals()
                   
        def index_all(self):
            for project in self.__projectList:
                self.index_project(project)
 
        def index_project(self, project):
            
            doc = lxml.etree.fromstring(project.ldt)

            self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
            
            res = doc.xpath("/iri/annotations/content")

            for content in res:
                contentId = content.get(None,u"id")
 
                ensembleId = "ens_perso"
                
                for decoupageNode in content.getchildren():
                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,"id") in self.decoupage_blacklist:
                        continue
                    
                    decoupId = decoupageNode.get(None,u"id")
                    res = decoupageNode.xpath("elements/element")
                    for elementNode in res:
                        doc = lucene.Document()
                        elementId = elementNode.get(None,u"id")
                        tags = elementNode.get(None,u"tags")
                        
                        if tags is not None:                            
                            tags.replace(",", ";")
                        
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tag/text()")
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()
                                
                        if tags is None or len(tags) == 0:
                            tags = ""
                            restagnode = elementNode.xpath("tags/tag/text()")
                            for tagnode in restagnode:
                                tags = tags + " ; " + tagnode.text()                  
    
                        title = ""
                        for txtRes in elementNode.xpath("title/text()"): 
                            title = title + txtRes.text()
                
                        abstract = ""
                        for txtRes in elementNode.xpath("abstract/text()"): 
                            abstract = abstract + txtRes.text()
                
                        doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
                        
                        seg = Segment(content=content,
                                      iri_id=content.iri_id,
                                      ensemble_id=ensembleId,
                                      cutting_id=decoupId,
                                      element_id=elementId,
                                      tags=tags,
                                      title=title,
                                      abstract=abstract,
                                      duration=duration,
                                      author=author,
                                      start_ts=start_ts,
                                      date=date_str)
                        seg.save()
                                    
                        self.__writer.addDocument(doc)
            
            self.__writer.commit()