web/ldt/ldt_utils/contentindexer.py
changeset 2 59311c28454f
parent 1 3a30d255c235
child 3 9e6b4dbefcbc
child 6 4d17de9ee64e
--- a/web/ldt/ldt_utils/contentindexer.py	Sun Nov 14 20:25:22 2010 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,231 +0,0 @@
-import tempfile
-import os
-import os.path
-import shutil
-from ldt.utils import zipfileext
-import urllib
-# import ldt.utils.log
-import ldt.utils.xml
-from django.conf import settings
-from models import Content
-import fnmatch
-import uuid
-import shutil
-import lucene
-from ldt.ldt_utils import STORE
-from ldt.ldt_utils import ANALYZER
-import lxml.etree
-
-def Property(func):
-    return property(**func()) 
-
-
-class ContentIndexer(object):
-        
-        def __init__(self, contentList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
-                self.__contentList = contentList
-                self.__decoupage_blacklist = decoupage_blackList
-                self.__writer = writer
-                    
-        @Property
-        def decoupage_blacklist(): #@NoSelf
-            doc = """get blacklist""" #@UnusedVariable
-           
-            def fget(self):
-                if self.__decoupage_blacklist is None:
-                    self.__decoupage_blacklist = ()
-                return self.__decoupage_blacklist
-               
-            def fset(self, value):
-                self.__decoupage_blacklist = value
-               
-            def fdel(self):
-                del self.__decoupage_blacklist
-               
-            return locals()
-                   
-        def index_all(self):
-            for content in self.__contentList:
-                self.index_content(content)
-                
-        def index_content(self, content):
-            url =content.iri_url()
-            filepath = urllib.urlopen(url)
-            doc = lxml.etree.fromstring(filepath) 
-           
-            self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
-            
-            res = doc.xpath("/iri/body/ensembles/ensemble")
-
-            for ensemble in res:
-                ensembleId = ensemble.get(None,u"id")
-                
-                for decoupageNode in ensemble.getchildren():
-                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,u"id") in self.decoupage_blacklist:
-
-                        continue
-                    
-                    decoupId = decoupageNode.get(None,u"id")
-                    res = decoupageNode.xpath("elements/element")
-                    for elementNode in res:
-                        doc = lucene.Document()
-                        elementId = elementNode.get(None,u"id")
-                        tags = elementNode.get(None,u"tags")
-                        
-                        if tags is not None:                            
-                            tags.replace(",", ";")
-                        
-                        if tags is None or len(tags) == 0:
-                            tags = ""
-                            restagnode = elementNode.xpath("tag/text()")
-                            for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.text()
-                                
-                        if tags is None or len(tags) == 0:
-                            tags = ""
-                            restagnode = elementNode.xpath("tags/tag/text()")
-
-                            for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.text()
-    
-                        title = ""
-                        for txtRes in elementNode.xpath("title/text()"): 
-                            title = title + txtRes.text()
-                
-                        abstract = ""
-                        for txtRes in elementNode.xpath("abstract/text()"): 
-                            abstract = abstract + txtRes.text()
-                
-                        doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
-                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
-                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-
-                        seg = Segment(content=content,
-                                      iri_id=content.iri_id,
-                                      ensemble_id=ensembleId,
-                                      cutting_id=decoupId,
-                                      element_id=elementId,
-                                      tags=tags,
-                                      title=title,
-                                      abstract=abstract,
-                                      duration=duration,
-                                      author=author,
-                                      start_ts=start_ts,
-                                      date=date_str)
-                        seg.save()
-
-            
-                        self.__writer.addDocument(doc)
-            
-            self.__writer.commit()
-            
-            
-class ProjectIndexer(object):
-        
-        def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
-                self.__projectList = projectList
-                self.__decoupage_blacklist = decoupage_blackList
-                self.__writer = writer
-                
-        @Property
-        def decoupage_blacklist(): #@NoSelf
-            doc = """get blacklist""" #@UnusedVariable
-           
-            def fget(self):
-                if self.__decoupage_blacklist is None:
-                    self.__decoupage_blacklist = ()
-                return self.__decoupage_blacklist
-               
-            def fset(self, value):
-                self.__decoupage_blacklist = value
-               
-            def fdel(self):
-                del self.__decoupage_blacklist
-               
-            return locals()
-                   
-        def index_all(self):
-            for project in self.__projectList:
-                self.index_project(project)
- 
-        def index_project(self, project):
-            
-            # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
-            doc = lxml.etree.fromstring(project.ldt)
-
-            self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
-            
-            res = doc.xpath("/iri/annotations/content")
-
-            for content in res:
-                contentId = content.get(None,u"id")
- 
-                ensembleId = "ens_perso"
-                
-                for decoupageNode in content.getchildren():
-                    # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
-                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,"id") in self.decoupage_blacklist:
-                        continue
-                    
-                    decoupId = decoupageNode.get(None,u"id")
-                    res = decoupageNode.xpath("elements/element")
-                    for elementNode in res:
-                        doc = lucene.Document()
-                        elementId = elementNode.get(None,u"id")
-                        tags = elementNode.get(None,u"tags")
-                        
-                        if tags is not None:                            
-                            tags.replace(",", ";")
-                        
-                        if tags is None or len(tags) == 0:
-                            tags = ""
-                            restagnode = elementNode.xpath("tag/text()")
-                            for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.text()
-                                
-                        if tags is None or len(tags) == 0:
-                            tags = ""
-                            restagnode = elementNode.xpath("tags/tag/text()")
-                            for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.text()                  
-    
-                        title = ""
-                        for txtRes in elementNode.xpath("title/text()"): 
-                            title = title + txtRes.text()
-                
-                        abstract = ""
-                        for txtRes in elementNode.xpath("abstract/text()"): 
-                            abstract = abstract + txtRes.text()
-                
-                        doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
-                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
-                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        
-                        seg = Segment(content=content,
-                                      iri_id=content.iri_id,
-                                      ensemble_id=ensembleId,
-                                      cutting_id=decoupId,
-                                      element_id=elementId,
-                                      tags=tags,
-                                      title=title,
-                                      abstract=abstract,
-                                      duration=duration,
-                                      author=author,
-                                      start_ts=start_ts,
-                                      date=date_str)
-                        seg.save()
-                                    
-                        self.__writer.addDocument(doc)
-            
-            self.__writer.commit()