web/ldt/ldt_utils/contentindexer.py
changeset 5 ae8593287883
child 10 84e31387a741
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/web/ldt/ldt_utils/contentindexer.py	Tue Jun 08 15:44:35 2010 +0200
@@ -0,0 +1,238 @@
+import tempfile
+import os
+import os.path
+import shutil
+from ldt.utils import zipfileext
+import urllib
+# import ldt.utils.log
+import ldt.utils.xml
+from django.conf import settings
+from models import Content
+import xml
+import xml.dom
+import xml.dom.minidom
+import xml.dom.ext
+import xml.xpath
+import fnmatch
+import Ft
+import uuid
+import shutil
+import lucene
+from ldt.ldt_utils import STORE
+from ldt.ldt_utils import ANALYZER
+
+def Property(func):
+    return property(**func()) 
+
+
+class ContentIndexer(object):
+        
+        def __init__(self, contentList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
+                self.__contentList = contentList
+                self.__decoupage_blacklist = decoupage_blackList
+                self.__writer = writer
+                    
+        @Property
+        def decoupage_blacklist(): #@NoSelf
+            doc = """get blacklist""" #@UnusedVariable
+           
+            def fget(self):
+                if self.__decoupage_blacklist is None:
+                    self.__decoupage_blacklist = ()
+                return self.__decoupage_blacklist
+               
+            def fset(self, value):
+                self.__decoupage_blacklist = value
+               
+            def fdel(self):
+                del self.__decoupage_blacklist
+               
+            return locals()
+                   
+        def index_all(self):
+            for content in self.__contentList:
+                self.index_content(content)
+                
+        def index_content(self, content):
+            url =content.iri_url()
+            filepath = urllib.urlopen(url)
+            doc = xml.dom.minidom.parse(filepath)
+            doc = Ft.Xml.Domlette.ConvertDocument(doc)
+                                   
+            self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
+            
+            con = xml.xpath.Context.Context(doc, 1, 1, None)
+            res = xml.xpath.Evaluate("/iri/body/ensembles/ensemble", context=con)
+
+            for ensemble in res:
+                ensembleId = ensemble.getAttributeNS("id",None)
+                
+                for decoupageNode in ensemble.childNodes:
+                    if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttributeNS("id",None) in self.decoupage_blacklist:
+                        continue
+                    
+                    decoupId = decoupageNode.getAttributeNS("id",None)
+                    res = xml.xpath.Evaluate("elements/element", decoupageNode)
+                    for elementNode in res:
+                        doc = lucene.Document()
+                        elementId = elementNode.getAttributeNS("id",None)
+                        tags = elementNode.getAttributeNS("tags",None)
+                        
+                        if tags is not None:                            
+                            tags.replace(",", ";")
+                        
+                        if tags is None or len(tags) == 0:
+                            tags = ""
+                            restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
+                            for tagnode in restagnode:
+                                tags = tags + " ; " + tagnode.data
+                                
+                        if tags is None or len(tags) == 0:
+                            tags = ""
+                            restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
+                            for tagnode in restagnode:
+                                tags = tags + " ; " + tagnode.data                            
+    
+                        title = ""
+                        for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
+                            title = title + txtRes.data 
+                
+                        abstract = ""
+                        for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
+                            abstract = abstract + txtRes.data 
+                
+                        doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
+                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
+                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+
+                        seg = Segment(content=content,
+                                      iri_id=content.iri_id,
+                                      ensemble_id=ensembleId,
+                                      cutting_id=decoupId,
+                                      element_id=elementId,
+                                      tags=tags,
+                                      title=title,
+                                      abstract=abstract,
+                                      duration=duration,
+                                      author=author,
+                                      start_ts=start_ts,
+                                      date=date_str)
+                        seg.save()
+
+            
+                        self.__writer.addDocument(doc)
+            
+            self.__writer.commit()
+            
+            
+class ProjectIndexer(object):
+        
+        def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
+                self.__projectList = projectList
+                self.__decoupage_blacklist = decoupage_blackList
+                self.__writer = writer
+                
+        @Property
+        def decoupage_blacklist(): #@NoSelf
+            doc = """get blacklist""" #@UnusedVariable
+           
+            def fget(self):
+                if self.__decoupage_blacklist is None:
+                    self.__decoupage_blacklist = ()
+                return self.__decoupage_blacklist
+               
+            def fset(self, value):
+                self.__decoupage_blacklist = value
+               
+            def fdel(self):
+                del self.__decoupage_blacklist
+               
+            return locals()
+                   
+        def index_all(self):
+            for project in self.__projectList:
+                self.index_project(project)
+ 
+        def index_project(self, project):
+            
+            # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
+            doc = xml.dom.minidom.parseString(project.ldt)
+            doc = Ft.Xml.Domlette.ConvertDocument(doc) 
+
+            self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
+            
+            con = xml.xpath.Context.Context(doc, 1, 1, None)
+            res = xml.xpath.Evaluate("/iri/annotations/content", context=con)
+
+            for content in res:
+                contentId = content.getAttributeNS("id",None)
+                
+                ensembleId = "ens_perso"
+                
+                for decoupageNode in content.childNodes:
+                    # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
+                    if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttributeNS("id",None) in self.decoupage_blacklist:
+                        continue
+                    
+                    decoupId = decoupageNode.getAttributeNS("id",None)
+                    res = xml.xpath.Evaluate("elements/element", decoupageNode)
+                    for elementNode in res:
+                        doc = lucene.Document()
+                        elementId = elementNode.getAttributeNS("id",None)
+                        tags = elementNode.getAttributeNS("tags",None)
+                        
+                        if tags is not None:                            
+                            tags.replace(",", ";")
+                        
+                        if tags is None or len(tags) == 0:
+                            tags = ""
+                            restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
+                            for tagnode in restagnode:
+                                tags = tags + " ; " + tagnode.data
+                                
+                        if tags is None or len(tags) == 0:
+                            tags = ""
+                            restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
+                            for tagnode in restagnode:
+                                tags = tags + " ; " + tagnode.data                            
+    
+                        title = ""
+                        for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
+                            title = title + txtRes.data 
+                
+                        abstract = ""
+                        for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
+                            abstract = abstract + txtRes.data 
+                
+                        doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
+                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
+                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        
+                        seg = Segment(content=content,
+                                      iri_id=content.iri_id,
+                                      ensemble_id=ensembleId,
+                                      cutting_id=decoupId,
+                                      element_id=elementId,
+                                      tags=tags,
+                                      title=title,
+                                      abstract=abstract,
+                                      duration=duration,
+                                      author=author,
+                                      start_ts=start_ts,
+                                      date=date_str)
+                        seg.save()
+                                    
+                        self.__writer.addDocument(doc)
+            
+            self.__writer.commit()