improve indexation, limit tags to 50 characters, improve wsgi
authorymh <ymh.work@gmail.com>
Fri, 06 May 2011 00:06:42 +0200
changeset 90 4ddb88f103ad
parent 89 95cbac192438
child 91 a8d8a87786b4
improve indexation, limit tags to 50 characters, improve wsgi
src/ldt/ldt/__init__.py
src/ldt/ldt/ldt_utils/contentindexer.py
src/ldt/ldt/ldt_utils/forms.py
web/ldtplatform/modwsgi.wsgi
--- a/src/ldt/ldt/__init__.py	Wed May 04 12:44:51 2011 +0200
+++ b/src/ldt/ldt/__init__.py	Fri May 06 00:06:42 2011 +0200
@@ -1,4 +1,4 @@
-VERSION = (0, 6, 0, "final", 0)
+VERSION = (0, 7, 0, "final", 0)
 
 
 def get_version():
--- a/src/ldt/ldt/ldt_utils/contentindexer.py	Wed May 04 12:44:51 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/contentindexer.py	Fri May 06 00:06:42 2011 +0200
@@ -9,35 +9,124 @@
 import urllib #@UnresolvedImport
 # import ldt.utils.log
 
-
 def Property(func):
     return property(**func()) 
+
+
+class LdtIndexer(object):
+    
+    def __init__(self, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
+            self.__decoupage_blacklist = decoupage_blackList
+            self.__writer = writer
         
+    @Property
+    def decoupage_blacklist(): #@NoSelf
+        doc = """get blacklist""" #@UnusedVariable
+       
+        def fget(self):
+            if self.__decoupage_blacklist is None:
+                self.__decoupage_blacklist = ()
+            return self.__decoupage_blacklist
+           
+        def fset(self, value):
+            self.__decoupage_blacklist = value
+           
+        def fdel(self):
+            del self.__decoupage_blacklist
+           
+        return locals()
+    
+    @Property
+    def writer(): #@NoSelf
+        def fget(self):
+            return self.__writer
+        return locals()
 
-class ContentIndexer(object):
+    def index_all(self):
+        raise NotImplemented
+    
+    def index_ensemble(self, ensemble, content, project=None):
+        ensembleId = ensemble.get(u"id", None)
+        
+        for decoupageNode in ensemble.getchildren():
+            if decoupageNode.tag != "decoupage"  or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
+                continue
+            
+            decoupId = decoupageNode.get(u"id", None)
+            res = decoupageNode.xpath("elements/element")
+            for elementNode in res:
+                
+                elementId = elementNode.get(u"id", None)
+                tags = elementNode.get(u"tags", None)
+                
+                if tags is not None:                            
+                    tags.replace(",", ";")
+                
+                if tags is None or len(tags) == 0:
+                    tags = u""
+                    restagnode = elementNode.xpath("tag/text()", smart_strings=False)
+                    for tagnode in restagnode:
+                        tags = tags + u" ; " + tagnode
+                        
+                if tags is None or len(tags) == 0:
+                    tags = u""
+                    restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
+
+                    for tagnode in restagnode:
+                        tags = tags + u" ; " + tagnode
+                
+                if tags is None:
+                    tags = ""
+                tags = ";".join([tag[0:50] for tag in tags.split(";")])
+                
+
+                title = reduce_text_node(elementNode, "title/text()")                
+                abstract = reduce_text_node(elementNode, "abstract/text()")
+                
+                author = elementNode.get("author", "")
+                start_ts = int(elementNode.get("begin", "-1"))
+                duration = int(elementNode.get("dur", "-1"))
+                date_str = elementNode.get("date", "")
+                ldt_id = ""
+                if project:
+                    ldt_id = project.ldt_id
+
+                doc = lucene.Document()
+                doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))        
+                doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
+                doc.add(lucene.Field("project_id", ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
+                doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
+                doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+
+                seg = Segment(content=content,
+                              iri_id=content.iri_id,
+                              ensemble_id=ensembleId,
+                              cutting_id=decoupId,
+                              element_id=elementId,
+                              tags=tags,
+                              title=title,
+                              abstract=abstract,
+                              duration=duration,
+                              author=author,
+                              start_ts=start_ts,
+                              date=date_str,
+                              project_obj=project)
+                seg.save()
+                self.writer.addDocument(doc)
+
+
+
+class ContentIndexer(LdtIndexer):
         
         def __init__(self, contentList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
-                self.__contentList = contentList
-                self.__decoupage_blacklist = decoupage_blackList
-                self.__writer = writer
-                    
-        @Property
-        def decoupage_blacklist(): #@NoSelf
-            doc = """get blacklist""" #@UnusedVariable
-           
-            def fget(self):
-                if self.__decoupage_blacklist is None:
-                    self.__decoupage_blacklist = ()
-                return self.__decoupage_blacklist
-               
-            def fset(self, value):
-                self.__decoupage_blacklist = value
-               
-            def fdel(self):
-                del self.__decoupage_blacklist
-               
-            return locals()
-                   
+            super(ContentIndexer, self).__init__(writer, decoupage_blackList)
+            self.__contentList = contentList
+                                       
         def index_all(self):
             for content in self.__contentList:
                 self.index_content(content)
@@ -47,104 +136,23 @@
             filepath = urllib.urlopen(url)
             doc = lxml.etree.parse(filepath) #@UndefinedVariable
            
-            self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
+            self.writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
             Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable
             
             res = doc.xpath("/iri/body/ensembles/ensemble")
 
-            for ensemble in res:
-                ensembleId = ensemble.get(u"id", None)
-                
-                for decoupageNode in ensemble.getchildren():
-                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
-
-                        continue
-                    
-                    decoupId = decoupageNode.get(u"id", None)
-                    res = decoupageNode.xpath("elements/element")
-                    for elementNode in res:
-                        doc = lucene.Document()
-                        elementId = elementNode.get(u"id", None)
-                        tags = elementNode.get(u"tags", None)
-                        
-                        if tags is not None:                            
-                            tags.replace(",", ";")
-                        
-                        if tags is None or len(tags) == 0:
-                            tags = u""
-                            restagnode = elementNode.xpath("tag/text()", smart_strings=False)
-                            for tagnode in restagnode:
-                                tags = tags + u" ; " + tagnode
-                                
-                        if tags is None or len(tags) == 0:
-                            tags = u""
-                            restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
-
-                            for tagnode in restagnode:
-                                tags = tags + u" ; " + tagnode
-    
-                        title = reduce_text_node(elementNode, "title/text()")                
-                        abstract = reduce_text_node(elementNode, "abstract/text()")
-                        
-                        author = elementNode.get("author", "")
-                        start_ts = int(elementNode.get("begin", "-1"))
-                        duration = int(elementNode.get("dur", "-1"))
-                        date_str = elementNode.get("date", "")
-
-                
-                        doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
-                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
-                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-
-                        seg = Segment(content=content,
-                                      iri_id=content.iri_id,
-                                      ensemble_id=ensembleId,
-                                      cutting_id=decoupId,
-                                      element_id=elementId,
-                                      tags=tags,
-                                      title=title,
-                                      abstract=abstract,
-                                      duration=duration,
-                                      author=author,
-                                      start_ts=start_ts,
-                                      date=date_str)
-                        seg.save()
-
+            for ensemble in res:                
+                self.index_ensemble(ensemble, content)
             
-                        self.__writer.addDocument(doc)
-            
-            self.__writer.commit()
+            self.writer.commit()
             
             
-class ProjectIndexer(object):
+class ProjectIndexer(LdtIndexer):
         
         def __init__(self, projectList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
-                self.__projectList = projectList
-                self.__decoupage_blacklist = decoupage_blackList
-                self.__writer = writer
-                
-        @Property
-        def decoupage_blacklist(): #@NoSelf
-            doc = """get blacklist""" #@UnusedVariable
-           
-            def fget(self):
-                if self.__decoupage_blacklist is None:
-                    self.__decoupage_blacklist = ()
-                return self.__decoupage_blacklist
-               
-            def fset(self, value):
-                self.__decoupage_blacklist = value
-               
-            def fdel(self):
-                del self.__decoupage_blacklist
-               
-            return locals()
-                   
+            super(ProjectIndexer, self).__init__(writer, decoupage_blackList)                
+            self.__projectList = projectList
+                                   
         def index_all(self):
             for project in self.__projectList:
                 self.index_project(project)
@@ -154,8 +162,8 @@
             # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
             doc = lxml.etree.fromstring(project.ldt) #@UndefinedVariable
 
-            self.__writer.deleteDocuments(lucene.Term("project_id", project.ldt_id))
-            Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete()
+            self.writer.deleteDocuments(lucene.Term("project_id", project.ldt_id))
+            Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() #@UndefinedVariable
             
             res = doc.xpath("/iri/annotations/content")
 
@@ -163,79 +171,14 @@
                 contentId = content.get(u"id", None)
                 content_obj = None
                 
-                clist = Content.objects.filter(iri_id = contentId)
+                clist = Content.objects.filter(iri_id = contentId) #@UndefinedVariable
                 if len(clist) > 0:
                     content_obj = clist[0]
  
-                for ensembleNode in content.getchildren():
-                    ensembleId = ensembleNode.get(u"id",None)
-                
-                    for decoupageNode in ensembleNode.getchildren():
-                        # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
-                        if decoupageNode.tag != "decoupage"  or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
-                            continue
-                        
-                        decoupId = decoupageNode.get(u"id", None)
-                        res = decoupageNode.xpath("elements/element")
-                        for elementNode in res:
-                            doc = lucene.Document()
-                            elementId = elementNode.get(u"id", None)
-                            tags = elementNode.get(u"tags", None)
-                            
-                            if tags is not None:                            
-                                tags.replace(",", ";")
-                            
-                            if tags is None or len(tags) == 0:
-                                tags = u""
-                                restagnode = elementNode.xpath("tag/text()", smart_strings=False)
-                                for tagnode in restagnode:
-                                    tags = tags + u" ; " + tagnode
-                                    
-                            if tags is None or len(tags) == 0:
-                                tags = u""
-                                restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
-    
-                                for tagnode in restagnode:
-                                    tags = tags + u" ; " + tagnode
-        
-                            title = reduce_text_node(elementNode, "title/text()")                
-                            abstract = reduce_text_node(elementNode, "abstract/text()")
-                            
-                            author = elementNode.get("author", "")
-                            start_ts = int(elementNode.get("begin", "-1"))
-                            duration = int(elementNode.get("dur", "-1"))
-                            date_str = elementNode.get("date", "")
-    
-                    
-                            doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))
-                            doc.add(lucene.Field("project_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
-                            doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
-                            doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                            doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                            doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                            doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                            doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                            doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                            doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                            
-                            seg = Segment(content=content_obj,
-                                          iri_id=contentId,
-                                          ensemble_id=ensembleId,
-                                          cutting_id=decoupId,
-                                          element_id=elementId,
-                                          tags=tags,
-                                          title=title,
-                                          abstract=abstract,
-                                          duration=duration,
-                                          author=author,
-                                          start_ts=start_ts,
-                                          date=date_str,
-                                          project_obj = project)
-                            seg.save()
-                                        
-                            self.__writer.addDocument(doc)
+                for ensemble in content.getchildren():
+                    self.index_ensemble(ensemble, content_obj, project)
             
-            self.__writer.commit()
+            self.writer.commit()
 
 @receiver(post_save, sender=Project)
 def index_project(sender, **kwargs):
@@ -243,7 +186,7 @@
     writer = ldt.indexation.get_writer()
     if instance.state != 2:
         writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id))
-        Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete()
+        Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() #@UndefinedVariable
     else:
         projectIndexer = ProjectIndexer([instance], writer)
         projectIndexer.index_all()
--- a/src/ldt/ldt/ldt_utils/forms.py	Wed May 04 12:44:51 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/forms.py	Fri May 06 00:06:42 2011 +0200
@@ -11,15 +11,15 @@
     
 class LdtAddForm(forms.ModelForm):
     title = forms.CharField()
-    contents = forms.ModelMultipleChoiceField(Content.objects.all())
+    contents = forms.ModelMultipleChoiceField(Content.objects.all()) #@UndefinedVariable
     # owner = forms.ModelChoiceField(Author.objects.all())
     class Meta:
         model = Project
         exclude = ("ldt_id", "ldt", "created_by", "changed_by", "creation_date", "modification_date", "state", "owner")   
 
 class ReindexForm(forms.Form):
-    contents = forms.ModelMultipleChoiceField(Content.objects.all())
-    index_projects = forms.BooleanField(initial=False)
+    contents = forms.ModelMultipleChoiceField(Content.objects.all()) #@UndefinedVariable
+    index_projects = forms.BooleanField(required=False, initial=False)
 
 class SearchForm(forms.Form):
     search = forms.CharField()
--- a/web/ldtplatform/modwsgi.wsgi	Wed May 04 12:44:51 2011 +0200
+++ b/web/ldtplatform/modwsgi.wsgi	Fri May 06 00:06:42 2011 +0200
@@ -1,21 +1,26 @@
 import os, sys, site
 
 def application(environ, start_response):
-    os.environ['DJANGO_SETTINGS_MODULE'] = environ['DJANGO_SETTINGS_MODULE']
-
-    prev_sys_path = list(sys.path)
-
-    sys.path.append(environ['PROJECT_PATH'])
-    for path in environ.get('PYTHON_PATH',"").split(os.pathsep):
-        if path:
-            site.addsitedir(path)
-
-    new_sys_path = [] 
-    for item in list(sys.path): 
-        if item not in prev_sys_path: 
-            new_sys_path.append(item) 
-            sys.path.remove(item)
-    sys.path[:0] = new_sys_path 
+    
+    global g_env_set
+    
+    if 'g_env_set' not in globals() or not g_env_set:
+        os.environ['DJANGO_SETTINGS_MODULE'] = environ['DJANGO_SETTINGS_MODULE']
+    
+        prev_sys_path = list(sys.path)
+    
+        sys.path.append(environ['PROJECT_PATH'])
+        for path in environ.get('PYTHON_PATH',"").split(os.pathsep):
+            if path:
+                site.addsitedir(path)
+    
+        new_sys_path = [] 
+        for item in list(sys.path): 
+            if item not in prev_sys_path and item not in new_sys_path: 
+                new_sys_path.append(item) 
+                sys.path.remove(item)
+        sys.path[:0] = new_sys_path
+        g_env_set = True 
 
     import django.core.handlers.wsgi