improve indexation
authorymh <ymh.work@gmail.com>
Sun, 01 May 2011 03:30:40 +0200
changeset 77 7923feb2e362
parent 76 a29face9a74b
child 78 083916488cd5
improve indexation
src/ldt/ldt/__init__.py
src/ldt/ldt/indexation/__init__.py
src/ldt/ldt/ldt_utils/__init__.py
src/ldt/ldt/ldt_utils/admin.py
src/ldt/ldt/ldt_utils/contentindexer.py
src/ldt/ldt/ldt_utils/forms.py
src/ldt/ldt/ldt_utils/models.py
src/ldt/ldt/ldt_utils/templates/admin/ldt_utils/app_action.html
src/ldt/ldt/ldt_utils/utils.py
src/ldt/ldt/ldt_utils/views.py
src/ldt/ldt/text/__init__.py
src/ldt/ldt/text/annotindexer.py
src/ldt/ldt/text/models.py
src/ldt/ldt/text/utils.py
--- a/src/ldt/ldt/__init__.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/__init__.py	Sun May 01 03:30:40 2011 +0200
@@ -14,3 +14,6 @@
 
 
 __version__ = get_version()
+
+#initialize 
+from ldt.ldt_utils import contentindexer
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/ldt/ldt/indexation/__init__.py	Sun May 01 03:30:40 2011 +0200
@@ -0,0 +1,15 @@
+from django.conf import settings
+import lucene
+
+lucene.initVM(lucene.CLASSPATH)
+
+STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH))
+ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
+ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
+ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
+ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
+ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
+
+def get_writer():
+    return lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
+
--- a/src/ldt/ldt/ldt_utils/__init__.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/__init__.py	Sun May 01 03:30:40 2011 +0200
@@ -1,15 +1,2 @@
-import lucene
-from django.conf import settings
-
-lucene.initVM(lucene.CLASSPATH)
-
-STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH))
-ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
-ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
-ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
-ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
-ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
-
-
 VERSION = (0, 1)
 VERSION_STR = unicode(".".join(map(lambda i:"%02d" % (i,), VERSION)))
--- a/src/ldt/ldt/ldt_utils/admin.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/admin.py	Sun May 01 03:30:40 2011 +0200
@@ -2,11 +2,11 @@
 from django.contrib import admin
 from django.shortcuts import render_to_response
 from django.template import RequestContext
-from ldt.ldt_utils import STORE, ANALYZER
-from ldt.ldt_utils.contentindexer import ContentIndexer
+from ldt.ldt_utils.contentindexer import ContentIndexer, ProjectIndexer
 from ldt.ldt_utils.fileimport import FileImport, FileImportError
 from ldt.ldt_utils.forms import LdtImportForm, ReindexForm
 from ldt.ldt_utils.models import Content, Project, Media, Author
+import ldt.indexation
 import lucene
 
 
@@ -48,10 +48,16 @@
             form = ReindexForm(request.POST)
             if form.is_valid():    
                 # try:
-                writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
+                writer = ldt.indexation.get_writer()
                 contentList = form.cleaned_data["contents"]
                 indexer = ContentIndexer(contentList, writer)
                 indexer.index_all()
+                
+                index_projects = form.cleaned_data["index_projects"]
+                if index_projects:
+                    projectList = Project.objects.filter(contents__in=contentList).distinct() #filter(contents__in=contentList) @UndefinedVariable
+                    indexer = ProjectIndexer(projectList, writer)
+                    indexer.index_all()
 
                 writer.close()
                 message = "Indexation ok : " + repr(form.cleaned_data["contents"]) 
--- a/src/ldt/ldt/ldt_utils/contentindexer.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/contentindexer.py	Sun May 01 03:30:40 2011 +0200
@@ -1,11 +1,15 @@
 from django.conf import settings
-from ldt.ldt_utils.models import Segment
+from django.db.models.signals import post_save
+from django.dispatch import receiver
+from ldt.ldt_utils.models import Segment, Content, Project
 from ldt.ldt_utils.utils import reduce_text_node
+import ldt.indexation
 import lucene
 import lxml.etree
 import urllib #@UnresolvedImport
 # import ldt.utils.log
 
+
 def Property(func):
     return property(**func()) 
         
@@ -44,6 +48,7 @@
             doc = lxml.etree.parse(filepath) #@UndefinedVariable
            
             self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
+            Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable
             
             res = doc.xpath("/iri/body/ensembles/ensemble")
 
@@ -149,80 +154,98 @@
             # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
             doc = lxml.etree.fromstring(project.ldt) #@UndefinedVariable
 
-            self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
+            self.__writer.deleteDocuments(lucene.Term("project_id", project.ldt_id))
+            Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete()
             
             res = doc.xpath("/iri/annotations/content")
 
             for content in res:
                 contentId = content.get(u"id", None)
- 
-                ensembleId = "ens_perso"
+                content_obj = None
                 
-                for decoupageNode in content.getchildren():
-                    # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
-                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
-                        continue
-                    
-                    decoupId = decoupageNode.get(u"id", None)
-                    res = decoupageNode.xpath("elements/element")
-                    for elementNode in res:
-                        doc = lucene.Document()
-                        elementId = elementNode.get(u"id", None)
-                        tags = elementNode.get(u"tags", None)
-                        
-                        if tags is not None:                            
-                            tags.replace(",", ";")
-                        
-                        if tags is None or len(tags) == 0:
-                            tags = ""
-                            restagnode = elementNode.xpath("tag/text()")
-                            for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.text()
-                                
-                        if tags is None or len(tags) == 0:
-                            tags = ""
-                            restagnode = elementNode.xpath("tags/tag/text()")
-                            for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.text()                  
-    
-                        title = reduce_text_node("")
-                        for txtRes in elementNode.xpath("title/text()"): 
-                            title = title + txtRes.text()
+                clist = Content.objects.filter(iri_id = contentId)
+                if len(clist) > 0:
+                    content_obj = clist[0]
+ 
+                for ensembleNode in content.getchildren():
+                    ensembleId = ensembleNode.get(u"id",None)
                 
-                        abstract = ""
-                        for txtRes in elementNode.xpath("abstract/text()"): 
-                            abstract = abstract + txtRes.text()
-
-                        author = elementNode.get("author", "")
-                        start_ts = int(elementNode.get("begin", "-1"))
-                        duration = int(elementNode.get("dur", "-1"))
-                        date_str = elementNode.get("date", "")
-
-                
-                        doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
-                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
-                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                    for decoupageNode in ensembleNode.getchildren():
+                        # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
+                        if decoupageNode.tag != "decoupage"  or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
+                            continue
                         
-                        seg = Segment(content=content,
-                                      iri_id=content.iri_id,
-                                      ensemble_id=ensembleId,
-                                      cutting_id=decoupId,
-                                      element_id=elementId,
-                                      tags=tags,
-                                      title=title,
-                                      abstract=abstract,
-                                      duration=duration,
-                                      author=author,
-                                      start_ts=start_ts,
-                                      date=date_str)
-                        seg.save()
+                        decoupId = decoupageNode.get(u"id", None)
+                        res = decoupageNode.xpath("elements/element")
+                        for elementNode in res:
+                            doc = lucene.Document()
+                            elementId = elementNode.get(u"id", None)
+                            tags = elementNode.get(u"tags", None)
+                            
+                            if tags is not None:                            
+                                tags.replace(",", ";")
+                            
+                            if tags is None or len(tags) == 0:
+                                tags = u""
+                                restagnode = elementNode.xpath("tag/text()", smart_strings=False)
+                                for tagnode in restagnode:
+                                    tags = tags + u" ; " + tagnode
                                     
-                        self.__writer.addDocument(doc)
+                            if tags is None or len(tags) == 0:
+                                tags = u""
+                                restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
+    
+                                for tagnode in restagnode:
+                                    tags = tags + u" ; " + tagnode
+        
+                            title = reduce_text_node(elementNode, "title/text()")                
+                            abstract = reduce_text_node(elementNode, "abstract/text()")
+                            
+                            author = elementNode.get("author", "")
+                            start_ts = int(elementNode.get("begin", "-1"))
+                            duration = int(elementNode.get("dur", "-1"))
+                            date_str = elementNode.get("date", "")
+    
+                    
+                            doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))
+                            doc.add(lucene.Field("project_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
+                            doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
+                            doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                            doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                            doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                            doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                            doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                            doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                            doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                            
+                            seg = Segment(content=content_obj,
+                                          iri_id=contentId,
+                                          ensemble_id=ensembleId,
+                                          cutting_id=decoupId,
+                                          element_id=elementId,
+                                          tags=tags,
+                                          title=title,
+                                          abstract=abstract,
+                                          duration=duration,
+                                          author=author,
+                                          start_ts=start_ts,
+                                          date=date_str,
+                                          project_obj = project)
+                            seg.save()
+                                        
+                            self.__writer.addDocument(doc)
             
             self.__writer.commit()
+
+@receiver(post_save, sender=Project)
+def index_project(sender, **kwargs):
+    instance = kwargs['instance']
+    writer = ldt.indexation.get_writer()
+    if instance.state != 2:
+        writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id))
+        Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete()
+    else:
+        projectIndexer = ProjectIndexer([instance], writer)
+        projectIndexer.index_all()
+        
+
--- a/src/ldt/ldt/ldt_utils/forms.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/forms.py	Sun May 01 03:30:40 2011 +0200
@@ -11,7 +11,7 @@
     
 class LdtAddForm(forms.ModelForm):
     title = forms.CharField()
-    # contents = forms.ModelMultipleChoiceField(Content.objects.all())
+    contents = forms.ModelMultipleChoiceField(Content.objects.all())
     # owner = forms.ModelChoiceField(Author.objects.all())
     class Meta:
         model = Project
@@ -19,6 +19,7 @@
 
 class ReindexForm(forms.Form):
     contents = forms.ModelMultipleChoiceField(Content.objects.all())
+    index_projects = forms.BooleanField(initial=False)
 
 class SearchForm(forms.Form):
     search = forms.CharField()
--- a/src/ldt/ldt/ldt_utils/models.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/models.py	Sun May 01 03:30:40 2011 +0200
@@ -3,7 +3,7 @@
 from django.db import models
 from django.utils.translation import ugettext_lazy as _
 from ldt.core.models import Document, Owner
-from ldt.ldt_utils import STORE, ANALYZER
+import ldt.indexation
 from utils import (create_ldt, copy_ldt, create_empty_iri, update_iri, 
     generate_uuid)
 import lucene
@@ -131,7 +131,7 @@
     
     def delete(self):
         super(Content, self).delete()
-        writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
+        writer = ldt.indexation.get_writer()
         writer.deleteDocuments(lucene.Term("iri_id", self.iri_id))
         writer.commit()
         
@@ -161,7 +161,7 @@
     #TODO: better manage the change in .iri name and error scenario (save in temp file + rename
     def save(self, *args, **kwargs):
         
-        self.sync_iri_file()        
+        #self.sync_iri_file()        
         # update it 
         super(Content, self).save(*args, **kwargs)
     
--- a/src/ldt/ldt/ldt_utils/templates/admin/ldt_utils/app_action.html	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/templates/admin/ldt_utils/app_action.html	Sun May 01 03:30:40 2011 +0200
@@ -1,6 +1,6 @@
 {% extends "admin/base_site.html" %}
 {% load i18n %}
 {% block breadcrumbs %}
-<div class="breadcrumbs"><a href="{% url admin:index %}"> {% trans "Home" %}</a> &rsaquo; <a href="{% url admin:app_list 'ldt' %}">ldt</a> &rsaquo; {{ current_action }}</div>
+<div class="breadcrumbs"><a href="{% url admin:index %}"> {% trans "Home" %}</a> &rsaquo; <a href="{% url admin:app_list 'ldt_utils' %}">ldt_utils</a> &rsaquo; {{ current_action }}</div>
 {% endblock %}
 
--- a/src/ldt/ldt/ldt_utils/utils.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/utils.py	Sun May 01 03:30:40 2011 +0200
@@ -1,5 +1,5 @@
 from django.conf import settings
-from ldt.ldt_utils import STORE
+from ldt.indexation import STORE
 import datetime
 import django.core.urlresolvers
 import lucene
--- a/src/ldt/ldt/ldt_utils/views.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/ldt_utils/views.py	Sun May 01 03:30:40 2011 +0200
@@ -269,7 +269,6 @@
             Project.create_project(title=form.cleaned_data['title'], user=user, contents=form.cleaned_data['contents'])
             form_status = "saved"
             contents = []
-            #return HttpResponseRedirect(reverse("ldt.ldt_utils.views.list_ldt"))
     else:
         form = LdtAddForm()
         contents = Content.objects.all() #@UndefinedVariable
--- a/src/ldt/ldt/text/__init__.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/text/__init__.py	Sun May 01 03:30:40 2011 +0200
@@ -1,16 +1,2 @@
-import lucene
-from django.conf import settings
-
-lucene.initVM(lucene.CLASSPATH)
-
-STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH))
-ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
-ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
-ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
-ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
-ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
-ANALYZER.addAnalyzer("type_doc", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
-
-
 VERSION = (1, 0)
-VERSION_STR = unicode(".".join(map(lambda i:"%01d" % (i,), VERSION)))
+VERSION_STR = unicode(".".join(map(lambda i:"%01d" % (i,), VERSION)))
\ No newline at end of file
--- a/src/ldt/ldt/text/annotindexer.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/text/annotindexer.py	Sun May 01 03:30:40 2011 +0200
@@ -28,7 +28,7 @@
             for tag in annottags:
                 tags += tag + ";" 
         
-        doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))              
+        doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))              
         doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
         doc.add(lucene.Field("title", annotation.title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
         doc.add(lucene.Field("abstract", annotation.description, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
--- a/src/ldt/ldt/text/models.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/text/models.py	Sun May 01 03:30:40 2011 +0200
@@ -1,9 +1,9 @@
 from annotindexer import AnnotIndexer
 from django.db import models
 from django.utils.translation import ugettext_lazy as _
-from ldt.ldt_utils import STORE, ANALYZER
 from tagging.models import Tag
 from utils import generate_uuid
+import ldt.indexation
 import lucene
 import lxml
 import tagging.fields
@@ -126,13 +126,13 @@
     def delete(self):
         super(Annotation, self).delete()
         lucene.getVMEnv().attachCurrentThread()
-        writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
+        writer = ldt.indexation.get_writer()
         writer.deleteDocuments(lucene.Term("external_id", self.external_id))
         writer.close()
 
     def index_annot(self):
         lucene.getVMEnv().attachCurrentThread()
-        writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
+        writer = ldt.indexation.get_writer()
         annotl = [self, ]
         indexer = AnnotIndexer(annotl, writer)
         indexer.index_all()
@@ -140,7 +140,7 @@
 
     def update_index(self):
         lucene.getVMEnv().attachCurrentThread()
-        writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
+        writer = ldt.indexation.get_writer()
         writer.deleteDocuments(lucene.Term("external_id", self.external_id))
         writer.close()
         self.index_annot()
--- a/src/ldt/ldt/text/utils.py	Sun May 01 03:10:32 2011 +0200
+++ b/src/ldt/ldt/text/utils.py	Sun May 01 03:30:40 2011 +0200
@@ -1,5 +1,5 @@
 from django.conf import settings
-from ldt.ldt_utils import STORE
+from ldt.indexation import STORE
 import lucene
 import uuid