# HG changeset patch # User ymh # Date 1304213440 -7200 # Node ID 7923feb2e362828146c1cb1db7b3d699b77f5bd4 # Parent a29face9a74b9c4935a0039fff7c0d6e07baf822 improve indexation diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/__init__.py --- a/src/ldt/ldt/__init__.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/__init__.py Sun May 01 03:30:40 2011 +0200 @@ -14,3 +14,6 @@ __version__ = get_version() + +#initialize +from ldt.ldt_utils import contentindexer diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/indexation/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ldt/ldt/indexation/__init__.py Sun May 01 03:30:40 2011 +0200 @@ -0,0 +1,15 @@ +from django.conf import settings +import lucene + +lucene.initVM(lucene.CLASSPATH) + +STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH)) +ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) +ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) +ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) +ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) +ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) + +def get_writer(): + return lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) + diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/ldt_utils/__init__.py --- a/src/ldt/ldt/ldt_utils/__init__.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/__init__.py Sun May 01 03:30:40 2011 +0200 @@ -1,15 +1,2 @@ -import lucene -from django.conf import settings - -lucene.initVM(lucene.CLASSPATH) - -STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH)) -ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) - - VERSION = (0, 1) VERSION_STR = unicode(".".join(map(lambda i:"%02d" % (i,), VERSION))) diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/ldt_utils/admin.py --- a/src/ldt/ldt/ldt_utils/admin.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/admin.py Sun May 01 03:30:40 2011 +0200 @@ -2,11 +2,11 @@ from django.contrib import admin from django.shortcuts import render_to_response from django.template import RequestContext -from ldt.ldt_utils import STORE, ANALYZER -from ldt.ldt_utils.contentindexer import ContentIndexer +from ldt.ldt_utils.contentindexer import ContentIndexer, ProjectIndexer from ldt.ldt_utils.fileimport import FileImport, FileImportError from ldt.ldt_utils.forms import LdtImportForm, ReindexForm from ldt.ldt_utils.models import Content, Project, Media, Author +import ldt.indexation import lucene @@ -48,10 +48,16 @@ form = ReindexForm(request.POST) if form.is_valid(): # try: - writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) + writer = ldt.indexation.get_writer() contentList = form.cleaned_data["contents"] indexer = ContentIndexer(contentList, writer) indexer.index_all() + + index_projects = form.cleaned_data["index_projects"] + if index_projects: + projectList = Project.objects.filter(contents__in=contentList).distinct() #filter(contents__in=contentList) @UndefinedVariable + indexer = ProjectIndexer(projectList, writer) + indexer.index_all() writer.close() message = "Indexation ok : " + repr(form.cleaned_data["contents"]) diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/ldt_utils/contentindexer.py --- a/src/ldt/ldt/ldt_utils/contentindexer.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/contentindexer.py Sun May 01 03:30:40 2011 +0200 @@ -1,11 +1,15 @@ from django.conf import settings -from ldt.ldt_utils.models import Segment +from django.db.models.signals import post_save +from django.dispatch import receiver +from ldt.ldt_utils.models import Segment, Content, Project from ldt.ldt_utils.utils import reduce_text_node +import ldt.indexation import lucene import lxml.etree import urllib #@UnresolvedImport # import ldt.utils.log + def Property(func): return property(**func()) @@ -44,6 +48,7 @@ doc = lxml.etree.parse(filepath) #@UndefinedVariable self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id)) + Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable res = doc.xpath("/iri/body/ensembles/ensemble") @@ -149,80 +154,98 @@ # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id)) doc = lxml.etree.fromstring(project.ldt) #@UndefinedVariable - self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id)) + self.__writer.deleteDocuments(lucene.Term("project_id", project.ldt_id)) + Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() res = doc.xpath("/iri/annotations/content") for content in res: contentId = content.get(u"id", None) - - ensembleId = "ens_perso" + content_obj = None - for decoupageNode in content.getchildren(): - # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist)) - if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist: - continue - - decoupId = decoupageNode.get(u"id", None) - res = decoupageNode.xpath("elements/element") - for elementNode in res: - doc = lucene.Document() - elementId = elementNode.get(u"id", None) - tags = elementNode.get(u"tags", None) - - if tags is not None: - tags.replace(",", ";") - - if tags is None or len(tags) == 0: - tags = "" - restagnode = elementNode.xpath("tag/text()") - for tagnode in restagnode: - tags = tags + " ; " + tagnode.text() - - if tags is None or len(tags) == 0: - tags = "" - restagnode = elementNode.xpath("tags/tag/text()") - for tagnode in restagnode: - tags = tags + " ; " + tagnode.text() - - title = reduce_text_node("") - for txtRes in elementNode.xpath("title/text()"): - title = title + txtRes.text() + clist = Content.objects.filter(iri_id = contentId) + if len(clist) > 0: + content_obj = clist[0] + + for ensembleNode in content.getchildren(): + ensembleId = ensembleNode.get(u"id",None) - abstract = "" - for txtRes in elementNode.xpath("abstract/text()"): - abstract = abstract + txtRes.text() - - author = elementNode.get("author", "") - start_ts = int(elementNode.get("begin", "-1")) - duration = int(elementNode.get("dur", "-1")) - date_str = elementNode.get("date", "") - - - doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + for decoupageNode in ensembleNode.getchildren(): + # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist)) + if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist: + continue - seg = Segment(content=content, - iri_id=content.iri_id, - ensemble_id=ensembleId, - cutting_id=decoupId, - element_id=elementId, - tags=tags, - title=title, - abstract=abstract, - duration=duration, - author=author, - start_ts=start_ts, - date=date_str) - seg.save() + decoupId = decoupageNode.get(u"id", None) + res = decoupageNode.xpath("elements/element") + for elementNode in res: + doc = lucene.Document() + elementId = elementNode.get(u"id", None) + tags = elementNode.get(u"tags", None) + + if tags is not None: + tags.replace(",", ";") + + if tags is None or len(tags) == 0: + tags = u"" + restagnode = elementNode.xpath("tag/text()", smart_strings=False) + for tagnode in restagnode: + tags = tags + u" ; " + tagnode - self.__writer.addDocument(doc) + if tags is None or len(tags) == 0: + tags = u"" + restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False) + + for tagnode in restagnode: + tags = tags + u" ; " + tagnode + + title = reduce_text_node(elementNode, "title/text()") + abstract = reduce_text_node(elementNode, "abstract/text()") + + author = elementNode.get("author", "") + start_ts = int(elementNode.get("begin", "-1")) + duration = int(elementNode.get("dur", "-1")) + date_str = elementNode.get("date", "") + + + doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("project_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + + seg = Segment(content=content_obj, + iri_id=contentId, + ensemble_id=ensembleId, + cutting_id=decoupId, + element_id=elementId, + tags=tags, + title=title, + abstract=abstract, + duration=duration, + author=author, + start_ts=start_ts, + date=date_str, + project_obj = project) + seg.save() + + self.__writer.addDocument(doc) self.__writer.commit() + +@receiver(post_save, sender=Project) +def index_project(sender, **kwargs): + instance = kwargs['instance'] + writer = ldt.indexation.get_writer() + if instance.state != 2: + writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id)) + Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() + else: + projectIndexer = ProjectIndexer([instance], writer) + projectIndexer.index_all() + + diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/ldt_utils/forms.py --- a/src/ldt/ldt/ldt_utils/forms.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/forms.py Sun May 01 03:30:40 2011 +0200 @@ -11,7 +11,7 @@ class LdtAddForm(forms.ModelForm): title = forms.CharField() - # contents = forms.ModelMultipleChoiceField(Content.objects.all()) + contents = forms.ModelMultipleChoiceField(Content.objects.all()) # owner = forms.ModelChoiceField(Author.objects.all()) class Meta: model = Project @@ -19,6 +19,7 @@ class ReindexForm(forms.Form): contents = forms.ModelMultipleChoiceField(Content.objects.all()) + index_projects = forms.BooleanField(initial=False) class SearchForm(forms.Form): search = forms.CharField() diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/ldt_utils/models.py --- a/src/ldt/ldt/ldt_utils/models.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/models.py Sun May 01 03:30:40 2011 +0200 @@ -3,7 +3,7 @@ from django.db import models from django.utils.translation import ugettext_lazy as _ from ldt.core.models import Document, Owner -from ldt.ldt_utils import STORE, ANALYZER +import ldt.indexation from utils import (create_ldt, copy_ldt, create_empty_iri, update_iri, generate_uuid) import lucene @@ -131,7 +131,7 @@ def delete(self): super(Content, self).delete() - writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) + writer = ldt.indexation.get_writer() writer.deleteDocuments(lucene.Term("iri_id", self.iri_id)) writer.commit() @@ -161,7 +161,7 @@ #TODO: better manage the change in .iri name and error scenario (save in temp file + rename def save(self, *args, **kwargs): - self.sync_iri_file() + #self.sync_iri_file() # update it super(Content, self).save(*args, **kwargs) diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/ldt_utils/templates/admin/ldt_utils/app_action.html --- a/src/ldt/ldt/ldt_utils/templates/admin/ldt_utils/app_action.html Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/templates/admin/ldt_utils/app_action.html Sun May 01 03:30:40 2011 +0200 @@ -1,6 +1,6 @@ {% extends "admin/base_site.html" %} {% load i18n %} {% block breadcrumbs %} - + {% endblock %} diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/ldt_utils/utils.py --- a/src/ldt/ldt/ldt_utils/utils.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/utils.py Sun May 01 03:30:40 2011 +0200 @@ -1,5 +1,5 @@ from django.conf import settings -from ldt.ldt_utils import STORE +from ldt.indexation import STORE import datetime import django.core.urlresolvers import lucene diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/ldt_utils/views.py --- a/src/ldt/ldt/ldt_utils/views.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/views.py Sun May 01 03:30:40 2011 +0200 @@ -269,7 +269,6 @@ Project.create_project(title=form.cleaned_data['title'], user=user, contents=form.cleaned_data['contents']) form_status = "saved" contents = [] - #return HttpResponseRedirect(reverse("ldt.ldt_utils.views.list_ldt")) else: form = LdtAddForm() contents = Content.objects.all() #@UndefinedVariable diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/text/__init__.py --- a/src/ldt/ldt/text/__init__.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/text/__init__.py Sun May 01 03:30:40 2011 +0200 @@ -1,16 +1,2 @@ -import lucene -from django.conf import settings - -lucene.initVM(lucene.CLASSPATH) - -STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH)) -ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("type_doc", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) - - VERSION = (1, 0) -VERSION_STR = unicode(".".join(map(lambda i:"%01d" % (i,), VERSION))) +VERSION_STR = unicode(".".join(map(lambda i:"%01d" % (i,), VERSION))) \ No newline at end of file diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/text/annotindexer.py --- a/src/ldt/ldt/text/annotindexer.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/text/annotindexer.py Sun May 01 03:30:40 2011 +0200 @@ -28,7 +28,7 @@ for tag in annottags: tags += tag + ";" - doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("title", annotation.title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) doc.add(lucene.Field("abstract", annotation.description, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/text/models.py --- a/src/ldt/ldt/text/models.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/text/models.py Sun May 01 03:30:40 2011 +0200 @@ -1,9 +1,9 @@ from annotindexer import AnnotIndexer from django.db import models from django.utils.translation import ugettext_lazy as _ -from ldt.ldt_utils import STORE, ANALYZER from tagging.models import Tag from utils import generate_uuid +import ldt.indexation import lucene import lxml import tagging.fields @@ -126,13 +126,13 @@ def delete(self): super(Annotation, self).delete() lucene.getVMEnv().attachCurrentThread() - writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) + writer = ldt.indexation.get_writer() writer.deleteDocuments(lucene.Term("external_id", self.external_id)) writer.close() def index_annot(self): lucene.getVMEnv().attachCurrentThread() - writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) + writer = ldt.indexation.get_writer() annotl = [self, ] indexer = AnnotIndexer(annotl, writer) indexer.index_all() @@ -140,7 +140,7 @@ def update_index(self): lucene.getVMEnv().attachCurrentThread() - writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) + writer = ldt.indexation.get_writer() writer.deleteDocuments(lucene.Term("external_id", self.external_id)) writer.close() self.index_annot() diff -r a29face9a74b -r 7923feb2e362 src/ldt/ldt/text/utils.py --- a/src/ldt/ldt/text/utils.py Sun May 01 03:10:32 2011 +0200 +++ b/src/ldt/ldt/text/utils.py Sun May 01 03:30:40 2011 +0200 @@ -1,5 +1,5 @@ from django.conf import settings -from ldt.ldt_utils import STORE +from ldt.indexation import STORE import lucene import uuid