centralise les appel à lucene
authorymh <ymh.work@gmail.com>
Fri, 20 Jul 2012 12:40:08 +0200
changeset 716 31dc2726ca51
parent 715 f21459554182
child 717 d66abfc1cb87
centralise les appel à lucene
src/ldt/ldt/indexation/__init__.py
src/ldt/ldt/ldt_utils/contentindexer.py
src/ldt/ldt/ldt_utils/views/workspace.py
src/ldt/ldt/text/annotindexer.py
src/ldt/ldt/text/models.py
src/ldt/ldt/text/utils.py
--- a/src/ldt/ldt/indexation/__init__.py	Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/indexation/__init__.py	Fri Jul 20 12:40:08 2012 +0200
@@ -36,7 +36,7 @@
                 ids['title'] = title
                 ids['tags'] = tags
                 ids['score'] = score
-                ids['lucene_id'] = i.doc
+                ids['indexation_id'] = i.doc
                 ids['begin'] = begin
                 ids['duration'] = duration
                 contexts.append(ids)     
@@ -57,6 +57,23 @@
             indexSearcher.close()
     return hits.scoreDocs
 
+def get_result_text(field, query):
+
+    indexSearcher = lucene.IndexSearcher(STORE)
+    queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30))
+    queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
+    queryObj = queryParser.parse(query)
+    hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER)
+    
+    res = []
+    for hit in hits.scoreDocs:
+        doc = indexSearcher.doc(hit.doc)
+        res.append({"external_id":doc.get("external_id"), "title":doc.get("title")})
+    indexSearcher.close()
+    
+    return res
+
+
 def highlight_documents(results_list, query, field):
     searcher = get_searcher()
     try:
@@ -67,7 +84,7 @@
         
         for project in results_list:
             for segment in project['list']:
-                lucene_doc = searcher.doc(segment.lucene_id)
+                lucene_doc = searcher.doc(segment.indexation_id)
                 segment.context = get_highlighted_text(lucene_doc, analyzer, highlighter, 'abstract')
                 tags = get_highlighted_text(lucene_doc, analyzer, highlighter, 'tags')
                 segment.title = get_highlighted_text(lucene_doc, analyzer, highlighter, 'title')
@@ -103,14 +120,35 @@
     queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
     return queryParser
 
-def delete_document(field, value):
-    writer = get_writer()
+def delete_document(field, value, writer=None):
+    if writer is None:
+        writer = get_writer()
     try:
         writer.deleteDocuments(lucene.Term(field, value))
         writer.commit()
     finally:
         writer.close()
 
+def add_document(values, writer=None):
+    
+    if writer is None:
+        writer = get_writer()
+
+    doc = lucene.Document()
+    
+    for field_val in values:
+        store = lucene.Field.Store.YES if field_val[2] else lucene.Field.Store.NO
+        index_analyse = {
+            "NOT_ANALYZED":  lucene.Field.Index.NOT_ANALYZED,
+            "ANALYSED":  lucene.Field.Index.ANALYZED,
+            "NO":  lucene.Field.Index.NO            
+        }[field_val[3]]
+        doc.add(lucene.Field(field_val[0], field_val[1], store, index_analyse))
+    
+    writer.addDocument(doc)
+
+
+
     
     
 
--- a/src/ldt/ldt/ldt_utils/contentindexer.py	Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/ldt_utils/contentindexer.py	Fri Jul 20 12:40:08 2012 +0200
@@ -5,7 +5,6 @@
 from ldt.ldt_utils.utils import reduce_text_node
 from ldt.ldt_utils.stat import update_stat_project
 import ldt.indexation
-import lucene
 import lxml.etree
 import urllib #@UnresolvedImport
 # import ldt.utils.log
@@ -93,21 +92,6 @@
                 if project:
                     ldt_id = project.ldt_id
 
-                doc = lucene.Document()
-                doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))        
-                doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
-                doc.add(lucene.Field("project_id", ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
-                doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
-                doc.add(lucene.Field("tags", tags, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
-                doc.add(lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
-                doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
-                doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
-                doc.add(lucene.Field("begin", str(start_ts), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
-                doc.add(lucene.Field("duration", str(duration), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
-                doc.add(lucene.Field("author", author, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
-
                 seg = Segment(content=content,
                               iri_id=content.iri_id,
                               ensemble_id=ensembleId,
@@ -124,7 +108,22 @@
                               project_id=ldt_id)
                 seg.polemics = seg.get_polemic(polemics)
                 seg.save()
-                self.writer.addDocument(doc)
+
+                ldt.indexation.add_document([
+                    ("type_doc", "annotation", False, "NOT_ANALYZED"),        
+                    ("iri_id", content.iri_id, True, "NOT_ANALYZED"),
+                    ("project_id", ldt_id, True, "NOT_ANALYZED"),
+                    ("ensemble_id", ensembleId, True, "NO"),
+                    ("decoupage_id", decoupId, True, "NO"),
+                    ("element_id", elementId, True, "NO"),
+                    ("tags", tags, True, "ANALYZED"),
+                    ("title", title, True, "ANALYZED"),
+                    ("abstract", abstract, True, "ANALYZED"),
+                    ("all", " ".join([tags, title, abstract]), True, "ANALYZED"),
+                    ("begin", str(start_ts), True, "NOT_ANALYZED"),
+                    ("duration", str(duration), True, "NOT_ANALYZED"),
+                    ("author", author, True, "ANALYZED"),                                             
+                ], self.writer)
 
 
 
@@ -147,7 +146,7 @@
         filepath = urllib.urlopen(url)
         doc = lxml.etree.parse(filepath) #@UndefinedVariable
        
-        self.writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
+        ldt.indexation.delete_document("iri_id", content.iri_id, self.writer)
         Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable
         
         res = doc.xpath("/iri/body/ensembles/ensemble")
@@ -177,7 +176,7 @@
         # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
         doc = lxml.etree.fromstring(project.ldt_encoded) #@UndefinedVariable
 
-        self.writer.deleteDocuments(lucene.Term("project_id", project.ldt_id))
+        ldt.indexation.delete_document("project_id",  project.ldt_id, self.writer)
         Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() #@UndefinedVariable
        
         res = doc.xpath("/iri/annotations/content")
@@ -202,7 +201,7 @@
         writer = ldt.indexation.get_writer()
         try:
             if instance.state != 2:
-                writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id))
+                ldt.indexation.delete_document("project_id", instance.ldt_id, writer)
                 Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() #@UndefinedVariable
                 update_stat_project(instance)
             else:
--- a/src/ldt/ldt/ldt_utils/views/workspace.py	Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/ldt_utils/views/workspace.py	Fri Jul 20 12:40:08 2012 +0200
@@ -215,7 +215,7 @@
             segment = [seg for seg in all_segments if seg.element_id == s['element_id'] and seg.project_id == s['project_id'] and seg.iri_id == s['iri_id'] and seg.cutting_id == s['decoupage_id'] and seg.ensemble_id == s['ensemble_id'] ][0]
                 
             segment.score = s['score']
-            segment.lucene_id = s['lucene_id']
+            segment.indexation_id = s['indexation_id']
             segment.context = s['context']
             segment.context_tags = s['tags']
                                     
--- a/src/ldt/ldt/text/annotindexer.py	Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/text/annotindexer.py	Fri Jul 20 12:40:08 2012 +0200
@@ -1,5 +1,4 @@
-import lucene
-
+import ldt.indexation
 
 class AnnotIndexer(object):
     
@@ -15,9 +14,6 @@
     
     def index_annotation(self, annotation):
         
-        doc = lucene.Document()
-        
-        doc.add(lucene.Field("annotation_id", annotation.external_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
         
         annottags = annotation.get_tag_list()
         tags = ""
@@ -26,16 +22,15 @@
             tags = ""
         else:
             for tag in annottags:
-                tags += tag + ";" 
+                tags += tag + ";"
+
+        ldt.indexation.add_document([
+            ("annotation_id", annotation.external_id, True, "NOT_ANALYZED"),
+            ("type_doc", "text-annotation", False, "NOT_ANALYZED"),
+            ("tags", tags, False, "ANALYZED"),
+            ("title", annotation.title, False, "ANALYZED"),
+            ("abstract", annotation.description, False, "ANALYZED"),
+            ("text", annotation.text, False, "ANALYZED"),
+            ("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), False, "ANALYZED"),
+        ], self.__writer)
         
-        doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))              
-        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-        doc.add(lucene.Field("title", annotation.title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-        doc.add(lucene.Field("abstract", annotation.description, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-        doc.add(lucene.Field("text", annotation.text, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-        doc.add(lucene.Field("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-
-        self.__writer.addDocument(doc)
-            
-        self.__writer.close()
-        
--- a/src/ldt/ldt/text/models.py	Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/text/models.py	Fri Jul 20 12:40:08 2012 +0200
@@ -4,7 +4,6 @@
 from tagging.models import Tag
 from utils import generate_uuid
 import ldt.indexation
-import lucene
 import lxml
 import tagging.fields
 #from django.core.management.validation import max_length
@@ -125,15 +124,9 @@
 
     def delete(self):
         super(Annotation, self).delete()
-        lucene.getVMEnv().attachCurrentThread()
-        writer = ldt.indexation.get_writer()
-        try:
-            writer.deleteDocuments(lucene.Term("external_id", self.external_id))
-        finally:
-            writer.close()
+        ldt.indexation.delete_document("external_id", self.external_id)
 
     def index_annot(self):
-        lucene.getVMEnv().attachCurrentThread()
         writer = ldt.indexation.get_writer()
         try:
             annotl = [self, ]
@@ -143,12 +136,7 @@
             writer.close()
 
     def update_index(self):
-        lucene.getVMEnv().attachCurrentThread()
-        writer = ldt.indexation.get_writer()
-        try:
-            writer.deleteDocuments(lucene.Term("external_id", self.external_id))
-        finally:
-            writer.close()
+        ldt.indexation.delete_document("external_id", self.external_id)
         self.index_annot()
         
         
--- a/src/ldt/ldt/text/utils.py	Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/text/utils.py	Fri Jul 20 12:40:08 2012 +0200
@@ -1,7 +1,6 @@
 from django.conf import settings
-from ldt.indexation import STORE
-import lucene
 import uuid
+import ldt.indexation
 
 __BOOLEAN_DICT = {
     'false':False,
@@ -38,18 +37,7 @@
 class TextSearch(object):
 
     def query(self, field, query):
-        indexSearcher = lucene.IndexSearcher(STORE)
-        queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30))
-        queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
-        queryObj = queryParser.parse(query)
-        hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER)
-    
-        res = []
-        for hit in hits.scoreDocs:
-            doc = indexSearcher.doc(hit.doc)
-            res.append({"external_id":doc.get("external_id"), "title":doc.get("title")})
-        indexSearcher.close()
-        return res
+        return ldt.indexation.get_result_text(field, query)
 
     def query_all(self, query):        
         return self.query("all", query)