# HG changeset patch # User ymh # Date 1342780808 -7200 # Node ID 31dc2726ca51e322c2fec7c45e9c5c16e854f86c # Parent f214595541824f4bd03b3e47be223f5a2c452918 centralise les appel à lucene diff -r f21459554182 -r 31dc2726ca51 src/ldt/ldt/indexation/__init__.py --- a/src/ldt/ldt/indexation/__init__.py Thu Jul 19 19:21:05 2012 +0200 +++ b/src/ldt/ldt/indexation/__init__.py Fri Jul 20 12:40:08 2012 +0200 @@ -36,7 +36,7 @@ ids['title'] = title ids['tags'] = tags ids['score'] = score - ids['lucene_id'] = i.doc + ids['indexation_id'] = i.doc ids['begin'] = begin ids['duration'] = duration contexts.append(ids) @@ -57,6 +57,23 @@ indexSearcher.close() return hits.scoreDocs +def get_result_text(field, query): + + indexSearcher = lucene.IndexSearcher(STORE) + queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30)) + queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND) + queryObj = queryParser.parse(query) + hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER) + + res = [] + for hit in hits.scoreDocs: + doc = indexSearcher.doc(hit.doc) + res.append({"external_id":doc.get("external_id"), "title":doc.get("title")}) + indexSearcher.close() + + return res + + def highlight_documents(results_list, query, field): searcher = get_searcher() try: @@ -67,7 +84,7 @@ for project in results_list: for segment in project['list']: - lucene_doc = searcher.doc(segment.lucene_id) + lucene_doc = searcher.doc(segment.indexation_id) segment.context = get_highlighted_text(lucene_doc, analyzer, highlighter, 'abstract') tags = get_highlighted_text(lucene_doc, analyzer, highlighter, 'tags') segment.title = get_highlighted_text(lucene_doc, analyzer, highlighter, 'title') @@ -103,14 +120,35 @@ queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND) return queryParser -def delete_document(field, value): - writer = get_writer() +def delete_document(field, value, writer=None): + if writer is None: + writer = get_writer() try: writer.deleteDocuments(lucene.Term(field, value)) writer.commit() finally: writer.close() +def add_document(values, writer=None): + + if writer is None: + writer = get_writer() + + doc = lucene.Document() + + for field_val in values: + store = lucene.Field.Store.YES if field_val[2] else lucene.Field.Store.NO + index_analyse = { + "NOT_ANALYZED": lucene.Field.Index.NOT_ANALYZED, + "ANALYSED": lucene.Field.Index.ANALYZED, + "NO": lucene.Field.Index.NO + }[field_val[3]] + doc.add(lucene.Field(field_val[0], field_val[1], store, index_analyse)) + + writer.addDocument(doc) + + + diff -r f21459554182 -r 31dc2726ca51 src/ldt/ldt/ldt_utils/contentindexer.py --- a/src/ldt/ldt/ldt_utils/contentindexer.py Thu Jul 19 19:21:05 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/contentindexer.py Fri Jul 20 12:40:08 2012 +0200 @@ -5,7 +5,6 @@ from ldt.ldt_utils.utils import reduce_text_node from ldt.ldt_utils.stat import update_stat_project import ldt.indexation -import lucene import lxml.etree import urllib #@UnresolvedImport # import ldt.utils.log @@ -93,21 +92,6 @@ if project: ldt_id = project.ldt_id - doc = lucene.Document() - doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("project_id", ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("tags", tags, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("begin", str(start_ts), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("duration", str(duration), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("author", author, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - seg = Segment(content=content, iri_id=content.iri_id, ensemble_id=ensembleId, @@ -124,7 +108,22 @@ project_id=ldt_id) seg.polemics = seg.get_polemic(polemics) seg.save() - self.writer.addDocument(doc) + + ldt.indexation.add_document([ + ("type_doc", "annotation", False, "NOT_ANALYZED"), + ("iri_id", content.iri_id, True, "NOT_ANALYZED"), + ("project_id", ldt_id, True, "NOT_ANALYZED"), + ("ensemble_id", ensembleId, True, "NO"), + ("decoupage_id", decoupId, True, "NO"), + ("element_id", elementId, True, "NO"), + ("tags", tags, True, "ANALYZED"), + ("title", title, True, "ANALYZED"), + ("abstract", abstract, True, "ANALYZED"), + ("all", " ".join([tags, title, abstract]), True, "ANALYZED"), + ("begin", str(start_ts), True, "NOT_ANALYZED"), + ("duration", str(duration), True, "NOT_ANALYZED"), + ("author", author, True, "ANALYZED"), + ], self.writer) @@ -147,7 +146,7 @@ filepath = urllib.urlopen(url) doc = lxml.etree.parse(filepath) #@UndefinedVariable - self.writer.deleteDocuments(lucene.Term("iri_id", content.iri_id)) + ldt.indexation.delete_document("iri_id", content.iri_id, self.writer) Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable res = doc.xpath("/iri/body/ensembles/ensemble") @@ -177,7 +176,7 @@ # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id)) doc = lxml.etree.fromstring(project.ldt_encoded) #@UndefinedVariable - self.writer.deleteDocuments(lucene.Term("project_id", project.ldt_id)) + ldt.indexation.delete_document("project_id", project.ldt_id, self.writer) Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() #@UndefinedVariable res = doc.xpath("/iri/annotations/content") @@ -202,7 +201,7 @@ writer = ldt.indexation.get_writer() try: if instance.state != 2: - writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id)) + ldt.indexation.delete_document("project_id", instance.ldt_id, writer) Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() #@UndefinedVariable update_stat_project(instance) else: diff -r f21459554182 -r 31dc2726ca51 src/ldt/ldt/ldt_utils/views/workspace.py --- a/src/ldt/ldt/ldt_utils/views/workspace.py Thu Jul 19 19:21:05 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/views/workspace.py Fri Jul 20 12:40:08 2012 +0200 @@ -215,7 +215,7 @@ segment = [seg for seg in all_segments if seg.element_id == s['element_id'] and seg.project_id == s['project_id'] and seg.iri_id == s['iri_id'] and seg.cutting_id == s['decoupage_id'] and seg.ensemble_id == s['ensemble_id'] ][0] segment.score = s['score'] - segment.lucene_id = s['lucene_id'] + segment.indexation_id = s['indexation_id'] segment.context = s['context'] segment.context_tags = s['tags'] diff -r f21459554182 -r 31dc2726ca51 src/ldt/ldt/text/annotindexer.py --- a/src/ldt/ldt/text/annotindexer.py Thu Jul 19 19:21:05 2012 +0200 +++ b/src/ldt/ldt/text/annotindexer.py Fri Jul 20 12:40:08 2012 +0200 @@ -1,5 +1,4 @@ -import lucene - +import ldt.indexation class AnnotIndexer(object): @@ -15,9 +14,6 @@ def index_annotation(self, annotation): - doc = lucene.Document() - - doc.add(lucene.Field("annotation_id", annotation.external_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) annottags = annotation.get_tag_list() tags = "" @@ -26,16 +22,15 @@ tags = "" else: for tag in annottags: - tags += tag + ";" + tags += tag + ";" + + ldt.indexation.add_document([ + ("annotation_id", annotation.external_id, True, "NOT_ANALYZED"), + ("type_doc", "text-annotation", False, "NOT_ANALYZED"), + ("tags", tags, False, "ANALYZED"), + ("title", annotation.title, False, "ANALYZED"), + ("abstract", annotation.description, False, "ANALYZED"), + ("text", annotation.text, False, "ANALYZED"), + ("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), False, "ANALYZED"), + ], self.__writer) - doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("title", annotation.title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("abstract", annotation.description, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("text", annotation.text, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - - self.__writer.addDocument(doc) - - self.__writer.close() - diff -r f21459554182 -r 31dc2726ca51 src/ldt/ldt/text/models.py --- a/src/ldt/ldt/text/models.py Thu Jul 19 19:21:05 2012 +0200 +++ b/src/ldt/ldt/text/models.py Fri Jul 20 12:40:08 2012 +0200 @@ -4,7 +4,6 @@ from tagging.models import Tag from utils import generate_uuid import ldt.indexation -import lucene import lxml import tagging.fields #from django.core.management.validation import max_length @@ -125,15 +124,9 @@ def delete(self): super(Annotation, self).delete() - lucene.getVMEnv().attachCurrentThread() - writer = ldt.indexation.get_writer() - try: - writer.deleteDocuments(lucene.Term("external_id", self.external_id)) - finally: - writer.close() + ldt.indexation.delete_document("external_id", self.external_id) def index_annot(self): - lucene.getVMEnv().attachCurrentThread() writer = ldt.indexation.get_writer() try: annotl = [self, ] @@ -143,12 +136,7 @@ writer.close() def update_index(self): - lucene.getVMEnv().attachCurrentThread() - writer = ldt.indexation.get_writer() - try: - writer.deleteDocuments(lucene.Term("external_id", self.external_id)) - finally: - writer.close() + ldt.indexation.delete_document("external_id", self.external_id) self.index_annot() diff -r f21459554182 -r 31dc2726ca51 src/ldt/ldt/text/utils.py --- a/src/ldt/ldt/text/utils.py Thu Jul 19 19:21:05 2012 +0200 +++ b/src/ldt/ldt/text/utils.py Fri Jul 20 12:40:08 2012 +0200 @@ -1,7 +1,6 @@ from django.conf import settings -from ldt.indexation import STORE -import lucene import uuid +import ldt.indexation __BOOLEAN_DICT = { 'false':False, @@ -38,18 +37,7 @@ class TextSearch(object): def query(self, field, query): - indexSearcher = lucene.IndexSearcher(STORE) - queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30)) - queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND) - queryObj = queryParser.parse(query) - hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER) - - res = [] - for hit in hits.scoreDocs: - doc = indexSearcher.doc(hit.doc) - res.append({"external_id":doc.get("external_id"), "title":doc.get("title")}) - indexSearcher.close() - return res + return ldt.indexation.get_result_text(field, query) def query_all(self, query): return self.query("all", query)