--- a/src/ldt/ldt/indexation/__init__.py Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/indexation/__init__.py Fri Jul 20 12:40:08 2012 +0200
@@ -36,7 +36,7 @@
ids['title'] = title
ids['tags'] = tags
ids['score'] = score
- ids['lucene_id'] = i.doc
+ ids['indexation_id'] = i.doc
ids['begin'] = begin
ids['duration'] = duration
contexts.append(ids)
@@ -57,6 +57,23 @@
indexSearcher.close()
return hits.scoreDocs
+def get_result_text(field, query):
+
+ indexSearcher = lucene.IndexSearcher(STORE)
+ queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30))
+ queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
+ queryObj = queryParser.parse(query)
+ hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER)
+
+ res = []
+ for hit in hits.scoreDocs:
+ doc = indexSearcher.doc(hit.doc)
+ res.append({"external_id":doc.get("external_id"), "title":doc.get("title")})
+ indexSearcher.close()
+
+ return res
+
+
def highlight_documents(results_list, query, field):
searcher = get_searcher()
try:
@@ -67,7 +84,7 @@
for project in results_list:
for segment in project['list']:
- lucene_doc = searcher.doc(segment.lucene_id)
+ lucene_doc = searcher.doc(segment.indexation_id)
segment.context = get_highlighted_text(lucene_doc, analyzer, highlighter, 'abstract')
tags = get_highlighted_text(lucene_doc, analyzer, highlighter, 'tags')
segment.title = get_highlighted_text(lucene_doc, analyzer, highlighter, 'title')
@@ -103,14 +120,35 @@
queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
return queryParser
-def delete_document(field, value):
- writer = get_writer()
+def delete_document(field, value, writer=None):
+ if writer is None:
+ writer = get_writer()
try:
writer.deleteDocuments(lucene.Term(field, value))
writer.commit()
finally:
writer.close()
+def add_document(values, writer=None):
+
+ if writer is None:
+ writer = get_writer()
+
+ doc = lucene.Document()
+
+ for field_val in values:
+ store = lucene.Field.Store.YES if field_val[2] else lucene.Field.Store.NO
+ index_analyse = {
+ "NOT_ANALYZED": lucene.Field.Index.NOT_ANALYZED,
+ "ANALYSED": lucene.Field.Index.ANALYZED,
+ "NO": lucene.Field.Index.NO
+ }[field_val[3]]
+ doc.add(lucene.Field(field_val[0], field_val[1], store, index_analyse))
+
+ writer.addDocument(doc)
+
+
+
--- a/src/ldt/ldt/ldt_utils/contentindexer.py Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/ldt_utils/contentindexer.py Fri Jul 20 12:40:08 2012 +0200
@@ -5,7 +5,6 @@
from ldt.ldt_utils.utils import reduce_text_node
from ldt.ldt_utils.stat import update_stat_project
import ldt.indexation
-import lucene
import lxml.etree
import urllib #@UnresolvedImport
# import ldt.utils.log
@@ -93,21 +92,6 @@
if project:
ldt_id = project.ldt_id
- doc = lucene.Document()
- doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("project_id", ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
- doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
- doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
- doc.add(lucene.Field("tags", tags, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("begin", str(start_ts), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("duration", str(duration), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("author", author, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED))
-
seg = Segment(content=content,
iri_id=content.iri_id,
ensemble_id=ensembleId,
@@ -124,7 +108,22 @@
project_id=ldt_id)
seg.polemics = seg.get_polemic(polemics)
seg.save()
- self.writer.addDocument(doc)
+
+ ldt.indexation.add_document([
+ ("type_doc", "annotation", False, "NOT_ANALYZED"),
+ ("iri_id", content.iri_id, True, "NOT_ANALYZED"),
+ ("project_id", ldt_id, True, "NOT_ANALYZED"),
+ ("ensemble_id", ensembleId, True, "NO"),
+ ("decoupage_id", decoupId, True, "NO"),
+ ("element_id", elementId, True, "NO"),
+ ("tags", tags, True, "ANALYZED"),
+ ("title", title, True, "ANALYZED"),
+ ("abstract", abstract, True, "ANALYZED"),
+ ("all", " ".join([tags, title, abstract]), True, "ANALYZED"),
+ ("begin", str(start_ts), True, "NOT_ANALYZED"),
+ ("duration", str(duration), True, "NOT_ANALYZED"),
+ ("author", author, True, "ANALYZED"),
+ ], self.writer)
@@ -147,7 +146,7 @@
filepath = urllib.urlopen(url)
doc = lxml.etree.parse(filepath) #@UndefinedVariable
- self.writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
+ ldt.indexation.delete_document("iri_id", content.iri_id, self.writer)
Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable
res = doc.xpath("/iri/body/ensembles/ensemble")
@@ -177,7 +176,7 @@
# pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
doc = lxml.etree.fromstring(project.ldt_encoded) #@UndefinedVariable
- self.writer.deleteDocuments(lucene.Term("project_id", project.ldt_id))
+ ldt.indexation.delete_document("project_id", project.ldt_id, self.writer)
Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() #@UndefinedVariable
res = doc.xpath("/iri/annotations/content")
@@ -202,7 +201,7 @@
writer = ldt.indexation.get_writer()
try:
if instance.state != 2:
- writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id))
+ ldt.indexation.delete_document("project_id", instance.ldt_id, writer)
Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() #@UndefinedVariable
update_stat_project(instance)
else:
--- a/src/ldt/ldt/ldt_utils/views/workspace.py Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/ldt_utils/views/workspace.py Fri Jul 20 12:40:08 2012 +0200
@@ -215,7 +215,7 @@
segment = [seg for seg in all_segments if seg.element_id == s['element_id'] and seg.project_id == s['project_id'] and seg.iri_id == s['iri_id'] and seg.cutting_id == s['decoupage_id'] and seg.ensemble_id == s['ensemble_id'] ][0]
segment.score = s['score']
- segment.lucene_id = s['lucene_id']
+ segment.indexation_id = s['indexation_id']
segment.context = s['context']
segment.context_tags = s['tags']
--- a/src/ldt/ldt/text/annotindexer.py Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/text/annotindexer.py Fri Jul 20 12:40:08 2012 +0200
@@ -1,5 +1,4 @@
-import lucene
-
+import ldt.indexation
class AnnotIndexer(object):
@@ -15,9 +14,6 @@
def index_annotation(self, annotation):
- doc = lucene.Document()
-
- doc.add(lucene.Field("annotation_id", annotation.external_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
annottags = annotation.get_tag_list()
tags = ""
@@ -26,16 +22,15 @@
tags = ""
else:
for tag in annottags:
- tags += tag + ";"
+ tags += tag + ";"
+
+ ldt.indexation.add_document([
+ ("annotation_id", annotation.external_id, True, "NOT_ANALYZED"),
+ ("type_doc", "text-annotation", False, "NOT_ANALYZED"),
+ ("tags", tags, False, "ANALYZED"),
+ ("title", annotation.title, False, "ANALYZED"),
+ ("abstract", annotation.description, False, "ANALYZED"),
+ ("text", annotation.text, False, "ANALYZED"),
+ ("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), False, "ANALYZED"),
+ ], self.__writer)
- doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))
- doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("title", annotation.title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("abstract", annotation.description, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("text", annotation.text, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
- doc.add(lucene.Field("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
-
- self.__writer.addDocument(doc)
-
- self.__writer.close()
-
--- a/src/ldt/ldt/text/models.py Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/text/models.py Fri Jul 20 12:40:08 2012 +0200
@@ -4,7 +4,6 @@
from tagging.models import Tag
from utils import generate_uuid
import ldt.indexation
-import lucene
import lxml
import tagging.fields
#from django.core.management.validation import max_length
@@ -125,15 +124,9 @@
def delete(self):
super(Annotation, self).delete()
- lucene.getVMEnv().attachCurrentThread()
- writer = ldt.indexation.get_writer()
- try:
- writer.deleteDocuments(lucene.Term("external_id", self.external_id))
- finally:
- writer.close()
+ ldt.indexation.delete_document("external_id", self.external_id)
def index_annot(self):
- lucene.getVMEnv().attachCurrentThread()
writer = ldt.indexation.get_writer()
try:
annotl = [self, ]
@@ -143,12 +136,7 @@
writer.close()
def update_index(self):
- lucene.getVMEnv().attachCurrentThread()
- writer = ldt.indexation.get_writer()
- try:
- writer.deleteDocuments(lucene.Term("external_id", self.external_id))
- finally:
- writer.close()
+ ldt.indexation.delete_document("external_id", self.external_id)
self.index_annot()
--- a/src/ldt/ldt/text/utils.py Thu Jul 19 19:21:05 2012 +0200
+++ b/src/ldt/ldt/text/utils.py Fri Jul 20 12:40:08 2012 +0200
@@ -1,7 +1,6 @@
from django.conf import settings
-from ldt.indexation import STORE
-import lucene
import uuid
+import ldt.indexation
__BOOLEAN_DICT = {
'false':False,
@@ -38,18 +37,7 @@
class TextSearch(object):
def query(self, field, query):
- indexSearcher = lucene.IndexSearcher(STORE)
- queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30))
- queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
- queryObj = queryParser.parse(query)
- hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER)
-
- res = []
- for hit in hits.scoreDocs:
- doc = indexSearcher.doc(hit.doc)
- res.append({"external_id":doc.get("external_id"), "title":doc.get("title")})
- indexSearcher.close()
- return res
+ return ldt.indexation.get_result_text(field, query)
def query_all(self, query):
return self.query("all", query)