segment abstracts + content images can be retrieved directly from search results page
from django.conf import settings
import lucene
lucene.initVM(lucene.CLASSPATH)
STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH))
ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
def get_results_with_context(field, query):
res = get_results_list(field, query)
searcher = get_searcher()
query = get_query_parser(field).parse(query)
contexts = []
for i in res:
doc = searcher.doc(i.doc)
ids = {"iri_id":doc.get("iri_id"), "ensemble_id":doc.get("ensemble_id"), "decoupage_id":doc.get("decoupage_id"), "element_id":doc.get("element_id"), "project_id":doc.get("project_id")}
score = i.score
title = doc.getField('title').stringValue()
desc = doc.getField('abstract').stringValue()
tags = doc.getField('tags').stringValue()
begin = doc.getField('begin').stringValue()
duration = doc.getField('duration').stringValue()
ids['context'] = desc
ids['title'] = title
ids['tags'] = tags
ids['score'] = score
ids['lucene_id'] = i.doc
ids['begin'] = begin
ids['duration'] = duration
contexts.append(ids)
searcher.close()
return contexts
def get_results_list(field, query):
indexSearcher = get_searcher()
queryParser = get_query_parser(field)
queryObj = queryParser.parse(query)
hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER)
return hits.scoreDocs
def highlight_documents(results_list, query, field):
searcher = get_searcher()
analyzer = lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)
formatter = lucene.SimpleHTMLFormatter('<span class="highlight">', '</span>')
query = get_query_parser(field).parse(query)
highlighter = lucene.Highlighter(formatter, lucene.QueryScorer (query))
for project in results_list:
for segment in project['list']:
lucene_doc = searcher.doc(segment.lucene_id)
segment.context = get_highlighted_text(lucene_doc, analyzer, highlighter, 'abstract')
tags = get_highlighted_text(lucene_doc, analyzer, highlighter, 'tags')
segment.title = get_highlighted_text(lucene_doc, analyzer, highlighter, 'title')
if segment.context == u'':
segment.context = lucene_doc.getField('abstract').stringValue()
if tags == u'':
tags = lucene_doc.getField('tags').stringValue()
if segment.title == u'':
segment.title = lucene_doc.getField('title').stringValue()
segment.context_tags = tags[tags.find(';')+1:]
return results_list
def get_highlighted_text(doc, analyzer, highlighter, field):
res = doc.getField(field).stringValue()
ts = analyzer.tokenStream("body", lucene.StringReader(res))
res = highlighter.getBestFragments(ts, res, settings.LDT_MAX_FRAGMENT_PER_SEARCH, "...")
return res
def get_writer(new=False):
lucene.getVMEnv().attachCurrentThread()
return lucene.IndexWriter(STORE, ANALYZER, new, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
def get_searcher():
lucene.getVMEnv().attachCurrentThread()
return lucene.IndexSearcher(STORE)
def get_query_parser(field):
queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30))
queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
return queryParser