src/ldt/ldt/indexation/__init__.py
author ymh <ymh.work@gmail.com>
Thu, 19 Jul 2012 19:21:05 +0200
changeset 715 f21459554182
parent 602 642b3654b8f1
child 716 31dc2726ca51
permissions -rw-r--r--
Remove lucene dependancies in model

from django.conf import settings
import lucene

lucene.initVM(lucene.CLASSPATH)

STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH))
ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))

def get_results_with_context(field, query, content_list=None):
    
    searcher = get_searcher()
    
    try:
        res = get_results_list(field, query, searcher)
        contexts = []
        content_iri_ids = None
        if content_list is not None :
            content_iri_ids = [ctt.iri_id for ctt in content_list]
            
        for i in res:
            doc = searcher.doc(i.doc)
            if content_iri_ids is None or (content_iri_ids is not None and doc.get("iri_id") in content_iri_ids) :
                ids = {"iri_id":doc.get("iri_id"), "ensemble_id":doc.get("ensemble_id"), "decoupage_id":doc.get("decoupage_id"), "element_id":doc.get("element_id"), "project_id":doc.get("project_id")}     
                score = i.score
                title = doc.getField('title').stringValue()
                desc = doc.getField('abstract').stringValue()
                tags = doc.getField('tags').stringValue()
                begin = doc.getField('begin').stringValue()
                duration = doc.getField('duration').stringValue()
                
                ids['context'] = desc
                ids['title'] = title
                ids['tags'] = tags
                ids['score'] = score
                ids['lucene_id'] = i.doc
                ids['begin'] = begin
                ids['duration'] = duration
                contexts.append(ids)     
    finally:
        searcher.close()
    return contexts

def get_results_list(field, query, indexSearcher=None):
    searcher_allocated = False    
    if indexSearcher is None:
        indexSearcher, searcher_allocated = get_searcher(), True
    try:
        queryParser = get_query_parser(field)
        queryObj = queryParser.parse(query)
        hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER)
    finally:
        if searcher_allocated:
            indexSearcher.close()
    return hits.scoreDocs

def highlight_documents(results_list, query, field):
    searcher = get_searcher()
    try:
        analyzer = lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)
        formatter = lucene.SimpleHTMLFormatter('<span class="highlight">', '</span>')
        query = get_query_parser(field).parse(query)
        highlighter = lucene.Highlighter(formatter, lucene.QueryScorer (query))
        
        for project in results_list:
            for segment in project['list']:
                lucene_doc = searcher.doc(segment.lucene_id)
                segment.context = get_highlighted_text(lucene_doc, analyzer, highlighter, 'abstract')
                tags = get_highlighted_text(lucene_doc, analyzer, highlighter, 'tags')
                segment.title = get_highlighted_text(lucene_doc, analyzer, highlighter, 'title')
                
                if segment.context == u'':
                    segment.context = lucene_doc.getField('abstract').stringValue()
                if tags == u'':
                    tags = lucene_doc.getField('tags').stringValue()
                if segment.title == u'':
                    segment.title = lucene_doc.getField('title').stringValue() 
                    
                segment.context_tags = tags[tags.find(';')+1:]          
    finally:
        searcher.close()
    return results_list    

def get_highlighted_text(doc, analyzer, highlighter, field):
    res = doc.getField(field).stringValue()
    ts = analyzer.tokenStream("body", lucene.StringReader(res)) 
    res = highlighter.getBestFragments(ts, res, settings.LDT_MAX_FRAGMENT_PER_SEARCH, "...")
    return res    

def get_writer(new=False):
    lucene.getVMEnv().attachCurrentThread()
    return lucene.IndexWriter(STORE, ANALYZER, new, lucene.IndexWriter.MaxFieldLength.UNLIMITED)

def get_searcher():
    lucene.getVMEnv().attachCurrentThread()
    return lucene.IndexSearcher(STORE)

def get_query_parser(field):
    queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30))
    queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
    return queryParser

def delete_document(field, value):
    writer = get_writer()
    try:
        writer.deleteDocuments(lucene.Term(field, value))
        writer.commit()
    finally:
        writer.close()