src/ldt/ldt/indexation/__init__.py
author ymh <ymh.work@gmail.com>
Thu, 20 Aug 2015 13:32:18 +0200
changeset 1411 e0cb5e956d79
parent 1371 17b7a6d9959d
child 1484 5a8702a8adf0
permissions -rw-r--r--
update version

import re
import sys

from django.conf import settings
from haystack import connections
from haystack.constants import DEFAULT_ALIAS
from haystack.query import SearchQuerySet

from ldt.indexation.highlighter import LdtHighlighter as Highlighter
from ldt.indexation.query_parser import QueryParser

from .backends import elasticsearch_backend as ldt_elasticsearch_backend


def get_results_with_context(model, field, query, content_list=None, highlight=True):
    
    results = get_results_list(model, field, query, highlight)
    contexts = []
    content_iri_ids = None
    if content_list is not None :
        content_iri_ids = [ctt.iri_id for ctt in content_list]
            
    for res in results:
        doc = res.get_stored_fields()
        if content_iri_ids is None or (content_iri_ids is not None and doc.get("iri_id") in content_iri_ids) :
            doc["score"] = res.score
            doc["indexation_id"] = res.pk
            doc["context"] = doc.get("abstract", "")
            doc["highlighted"] = res.highlighted
            contexts.append(doc)
    return contexts



def get_results_list(model, field, query, highlight=True):

    if field == 'all':
        field = 'text'
    
    qp = QueryParser(field)
    
    qs = SearchQuerySet().models(model).filter(qp.parse(query))
    if highlight:
        qs = qs.highlight()
    return qs
    
        

def get_result_text(field, query):

    #put import here to avoid a circular dependency
    from ldt.text.models import Annotation

    if field == 'all':
        field = 'text'
    elif field == 'text':
        field = 'text_field'
        
    qp = QueryParser(field)        
    qs = SearchQuerySet.models(Annotation).filter(qp.parse(query))
    
    return [{'external_id':res.get_stored_fields()['external_id'], 'title': res.get_stored_fields()['title'], 'score': res.score} for res in qs] 
    
def highlight_documents(results_list, query, field):

    highlight = Highlighter(query, html_tag="span", css_class="highlight", max_length=sys.maxint)    
    
    for project in results_list:
        for segment in project['list']:
            if hasattr(segment, "highlighted") and segment.highlighted:
                #TODO :                 
                highlighted_text = {
                     "context" : segment.highlighted.get('abstract',[segment.abstract])[0],
                     "tags" : segment.highlighted.get('tags',[segment.get_tags()])[0],
                     'title' : segment.highlighted.get('title',[segment.title])[0],
                }

            else:
                highlighted_text = {
                     "context" : highlight.highlight(segment.abstract),
                     "tags" : highlight.highlight(segment.get_tags()),
                     'title' : highlight.highlight(segment.title)
                }
                
            segment.context = highlighted_text['context']
            segment.title = highlighted_text['title']
            tags = highlighted_text['tags']
            segment.context_tags = tags[tags.find(';')+1:]
                
    return results_list

def object_delete(model, **kwargs):
    
    
    kwargs_filter = kwargs.copy()
    kwargs_filter.pop('using', None)
    
    # here we do a poor man transaction management.
    # the is no clear transaction management in Haystack.
    # therefore, we give priority to the database and delete there first.
    # if there is an error there, the index will not be updated.
      
    objs = list(model.objects.filter(**kwargs_filter))    
    
    model.objects.filter(**kwargs_filter).delete()
    
    using = None
    if 'using' in kwargs:
        using = kwargs.get('using', None)
    if not using:
        using = DEFAULT_ALIAS        
    
    conn = connections[using]
    
    if isinstance(conn, ldt_elasticsearch_backend.ElasticsearchSearchEngine):
        conn.get_backend().remove(objs, commit=True)
    else:
        for o in objs:
            conn.get_backend().remove(o, commit=True)



def object_insert(model, object_list, func_key, using = None):
    
    if not object_list:
        return

    model.objects.bulk_create(object_list)
    obj_dict = dict(model.objects.filter(**{func_key+'__in':[getattr(o, func_key) for o in object_list]}).values_list(func_key,"id"))

    for o in object_list:
        o.id = obj_dict[getattr(o,func_key)]


def object_run_index(model, object_list, using = None):
    
    if not object_list:
        return

    if not using:
        using = DEFAULT_ALIAS        
    
    conn = connections[using]
    
    backend = conn.get_backend()
    unified_index = conn.get_unified_index()
    
    index = unified_index.get_index(model)
    
    backend.update(index, object_list)



class SimpleSearch(object):

    def query(self, model, field, query):
        hits = get_results_list(model, field, query)
    
        res = []
        for hit in hits:
            res.append(hit.get_stored_fields())
        return res

    def query_all(self, query):        
        return self.query("all", query)