src/ldt/ldt/indexation/__init__.py
author cavaliet
Tue, 06 Nov 2012 13:56:54 +0100
changeset 895 e76df6d34e6f
parent 725 4f4005df9a97
child 1117 3bab1e42acfa
permissions -rw-r--r--
Merge with bd5f9dea97910caae6c2fb1ef8fc2e119e47d248
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
from django.conf import settings
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
     3
from haystack.query import SearchQuerySet
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
     4
from ldt.indexation.highlighter import LdtHighlighter as Highlighter
725
4f4005df9a97 improve indexation query language
ymh <ymh.work@gmail.com>
parents: 719
diff changeset
     5
from ldt.indexation.query_parser import QueryParser
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
     6
from ldt.ldt_utils.models import Segment
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
     7
from ldt.text.models import Annotation
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
     8
import re
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
     9
import sys
719
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    10
 
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    11
def get_results_with_context(field, query, content_list=None, highlight=True):
568
b67fc0fd2389 small correction on indexation + version bum
ymh <ymh.work@gmail.com>
parents: 452
diff changeset
    12
    
719
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    13
    results = get_results_list(field, query, highlight)
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    14
    contexts = []
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    15
    content_iri_ids = None
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    16
    if content_list is not None :
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    17
        content_iri_ids = [ctt.iri_id for ctt in content_list]
594
bc018d2744f2 release indexer
ymh <ymh.work@gmail.com>
parents: 568
diff changeset
    18
            
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    19
    for res in results:
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    20
        doc = res.get_stored_fields()
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    21
        if content_iri_ids is None or (content_iri_ids is not None and doc.get("iri_id") in content_iri_ids) :
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    22
            doc["score"] = res.score
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    23
            doc["indexation_id"] = res.pk
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    24
            doc["context"] = doc["abstract"]
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    25
            doc["highlighted"] = res.highlighted
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    26
            contexts.append(doc)
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    27
    return contexts     
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    28
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    29
176
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    30
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    31
def get_results_list(field, query, highlight=True):
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    32
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    33
    if field == 'all':
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    34
        field = 'text'
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    35
    
725
4f4005df9a97 improve indexation query language
ymh <ymh.work@gmail.com>
parents: 719
diff changeset
    36
    qp = QueryParser(field)
4f4005df9a97 improve indexation query language
ymh <ymh.work@gmail.com>
parents: 719
diff changeset
    37
    
4f4005df9a97 improve indexation query language
ymh <ymh.work@gmail.com>
parents: 719
diff changeset
    38
    qs = SearchQuerySet().models(Segment).filter(qp.parse(query))
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    39
    if highlight:
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    40
        qs = qs.highlight()
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    41
    return qs
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    42
    
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    43
        
176
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    44
716
31dc2726ca51 centralise les appel à lucene
ymh <ymh.work@gmail.com>
parents: 715
diff changeset
    45
def get_result_text(field, query):
31dc2726ca51 centralise les appel à lucene
ymh <ymh.work@gmail.com>
parents: 715
diff changeset
    46
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    47
    if field == 'all':
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    48
        field = 'text'
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    49
    elif field == 'text':
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    50
        field = 'text_field'
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    51
        
725
4f4005df9a97 improve indexation query language
ymh <ymh.work@gmail.com>
parents: 719
diff changeset
    52
    qp = QueryParser(field)        
4f4005df9a97 improve indexation query language
ymh <ymh.work@gmail.com>
parents: 719
diff changeset
    53
    qs = SearchQuerySet.models(Annotation).filter(qp.parse(query))
716
31dc2726ca51 centralise les appel à lucene
ymh <ymh.work@gmail.com>
parents: 715
diff changeset
    54
    
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    55
    return [{'external_id':res.get_stored_fields()['external_id'], 'title': res.get_stored_fields()['title'], 'score': res.score} for res in qs] 
716
31dc2726ca51 centralise les appel à lucene
ymh <ymh.work@gmail.com>
parents: 715
diff changeset
    56
    
452
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    57
def highlight_documents(results_list, query, field):
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    58
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    59
    highlight = Highlighter(query, html_tag="span", css_class="highlight", max_length=sys.maxint)    
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    60
    
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    61
    for project in results_list:
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    62
        for segment in project['list']:
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    63
            if hasattr(segment, "highlighted") and segment.highlighted:
719
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    64
                #TODO :                 
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    65
                highlighted_text = {
719
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    66
                     "context" : segment.highlighted.get('abstract',[segment.abstract])[0],
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    67
                     "tags" : segment.highlighted.get('tags',[segment.tags])[0],
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    68
                     'title' : segment.highlighted.get('title',[segment.title])[0],
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    69
                }
176
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    70
718
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    71
            else:
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    72
                highlighted_text = {
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    73
                     "context" : highlight.highlight(segment.abstract),
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    74
                     "tags" : highlight.highlight(segment.tags),
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    75
                     'title' : highlight.highlight(segment.title)
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    76
                }
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    77
                
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    78
            segment.context = highlighted_text['context']
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    79
            segment.title = highlighted_text['title']
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    80
            tags = highlighted_text['tags']
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    81
            segment.context_tags = tags[tags.find(';')+1:]
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    82
                
5e27a39d3742 replace lucene by haystack, remove references to lucene
ymh <ymh.work@gmail.com>
parents: 716
diff changeset
    83
    return results_list
716
31dc2726ca51 centralise les appel à lucene
ymh <ymh.work@gmail.com>
parents: 715
diff changeset
    84
719
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    85
class SimpleSearch(object):
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    86
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    87
    def query(self, field, query):
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    88
        hits = get_results_list(field, query)
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    89
    
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    90
        res = []
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    91
        for hit in hits:
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    92
            res.append(hit.get_stored_fields())
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    93
        return res
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    94
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    95
    def query_all(self, query):        
1c0ac4068bbe improve highlighting management
ymh <ymh.work@gmail.com>
parents: 718
diff changeset
    96
        return self.query("all", query)
716
31dc2726ca51 centralise les appel à lucene
ymh <ymh.work@gmail.com>
parents: 715
diff changeset
    97
31dc2726ca51 centralise les appel à lucene
ymh <ymh.work@gmail.com>
parents: 715
diff changeset
    98
715
f21459554182 Remove lucene dependancies in model
ymh <ymh.work@gmail.com>
parents: 602
diff changeset
    99
    
f21459554182 Remove lucene dependancies in model
ymh <ymh.work@gmail.com>
parents: 602
diff changeset
   100
    
176
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
   101
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
   102
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
   103