improve highlighting management
authorymh <ymh.work@gmail.com>
Tue, 31 Jul 2012 02:27:09 +0200
changeset 719 1c0ac4068bbe
parent 718 5e27a39d3742
child 720 b93c99226832
improve highlighting management
.settings/org.eclipse.core.resources.prefs
src/ldt/ldt/indexation/__init__.py
src/ldt/ldt/indexation/backends/__init__.py
src/ldt/ldt/indexation/backends/elasticsearch_backend.py
src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/segment_text.txt
src/ldt/ldt/ldt_utils/models.py
src/ldt/ldt/ldt_utils/templates/front/front_search_results.html
src/ldt/ldt/ldt_utils/utils.py
src/ldt/ldt/ldt_utils/views/lignesdetemps.py
web/ldtplatform/config.py.tmpl
--- a/.settings/org.eclipse.core.resources.prefs	Mon Jul 30 19:46:40 2012 +0200
+++ b/.settings/org.eclipse.core.resources.prefs	Tue Jul 31 02:27:09 2012 +0200
@@ -1,6 +1,7 @@
 eclipse.preferences.version=1
 encoding//src/ldt/ldt/core/migrations/0001_initial.py=utf-8
 encoding//src/ldt/ldt/core/migrations/0002_auto__del_owner.py=utf-8
+encoding//src/ldt/ldt/indexation/backends/elasticsearch_backend.py=utf-8
 encoding//src/ldt/ldt/indexation/highlighter.py=utf-8
 encoding//src/ldt/ldt/indexation/search_indexes.py=utf-8
 encoding//src/ldt/ldt/ldt_utils/migrations/0001_initial.py=utf-8
--- a/src/ldt/ldt/indexation/__init__.py	Mon Jul 30 19:46:40 2012 +0200
+++ b/src/ldt/ldt/indexation/__init__.py	Tue Jul 31 02:27:09 2012 +0200
@@ -6,11 +6,10 @@
 from ldt.text.models import Annotation
 import re
 import sys
-
-
-def get_results_with_context(field, query, content_list=None):
+ 
+def get_results_with_context(field, query, content_list=None, highlight=True):
     
-    results = get_results_list(field, query, False)
+    results = get_results_list(field, query, highlight)
     contexts = []
     content_iri_ids = None
     if content_list is not None :
@@ -58,13 +57,11 @@
     for project in results_list:
         for segment in project['list']:
             if hasattr(segment, "highlighted") and segment.highlighted:
-                #TODO : 
-                h = segment.highlighted[0]
-                hsplit = re.split("\-\-\*([\w\-]+)\*\-\-", h, flags=re.S)                
+                #TODO :                 
                 highlighted_text = {
-                     "context" : segment.abstract,
-                     "tags" : segment.tags,
-                     'title' : segment.title,
+                     "context" : segment.highlighted.get('abstract',[segment.abstract])[0],
+                     "tags" : segment.highlighted.get('tags',[segment.tags])[0],
+                     'title' : segment.highlighted.get('title',[segment.title])[0],
                 }
 
             else:
@@ -81,6 +78,18 @@
                 
     return results_list
 
+class SimpleSearch(object):
+
+    def query(self, field, query):
+        hits = get_results_list(field, query)
+    
+        res = []
+        for hit in hits:
+            res.append(hit.get_stored_fields())
+        return res
+
+    def query_all(self, query):        
+        return self.query("all", query)
 
 
     
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/ldt/ldt/indexation/backends/elasticsearch_backend.py	Tue Jul 31 02:27:09 2012 +0200
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+'''
+Created on Jul 30, 2012
+
+@author: ymh
+'''
+from django.db.models.loading import get_model
+from haystack.backends import BaseEngine, SearchResult, elasticsearch_backend
+from haystack.constants import DJANGO_CT, DJANGO_ID
+from ldt.ldt_utils.models import Segment
+import datetime
+
+class ElasticsearchSearchBackend(elasticsearch_backend.ElasticsearchSearchBackend):
+
+    def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None,
+                        fields='', highlight=False, facets=None,
+                        date_facets=None, query_facets=None,
+                        narrow_queries=None, spelling_query=None,
+                        within=None, dwithin=None, distance_point=None,
+                        models=None, limit_to_registered_models=None,
+                        result_class=None):
+        
+        kwargs = super(ElasticsearchSearchBackend, self).build_search_kwargs(query_string, sort_by=sort_by, start_offset=start_offset, end_offset=end_offset,
+                        fields=fields, highlight=highlight, facets=facets,
+                        date_facets=date_facets, query_facets=query_facets,
+                        narrow_queries=narrow_queries, spelling_query=spelling_query,
+                        within=within, dwithin=dwithin, distance_point=distance_point,
+                        models=models, limit_to_registered_models=limit_to_registered_models,
+                        result_class=result_class)
+                
+        #TODO : try to make list of field dynamic
+        #TODO : How to handle multiple 
+        if highlight:
+            fields_def = { }
+ 
+            if models is None or len(models) == 0 or Segment in models:
+                fields_def['tags'] = {}
+                fields_def['title'] = {}
+                fields_def['abstract'] = {}
+            
+            kwargs['highlight'] = {
+                'pre_tags' : ["<span class='highlight'>"],
+                'post_tags' : ["</span>"],
+                "number_of_fragments" : 0,
+                'fields': fields_def                
+            }
+        
+        return kwargs
+    
+    def _process_results(self, raw_results, highlight=False, result_class=None):
+        from haystack import connections
+        results = []
+        hits = raw_results.get('hits', {}).get('total', 0)
+        facets = {}
+        spelling_suggestion = None
+
+        if result_class is None:
+            result_class = SearchResult
+
+        if 'facets' in raw_results:
+            facets = {
+                'fields': {},
+                'dates': {},
+                'queries': {},
+            }
+
+            for facet_fieldname, facet_info in raw_results['facets'].items():
+                if facet_info.get('_type', 'terms') == 'terms':
+                    facets['fields'][facet_fieldname] = [(individual['term'], individual['count']) for individual in facet_info['terms']]
+                elif facet_info.get('_type', 'terms') == 'date_histogram':
+                    # Elasticsearch provides UTC timestamps with an extra three
+                    # decimals of precision, which datetime barfs on.
+                    facets['dates'][facet_fieldname] = [(datetime.datetime.utcfromtimestamp(individual['time'] / 1000), individual['count']) for individual in facet_info['entries']]
+                elif facet_info.get('_type', 'terms') == 'query':
+                    facets['queries'][facet_fieldname] = facet_info['count']
+
+        unified_index = connections[self.connection_alias].get_unified_index()
+        indexed_models = unified_index.get_indexed_models()
+
+        for raw_result in raw_results.get('hits', {}).get('hits', []):
+            source = raw_result['_source']
+            app_label, model_name = source[DJANGO_CT].split('.')
+            additional_fields = {}
+            model = get_model(app_label, model_name)
+
+            if model and model in indexed_models:
+                for key, value in source.items():
+                    index = unified_index.get_index(model)
+                    string_key = str(key)
+
+                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
+                        additional_fields[string_key] = index.fields[string_key].convert(value)
+                    else:
+                        additional_fields[string_key] = self.conn.to_python(value)
+
+                del(additional_fields[DJANGO_CT])
+                del(additional_fields[DJANGO_ID])
+
+                if 'highlight' in raw_result:
+                    additional_fields['highlighted'] = raw_result['highlight']
+
+                result = result_class(app_label, model_name, source[DJANGO_ID], raw_result['_score'], **additional_fields)
+                results.append(result)
+            else:
+                hits -= 1
+
+        return {
+            'results': results,
+            'hits': hits,
+            'facets': facets,
+            'spelling_suggestion': spelling_suggestion,
+        }
+
+
+class ElasticsearchSearchEngine(BaseEngine):
+    backend = ElasticsearchSearchBackend
+    query = elasticsearch_backend.ElasticsearchSearchQuery
--- a/src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/segment_text.txt	Mon Jul 30 19:46:40 2012 +0200
+++ b/src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/segment_text.txt	Tue Jul 31 02:27:09 2012 +0200
@@ -1,6 +1,3 @@
---*t-a-g-s*--
 {{object.tags}}
---*t-i-t-l-e*--
 {{object.title}}
---*a-b-s-t-r-a-c-t*--
 {{object.abstract}}
\ No newline at end of file
--- a/src/ldt/ldt/ldt_utils/models.py	Mon Jul 30 19:46:40 2012 +0200
+++ b/src/ldt/ldt/ldt_utils/models.py	Tue Jul 31 02:27:09 2012 +0200
@@ -13,7 +13,6 @@
 from utils import (create_ldt, copy_ldt, create_empty_iri, update_iri, 
     generate_uuid)
 import datetime
-import ldt.indexation
 import lxml.etree
 import mimetypes
 import os.path
--- a/src/ldt/ldt/ldt_utils/templates/front/front_search_results.html	Mon Jul 30 19:46:40 2012 +0200
+++ b/src/ldt/ldt/ldt_utils/templates/front/front_search_results.html	Tue Jul 31 02:27:09 2012 +0200
@@ -116,10 +116,10 @@
                             {% thumbnail res.content.image "300x200" format="PNG" crop="center" as im %}<img src="{{ im.url }}" class="img_media" width="{{ im.width }}" height="{{ im.height }}" %}">{% endthumbnail %}
                             <h4><a class="pink under b" href="{% url ldt.ldt_utils.views.front.annot_content segment.iri_id segment.project_id segment.cutting_id %}#id={{segment.element_id}}" title="{% trans 'view this annotation in the player' %}">
                             {% if segment.title %}{{ segment.title }}{% else %}{% trans "No title" %}{% endif %}</a></h4>
-                            <p class="bigmargin">{{ segment.abstract }}</p>
-                            {% if segment.tags %}
+                            <p class="bigmargin">{% if segment.context %}{{ segment.context }}{% else %}{{ segment.abstract }}{% endif %}</p>
+                            {% if segment.context_tags or segment.tags %}
                             	<h5>Tags:</h5>
-                            	<p class="result_taglist b fl">{{ segment.tags }}</p>
+                            	<p class="result_taglist b fl">{% if segment.context_tags %}{{ segment.context_tags }}{% else %}{{ segment.tags }}{% endif %}</p>
                             {% endif %}
                         </div>
                     </li>
--- a/src/ldt/ldt/ldt_utils/utils.py	Mon Jul 30 19:46:40 2012 +0200
+++ b/src/ldt/ldt/ldt_utils/utils.py	Tue Jul 31 02:27:09 2012 +0200
@@ -1,6 +1,5 @@
 from copy import deepcopy
 from django.conf import settings
-from ldt.indexation import get_results_list
 from django.utils.translation import ugettext as _
 from StringIO import StringIO
 import datetime
@@ -37,18 +36,6 @@
 def generate_uuid():
     return unicode(uuid.uuid1())
 
-class LdtSearch(object):
-
-    def query(self, field, query):
-        hits = get_results_list(field, query)
-    
-        res = []
-        for hit in hits:
-            res.append(hit.get_stored_fields())
-        return res
-
-    def query_all(self, query):        
-        return self.query("all", query)
         
 
 class LdtUtils(object):
--- a/src/ldt/ldt/ldt_utils/views/lignesdetemps.py	Mon Jul 30 19:46:40 2012 +0200
+++ b/src/ldt/ldt/ldt_utils/views/lignesdetemps.py	Tue Jul 31 02:27:09 2012 +0200
@@ -5,7 +5,8 @@
 from django.shortcuts import render_to_response, get_object_or_404
 from django.template import RequestContext
 from ldt.ldt_utils.models import Content, Project
-from ldt.ldt_utils.utils import LdtUtils, LdtSearch, clean_description
+from ldt.ldt_utils.utils import LdtUtils, clean_description
+from ldt.indexation import SimpleSearch
 from ldt.security.utils import set_forbidden_stream
 from ldt.ldt_utils.stat import update_stat_project
 import base64
@@ -38,7 +39,7 @@
 
     if query and len(query) > 0:        
         queryStr = base64.urlsafe_b64decode(query.encode("ascii")).decode("utf8")
-        searcher = LdtSearch()
+        searcher = SimpleSearch()
         ids = {}
         projIds = {}
         
@@ -68,7 +69,7 @@
 def search_segments(request, field, query, edition=None):
     
     if query and len(query) > 0:
-        searcher = LdtSearch()
+        searcher = SimpleSearch()
         
         queryStr = base64.urlsafe_b64decode(query.encode("ascii")).decode("utf8")
         res = searcher.query(field, queryStr)            
--- a/web/ldtplatform/config.py.tmpl	Mon Jul 30 19:46:40 2012 +0200
+++ b/web/ldtplatform/config.py.tmpl	Tue Jul 31 02:27:09 2012 +0200
@@ -100,6 +100,7 @@
 
 HAYSTACK_CONNECTIONS = {
     'default': {
+        #for elasticsearch use ldt.indexation.backends.elasticsearch_backend.ElasticsearchSearchEngine
         'ENGINE': 'haystack.backends.simple_backend.SimpleEngine',
     },
 }