# HG changeset patch # User ymh # Date 1343694429 -7200 # Node ID 1c0ac4068bbedfc591a2243b2e2a0eed8cd0a70b # Parent 5e27a39d37425e9dd48bc6996318824301da650d improve highlighting management diff -r 5e27a39d3742 -r 1c0ac4068bbe .settings/org.eclipse.core.resources.prefs --- a/.settings/org.eclipse.core.resources.prefs Mon Jul 30 19:46:40 2012 +0200 +++ b/.settings/org.eclipse.core.resources.prefs Tue Jul 31 02:27:09 2012 +0200 @@ -1,6 +1,7 @@ eclipse.preferences.version=1 encoding//src/ldt/ldt/core/migrations/0001_initial.py=utf-8 encoding//src/ldt/ldt/core/migrations/0002_auto__del_owner.py=utf-8 +encoding//src/ldt/ldt/indexation/backends/elasticsearch_backend.py=utf-8 encoding//src/ldt/ldt/indexation/highlighter.py=utf-8 encoding//src/ldt/ldt/indexation/search_indexes.py=utf-8 encoding//src/ldt/ldt/ldt_utils/migrations/0001_initial.py=utf-8 diff -r 5e27a39d3742 -r 1c0ac4068bbe src/ldt/ldt/indexation/__init__.py --- a/src/ldt/ldt/indexation/__init__.py Mon Jul 30 19:46:40 2012 +0200 +++ b/src/ldt/ldt/indexation/__init__.py Tue Jul 31 02:27:09 2012 +0200 @@ -6,11 +6,10 @@ from ldt.text.models import Annotation import re import sys - - -def get_results_with_context(field, query, content_list=None): + +def get_results_with_context(field, query, content_list=None, highlight=True): - results = get_results_list(field, query, False) + results = get_results_list(field, query, highlight) contexts = [] content_iri_ids = None if content_list is not None : @@ -58,13 +57,11 @@ for project in results_list: for segment in project['list']: if hasattr(segment, "highlighted") and segment.highlighted: - #TODO : - h = segment.highlighted[0] - hsplit = re.split("\-\-\*([\w\-]+)\*\-\-", h, flags=re.S) + #TODO : highlighted_text = { - "context" : segment.abstract, - "tags" : segment.tags, - 'title' : segment.title, + "context" : segment.highlighted.get('abstract',[segment.abstract])[0], + "tags" : segment.highlighted.get('tags',[segment.tags])[0], + 'title' : segment.highlighted.get('title',[segment.title])[0], } else: @@ -81,6 +78,18 @@ return results_list +class SimpleSearch(object): + + def query(self, field, query): + hits = get_results_list(field, query) + + res = [] + for hit in hits: + res.append(hit.get_stored_fields()) + return res + + def query_all(self, query): + return self.query("all", query) diff -r 5e27a39d3742 -r 1c0ac4068bbe src/ldt/ldt/indexation/backends/__init__.py diff -r 5e27a39d3742 -r 1c0ac4068bbe src/ldt/ldt/indexation/backends/elasticsearch_backend.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ldt/ldt/indexation/backends/elasticsearch_backend.py Tue Jul 31 02:27:09 2012 +0200 @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +''' +Created on Jul 30, 2012 + +@author: ymh +''' +from django.db.models.loading import get_model +from haystack.backends import BaseEngine, SearchResult, elasticsearch_backend +from haystack.constants import DJANGO_CT, DJANGO_ID +from ldt.ldt_utils.models import Segment +import datetime + +class ElasticsearchSearchBackend(elasticsearch_backend.ElasticsearchSearchBackend): + + def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, + fields='', highlight=False, facets=None, + date_facets=None, query_facets=None, + narrow_queries=None, spelling_query=None, + within=None, dwithin=None, distance_point=None, + models=None, limit_to_registered_models=None, + result_class=None): + + kwargs = super(ElasticsearchSearchBackend, self).build_search_kwargs(query_string, sort_by=sort_by, start_offset=start_offset, end_offset=end_offset, + fields=fields, highlight=highlight, facets=facets, + date_facets=date_facets, query_facets=query_facets, + narrow_queries=narrow_queries, spelling_query=spelling_query, + within=within, dwithin=dwithin, distance_point=distance_point, + models=models, limit_to_registered_models=limit_to_registered_models, + result_class=result_class) + + #TODO : try to make list of field dynamic + #TODO : How to handle multiple + if highlight: + fields_def = { } + + if models is None or len(models) == 0 or Segment in models: + fields_def['tags'] = {} + fields_def['title'] = {} + fields_def['abstract'] = {} + + kwargs['highlight'] = { + 'pre_tags' : [""], + 'post_tags' : [""], + "number_of_fragments" : 0, + 'fields': fields_def + } + + return kwargs + + def _process_results(self, raw_results, highlight=False, result_class=None): + from haystack import connections + results = [] + hits = raw_results.get('hits', {}).get('total', 0) + facets = {} + spelling_suggestion = None + + if result_class is None: + result_class = SearchResult + + if 'facets' in raw_results: + facets = { + 'fields': {}, + 'dates': {}, + 'queries': {}, + } + + for facet_fieldname, facet_info in raw_results['facets'].items(): + if facet_info.get('_type', 'terms') == 'terms': + facets['fields'][facet_fieldname] = [(individual['term'], individual['count']) for individual in facet_info['terms']] + elif facet_info.get('_type', 'terms') == 'date_histogram': + # Elasticsearch provides UTC timestamps with an extra three + # decimals of precision, which datetime barfs on. + facets['dates'][facet_fieldname] = [(datetime.datetime.utcfromtimestamp(individual['time'] / 1000), individual['count']) for individual in facet_info['entries']] + elif facet_info.get('_type', 'terms') == 'query': + facets['queries'][facet_fieldname] = facet_info['count'] + + unified_index = connections[self.connection_alias].get_unified_index() + indexed_models = unified_index.get_indexed_models() + + for raw_result in raw_results.get('hits', {}).get('hits', []): + source = raw_result['_source'] + app_label, model_name = source[DJANGO_CT].split('.') + additional_fields = {} + model = get_model(app_label, model_name) + + if model and model in indexed_models: + for key, value in source.items(): + index = unified_index.get_index(model) + string_key = str(key) + + if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): + additional_fields[string_key] = index.fields[string_key].convert(value) + else: + additional_fields[string_key] = self.conn.to_python(value) + + del(additional_fields[DJANGO_CT]) + del(additional_fields[DJANGO_ID]) + + if 'highlight' in raw_result: + additional_fields['highlighted'] = raw_result['highlight'] + + result = result_class(app_label, model_name, source[DJANGO_ID], raw_result['_score'], **additional_fields) + results.append(result) + else: + hits -= 1 + + return { + 'results': results, + 'hits': hits, + 'facets': facets, + 'spelling_suggestion': spelling_suggestion, + } + + +class ElasticsearchSearchEngine(BaseEngine): + backend = ElasticsearchSearchBackend + query = elasticsearch_backend.ElasticsearchSearchQuery diff -r 5e27a39d3742 -r 1c0ac4068bbe src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/segment_text.txt --- a/src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/segment_text.txt Mon Jul 30 19:46:40 2012 +0200 +++ b/src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/segment_text.txt Tue Jul 31 02:27:09 2012 +0200 @@ -1,6 +1,3 @@ ---*t-a-g-s*-- {{object.tags}} ---*t-i-t-l-e*-- {{object.title}} ---*a-b-s-t-r-a-c-t*-- {{object.abstract}} \ No newline at end of file diff -r 5e27a39d3742 -r 1c0ac4068bbe src/ldt/ldt/ldt_utils/models.py --- a/src/ldt/ldt/ldt_utils/models.py Mon Jul 30 19:46:40 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/models.py Tue Jul 31 02:27:09 2012 +0200 @@ -13,7 +13,6 @@ from utils import (create_ldt, copy_ldt, create_empty_iri, update_iri, generate_uuid) import datetime -import ldt.indexation import lxml.etree import mimetypes import os.path diff -r 5e27a39d3742 -r 1c0ac4068bbe src/ldt/ldt/ldt_utils/templates/front/front_search_results.html --- a/src/ldt/ldt/ldt_utils/templates/front/front_search_results.html Mon Jul 30 19:46:40 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/templates/front/front_search_results.html Tue Jul 31 02:27:09 2012 +0200 @@ -116,10 +116,10 @@ {% thumbnail res.content.image "300x200" format="PNG" crop="center" as im %}{% endthumbnail %}

{% if segment.title %}{{ segment.title }}{% else %}{% trans "No title" %}{% endif %}

-

{{ segment.abstract }}

- {% if segment.tags %} +

{% if segment.context %}{{ segment.context }}{% else %}{{ segment.abstract }}{% endif %}

+ {% if segment.context_tags or segment.tags %}
Tags:
-

{{ segment.tags }}

+

{% if segment.context_tags %}{{ segment.context_tags }}{% else %}{{ segment.tags }}{% endif %}

{% endif %} diff -r 5e27a39d3742 -r 1c0ac4068bbe src/ldt/ldt/ldt_utils/utils.py --- a/src/ldt/ldt/ldt_utils/utils.py Mon Jul 30 19:46:40 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/utils.py Tue Jul 31 02:27:09 2012 +0200 @@ -1,6 +1,5 @@ from copy import deepcopy from django.conf import settings -from ldt.indexation import get_results_list from django.utils.translation import ugettext as _ from StringIO import StringIO import datetime @@ -37,18 +36,6 @@ def generate_uuid(): return unicode(uuid.uuid1()) -class LdtSearch(object): - - def query(self, field, query): - hits = get_results_list(field, query) - - res = [] - for hit in hits: - res.append(hit.get_stored_fields()) - return res - - def query_all(self, query): - return self.query("all", query) class LdtUtils(object): diff -r 5e27a39d3742 -r 1c0ac4068bbe src/ldt/ldt/ldt_utils/views/lignesdetemps.py --- a/src/ldt/ldt/ldt_utils/views/lignesdetemps.py Mon Jul 30 19:46:40 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/views/lignesdetemps.py Tue Jul 31 02:27:09 2012 +0200 @@ -5,7 +5,8 @@ from django.shortcuts import render_to_response, get_object_or_404 from django.template import RequestContext from ldt.ldt_utils.models import Content, Project -from ldt.ldt_utils.utils import LdtUtils, LdtSearch, clean_description +from ldt.ldt_utils.utils import LdtUtils, clean_description +from ldt.indexation import SimpleSearch from ldt.security.utils import set_forbidden_stream from ldt.ldt_utils.stat import update_stat_project import base64 @@ -38,7 +39,7 @@ if query and len(query) > 0: queryStr = base64.urlsafe_b64decode(query.encode("ascii")).decode("utf8") - searcher = LdtSearch() + searcher = SimpleSearch() ids = {} projIds = {} @@ -68,7 +69,7 @@ def search_segments(request, field, query, edition=None): if query and len(query) > 0: - searcher = LdtSearch() + searcher = SimpleSearch() queryStr = base64.urlsafe_b64decode(query.encode("ascii")).decode("utf8") res = searcher.query(field, queryStr) diff -r 5e27a39d3742 -r 1c0ac4068bbe web/ldtplatform/config.py.tmpl --- a/web/ldtplatform/config.py.tmpl Mon Jul 30 19:46:40 2012 +0200 +++ b/web/ldtplatform/config.py.tmpl Tue Jul 31 02:27:09 2012 +0200 @@ -100,6 +100,7 @@ HAYSTACK_CONNECTIONS = { 'default': { + #for elasticsearch use ldt.indexation.backends.elasticsearch_backend.ElasticsearchSearchEngine 'ENGINE': 'haystack.backends.simple_backend.SimpleEngine', }, }