# -*- coding: utf-8 -*-
'''

Adaptation of Haystack' Swoosh backend with french snowball stemming

Created on May 31, 2011

@author: ymh
'''

from django.conf import settings
from django.db.models.loading import get_model
from haystack.backends.whoosh_backend import (
    SearchBackend as WhooshSearchBackend, SearchQuery as WhooshSearchQuery)
from haystack.constants import ID, DJANGO_CT, DJANGO_ID
from haystack.exceptions import SearchBackendError
from haystack.models import SearchResult
from whoosh.analysis import (default_pattern, stem, RegexTokenizer, 
    LowercaseFilter, StopFilter, PyStemmerFilter)
from whoosh.fields import (ID as WHOOSH_ID, Schema, IDLIST, TEXT, KEYWORD, 
    NUMERIC, BOOLEAN, DATETIME, NGRAM, NGRAMWORDS)

BACKEND_NAME = 'french_whoosh'

STOP_WORDS = [ u"a", u"afin", u"ai", u"ainsi", u"après", u"attendu", u"au",
u"aujourd", u"auquel", u"aussi", u"autre", u"autres", u"aux", u"auxquelles",
u"auxquels", u"avait", u"avant", u"avec", u"avoir", u"c", u"car", u"ce",
u"ceci", u"cela", u"celle", u"celles", u"celui", u"cependant", u"certain",
u"certaine", u"certaines", u"certains", u"ces", u"cet", u"cette", u"ceux",
u"chez", u"ci", u"combien", u"comme", u"comment", u"concernant", u"contre",
u"d", u"dans", u"je", u"de", u"jusqu", u"debout", u"jusque", u"dedans", u"l",
u"dehors", u"la", u"delà", u"laquelle", u"depuis", u"le", u"derrière",
u"lequel", u"des", u"les", u"desquelles", u"lesquelles", u"desquels",
u"lesquels", u"dessous", u"leur", u"dessus", u"leurs", u"devant", u"lorsque",
u"devers", u"lui", u"devra", u"là", u"divers", u"ma", u"diverse", u"mais",
u"diverses", u"malgré", u"doit", u"me", u"donc", u"merci", u"dont", u"mes",
u"du", u"mien", u"duquel", u"mienne", u"durant", u"miennes", u"dès", u"miens",
u"elle", u"moi", u"elles", u"moins", u"en", u"mon", u"entre", u"moyennant",
u"environ", u"même", u"est", u"mêmes", u"et", u"n", u"etc", u"ne", u"etre",
u"ni", u"eux", u"non", u"excepté", u"nos", u"hormis", u"notre", u"hors",
u"nous", u"hélas", u"néanmoins", u"hui", u"nôtre", u"il", u"nôtres", u"ils",
u"on", u"j", u"out", u"ou", u"soit", u"outre", u"son", u"où", u"sont", u"par",
u"sous", u"parmi", u"suivant", u"partant", u"sur", u"pas", u"ta", u"passé",
u"te", u"pendant", u"tes", u"plein", u"tien", u"plus", u"tienne", u"plusieurs",
u"tiennes", u"pour", u"tiens", u"pourquoi", u"toi", u"proche", u"ton", u"près",
u"tous", u"puisque", u"tout", u"qu", u"toute", u"quand", u"toutes", u"que",
u"tu", u"quel", u"un", u"quelle", u"une", u"quelles", u"va", u"quels", u"vers",
u"qui", u"voici", u"quoi", u"voilà", u"quoique", u"vos", u"revoici", u"votre",
u"revoilà", u"vous", u"s", u"vu", u"sa", u"vôtre", u"sans", u"vôtres",
u"sauf", u"y", u"se", u"à", u"selon", u"ça", u"seront", u"ès", u"ses",
u"été", u"si", u"être", u"sien", u"ô", u"sienne", u"siennes", u"siens",
u"sinon", u"soi"]

def FrenchStemmingAnalyzer(expression=default_pattern, stoplist=STOP_WORDS,
                     minsize=2, maxsize=None, gaps=False, stemfn=stem,
                     ignore=None, cachesize=50000):
    """Composes a RegexTokenizer with a lower case filter, an optional stop
    filter, and a stemming filter.
    
    >>> ana = StemmingAnalyzer()
    >>> [token.text for token in ana(u"Testing is testing and testing")]
    [u"test", u"test", u"test"]
    
    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :param ignore: a set of words to not stem.
    :param cachesize: the maximum number of stemmed words to cache. The larger
        this number, the faster stemming will be but the more memory it will
        use.
    """
    
    ret = RegexTokenizer(expression=expression, gaps=gaps)
    chain = ret | LowercaseFilter()
    if stoplist is not None:
        chain = chain | StopFilter(stoplist=stoplist, minsize=minsize,
                                   maxsize=maxsize)
    return chain | PyStemmerFilter(lang="french", ignore=ignore, cachesize=cachesize)
FrenchStemmingAnalyzer.__inittypes__ = dict(expression=unicode, gaps=bool,
                                      stoplist=list, minsize=int, maxsize=int)


class SearchBackend(WhooshSearchBackend):
    '''
    Adaptation of Haystack' Swoosh backend with french snowball stemming
    '''

    def build_schema(self, fields):
        schema_fields = {
            ID: WHOOSH_ID(stored=True, unique=True),
            DJANGO_CT: WHOOSH_ID(stored=True),
            DJANGO_ID: WHOOSH_ID(stored=True),
        }
        # Grab the number of keys that are hard-coded into Haystack.
        # We'll use this to (possibly) fail slightly more gracefully later.
        initial_key_count = len(schema_fields)
        content_field_name = ''
        
        for field_name, field_class in fields.items(): #@UnusedVariable
            if field_class.is_multivalued:
                if field_class.indexed is False:
                    schema_fields[field_class.index_fieldname] = IDLIST(stored=True, field_boost=field_class.boost)
                else:
                    schema_fields[field_class.index_fieldname] = KEYWORD(stored=True, commas=True, scorable=True, field_boost=field_class.boost)
            elif field_class.field_type in ['date', 'datetime']:
                schema_fields[field_class.index_fieldname] = DATETIME(stored=field_class.stored)
            elif field_class.field_type == 'integer':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=int, field_boost=field_class.boost)
            elif field_class.field_type == 'float':
                schema_fields[field_class.index_fieldname] = NUMERIC(stored=field_class.stored, type=float, field_boost=field_class.boost)
            elif field_class.field_type == 'boolean':
                # Field boost isn't supported on BOOLEAN as of 1.8.2.
                schema_fields[field_class.index_fieldname] = BOOLEAN(stored=field_class.stored)
            elif field_class.field_type == 'ngram':
                schema_fields[field_class.index_fieldname] = NGRAM(minsize=3, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            elif field_class.field_type == 'edge_ngram':
                schema_fields[field_class.index_fieldname] = NGRAMWORDS(minsize=2, maxsize=15, stored=field_class.stored, field_boost=field_class.boost)
            else:
                schema_fields[field_class.index_fieldname] = TEXT(stored=True, analyzer=FrenchStemmingAnalyzer(), field_boost=field_class.boost)
            
            if field_class.document is True:
                content_field_name = field_class.index_fieldname
        
        # Fail more gracefully than relying on the backend to die if no fields
        # are found.
        if len(schema_fields) <= initial_key_count:
            raise SearchBackendError("No fields were found in any search_indexes. Please correct this before attempting to search.")
        
        return (content_field_name, Schema(**schema_fields))

    def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None, result_class=None):
        if not self.site:
            from haystack import site
        else:
            site = self.site
        
        results = []
        
        # It's important to grab the hits first before slicing. Otherwise, this
        # can cause pagination failures.
        hits = len(raw_page)
        
        if result_class is None:
            result_class = SearchResult
        
        facets = {}
        spelling_suggestion = None
        indexed_models = site.get_indexed_models()
        
        for doc_offset, raw_result in enumerate(raw_page):
            score = raw_page.score(doc_offset) or 0
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)
            
            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)
                    
                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        # Special-cased due to the nature of KEYWORD fields.
                        if index.fields[string_key].is_multivalued:
                            if value is None or len(value) is 0:
                                additional_fields[string_key] = []
                            else:
                                additional_fields[string_key] = value.split(',')
                        else:
                            additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self._to_python(value)
                
                del(additional_fields[DJANGO_CT])
                del(additional_fields[DJANGO_ID])
                
                if highlight:
                    from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                    sa = FrenchStemmingAnalyzer()
                    terms = [term.replace('*', '') for term in query_string.split()]
                    
                    additional_fields['highlighted'] = {
                        self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                    }
                
                result = result_class(app_label, model_name, raw_result[DJANGO_ID], score, searchsite=self.site, **additional_fields)
                results.append(result)
            else:
                hits -= 1
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
            if spelling_query:
                spelling_suggestion = self.create_spelling_suggestion(spelling_query)
            else:
                spelling_suggestion = self.create_spelling_suggestion(query_string)
        
        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

class SearchQuery(WhooshSearchQuery):
    pass
        
