# HG changeset patch # User ymh # Date 1343729633 -7200 # Node ID b93c992268326198c741e2f59a16dc22a1adb251 # Parent 1c0ac4068bbedfc591a2243b2e2a0eed8cd0a70b# Parent a25d344cb446a82c64ca6c9cc311e903d3afb90c Merge with a25d344cb446a82c64ca6c9cc311e903d3afb90c diff -r a25d344cb446 -r b93c99226832 .settings/org.eclipse.core.resources.prefs --- a/.settings/org.eclipse.core.resources.prefs Tue Jul 31 11:51:24 2012 +0200 +++ b/.settings/org.eclipse.core.resources.prefs Tue Jul 31 12:13:53 2012 +0200 @@ -2,6 +2,9 @@ eclipse.preferences.version=1 encoding//src/ldt/ldt/core/migrations/0001_initial.py=utf-8 encoding//src/ldt/ldt/core/migrations/0002_auto__del_owner.py=utf-8 +encoding//src/ldt/ldt/indexation/backends/elasticsearch_backend.py=utf-8 +encoding//src/ldt/ldt/indexation/highlighter.py=utf-8 +encoding//src/ldt/ldt/indexation/search_indexes.py=utf-8 encoding//src/ldt/ldt/ldt_utils/migrations/0001_initial.py=utf-8 encoding//src/ldt/ldt/ldt_utils/migrations/0002_auto__add_field_media_mimetype_field__chg_field_media_external_src_url.py=utf-8 encoding//src/ldt/ldt/ldt_utils/migrations/0003_auto__chg_field_project_owner.py=utf-8 @@ -19,5 +22,6 @@ encoding//src/ldt/ldt/user/migrations/0001_initial.py=utf-8 encoding//src/ldt/ldt/user/migrations/0008_auto__chg_field_groupprofile_image__chg_field_groupprofile_group__chg_.py.old=utf-8 encoding//virtualenv/web/env/guardianenv/Lib/site-packages/guardian/migrations/0001_initial.py=utf-8 +encoding//virtualenv/web/env/venv_platform/lib/python2.7/site-packages/haystack/backends/__init__.py=utf-8 encoding//web/ldtplatform/config.py=utf-8 encoding//web/ldtplatform/settings.py=utf-8 diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/indexation/__init__.py --- a/src/ldt/ldt/indexation/__init__.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/indexation/__init__.py Tue Jul 31 12:13:53 2012 +0200 @@ -1,108 +1,99 @@ from django.conf import settings -import lucene - -lucene.initVM(lucene.CLASSPATH) - -STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH)) -ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) -ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) - -def get_results_with_context(field, query, content_list=None): - - searcher = get_searcher() - - try: - res = get_results_list(field, query, searcher) - contexts = [] - content_iri_ids = None - if content_list is not None : - content_iri_ids = [ctt.iri_id for ctt in content_list] - - for i in res: - doc = searcher.doc(i.doc) - if content_iri_ids is None or (content_iri_ids is not None and doc.get("iri_id") in content_iri_ids) : - ids = {"iri_id":doc.get("iri_id"), "ensemble_id":doc.get("ensemble_id"), "decoupage_id":doc.get("decoupage_id"), "element_id":doc.get("element_id"), "project_id":doc.get("project_id")} - score = i.score - title = doc.getField('title').stringValue() - desc = doc.getField('abstract').stringValue() - tags = doc.getField('tags').stringValue() - begin = doc.getField('begin').stringValue() - duration = doc.getField('duration').stringValue() - - ids['context'] = desc - ids['title'] = title - ids['tags'] = tags - ids['score'] = score - ids['lucene_id'] = i.doc - ids['begin'] = begin - ids['duration'] = duration - contexts.append(ids) - finally: - searcher.close() - return contexts -def get_results_list(field, query, indexSearcher=None): - searcher_allocated = False - if indexSearcher is None: - indexSearcher, searcher_allocated = get_searcher(), True - try: - queryParser = get_query_parser(field) - queryObj = queryParser.parse(query) - hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER) - finally: - if searcher_allocated: - indexSearcher.close() - return hits.scoreDocs - -def highlight_documents(results_list, query, field): - searcher = get_searcher() - try: - analyzer = lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT) - formatter = lucene.SimpleHTMLFormatter('', '') - query = get_query_parser(field).parse(query) - highlighter = lucene.Highlighter(formatter, lucene.QueryScorer (query)) - - for project in results_list: - for segment in project['list']: - lucene_doc = searcher.doc(segment.lucene_id) - segment.context = get_highlighted_text(lucene_doc, analyzer, highlighter, 'abstract') - tags = get_highlighted_text(lucene_doc, analyzer, highlighter, 'tags') - segment.title = get_highlighted_text(lucene_doc, analyzer, highlighter, 'title') - - if segment.context == u'': - segment.context = lucene_doc.getField('abstract').stringValue() - if tags == u'': - tags = lucene_doc.getField('tags').stringValue() - if segment.title == u'': - segment.title = lucene_doc.getField('title').stringValue() - - segment.context_tags = tags[tags.find(';')+1:] - finally: - searcher.close() - return results_list - -def get_highlighted_text(doc, analyzer, highlighter, field): - res = doc.getField(field).stringValue() - ts = analyzer.tokenStream("body", lucene.StringReader(res)) - res = highlighter.getBestFragments(ts, res, settings.LDT_MAX_FRAGMENT_PER_SEARCH, "...") - return res - -def get_writer(new=False): - lucene.getVMEnv().attachCurrentThread() - return lucene.IndexWriter(STORE, ANALYZER, new, lucene.IndexWriter.MaxFieldLength.UNLIMITED) - -def get_searcher(): - lucene.getVMEnv().attachCurrentThread() - return lucene.IndexSearcher(STORE) - -def get_query_parser(field): - queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30)) - queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND) - return queryParser +from haystack.query import SearchQuerySet +from ldt.indexation.highlighter import LdtHighlighter as Highlighter +from ldt.ldt_utils.models import Segment +from ldt.text.models import Annotation +import re +import sys + +def get_results_with_context(field, query, content_list=None, highlight=True): + + results = get_results_list(field, query, highlight) + contexts = [] + content_iri_ids = None + if content_list is not None : + content_iri_ids = [ctt.iri_id for ctt in content_list] + + for res in results: + doc = res.get_stored_fields() + if content_iri_ids is None or (content_iri_ids is not None and doc.get("iri_id") in content_iri_ids) : + doc["score"] = res.score + doc["indexation_id"] = res.pk + doc["context"] = doc["abstract"] + doc["highlighted"] = res.highlighted + contexts.append(doc) + return contexts +def get_results_list(field, query, highlight=True): + if field == 'all': + field = 'text' + + qs = SearchQuerySet().models(Segment).auto_query(query, field) + if highlight: + qs = qs.highlight() + return qs + + + +def get_result_text(field, query): + + if field == 'all': + field = 'text' + elif field == 'text': + field = 'text_field' + + qs = SearchQuerySet.models(Annotation).auto_query(query, field) + + return [{'external_id':res.get_stored_fields()['external_id'], 'title': res.get_stored_fields()['title'], 'score': res.score} for res in qs] + +def highlight_documents(results_list, query, field): + + highlight = Highlighter(query, html_tag="span", css_class="highlight", max_length=sys.maxint) + + for project in results_list: + for segment in project['list']: + if hasattr(segment, "highlighted") and segment.highlighted: + #TODO : + highlighted_text = { + "context" : segment.highlighted.get('abstract',[segment.abstract])[0], + "tags" : segment.highlighted.get('tags',[segment.tags])[0], + 'title' : segment.highlighted.get('title',[segment.title])[0], + } + + else: + highlighted_text = { + "context" : highlight.highlight(segment.abstract), + "tags" : highlight.highlight(segment.tags), + 'title' : highlight.highlight(segment.title) + } + + segment.context = highlighted_text['context'] + segment.title = highlighted_text['title'] + tags = highlighted_text['tags'] + segment.context_tags = tags[tags.find(';')+1:] + + return results_list + +class SimpleSearch(object): + + def query(self, field, query): + hits = get_results_list(field, query) + + res = [] + for hit in hits: + res.append(hit.get_stored_fields()) + return res + + def query_all(self, query): + return self.query("all", query) + + + + + + + diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/indexation/backends/__init__.py diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/indexation/backends/elasticsearch_backend.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ldt/ldt/indexation/backends/elasticsearch_backend.py Tue Jul 31 12:13:53 2012 +0200 @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +''' +Created on Jul 30, 2012 + +@author: ymh +''' +from django.db.models.loading import get_model +from haystack.backends import BaseEngine, SearchResult, elasticsearch_backend +from haystack.constants import DJANGO_CT, DJANGO_ID +from ldt.ldt_utils.models import Segment +import datetime + +class ElasticsearchSearchBackend(elasticsearch_backend.ElasticsearchSearchBackend): + + def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, + fields='', highlight=False, facets=None, + date_facets=None, query_facets=None, + narrow_queries=None, spelling_query=None, + within=None, dwithin=None, distance_point=None, + models=None, limit_to_registered_models=None, + result_class=None): + + kwargs = super(ElasticsearchSearchBackend, self).build_search_kwargs(query_string, sort_by=sort_by, start_offset=start_offset, end_offset=end_offset, + fields=fields, highlight=highlight, facets=facets, + date_facets=date_facets, query_facets=query_facets, + narrow_queries=narrow_queries, spelling_query=spelling_query, + within=within, dwithin=dwithin, distance_point=distance_point, + models=models, limit_to_registered_models=limit_to_registered_models, + result_class=result_class) + + #TODO : try to make list of field dynamic + #TODO : How to handle multiple + if highlight: + fields_def = { } + + if models is None or len(models) == 0 or Segment in models: + fields_def['tags'] = {} + fields_def['title'] = {} + fields_def['abstract'] = {} + + kwargs['highlight'] = { + 'pre_tags' : [""], + 'post_tags' : [""], + "number_of_fragments" : 0, + 'fields': fields_def + } + + return kwargs + + def _process_results(self, raw_results, highlight=False, result_class=None): + from haystack import connections + results = [] + hits = raw_results.get('hits', {}).get('total', 0) + facets = {} + spelling_suggestion = None + + if result_class is None: + result_class = SearchResult + + if 'facets' in raw_results: + facets = { + 'fields': {}, + 'dates': {}, + 'queries': {}, + } + + for facet_fieldname, facet_info in raw_results['facets'].items(): + if facet_info.get('_type', 'terms') == 'terms': + facets['fields'][facet_fieldname] = [(individual['term'], individual['count']) for individual in facet_info['terms']] + elif facet_info.get('_type', 'terms') == 'date_histogram': + # Elasticsearch provides UTC timestamps with an extra three + # decimals of precision, which datetime barfs on. + facets['dates'][facet_fieldname] = [(datetime.datetime.utcfromtimestamp(individual['time'] / 1000), individual['count']) for individual in facet_info['entries']] + elif facet_info.get('_type', 'terms') == 'query': + facets['queries'][facet_fieldname] = facet_info['count'] + + unified_index = connections[self.connection_alias].get_unified_index() + indexed_models = unified_index.get_indexed_models() + + for raw_result in raw_results.get('hits', {}).get('hits', []): + source = raw_result['_source'] + app_label, model_name = source[DJANGO_CT].split('.') + additional_fields = {} + model = get_model(app_label, model_name) + + if model and model in indexed_models: + for key, value in source.items(): + index = unified_index.get_index(model) + string_key = str(key) + + if string_key in index.fields and hasattr(index.fields[string_key], 'convert'): + additional_fields[string_key] = index.fields[string_key].convert(value) + else: + additional_fields[string_key] = self.conn.to_python(value) + + del(additional_fields[DJANGO_CT]) + del(additional_fields[DJANGO_ID]) + + if 'highlight' in raw_result: + additional_fields['highlighted'] = raw_result['highlight'] + + result = result_class(app_label, model_name, source[DJANGO_ID], raw_result['_score'], **additional_fields) + results.append(result) + else: + hits -= 1 + + return { + 'results': results, + 'hits': hits, + 'facets': facets, + 'spelling_suggestion': spelling_suggestion, + } + + +class ElasticsearchSearchEngine(BaseEngine): + backend = ElasticsearchSearchBackend + query = elasticsearch_backend.ElasticsearchSearchQuery diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/indexation/highlighter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ldt/ldt/indexation/highlighter.py Tue Jul 31 12:13:53 2012 +0200 @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- +''' +Created on Jul 30, 2012 + +@author: ymh +''' +from haystack.utils import Highlighter + +class LdtHighlighter(Highlighter): + + def render_html(self, highlight_locations=None, start_offset=None, end_offset=None): + + return super(LdtHighlighter, self).render_html(highlight_locations=highlight_locations, start_offset=0, end_offset=len(self.text_block)) \ No newline at end of file diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/indexation/search_indexes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ldt/ldt/indexation/search_indexes.py Tue Jul 31 12:13:53 2012 +0200 @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +''' +Created on Jul 23, 2012 + +@author: ymh +''' + +from haystack import indexes +from ldt.ldt_utils.models import Segment +from ldt.text.models import Annotation + +class SegmentIndex(indexes.RealTimeSearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + iri_id = indexes.CharField(model_attr='iri_id', indexed=False, stored=True) + project_id = indexes.CharField(model_attr='project_id', indexed=False, stored=True, null=True) + ensemble_id = indexes.CharField(model_attr='ensemble_id', indexed=False, stored=True) + cutting_id = indexes.CharField(model_attr='cutting_id', indexed=False, stored=True) + element_id = indexes.CharField(model_attr='element_id', indexed=False, stored=True) + tags = indexes.CharField(model_attr='tags', stored=True) + title = indexes.CharField(model_attr='title', stored=True) + abstract = indexes.CharField(model_attr='abstract', stored=True) + duration = indexes.IntegerField(model_attr='duration', indexed=False, stored=True) + author = indexes.CharField(model_attr='author', stored=True, null=True) + start_ts = indexes.IntegerField(model_attr='start_ts', indexed=False, stored=True) + date = indexes.CharField(model_attr='date', stored=True) + + def get_model(self): + return Segment + +class AnnotationIndex(indexes.RealTimeSearchIndex, indexes.Indexable): + text = indexes.CharField(document=True, use_template=True) + tags = indexes.CharField(model_attr='tags', indexed=True, stored=False) + title = indexes.CharField(model_attr='title', indexed=True, stored=True) + abstract = indexes.CharField(model_attr='description', indexed=True, stored=False) + text_field = indexes.CharField(model_attr='text', indexed=True, stored=False) + annotation_id = indexes.CharField(model_attr='extarnal_id', indexed=False, stored=True) + + def get_model(self): + return Annotation + \ No newline at end of file diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/annotation_text.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/annotation_text.txt Tue Jul 31 12:13:53 2012 +0200 @@ -0,0 +1,4 @@ +{{object.tags}} +{{object.title}} +{{object.description}} +{{object.text}} \ No newline at end of file diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/segment_text.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/ldt/ldt/indexation/templates/search/indexes/ldt_utils/segment_text.txt Tue Jul 31 12:13:53 2012 +0200 @@ -0,0 +1,3 @@ +{{object.tags}} +{{object.title}} +{{object.abstract}} \ No newline at end of file diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/ldt_utils/admin.py --- a/src/ldt/ldt/ldt_utils/admin.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/admin.py Tue Jul 31 12:13:53 2012 +0200 @@ -7,7 +7,6 @@ from ldt.ldt_utils.forms import LdtImportForm, ReindexForm, StatAnnotationForm from ldt.ldt_utils.models import Content, Project, Media, Author from ldt.ldt_utils.stat import update_stat_content -import ldt.indexation from guardian.admin import GuardedModelAdmin @@ -55,20 +54,15 @@ if request.method == "POST": form = ReindexForm(request.POST) if form.is_valid(): - # try: - writer = ldt.indexation.get_writer(True) - try: - contentList = form.cleaned_data["contents"] - indexer = ContentIndexer(contentList, writer) + contentList = form.cleaned_data["contents"] + indexer = ContentIndexer(contentList) + indexer.index_all() + + index_projects = form.cleaned_data["index_projects"] + if index_projects: + projectList = Project.objects.filter(contents__in=contentList, state=2).distinct() #filter(contents__in=contentList) @UndefinedVariable + indexer = ProjectIndexer(projectList) indexer.index_all() - - index_projects = form.cleaned_data["index_projects"] - if index_projects: - projectList = Project.objects.filter(contents__in=contentList, state=2).distinct() #filter(contents__in=contentList) @UndefinedVariable - indexer = ProjectIndexer(projectList, writer) - indexer.index_all() - finally: - writer.close() message = "Indexation ok : " + repr(form.cleaned_data["contents"]) form = ReindexForm() # except Exception, inst: diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/ldt_utils/contentindexer.py --- a/src/ldt/ldt/ldt_utils/contentindexer.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/contentindexer.py Tue Jul 31 12:13:53 2012 +0200 @@ -4,8 +4,6 @@ from ldt.ldt_utils.models import Segment, Content, Project from ldt.ldt_utils.utils import reduce_text_node from ldt.ldt_utils.stat import update_stat_project -import ldt.indexation -import lucene import lxml.etree import urllib #@UnresolvedImport # import ldt.utils.log @@ -16,9 +14,8 @@ class LdtIndexer(object): - def __init__(self, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST): + def __init__(self, decoupage_blackList=settings.DECOUPAGE_BLACKLIST): self.__decoupage_blacklist = decoupage_blackList - self.__writer = writer @Property def decoupage_blacklist(): #@NoSelf @@ -37,12 +34,6 @@ return locals() - @Property - def writer(): #@NoSelf - def fget(self): - return self.__writer - return locals() - def index_all(self): raise NotImplemented @@ -93,21 +84,6 @@ if project: ldt_id = project.ldt_id - doc = lucene.Document() - doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("project_id", ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("tags", tags, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("title", title, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("begin", str(start_ts), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("duration", str(duration), lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("author", author, lucene.Field.Store.YES, lucene.Field.Index.ANALYZED)) - seg = Segment(content=content, iri_id=content.iri_id, ensemble_id=ensembleId, @@ -124,14 +100,12 @@ project_id=ldt_id) seg.polemics = seg.get_polemic(polemics) seg.save() - self.writer.addDocument(doc) - class ContentIndexer(LdtIndexer): - def __init__(self, contentList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST, callback=None): - super(ContentIndexer, self).__init__(writer, decoupage_blackList) + def __init__(self, contentList, decoupage_blackList=settings.DECOUPAGE_BLACKLIST, callback=None): + super(ContentIndexer, self).__init__(decoupage_blackList) self.__contentList = contentList self.__callback = callback @@ -141,27 +115,23 @@ self.__callback(i,content) self.index_content(content) - def index_content(self, content): - + def index_content(self, content): url = content.iri_url() filepath = urllib.urlopen(url) doc = lxml.etree.parse(filepath) #@UndefinedVariable - self.writer.deleteDocuments(lucene.Term("iri_id", content.iri_id)) Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable res = doc.xpath("/iri/body/ensembles/ensemble") for ensemble in res: self.index_ensemble(ensemble, content) - - self.writer.commit() - + class ProjectIndexer(LdtIndexer): - def __init__(self, projectList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST, callback=None): - super(ProjectIndexer, self).__init__(writer, decoupage_blackList) + def __init__(self, projectList, decoupage_blackList=settings.DECOUPAGE_BLACKLIST, callback=None): + super(ProjectIndexer, self).__init__(decoupage_blackList) self.__projectList = projectList self.__callback = callback @@ -177,7 +147,6 @@ # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id)) doc = lxml.etree.fromstring(project.ldt_encoded) #@UndefinedVariable - self.writer.deleteDocuments(lucene.Term("project_id", project.ldt_id)) Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() #@UndefinedVariable res = doc.xpath("/iri/annotations/content") @@ -192,22 +161,15 @@ for ensemble in content.getchildren(): self.index_ensemble(ensemble, content_obj, project) - - self.writer.commit() @receiver(post_save, sender=Project) def index_project(sender, **kwargs): if settings.AUTO_INDEX_AFTER_SAVE: instance = kwargs['instance'] - writer = ldt.indexation.get_writer() - try: - if instance.state != 2: - writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id)) - Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() #@UndefinedVariable - update_stat_project(instance) - else: - projectIndexer = ProjectIndexer([instance], writer) - projectIndexer.index_all() - update_stat_project(instance) - finally: - writer.close() + if instance.state != 2: + Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() #@UndefinedVariable + update_stat_project(instance) + else: + projectIndexer = ProjectIndexer([instance]) + projectIndexer.index_all() + update_stat_project(instance) diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/ldt_utils/models.py --- a/src/ldt/ldt/ldt_utils/models.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/models.py Tue Jul 31 12:13:53 2012 +0200 @@ -13,8 +13,6 @@ from utils import (create_ldt, copy_ldt, create_empty_iri, update_iri, generate_uuid) import datetime -import ldt.indexation -import lucene import lxml.etree import mimetypes import os.path @@ -179,16 +177,7 @@ return locals() mimetype = property(**mimetype()) - - def delete(self): - super(Content, self).delete() - writer = ldt.indexation.get_writer() - try: - writer.deleteDocuments(lucene.Term("iri_id", self.iri_id)) - writer.commit() - finally: - writer.close() - + def sync_iri_file(self): # create iri file if needed created = False diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/ldt_utils/templates/front/front_search_results.html --- a/src/ldt/ldt/ldt_utils/templates/front/front_search_results.html Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/templates/front/front_search_results.html Tue Jul 31 12:13:53 2012 +0200 @@ -116,10 +116,10 @@ {% thumbnail res.content.image "300x200" format="PNG" crop="center" as im %}{% endthumbnail %}

{% if segment.title %}{{ segment.title }}{% else %}{% trans "No title" %}{% endif %}

-

{{ segment.abstract }}

- {% if segment.tags %} +

{% if segment.context %}{{ segment.context }}{% else %}{{ segment.abstract }}{% endif %}

+ {% if segment.context_tags or segment.tags %}
Tags:
-

{{ segment.tags }}

+

{% if segment.context_tags %}{{ segment.context_tags }}{% else %}{{ segment.tags }}{% endif %}

{% endif %} diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/ldt_utils/utils.py --- a/src/ldt/ldt/ldt_utils/utils.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/utils.py Tue Jul 31 12:13:53 2012 +0200 @@ -1,6 +1,5 @@ from copy import deepcopy from django.conf import settings -from ldt.indexation import get_searcher, get_results_list from django.utils.translation import ugettext as _ from StringIO import StringIO import datetime @@ -37,21 +36,6 @@ def generate_uuid(): return unicode(uuid.uuid1()) -class LdtSearch(object): - - def query(self, field, query): - indexSearcher = get_searcher() - hits = get_results_list(field, query) - - res = [] - for hit in hits: - doc = indexSearcher.doc(hit.doc) - res.append({"iri_id":doc.get("iri_id"), "ensemble_id":doc.get("ensemble_id"), "decoupage_id":doc.get("decoupage_id"), "element_id":doc.get("element_id"), "project_id":doc.get("project_id"), "begin":doc.get("begin"), "duration":doc.get("duration")}) - indexSearcher.close() - return res - - def query_all(self, query): - return self.query("all", query) class LdtUtils(object): diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/ldt_utils/views/lignesdetemps.py --- a/src/ldt/ldt/ldt_utils/views/lignesdetemps.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/views/lignesdetemps.py Tue Jul 31 12:13:53 2012 +0200 @@ -5,7 +5,8 @@ from django.shortcuts import render_to_response, get_object_or_404 from django.template import RequestContext from ldt.ldt_utils.models import Content, Project -from ldt.ldt_utils.utils import LdtUtils, LdtSearch, clean_description +from ldt.ldt_utils.utils import LdtUtils, clean_description +from ldt.indexation import SimpleSearch from ldt.security.utils import set_forbidden_stream from ldt.ldt_utils.stat import update_stat_project import base64 @@ -41,7 +42,7 @@ queryStr = base64.urlsafe_b64decode(query.encode("ascii")).decode("utf8") else: queryStr = query - searcher = LdtSearch() + searcher = SimpleSearch() ids = {} projIds = {} typesIds = {} @@ -76,7 +77,7 @@ def search_segments(request, field, query, edition=None): if query and len(query) > 0: - searcher = LdtSearch() + searcher = SimpleSearch() queryStr = base64.urlsafe_b64decode(query.encode("ascii")).decode("utf8") res = searcher.query(field, queryStr) diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/ldt_utils/views/workspace.py --- a/src/ldt/ldt/ldt_utils/views/workspace.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/ldt_utils/views/workspace.py Tue Jul 31 12:13:53 2012 +0200 @@ -213,10 +213,10 @@ valid_segments = [] for s in all_related_segments: - segment = [seg for seg in all_segments if seg.element_id == s['element_id'] and seg.project_id == s['project_id'] and seg.iri_id == s['iri_id'] and seg.cutting_id == s['decoupage_id'] and seg.ensemble_id == s['ensemble_id'] ][0] + segment = [seg for seg in all_segments if seg.element_id == s['element_id'] and seg.project_id == s['project_id'] and seg.iri_id == s['iri_id'] and seg.cutting_id == s['cutting_id'] and seg.ensemble_id == s['ensemble_id'] ][0] segment.score = s['score'] - segment.lucene_id = s['lucene_id'] + segment.indexation_id = s['indexation_id'] segment.context = s['context'] segment.context_tags = s['tags'] diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/management/commands/reindex.py --- a/src/ldt/ldt/management/commands/reindex.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/management/commands/reindex.py Tue Jul 31 12:13:53 2012 +0200 @@ -1,5 +1,4 @@ from django.core.management.base import BaseCommand -import ldt.indexation from ldt.ldt_utils.models import Content, Project from ldt.ldt_utils.contentindexer import ContentIndexer, ProjectIndexer from ldt.management.utils import show_progress @@ -22,9 +21,7 @@ def handle(self, *args, **options): parser = self.create_parser("reindex", "") options, _ = parser.parse_args() - - writer = ldt.indexation.get_writer(True) - + if options.content_id: self.stdout.write('Creating index for %s\n' % options.content_id) contentList = Content.objects.filter(iri_id=options.content_id) @@ -36,7 +33,7 @@ c = lambda i,o: show_progress(i+1, count, o.title, 50) - indexer = ContentIndexer(contentList, writer, callback=c) + indexer = ContentIndexer(contentList, callback=c) indexer.index_all() if options.projects: @@ -44,7 +41,5 @@ projectList = Project.objects.filter(contents__in=contentList, state=2).distinct() count = projectList.count() c = lambda i,o: show_progress(i+1, count, o.title, 50) - indexer = ProjectIndexer(projectList, writer, callback=c) + indexer = ProjectIndexer(projectList, callback=c) indexer.index_all() - - writer.close() diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/settings.py --- a/src/ldt/ldt/settings.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/settings.py Tue Jul 31 12:13:53 2012 +0200 @@ -20,12 +20,14 @@ 'django.contrib.admin', 'registration', 'tagging', + 'haystack', 'ldt', 'ldt.core', 'ldt.ldt_utils', 'ldt.text', 'ldt.user', 'ldt.management', + 'ldt.indexation', 'oauth_provider', 'django_openid_consumer', 'piston', @@ -96,4 +98,10 @@ DEFAULT_USER_ICON = "thumbnails/users/user_default_icon.png" DEFAULT_GROUP_ICON = "thumbnails/groups/group_default_icon.png" -EXTERNAL_STREAM_SRC = getattr(settings, 'EXTERNAL_STREAM_SRC', ['youtube.com', 'dailymotion.com']) \ No newline at end of file +EXTERNAL_STREAM_SRC = getattr(settings, 'EXTERNAL_STREAM_SRC', ['youtube.com', 'dailymotion.com']) + +HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.simple_backend.SimpleEngine', + }, +} diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/text/annotindexer.py --- a/src/ldt/ldt/text/annotindexer.py Tue Jul 31 11:51:24 2012 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,41 +0,0 @@ -import lucene - - -class AnnotIndexer(object): - - def __init__(self, annotList, writer): - self.__annotList = annotList - self.__writer = writer - - - def index_all(self): - for annot in self.__annotList: - self.index_annotation(annot) - - - def index_annotation(self, annotation): - - doc = lucene.Document() - - doc.add(lucene.Field("annotation_id", annotation.external_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - - annottags = annotation.get_tag_list() - tags = "" - - if annottags is None or len(annottags) == 0: - tags = "" - else: - for tag in annottags: - tags += tag + ";" - - doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("title", annotation.title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("abstract", annotation.description, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("text", annotation.text, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - - self.__writer.addDocument(doc) - - self.__writer.close() - diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/text/models.py --- a/src/ldt/ldt/text/models.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/text/models.py Tue Jul 31 12:13:53 2012 +0200 @@ -1,10 +1,7 @@ -from annotindexer import AnnotIndexer from django.db import models from django.utils.translation import ugettext_lazy as _ from tagging.models import Tag from utils import generate_uuid -import ldt.indexation -import lucene import lxml import tagging.fields #from django.core.management.validation import max_length @@ -118,37 +115,8 @@ def create_annotation(external_id, uri=None, tags=None, title=None, description=None, text=None, color=None, creator=None, contributor=None, creation_date=None, update_date=None): annotation = Annotation(external_id=external_id, uri=uri, tags=tags, title=title, description=description, text=text, color=color, creator=creator, contributor=contributor, creation_date=creation_date, update_date=update_date) annotation.save() - annotation.index_annot() return annotation - - def delete(self): - super(Annotation, self).delete() - lucene.getVMEnv().attachCurrentThread() - writer = ldt.indexation.get_writer() - try: - writer.deleteDocuments(lucene.Term("external_id", self.external_id)) - finally: - writer.close() - - def index_annot(self): - lucene.getVMEnv().attachCurrentThread() - writer = ldt.indexation.get_writer() - try: - annotl = [self, ] - indexer = AnnotIndexer(annotl, writer) - indexer.index_all() - finally: - writer.close() - - def update_index(self): - lucene.getVMEnv().attachCurrentThread() - writer = ldt.indexation.get_writer() - try: - writer.deleteDocuments(lucene.Term("external_id", self.external_id)) - finally: - writer.close() - self.index_annot() diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/text/utils.py --- a/src/ldt/ldt/text/utils.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/text/utils.py Tue Jul 31 12:13:53 2012 +0200 @@ -1,7 +1,6 @@ from django.conf import settings -from ldt.indexation import STORE -import lucene import uuid +import ldt.indexation __BOOLEAN_DICT = { 'false':False, @@ -38,18 +37,7 @@ class TextSearch(object): def query(self, field, query): - indexSearcher = lucene.IndexSearcher(STORE) - queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30)) - queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND) - queryObj = queryParser.parse(query) - hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER) - - res = [] - for hit in hits.scoreDocs: - doc = indexSearcher.doc(hit.doc) - res.append({"external_id":doc.get("external_id"), "title":doc.get("title")}) - indexSearcher.close() - return res + return ldt.indexation.get_result_text(field, query) def query_all(self, query): return self.query("all", query) diff -r a25d344cb446 -r b93c99226832 src/ldt/ldt/text/views.py --- a/src/ldt/ldt/text/views.py Tue Jul 31 11:51:24 2012 +0200 +++ b/src/ldt/ldt/text/views.py Tue Jul 31 12:13:53 2012 +0200 @@ -192,7 +192,6 @@ annot.update_date = unicode(update_date[0]) annot.save() - annot.update_index() return HttpResponse(lxml.etree.tostring(annot.serialize(), pretty_print=True), mimetype="text/xml;charset=utf-8") diff -r a25d344cb446 -r b93c99226832 virtualenv/res/lib/lib_create_env.py --- a/virtualenv/res/lib/lib_create_env.py Tue Jul 31 11:51:24 2012 +0200 +++ b/virtualenv/res/lib/lib_create_env.py Tue Jul 31 12:13:53 2012 +0200 @@ -35,13 +35,13 @@ 'SSH': {'setup': 'ssh', 'url':'http://pypi.python.org/packages/source/s/ssh/ssh-1.7.14.tar.gz#md5=4cdd0549ef4699bd67b96264d3b21427', 'local':'ssh-1.7.14.tar.gz', 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}}, 'FABRIC': {'setup': 'fabric', 'url':'https://github.com/fabric/fabric/tarball/1.4.2', 'local':'fabric-1.4.2.tar.gz', 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}}, 'MERCURIAL': {'setup': 'mercurial', 'url':'http://mercurial.selenic.com/release/mercurial-2.2.2.tar.gz', 'local':'mercurial-2.2.2.tar.gz', 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}}, + 'HAYSTACK': {'setup': 'django-haystack', 'url': 'https://github.com/toastdriven/django-haystack/tarball/master', 'local': 'django-haystack-v2.0.0.tar.gz', 'install':{'method':'pip', 'option_str': None, 'dict_extra_env': None}}, + 'REQUEST': {'setup': 'requests', 'url':'https://github.com/kennethreitz/requests/tarball/v0.13.3', 'local':'requests-v0.13.3.tar.gz', 'install' : {'method':'pip', 'option_str': None, 'dict_extra_env': None}}, } if system_str == 'Windows': URLS.update({ 'PSYCOPG2': {'setup': 'psycopg2','url': 'psycopg2-2.0.14.win32-py2.6-pg8.4.3-release.zip', 'local':"psycopg2-2.0.14.win32-py2.6-pg8.4.3-release.zip", 'install': {'method': 'install_psycopg2', 'option_str': None, 'dict_extra_env': None}}, - 'JCC': {'setup': 'jcc', 'url': 'http://pylucene-win32-binary.googlecode.com/files/JCC-2.6-py2.6-win32.egg', 'local':"JCC-2.6-py2.6-win32.egg", 'install': {'method': 'easy_install', 'option_str': None, 'dict_extra_env': None}}, - 'PYLUCENE': {'setup': 'pylucene', 'url': 'http://pylucene-win32-binary.googlecode.com/files/lucene-3.0.2-py2.6-win32.egg', 'local':"lucene-3.0.2-py2.6-win32.egg", 'install': {'method': 'easy_install', 'option_str': None, 'dict_extra_env': None}}, 'PIL': {'setup': 'pil', 'url': 'http://effbot.org/media/downloads/PIL-1.1.7.win32-py2.6.exe', 'local':"PIL-1.1.7.win32-py2.6.exe", 'install': {'method': 'easy_install', 'option_str': None, 'dict_extra_env': None}}, 'LXML': {'setup': 'lxml', 'url': 'http://pypi.python.org/packages/2.6/l/lxml/lxml-2.2.2-py2.6-win32.egg', 'local':"lxml-2.2.2-py2.6-win32.egg", 'install': {'method': 'easy_install', 'option_str': None, 'dict_extra_env': None}} }) @@ -55,7 +55,6 @@ URLS.update({ 'PSYCOPG2': {'setup': 'psycopg2','url': 'http://www.psycopg.org/psycopg/tarballs/PSYCOPG-2-4/psycopg2-2.4.5.tar.gz', 'local':"psycopg2-2.4.5.tar.gz", 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}}, - 'PYLUCENE': {'setup': 'pylucene', 'url': 'http://mirrors.ircam.fr/pub/apache/lucene/pylucene/pylucene-3.6.0-2-src.tar.gz', 'local':"pylucene-3.6.0-2-src.tar.gz", 'install': {'method': 'install_pylucene', 'option_str': None, 'dict_extra_env': None}}, 'PIL': {'setup': 'pil', 'url': 'http://effbot.org/downloads/Imaging-1.1.7.tar.gz', 'local':"Imaging-1.1.7.tar.gz", 'install': {'method': 'easy_install', 'option_str': None, 'dict_extra_env': None}}, 'LXML': {'setup': 'lxml', 'url':"lxml-2.3.4.tar.bz2", 'local':"lxml-2.3.4.tar.bz2", 'install': {'method': lxml_method, 'option_str': None, 'dict_extra_env': lxml_options}}, }) diff -r a25d344cb446 -r b93c99226832 virtualenv/res/src/JCC-2.6-py2.6-win32.egg Binary file virtualenv/res/src/JCC-2.6-py2.6-win32.egg has changed diff -r a25d344cb446 -r b93c99226832 virtualenv/res/src/django-haystack-v2.0.0.tar.gz Binary file virtualenv/res/src/django-haystack-v2.0.0.tar.gz has changed diff -r a25d344cb446 -r b93c99226832 virtualenv/res/src/lucene-3.0.2-py2.6-win32.egg Binary file virtualenv/res/src/lucene-3.0.2-py2.6-win32.egg has changed diff -r a25d344cb446 -r b93c99226832 virtualenv/res/src/pyelasticsearch.tar.gz Binary file virtualenv/res/src/pyelasticsearch.tar.gz has changed diff -r a25d344cb446 -r b93c99226832 virtualenv/res/src/pylucene-3.6.0-2-src.tar.gz Binary file virtualenv/res/src/pylucene-3.6.0-2-src.tar.gz has changed diff -r a25d344cb446 -r b93c99226832 virtualenv/res/src/requests-v0.13.3.tar.gz Binary file virtualenv/res/src/requests-v0.13.3.tar.gz has changed diff -r a25d344cb446 -r b93c99226832 virtualenv/web/res/res_create_env.py --- a/virtualenv/web/res/res_create_env.py Tue Jul 31 11:51:24 2012 +0200 +++ b/virtualenv/web/res/res_create_env.py Tue Jul 31 12:13:53 2012 +0200 @@ -7,7 +7,6 @@ INSTALLS = [ #(key,method, option_str, dict_extra_env) 'LXML', - 'PYLUCENE', 'PSYCOPG2', 'SOUTH', 'PIL', @@ -26,9 +25,6 @@ 'SORL_THUMBNAIL', ] -if system_str == 'Windows': - INSTALLS.insert(0, 'JCC') - if system_str == "Linux": INSTALLS.insert(2, 'DISTRIBUTE') diff -r a25d344cb446 -r b93c99226832 web/ldtplatform/config.py.tmpl --- a/web/ldtplatform/config.py.tmpl Tue Jul 31 11:51:24 2012 +0200 +++ b/web/ldtplatform/config.py.tmpl Tue Jul 31 12:13:53 2012 +0200 @@ -98,5 +98,12 @@ FRONT_TAG_LIST = [u"Enmi 2011", u"film", u"conférence"] +HAYSTACK_CONNECTIONS = { + 'default': { + #for elasticsearch use ldt.indexation.backends.elasticsearch_backend.ElasticsearchSearchEngine + 'ENGINE': 'haystack.backends.simple_backend.SimpleEngine', + }, +} + diff -r a25d344cb446 -r b93c99226832 web/ldtplatform/settings.py --- a/web/ldtplatform/settings.py Tue Jul 31 11:51:24 2012 +0200 +++ b/web/ldtplatform/settings.py Tue Jul 31 12:13:53 2012 +0200 @@ -129,6 +129,7 @@ 'django.contrib.messages', 'django.contrib.admin', 'django.contrib.staticfiles', + 'haystack', 'ldtplatform', 'registration', 'tagging', @@ -138,6 +139,7 @@ 'ldt.text', 'ldt.user', 'ldt.management', + 'ldt.indexation', 'oauth_provider', 'django_openid_consumer', 'piston', @@ -231,6 +233,12 @@ EXTERNAL_STREAM_SRC = ['youtube.com', 'dailymotion.com'] +HAYSTACK_CONNECTIONS = { + 'default': { + 'ENGINE': 'haystack.backends.simple_backend.SimpleEngine', + }, +} + from config import * if not "LOGIN_URL" in locals():