# HG changeset patch # User cavaliet # Date 1398250827 -7200 # Node ID 7a638196577dbdba0ada5436c114335e39faf62a # Parent 22db13f77355030df7abbf709ab7297f353f6e72 v1.53.2 : taggit optimisation diff -r 22db13f77355 -r 7a638196577d src/ldt/ldt/__init__.py --- a/src/ldt/ldt/__init__.py Mon Apr 14 15:17:27 2014 +0200 +++ b/src/ldt/ldt/__init__.py Wed Apr 23 13:00:27 2014 +0200 @@ -1,6 +1,6 @@ __all__ = ["VERSION", "get_version", "__version__"] -VERSION = (1, 53, 1, "final", 0) +VERSION = (1, 53, 2, "final", 0) def get_version(): diff -r 22db13f77355 -r 7a638196577d src/ldt/ldt/indexation/__init__.py --- a/src/ldt/ldt/indexation/__init__.py Mon Apr 14 15:17:27 2014 +0200 +++ b/src/ldt/ldt/indexation/__init__.py Wed Apr 23 13:00:27 2014 +0200 @@ -118,18 +118,25 @@ for o in objs: conn.get_backend().remove(o, commit=True) + + def object_insert(model, object_list, func_key, using = None): if not object_list: return model.objects.bulk_create(object_list) - obj_dict = dict(model.objects.filter(**{func_key+'__in':[getattr(o, func_key) for o in object_list]}).values_list(func_key,"id")) for o in object_list: o.id = obj_dict[getattr(o,func_key)] + +def object_run_index(model, object_list, using = None): + + if not object_list: + return + if not using: using = DEFAULT_ALIAS @@ -141,8 +148,7 @@ index = unified_index.get_index(model) backend.update(index, object_list) - - + class SimpleSearch(object): diff -r 22db13f77355 -r 7a638196577d src/ldt/ldt/indexation/search_indexes.py --- a/src/ldt/ldt/indexation/search_indexes.py Mon Apr 14 15:17:27 2014 +0200 +++ b/src/ldt/ldt/indexation/search_indexes.py Wed Apr 23 13:00:27 2014 +0200 @@ -30,8 +30,12 @@ return Segment def prepare_tags(self, obj): + if obj.tag_list is not None: + obj.tags = None # To avoid a second and useless db request + return ",".join(obj.tag_list) return ",".join([tag.name for tag in obj.tags.all()]) + class AnnotationIndex(indexes.SearchIndex, indexes.Indexable): text = indexes.CharField(document=True, use_template=True) tags = indexes.CharField(model_attr='tags', indexed=True, stored=False) diff -r 22db13f77355 -r 7a638196577d src/ldt/ldt/ldt_utils/contentindexer.py --- a/src/ldt/ldt/ldt_utils/contentindexer.py Mon Apr 14 15:17:27 2014 +0200 +++ b/src/ldt/ldt/ldt_utils/contentindexer.py Wed Apr 23 13:00:27 2014 +0200 @@ -1,16 +1,18 @@ from StringIO import StringIO +from django.contrib.contenttypes.models import ContentType from django.dispatch import receiver from ldt import settings -from ldt.indexation import object_delete, object_insert +from ldt.indexation import object_delete, object_insert, object_run_index from ldt.ldt_utils.events import post_project_save from ldt.ldt_utils.models import Segment, Content, Project from ldt.ldt_utils.stat import update_stat_project, add_annotation_to_stat from ldt.ldt_utils.utils import reduce_text_node from ldt.utils.url import request_with_auth -import logging +from taggit.models import Tag, TaggedItem +from taggit.utils import parse_tags import lxml.etree #@UnresolvedImport -from taggit.utils import parse_tags +import logging logger = logging.getLogger(__name__) def Property(func): @@ -24,6 +26,9 @@ self.__decoupage_blacklist = decoupage_blackList self.__callback = callback self.__segment_cache = [] + self.__all_tags_cache = {} + self.__segment_tags_cache = {} + self.__tags_cache = [] @Property def decoupage_blacklist(): #@NoSelf @@ -50,11 +55,11 @@ def index_object(self, obj): - self._do_index_object(obj) if self.__segment_cache: object_insert(Segment, self.__segment_cache, 'id_hash') + object_run_index(Segment, self.__segment_cache) self.__segment_cache = [] @@ -63,6 +68,7 @@ def index_ensemble(self, ensemble, content, project=None): ensembleId = ensemble.get(u"id", None) + ctp = ContentType.objects.get_for_model(Segment) for decoupageNode in ensemble.getchildren(): if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist: @@ -74,7 +80,7 @@ elementId = elementNode.get(u"id", None) tags = elementNode.get(u"tags", None) - + if tags is None or len(tags) == 0: tags = u"" restagnode = elementNode.xpath("tag/text()", smart_strings=False) @@ -110,7 +116,7 @@ if audio_node: audio_src = audio_node[0].get(u"source", u"") audio_href = audio_node[0].text - + seg = Segment.create(content=content, iri_id=content.iri_id, ensemble_id=ensembleId, @@ -126,20 +132,75 @@ project_id=ldt_id, audio_src=audio_src, audio_href=audio_href) - # Because of taggit managing (we HAVE to have primary key to ad tags), we save segment and then tags + + tags = parse_tags(tags) + self.__segment_tags_cache[seg.id_hash] = tags seg.polemics = seg.get_polemic(polemics) - seg.save() - for t in parse_tags(tags): - seg.tags.add(t) - seg.save() - -# if settings.LDT_INDEXATION_INSERT_BATCH_SIZE < 2: -# seg.save() -# else: -# self.__segment_cache.append(seg) -# if not (len(self.__segment_cache)%settings.LDT_INDEXATION_INSERT_BATCH_SIZE): -# object_insert(Segment, self.__segment_cache) -# self.__segment_cache = [] + if settings.LDT_INDEXATION_INSERT_BATCH_SIZE < 2: + seg.save() + seg.tags.add(*tags) + else: + self.__segment_cache.append(seg) + self.__tags_cache = set( list(self.__tags_cache) + tags) + + if not (len(self.__segment_cache)%settings.LDT_INDEXATION_INSERT_BATCH_SIZE): + # First we insert/bulk_create the segments + object_insert(Segment, self.__segment_cache, 'id_hash') + # Filter already existing tags in current dict + for t in list(self.__tags_cache): + if t in self.__all_tags_cache: + self.__tags_cache.remove(t) + # Filter already existing tags in database + current_tags = Tag.objects.filter(name__in=self.__tags_cache) + for t in current_tags: + self.__all_tags_cache[t.name] = t + self.__tags_cache.remove(t.name) + # If the rest of tags were never in the db, we save them + if len(self.__tags_cache)>0: + for t in self.__tags_cache: + tag = Tag.objects.create(name=t) + self.__all_tags_cache[t] = tag + + # Prepare taggeditems + ti = [] + for s in self.__segment_cache: + s.tag_list = self.__segment_tags_cache[s.id_hash] + for t in self.__segment_tags_cache[s.id_hash]: + ti.append( TaggedItem(tag=self.__all_tags_cache[t], content_type=ctp, object_id=s.pk) ) + TaggedItem.objects.bulk_create(ti) + object_run_index(Segment, self.__segment_cache) + self.__segment_cache = [] + + # last loop if necessary + if len(self.__segment_cache) > 0: + # First we insert/bulk_create the segments + object_insert(Segment, self.__segment_cache, 'id_hash') + # Filter already existing tags in current dict + for t in list(self.__tags_cache): + if t in self.__all_tags_cache: + self.__tags_cache.remove(t) + # Filter already existing tags in database + current_tags = Tag.objects.filter(name__in=self.__tags_cache) + for t in current_tags: + self.__all_tags_cache[t.name] = t + self.__tags_cache.remove(t.name) + # If the rest of tags were never in the db, we save them + if len(self.__tags_cache)>0: + for t in self.__tags_cache: + tag = Tag.objects.create(name=t) + self.__all_tags_cache[t] = tag + + # Prepare taggeditems + ti = [] + for s in self.__segment_cache: + s.tag_list = self.__segment_tags_cache[s.id_hash] + for t in self.__segment_tags_cache[s.id_hash]: + ti.append( TaggedItem(tag=self.__all_tags_cache[t], content_type=ctp, object_id=s.pk) ) + TaggedItem.objects.bulk_create(ti) + object_run_index(Segment, self.__segment_cache) + # End of batch + self.__segment_cache = [] + class ContentIndexer(LdtIndexer):