v1.53.2 : taggit optimisation V01.53.02
authorcavaliet
Wed, 23 Apr 2014 13:00:27 +0200
changeset 1300 7a638196577d
parent 1299 22db13f77355
child 1301 3b27efb4e2f5
v1.53.2 : taggit optimisation
src/ldt/ldt/__init__.py
src/ldt/ldt/indexation/__init__.py
src/ldt/ldt/indexation/search_indexes.py
src/ldt/ldt/ldt_utils/contentindexer.py
--- a/src/ldt/ldt/__init__.py	Mon Apr 14 15:17:27 2014 +0200
+++ b/src/ldt/ldt/__init__.py	Wed Apr 23 13:00:27 2014 +0200
@@ -1,6 +1,6 @@
 __all__ = ["VERSION", "get_version", "__version__"]
 
-VERSION = (1, 53, 1, "final", 0)
+VERSION = (1, 53, 2, "final", 0)
 
 
 def get_version():
--- a/src/ldt/ldt/indexation/__init__.py	Mon Apr 14 15:17:27 2014 +0200
+++ b/src/ldt/ldt/indexation/__init__.py	Wed Apr 23 13:00:27 2014 +0200
@@ -118,18 +118,25 @@
         for o in objs:
             conn.get_backend().remove(o, commit=True)
 
+
+
 def object_insert(model, object_list, func_key, using = None):
     
     if not object_list:
         return
 
     model.objects.bulk_create(object_list)
-    
     obj_dict = dict(model.objects.filter(**{func_key+'__in':[getattr(o, func_key) for o in object_list]}).values_list(func_key,"id"))
 
     for o in object_list:
         o.id = obj_dict[getattr(o,func_key)]
 
+
+def object_run_index(model, object_list, using = None):
+    
+    if not object_list:
+        return
+
     if not using:
         using = DEFAULT_ALIAS        
     
@@ -141,8 +148,7 @@
     index = unified_index.get_index(model)
     
     backend.update(index, object_list)
-    
-    
+
 
 
 class SimpleSearch(object):
--- a/src/ldt/ldt/indexation/search_indexes.py	Mon Apr 14 15:17:27 2014 +0200
+++ b/src/ldt/ldt/indexation/search_indexes.py	Wed Apr 23 13:00:27 2014 +0200
@@ -30,8 +30,12 @@
         return Segment
     
     def prepare_tags(self, obj):
+        if obj.tag_list is not None:
+            obj.tags = None # To avoid a second and useless db request
+            return ",".join(obj.tag_list)
         return ",".join([tag.name for tag in obj.tags.all()])
     
+    
 class AnnotationIndex(indexes.SearchIndex, indexes.Indexable):
     text = indexes.CharField(document=True, use_template=True)
     tags = indexes.CharField(model_attr='tags', indexed=True, stored=False)
--- a/src/ldt/ldt/ldt_utils/contentindexer.py	Mon Apr 14 15:17:27 2014 +0200
+++ b/src/ldt/ldt/ldt_utils/contentindexer.py	Wed Apr 23 13:00:27 2014 +0200
@@ -1,16 +1,18 @@
 from StringIO import StringIO
+from django.contrib.contenttypes.models import ContentType
 from django.dispatch import receiver
 from ldt import settings
-from ldt.indexation import object_delete, object_insert
+from ldt.indexation import object_delete, object_insert, object_run_index
 from ldt.ldt_utils.events import post_project_save
 from ldt.ldt_utils.models import Segment, Content, Project
 from ldt.ldt_utils.stat import update_stat_project, add_annotation_to_stat
 from ldt.ldt_utils.utils import reduce_text_node
 from ldt.utils.url import request_with_auth
-import logging
+from taggit.models import Tag, TaggedItem
+from taggit.utils import parse_tags
 import lxml.etree #@UnresolvedImport
-from taggit.utils import parse_tags
 
+import logging
 logger = logging.getLogger(__name__)
 
 def Property(func):
@@ -24,6 +26,9 @@
         self.__decoupage_blacklist = decoupage_blackList
         self.__callback = callback
         self.__segment_cache = []
+        self.__all_tags_cache = {}
+        self.__segment_tags_cache = {}
+        self.__tags_cache = []
         
     @Property
     def decoupage_blacklist(): #@NoSelf
@@ -50,11 +55,11 @@
     
     
     def index_object(self, obj):
-        
         self._do_index_object(obj)
         
         if self.__segment_cache:
             object_insert(Segment, self.__segment_cache, 'id_hash')
+            object_run_index(Segment, self.__segment_cache)
             self.__segment_cache = []
 
 
@@ -63,6 +68,7 @@
     
     def index_ensemble(self, ensemble, content, project=None):
         ensembleId = ensemble.get(u"id", None)
+        ctp = ContentType.objects.get_for_model(Segment)
         
         for decoupageNode in ensemble.getchildren():
             if decoupageNode.tag != "decoupage"  or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
@@ -74,7 +80,7 @@
                 
                 elementId = elementNode.get(u"id", None)
                 tags = elementNode.get(u"tags", None)
-                                
+                
                 if tags is None or len(tags) == 0:
                     tags = u""
                     restagnode = elementNode.xpath("tag/text()", smart_strings=False)
@@ -110,7 +116,7 @@
                 if audio_node:
                     audio_src = audio_node[0].get(u"source", u"")
                     audio_href = audio_node[0].text
-
+                
                 seg = Segment.create(content=content,
                               iri_id=content.iri_id,
                               ensemble_id=ensembleId,
@@ -126,20 +132,75 @@
                               project_id=ldt_id,
                               audio_src=audio_src,
                               audio_href=audio_href)
-                # Because of taggit managing (we HAVE to have primary key to ad tags), we save segment and then tags
+                
+                tags = parse_tags(tags)
+                self.__segment_tags_cache[seg.id_hash] = tags
                 seg.polemics = seg.get_polemic(polemics)
-                seg.save()
-                for t in parse_tags(tags):
-                    seg.tags.add(t)
-                seg.save()
-                
-#                 if settings.LDT_INDEXATION_INSERT_BATCH_SIZE < 2:
-#                     seg.save()
-#                 else:
-#                     self.__segment_cache.append(seg)
-#                     if not (len(self.__segment_cache)%settings.LDT_INDEXATION_INSERT_BATCH_SIZE):
-#                         object_insert(Segment, self.__segment_cache)
-#                         self.__segment_cache = []
+                if settings.LDT_INDEXATION_INSERT_BATCH_SIZE < 2:
+                    seg.save()
+                    seg.tags.add(*tags)
+                else:
+                    self.__segment_cache.append(seg)
+                    self.__tags_cache = set( list(self.__tags_cache) + tags)
+                    
+                    if not (len(self.__segment_cache)%settings.LDT_INDEXATION_INSERT_BATCH_SIZE):
+                        # First we insert/bulk_create the segments
+                        object_insert(Segment, self.__segment_cache, 'id_hash')
+                        # Filter already existing tags in current dict
+                        for t in list(self.__tags_cache):
+                            if t in self.__all_tags_cache:
+                                self.__tags_cache.remove(t)
+                        # Filter already existing tags in database
+                        current_tags = Tag.objects.filter(name__in=self.__tags_cache)
+                        for t in current_tags:
+                            self.__all_tags_cache[t.name] = t
+                            self.__tags_cache.remove(t.name)
+                        # If the rest of tags were never in the db, we save them
+                        if len(self.__tags_cache)>0:
+                            for t in self.__tags_cache:
+                                tag = Tag.objects.create(name=t)
+                                self.__all_tags_cache[t] = tag
+                        
+                        # Prepare taggeditems
+                        ti = []
+                        for s in self.__segment_cache:
+                            s.tag_list = self.__segment_tags_cache[s.id_hash]
+                            for t in self.__segment_tags_cache[s.id_hash]:
+                                ti.append( TaggedItem(tag=self.__all_tags_cache[t], content_type=ctp, object_id=s.pk) )
+                        TaggedItem.objects.bulk_create(ti)
+                        object_run_index(Segment, self.__segment_cache)
+                        self.__segment_cache = []
+        
+        # last loop if necessary
+        if len(self.__segment_cache) > 0:
+            # First we insert/bulk_create the segments
+            object_insert(Segment, self.__segment_cache, 'id_hash')
+            # Filter already existing tags in current dict
+            for t in list(self.__tags_cache):
+                if t in self.__all_tags_cache:
+                    self.__tags_cache.remove(t)
+            # Filter already existing tags in database
+            current_tags = Tag.objects.filter(name__in=self.__tags_cache)
+            for t in current_tags:
+                self.__all_tags_cache[t.name] = t
+                self.__tags_cache.remove(t.name)
+            # If the rest of tags were never in the db, we save them
+            if len(self.__tags_cache)>0:
+                for t in self.__tags_cache:
+                    tag = Tag.objects.create(name=t)
+                    self.__all_tags_cache[t] = tag
+            
+            # Prepare taggeditems
+            ti = []
+            for s in self.__segment_cache:
+                s.tag_list = self.__segment_tags_cache[s.id_hash]
+                for t in self.__segment_tags_cache[s.id_hash]:
+                    ti.append( TaggedItem(tag=self.__all_tags_cache[t], content_type=ctp, object_id=s.pk) )
+            TaggedItem.objects.bulk_create(ti)
+            object_run_index(Segment, self.__segment_cache)
+            # End of batch
+            self.__segment_cache = []
+
 
 
 class ContentIndexer(LdtIndexer):