--- a/src/ldt/ldt/ldt_utils/contentindexer.py Mon Apr 14 15:17:27 2014 +0200
+++ b/src/ldt/ldt/ldt_utils/contentindexer.py Wed Apr 23 13:00:27 2014 +0200
@@ -1,16 +1,18 @@
from StringIO import StringIO
+from django.contrib.contenttypes.models import ContentType
from django.dispatch import receiver
from ldt import settings
-from ldt.indexation import object_delete, object_insert
+from ldt.indexation import object_delete, object_insert, object_run_index
from ldt.ldt_utils.events import post_project_save
from ldt.ldt_utils.models import Segment, Content, Project
from ldt.ldt_utils.stat import update_stat_project, add_annotation_to_stat
from ldt.ldt_utils.utils import reduce_text_node
from ldt.utils.url import request_with_auth
-import logging
+from taggit.models import Tag, TaggedItem
+from taggit.utils import parse_tags
import lxml.etree #@UnresolvedImport
-from taggit.utils import parse_tags
+import logging
logger = logging.getLogger(__name__)
def Property(func):
@@ -24,6 +26,9 @@
self.__decoupage_blacklist = decoupage_blackList
self.__callback = callback
self.__segment_cache = []
+ self.__all_tags_cache = {}
+ self.__segment_tags_cache = {}
+ self.__tags_cache = []
@Property
def decoupage_blacklist(): #@NoSelf
@@ -50,11 +55,11 @@
def index_object(self, obj):
-
self._do_index_object(obj)
if self.__segment_cache:
object_insert(Segment, self.__segment_cache, 'id_hash')
+ object_run_index(Segment, self.__segment_cache)
self.__segment_cache = []
@@ -63,6 +68,7 @@
def index_ensemble(self, ensemble, content, project=None):
ensembleId = ensemble.get(u"id", None)
+ ctp = ContentType.objects.get_for_model(Segment)
for decoupageNode in ensemble.getchildren():
if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
@@ -74,7 +80,7 @@
elementId = elementNode.get(u"id", None)
tags = elementNode.get(u"tags", None)
-
+
if tags is None or len(tags) == 0:
tags = u""
restagnode = elementNode.xpath("tag/text()", smart_strings=False)
@@ -110,7 +116,7 @@
if audio_node:
audio_src = audio_node[0].get(u"source", u"")
audio_href = audio_node[0].text
-
+
seg = Segment.create(content=content,
iri_id=content.iri_id,
ensemble_id=ensembleId,
@@ -126,20 +132,75 @@
project_id=ldt_id,
audio_src=audio_src,
audio_href=audio_href)
- # Because of taggit managing (we HAVE to have primary key to ad tags), we save segment and then tags
+
+ tags = parse_tags(tags)
+ self.__segment_tags_cache[seg.id_hash] = tags
seg.polemics = seg.get_polemic(polemics)
- seg.save()
- for t in parse_tags(tags):
- seg.tags.add(t)
- seg.save()
-
-# if settings.LDT_INDEXATION_INSERT_BATCH_SIZE < 2:
-# seg.save()
-# else:
-# self.__segment_cache.append(seg)
-# if not (len(self.__segment_cache)%settings.LDT_INDEXATION_INSERT_BATCH_SIZE):
-# object_insert(Segment, self.__segment_cache)
-# self.__segment_cache = []
+ if settings.LDT_INDEXATION_INSERT_BATCH_SIZE < 2:
+ seg.save()
+ seg.tags.add(*tags)
+ else:
+ self.__segment_cache.append(seg)
+ self.__tags_cache = set( list(self.__tags_cache) + tags)
+
+ if not (len(self.__segment_cache)%settings.LDT_INDEXATION_INSERT_BATCH_SIZE):
+ # First we insert/bulk_create the segments
+ object_insert(Segment, self.__segment_cache, 'id_hash')
+ # Filter already existing tags in current dict
+ for t in list(self.__tags_cache):
+ if t in self.__all_tags_cache:
+ self.__tags_cache.remove(t)
+ # Filter already existing tags in database
+ current_tags = Tag.objects.filter(name__in=self.__tags_cache)
+ for t in current_tags:
+ self.__all_tags_cache[t.name] = t
+ self.__tags_cache.remove(t.name)
+ # If the rest of tags were never in the db, we save them
+ if len(self.__tags_cache)>0:
+ for t in self.__tags_cache:
+ tag = Tag.objects.create(name=t)
+ self.__all_tags_cache[t] = tag
+
+ # Prepare taggeditems
+ ti = []
+ for s in self.__segment_cache:
+ s.tag_list = self.__segment_tags_cache[s.id_hash]
+ for t in self.__segment_tags_cache[s.id_hash]:
+ ti.append( TaggedItem(tag=self.__all_tags_cache[t], content_type=ctp, object_id=s.pk) )
+ TaggedItem.objects.bulk_create(ti)
+ object_run_index(Segment, self.__segment_cache)
+ self.__segment_cache = []
+
+ # last loop if necessary
+ if len(self.__segment_cache) > 0:
+ # First we insert/bulk_create the segments
+ object_insert(Segment, self.__segment_cache, 'id_hash')
+ # Filter already existing tags in current dict
+ for t in list(self.__tags_cache):
+ if t in self.__all_tags_cache:
+ self.__tags_cache.remove(t)
+ # Filter already existing tags in database
+ current_tags = Tag.objects.filter(name__in=self.__tags_cache)
+ for t in current_tags:
+ self.__all_tags_cache[t.name] = t
+ self.__tags_cache.remove(t.name)
+ # If the rest of tags were never in the db, we save them
+ if len(self.__tags_cache)>0:
+ for t in self.__tags_cache:
+ tag = Tag.objects.create(name=t)
+ self.__all_tags_cache[t] = tag
+
+ # Prepare taggeditems
+ ti = []
+ for s in self.__segment_cache:
+ s.tag_list = self.__segment_tags_cache[s.id_hash]
+ for t in self.__segment_tags_cache[s.id_hash]:
+ ti.append( TaggedItem(tag=self.__all_tags_cache[t], content_type=ctp, object_id=s.pk) )
+ TaggedItem.objects.bulk_create(ti)
+ object_run_index(Segment, self.__segment_cache)
+ # End of batch
+ self.__segment_cache = []
+
class ContentIndexer(LdtIndexer):