# HG changeset patch # User ymh # Date 1304633202 -7200 # Node ID 4ddb88f103ada1a56b451f7d9dccf7c013d6598b # Parent 95cbac192438d954cc21843d6149f522bacc1bb5 improve indexation, limit tags to 50 characters, improve wsgi diff -r 95cbac192438 -r 4ddb88f103ad src/ldt/ldt/__init__.py --- a/src/ldt/ldt/__init__.py Wed May 04 12:44:51 2011 +0200 +++ b/src/ldt/ldt/__init__.py Fri May 06 00:06:42 2011 +0200 @@ -1,4 +1,4 @@ -VERSION = (0, 6, 0, "final", 0) +VERSION = (0, 7, 0, "final", 0) def get_version(): diff -r 95cbac192438 -r 4ddb88f103ad src/ldt/ldt/ldt_utils/contentindexer.py --- a/src/ldt/ldt/ldt_utils/contentindexer.py Wed May 04 12:44:51 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/contentindexer.py Fri May 06 00:06:42 2011 +0200 @@ -9,35 +9,124 @@ import urllib #@UnresolvedImport # import ldt.utils.log - def Property(func): return property(**func()) + + +class LdtIndexer(object): + + def __init__(self, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST): + self.__decoupage_blacklist = decoupage_blackList + self.__writer = writer + @Property + def decoupage_blacklist(): #@NoSelf + doc = """get blacklist""" #@UnusedVariable + + def fget(self): + if self.__decoupage_blacklist is None: + self.__decoupage_blacklist = () + return self.__decoupage_blacklist + + def fset(self, value): + self.__decoupage_blacklist = value + + def fdel(self): + del self.__decoupage_blacklist + + return locals() + + @Property + def writer(): #@NoSelf + def fget(self): + return self.__writer + return locals() -class ContentIndexer(object): + def index_all(self): + raise NotImplemented + + def index_ensemble(self, ensemble, content, project=None): + ensembleId = ensemble.get(u"id", None) + + for decoupageNode in ensemble.getchildren(): + if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist: + continue + + decoupId = decoupageNode.get(u"id", None) + res = decoupageNode.xpath("elements/element") + for elementNode in res: + + elementId = elementNode.get(u"id", None) + tags = elementNode.get(u"tags", None) + + if tags is not None: + tags.replace(",", ";") + + if tags is None or len(tags) == 0: + tags = u"" + restagnode = elementNode.xpath("tag/text()", smart_strings=False) + for tagnode in restagnode: + tags = tags + u" ; " + tagnode + + if tags is None or len(tags) == 0: + tags = u"" + restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False) + + for tagnode in restagnode: + tags = tags + u" ; " + tagnode + + if tags is None: + tags = "" + tags = ";".join([tag[0:50] for tag in tags.split(";")]) + + + title = reduce_text_node(elementNode, "title/text()") + abstract = reduce_text_node(elementNode, "abstract/text()") + + author = elementNode.get("author", "") + start_ts = int(elementNode.get("begin", "-1")) + duration = int(elementNode.get("dur", "-1")) + date_str = elementNode.get("date", "") + ldt_id = "" + if project: + ldt_id = project.ldt_id + + doc = lucene.Document() + doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("project_id", ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) + doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + + seg = Segment(content=content, + iri_id=content.iri_id, + ensemble_id=ensembleId, + cutting_id=decoupId, + element_id=elementId, + tags=tags, + title=title, + abstract=abstract, + duration=duration, + author=author, + start_ts=start_ts, + date=date_str, + project_obj=project) + seg.save() + self.writer.addDocument(doc) + + + +class ContentIndexer(LdtIndexer): def __init__(self, contentList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST): - self.__contentList = contentList - self.__decoupage_blacklist = decoupage_blackList - self.__writer = writer - - @Property - def decoupage_blacklist(): #@NoSelf - doc = """get blacklist""" #@UnusedVariable - - def fget(self): - if self.__decoupage_blacklist is None: - self.__decoupage_blacklist = () - return self.__decoupage_blacklist - - def fset(self, value): - self.__decoupage_blacklist = value - - def fdel(self): - del self.__decoupage_blacklist - - return locals() - + super(ContentIndexer, self).__init__(writer, decoupage_blackList) + self.__contentList = contentList + def index_all(self): for content in self.__contentList: self.index_content(content) @@ -47,104 +136,23 @@ filepath = urllib.urlopen(url) doc = lxml.etree.parse(filepath) #@UndefinedVariable - self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id)) + self.writer.deleteDocuments(lucene.Term("iri_id", content.iri_id)) Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable res = doc.xpath("/iri/body/ensembles/ensemble") - for ensemble in res: - ensembleId = ensemble.get(u"id", None) - - for decoupageNode in ensemble.getchildren(): - if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist: - - continue - - decoupId = decoupageNode.get(u"id", None) - res = decoupageNode.xpath("elements/element") - for elementNode in res: - doc = lucene.Document() - elementId = elementNode.get(u"id", None) - tags = elementNode.get(u"tags", None) - - if tags is not None: - tags.replace(",", ";") - - if tags is None or len(tags) == 0: - tags = u"" - restagnode = elementNode.xpath("tag/text()", smart_strings=False) - for tagnode in restagnode: - tags = tags + u" ; " + tagnode - - if tags is None or len(tags) == 0: - tags = u"" - restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False) - - for tagnode in restagnode: - tags = tags + u" ; " + tagnode - - title = reduce_text_node(elementNode, "title/text()") - abstract = reduce_text_node(elementNode, "abstract/text()") - - author = elementNode.get("author", "") - start_ts = int(elementNode.get("begin", "-1")) - duration = int(elementNode.get("dur", "-1")) - date_str = elementNode.get("date", "") - - - doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - - seg = Segment(content=content, - iri_id=content.iri_id, - ensemble_id=ensembleId, - cutting_id=decoupId, - element_id=elementId, - tags=tags, - title=title, - abstract=abstract, - duration=duration, - author=author, - start_ts=start_ts, - date=date_str) - seg.save() - + for ensemble in res: + self.index_ensemble(ensemble, content) - self.__writer.addDocument(doc) - - self.__writer.commit() + self.writer.commit() -class ProjectIndexer(object): +class ProjectIndexer(LdtIndexer): def __init__(self, projectList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST): - self.__projectList = projectList - self.__decoupage_blacklist = decoupage_blackList - self.__writer = writer - - @Property - def decoupage_blacklist(): #@NoSelf - doc = """get blacklist""" #@UnusedVariable - - def fget(self): - if self.__decoupage_blacklist is None: - self.__decoupage_blacklist = () - return self.__decoupage_blacklist - - def fset(self, value): - self.__decoupage_blacklist = value - - def fdel(self): - del self.__decoupage_blacklist - - return locals() - + super(ProjectIndexer, self).__init__(writer, decoupage_blackList) + self.__projectList = projectList + def index_all(self): for project in self.__projectList: self.index_project(project) @@ -154,8 +162,8 @@ # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id)) doc = lxml.etree.fromstring(project.ldt) #@UndefinedVariable - self.__writer.deleteDocuments(lucene.Term("project_id", project.ldt_id)) - Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() + self.writer.deleteDocuments(lucene.Term("project_id", project.ldt_id)) + Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete() #@UndefinedVariable res = doc.xpath("/iri/annotations/content") @@ -163,79 +171,14 @@ contentId = content.get(u"id", None) content_obj = None - clist = Content.objects.filter(iri_id = contentId) + clist = Content.objects.filter(iri_id = contentId) #@UndefinedVariable if len(clist) > 0: content_obj = clist[0] - for ensembleNode in content.getchildren(): - ensembleId = ensembleNode.get(u"id",None) - - for decoupageNode in ensembleNode.getchildren(): - # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist)) - if decoupageNode.tag != "decoupage" or decoupageNode.get(u"id", None) in self.decoupage_blacklist: - continue - - decoupId = decoupageNode.get(u"id", None) - res = decoupageNode.xpath("elements/element") - for elementNode in res: - doc = lucene.Document() - elementId = elementNode.get(u"id", None) - tags = elementNode.get(u"tags", None) - - if tags is not None: - tags.replace(",", ";") - - if tags is None or len(tags) == 0: - tags = u"" - restagnode = elementNode.xpath("tag/text()", smart_strings=False) - for tagnode in restagnode: - tags = tags + u" ; " + tagnode - - if tags is None or len(tags) == 0: - tags = u"" - restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False) - - for tagnode in restagnode: - tags = tags + u" ; " + tagnode - - title = reduce_text_node(elementNode, "title/text()") - abstract = reduce_text_node(elementNode, "abstract/text()") - - author = elementNode.get("author", "") - start_ts = int(elementNode.get("begin", "-1")) - duration = int(elementNode.get("dur", "-1")) - date_str = elementNode.get("date", "") - - - doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("project_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) - doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO)) - doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) - - seg = Segment(content=content_obj, - iri_id=contentId, - ensemble_id=ensembleId, - cutting_id=decoupId, - element_id=elementId, - tags=tags, - title=title, - abstract=abstract, - duration=duration, - author=author, - start_ts=start_ts, - date=date_str, - project_obj = project) - seg.save() - - self.__writer.addDocument(doc) + for ensemble in content.getchildren(): + self.index_ensemble(ensemble, content_obj, project) - self.__writer.commit() + self.writer.commit() @receiver(post_save, sender=Project) def index_project(sender, **kwargs): @@ -243,7 +186,7 @@ writer = ldt.indexation.get_writer() if instance.state != 2: writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id)) - Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() + Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete() #@UndefinedVariable else: projectIndexer = ProjectIndexer([instance], writer) projectIndexer.index_all() diff -r 95cbac192438 -r 4ddb88f103ad src/ldt/ldt/ldt_utils/forms.py --- a/src/ldt/ldt/ldt_utils/forms.py Wed May 04 12:44:51 2011 +0200 +++ b/src/ldt/ldt/ldt_utils/forms.py Fri May 06 00:06:42 2011 +0200 @@ -11,15 +11,15 @@ class LdtAddForm(forms.ModelForm): title = forms.CharField() - contents = forms.ModelMultipleChoiceField(Content.objects.all()) + contents = forms.ModelMultipleChoiceField(Content.objects.all()) #@UndefinedVariable # owner = forms.ModelChoiceField(Author.objects.all()) class Meta: model = Project exclude = ("ldt_id", "ldt", "created_by", "changed_by", "creation_date", "modification_date", "state", "owner") class ReindexForm(forms.Form): - contents = forms.ModelMultipleChoiceField(Content.objects.all()) - index_projects = forms.BooleanField(initial=False) + contents = forms.ModelMultipleChoiceField(Content.objects.all()) #@UndefinedVariable + index_projects = forms.BooleanField(required=False, initial=False) class SearchForm(forms.Form): search = forms.CharField() diff -r 95cbac192438 -r 4ddb88f103ad web/ldtplatform/modwsgi.wsgi --- a/web/ldtplatform/modwsgi.wsgi Wed May 04 12:44:51 2011 +0200 +++ b/web/ldtplatform/modwsgi.wsgi Fri May 06 00:06:42 2011 +0200 @@ -1,21 +1,26 @@ import os, sys, site def application(environ, start_response): - os.environ['DJANGO_SETTINGS_MODULE'] = environ['DJANGO_SETTINGS_MODULE'] - - prev_sys_path = list(sys.path) - - sys.path.append(environ['PROJECT_PATH']) - for path in environ.get('PYTHON_PATH',"").split(os.pathsep): - if path: - site.addsitedir(path) - - new_sys_path = [] - for item in list(sys.path): - if item not in prev_sys_path: - new_sys_path.append(item) - sys.path.remove(item) - sys.path[:0] = new_sys_path + + global g_env_set + + if 'g_env_set' not in globals() or not g_env_set: + os.environ['DJANGO_SETTINGS_MODULE'] = environ['DJANGO_SETTINGS_MODULE'] + + prev_sys_path = list(sys.path) + + sys.path.append(environ['PROJECT_PATH']) + for path in environ.get('PYTHON_PATH',"").split(os.pathsep): + if path: + site.addsitedir(path) + + new_sys_path = [] + for item in list(sys.path): + if item not in prev_sys_path and item not in new_sys_path: + new_sys_path.append(item) + sys.path.remove(item) + sys.path[:0] = new_sys_path + g_env_set = True import django.core.handlers.wsgi