src/ldt/ldt/ldt_utils/contentindexer.py
author ymh <ymh.work@gmail.com>
Sun, 01 May 2011 03:30:40 +0200
changeset 77 7923feb2e362
parent 63 93325a5d61f0
child 90 4ddb88f103ad
permissions -rw-r--r--
improve indexation
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
57a2650a7f87 update nyromodal
ymh <ymh.work@gmail.com>
parents: 19
diff changeset
     1
from django.conf import settings
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
     2
from django.db.models.signals import post_save
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
     3
from django.dispatch import receiver
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
     4
from ldt.ldt_utils.models import Segment, Content, Project
63
93325a5d61f0 organize format and import
ymh <ymh.work@gmail.com>
parents: 54
diff changeset
     5
from ldt.ldt_utils.utils import reduce_text_node
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
     6
import ldt.indexation
24
57a2650a7f87 update nyromodal
ymh <ymh.work@gmail.com>
parents: 19
diff changeset
     7
import lucene
57a2650a7f87 update nyromodal
ymh <ymh.work@gmail.com>
parents: 19
diff changeset
     8
import lxml.etree
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
     9
import urllib #@UnresolvedImport
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
# import ldt.utils.log
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
    12
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
def Property(func):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
    return property(**func()) 
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    15
        
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
class ContentIndexer(object):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
        
13
97ab7b3191cf add api to update project, uses psiton
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    19
        def __init__(self, contentList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
                self.__contentList = contentList
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
                self.__decoupage_blacklist = decoupage_blackList
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
                self.__writer = writer
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
                    
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
        @Property
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
        def decoupage_blacklist(): #@NoSelf
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
            doc = """get blacklist""" #@UnusedVariable
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
           
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
            def fget(self):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
                if self.__decoupage_blacklist is None:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
                    self.__decoupage_blacklist = ()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
                return self.__decoupage_blacklist
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
               
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
            def fset(self, value):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
                self.__decoupage_blacklist = value
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
               
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
            def fdel(self):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
                del self.__decoupage_blacklist
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
               
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
            return locals()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
                   
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
        def index_all(self):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
            for content in self.__contentList:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
                self.index_content(content)
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
                
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
        def index_content(self, content):
13
97ab7b3191cf add api to update project, uses psiton
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    46
            url = content.iri_url()
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
            filepath = urllib.urlopen(url)
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    48
            doc = lxml.etree.parse(filepath) #@UndefinedVariable
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
           
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
            self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
    51
            Segment.objects.filter(iri_id=content.iri_id).delete() #@UndefinedVariable
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
            res = doc.xpath("/iri/body/ensembles/ensemble")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
            for ensemble in res:
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    56
                ensembleId = ensemble.get(u"id", None)
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
                
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
                for decoupageNode in ensemble.getchildren():
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    59
                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
                        continue
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
                    
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    63
                    decoupId = decoupageNode.get(u"id", None)
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
                    res = decoupageNode.xpath("elements/element")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
                    for elementNode in res:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
                        doc = lucene.Document()
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    67
                        elementId = elementNode.get(u"id", None)
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    68
                        tags = elementNode.get(u"tags", None)
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
                        
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
                        if tags is not None:                            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
                            tags.replace(",", ";")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
                        
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
                        if tags is None or len(tags) == 0:
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    74
                            tags = u""
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    75
                            restagnode = elementNode.xpath("tag/text()", smart_strings=False)
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
                            for tagnode in restagnode:
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    77
                                tags = tags + u" ; " + tagnode
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
                                
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                        if tags is None or len(tags) == 0:
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    80
                            tags = u""
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    81
                            restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
                            for tagnode in restagnode:
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    84
                                tags = tags + u" ; " + tagnode
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
    
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    86
                        title = reduce_text_node(elementNode, "title/text()")                
63
93325a5d61f0 organize format and import
ymh <ymh.work@gmail.com>
parents: 54
diff changeset
    87
                        abstract = reduce_text_node(elementNode, "abstract/text()")
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    88
                        
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    89
                        author = elementNode.get("author", "")
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    90
                        start_ts = int(elementNode.get("begin", "-1"))
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    91
                        duration = int(elementNode.get("dur", "-1"))
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    92
                        date_str = elementNode.get("date", "")
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
    93
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
                
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
                        doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))                        
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
                        seg = Segment(content=content,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
                                      iri_id=content.iri_id,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
                                      ensemble_id=ensembleId,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
                                      cutting_id=decoupId,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
                                      element_id=elementId,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
                                      tags=tags,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
                                      title=title,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
                                      abstract=abstract,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
                                      duration=duration,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
                                      author=author,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
                                      start_ts=start_ts,
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
                                      date=date_str)
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
                        seg.save()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
                        self.__writer.addDocument(doc)
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
            self.__writer.commit()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
class ProjectIndexer(object):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
        
13
97ab7b3191cf add api to update project, uses psiton
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
   126
        def __init__(self, projectList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
                self.__projectList = projectList
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
                self.__decoupage_blacklist = decoupage_blackList
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
                self.__writer = writer
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130
                
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
        @Property
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
        def decoupage_blacklist(): #@NoSelf
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
            doc = """get blacklist""" #@UnusedVariable
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
           
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
            def fget(self):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
                if self.__decoupage_blacklist is None:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
                    self.__decoupage_blacklist = ()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
                return self.__decoupage_blacklist
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
               
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
            def fset(self, value):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
                self.__decoupage_blacklist = value
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   142
               
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   143
            def fdel(self):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   144
                del self.__decoupage_blacklist
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   145
               
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   146
            return locals()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   147
                   
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   148
        def index_all(self):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   149
            for project in self.__projectList:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   150
                self.index_project(project)
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   151
 
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   152
        def index_project(self, project):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   153
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   154
            # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
   155
            doc = lxml.etree.fromstring(project.ldt) #@UndefinedVariable
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   156
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   157
            self.__writer.deleteDocuments(lucene.Term("project_id", project.ldt_id))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   158
            Segment.objects.filter(project_obj__ldt_id=project.ldt_id).delete()
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   159
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   160
            res = doc.xpath("/iri/annotations/content")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   161
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   162
            for content in res:
54
e111c8a3b4ac change to reindex and import
ymh <ymh.work@gmail.com>
parents: 24
diff changeset
   163
                contentId = content.get(u"id", None)
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   164
                content_obj = None
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   165
                
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   166
                clist = Content.objects.filter(iri_id = contentId)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   167
                if len(clist) > 0:
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   168
                    content_obj = clist[0]
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   169
 
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   170
                for ensembleNode in content.getchildren():
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   171
                    ensembleId = ensembleNode.get(u"id",None)
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   172
                
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   173
                    for decoupageNode in ensembleNode.getchildren():
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   174
                        # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   175
                        if decoupageNode.tag != "decoupage"  or decoupageNode.get(u"id", None) in self.decoupage_blacklist:
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   176
                            continue
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   177
                        
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   178
                        decoupId = decoupageNode.get(u"id", None)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   179
                        res = decoupageNode.xpath("elements/element")
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   180
                        for elementNode in res:
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   181
                            doc = lucene.Document()
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   182
                            elementId = elementNode.get(u"id", None)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   183
                            tags = elementNode.get(u"tags", None)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   184
                            
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   185
                            if tags is not None:                            
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   186
                                tags.replace(",", ";")
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   187
                            
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   188
                            if tags is None or len(tags) == 0:
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   189
                                tags = u""
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   190
                                restagnode = elementNode.xpath("tag/text()", smart_strings=False)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   191
                                for tagnode in restagnode:
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   192
                                    tags = tags + u" ; " + tagnode
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   193
                                    
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   194
                            if tags is None or len(tags) == 0:
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   195
                                tags = u""
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   196
                                restagnode = elementNode.xpath("tags/tag/text()", smart_strings=False)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   197
    
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   198
                                for tagnode in restagnode:
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   199
                                    tags = tags + u" ; " + tagnode
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   200
        
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   201
                            title = reduce_text_node(elementNode, "title/text()")                
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   202
                            abstract = reduce_text_node(elementNode, "abstract/text()")
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   203
                            
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   204
                            author = elementNode.get("author", "")
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   205
                            start_ts = int(elementNode.get("begin", "-1"))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   206
                            duration = int(elementNode.get("dur", "-1"))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   207
                            date_str = elementNode.get("date", "")
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   208
    
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   209
                    
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   210
                            doc.add(lucene.Field("type_doc", "annotation", lucene.Field.Store.NO, lucene.Field.Index.NOT_ANALYZED))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   211
                            doc.add(lucene.Field("project_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   212
                            doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   213
                            doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   214
                            doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   215
                            doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   216
                            doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   217
                            doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   218
                            doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   219
                            doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   220
                            
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   221
                            seg = Segment(content=content_obj,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   222
                                          iri_id=contentId,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   223
                                          ensemble_id=ensembleId,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   224
                                          cutting_id=decoupId,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   225
                                          element_id=elementId,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   226
                                          tags=tags,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   227
                                          title=title,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   228
                                          abstract=abstract,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   229
                                          duration=duration,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   230
                                          author=author,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   231
                                          start_ts=start_ts,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   232
                                          date=date_str,
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   233
                                          project_obj = project)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   234
                            seg.save()
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   235
                                        
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   236
                            self.__writer.addDocument(doc)
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   237
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   238
            self.__writer.commit()
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   239
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   240
@receiver(post_save, sender=Project)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   241
def index_project(sender, **kwargs):
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   242
    instance = kwargs['instance']
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   243
    writer = ldt.indexation.get_writer()
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   244
    if instance.state != 2:
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   245
        writer.deleteDocuments(lucene.Term("project_id", instance.ldt_id))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   246
        Segment.objects.filter(project_obj__ldt_id=instance.ldt_id).delete()
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   247
    else:
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   248
        projectIndexer = ProjectIndexer([instance], writer)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   249
        projectIndexer.index_all()
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   250
        
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents: 63
diff changeset
   251