src/ldt/ldt/ldt_utils/projectindexer.py
author ymh <ymh.work@gmail.com>
Thu, 19 May 2011 10:38:36 +0200
changeset 113 cf3bdb2a4216
parent 63 93325a5d61f0
child 560 1cb2a4a573e1
permissions -rw-r--r--
improve ldt generation for merging projects
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
from ldt import settings
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import lucene
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
import lxml.etree
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
def Property(func):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
    return property(**func()) 
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
class ProjectIndexer(object):
13
97ab7b3191cf add api to update project, uses psiton
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
     9
    def __init__(self, projectList, writer, decoupage_blackList=settings.DECOUPAGE_BLACKLIST):
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
        self.__projectList = projectList
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
        self.__decoupage_blacklist = decoupage_blackList
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
        self.__writer = writer
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
    @Property
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
    def decoupage_blacklist(): #@NoSelf
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
        doc = """get blacklist""" #@UnusedVariable
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
       
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
        def fget(self):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
            if self.__decoupage_blacklist is None:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
                self.__decoupage_blacklist = ()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
            return self.__decoupage_blacklist
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
           
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
        def fset(self, value):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
            self.__decoupage_blacklist = value
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
           
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
        def fdel(self):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
            del self.__decoupage_blacklist
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
           
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
        return locals()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
               
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
    def index_all(self):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
        for project in self.__projectList:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
            self.index_project(project)
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
    def index_project(self, project):
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
        # ldt.utils.log.debug("Indexing project : "+str(project.ldt_id))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
        
13
97ab7b3191cf add api to update project, uses psiton
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    38
        ldt = project.ldt
97ab7b3191cf add api to update project, uses psiton
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    39
        doc = lxml.etree.fromstring(ldt.encode("utf-8"))
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
 
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
        self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
        res = doc.xpath("/iri/annotations/content")
13
97ab7b3191cf add api to update project, uses psiton
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    44
        project.ldt.encode("utf-8 ")
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
        for content in res:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
            contentId = content.get("id")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
 
13
97ab7b3191cf add api to update project, uses psiton
ymh <ymh.work@gmail.com>
parents: 0
diff changeset
    49
            res = content.xpath("ensemble")
0
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
            for ensemble in res:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
                ensembleId = ensemble.get("id")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
 
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
                for decoupageNode in ensemble.getchildren():
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
                    # ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
                    if decoupageNode.tag != "decoupage"  or decoupageNode.get("id") in self.decoupage_blacklist:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
                        continue
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
                
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
                    decoupId = decoupageNode.get("id")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
                    res = decoupageNode.xpath("elements/element")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
                    for elementNode in res:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
                        doc = lucene.Document()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
                        elementId = elementNode.get("id")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
                        tags = elementNode.get("tags")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
                        
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
                        if tags is not None:                            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
                            tags.replace(",", ";")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
                        
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
                        if tags is None or len(tags) == 0:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
                            tags = ""
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
                            restagnode = elementNode.xpath("tag/text()")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
                            for tagnode in restagnode:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
                                tags = tags + " ; " + tagnode.text()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
                                
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
                        if tags is None or len(tags) == 0:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
                            tags = ""
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
                            restagnode = elementNode.xpath("tags/tag/text()")
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
                            for tagnode in restagnode:
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                                tags = tags + " ; " + tagnode.text()                          
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
                        title = ""
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
                        for txtRes in elementNode.xpath("title/text()"): 
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
                            title = title + txtRes.text()
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
                
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
                        abstract = ""
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
                        for txtRes in elementNode.xpath("abstract/text()"): 
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
                            abstract = abstract + txtRes.text() 
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
                            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
                        doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
            
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
                        self.__writer.addDocument(doc)
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
    
bdf22b140727 first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
        self.__writer.commit()