web/ldt/ldt_utils/projectindexer.py
author ymh <ymh.work@gmail.com>
Tue, 08 Jun 2010 15:44:35 +0200
changeset 5 ae8593287883
permissions -rw-r--r--
correct error changing ldt.ldt to ldt.ldt_utils
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
5
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
import tempfile
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import os
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
import os.path
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
import shutil
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
import ldt.utils.xml
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
from ldt import settings
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
import xml
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
import xml.dom
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
import xml.dom.minidom
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
import xml.dom.ext
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
import xml.xpath
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
import lucene
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
from ldt.ldt_utils import STORE
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
from ldt.ldt_utils import ANALYZER
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
def Property(func):
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
    return property(**func()) 
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
class ProjectIndexer(object):
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
    def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
        self.__projectList = projectList
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
        self.__decoupage_blacklist = decoupage_blackList
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
        self.__writer = writer
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
            
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
    @Property
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
    def decoupage_blacklist(): #@NoSelf
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
        doc = """get blacklist""" #@UnusedVariable
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
       
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
        def fget(self):
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
            if self.__decoupage_blacklist is None:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
                self.__decoupage_blacklist = ()
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
            return self.__decoupage_blacklist
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
           
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
        def fset(self, value):
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
            self.__decoupage_blacklist = value
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
           
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
        def fdel(self):
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
            del self.__decoupage_blacklist
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
           
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
        return locals()
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
               
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
    def index_all(self):
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
        for project in self.__projectList:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
            self.index_project(project)
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
    def index_project(self, project):
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
        # ldt.utils.log.debug("Indexing project : "+str(project.ldt_id))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
        ldt=project.ldt
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
        doc = xml.dom.minidom.parseString(ldt.encode( "utf-8" ))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
        
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
        self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
            
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
        con = xml.xpath.Context.Context(doc, 1, 1, None)
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
        res = xml.xpath.Evaluate("/iri/annotations/content", context=con)
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
        for content in res:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
            contentId = content.getAttribute("id")
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
            
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
            res =xml.xpath.Evaluate("ensemble", content)
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
            for ensemble in res:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
                ensembleId = ensemble.getAttribute("id")
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
                
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
                for decoupageNode in ensemble.childNodes:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
                    # ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
                    if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttribute("id") in self.decoupage_blacklist:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
                        continue
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
                
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
                    decoupId = decoupageNode.getAttribute("id")
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
                    res = xml.xpath.Evaluate("elements/element", decoupageNode)
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
                    for elementNode in res:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
                        doc = lucene.Document()
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
                        elementId = elementNode.getAttribute("id")
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
                        tags = elementNode.getAttribute("tags")
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
                        
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
                        if tags is not None:                            
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
                            tags.replace(",", ";")
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
                        
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
                        if tags is None or len(tags) == 0:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                            tags = ""
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
                            restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
                            for tagnode in restagnode:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
                                tags = tags + " ; " + tagnode.data
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
                                
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
                        if tags is None or len(tags) == 0:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
                            tags = ""
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
                            restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
                            for tagnode in restagnode:
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
                                tags = tags + " ; " + tagnode.data                            
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
                        title = ""
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
                        for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
                            title = title + txtRes.data 
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
                
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
                        abstract = ""
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
                        for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
                            abstract = abstract + txtRes.data 
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
                            
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
                        doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))              
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
            
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
                        self.__writer.addDocument(doc)
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
    
ae8593287883 correct error changing ldt.ldt to ldt.ldt_utils
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
        self.__writer.flush()