web/ldt/ldt/contentindexer.py
author ymh <ymh.work@gmail.com>
Tue, 08 Jun 2010 01:16:35 +0200
changeset 0 ecdfc63274bf
child 3 651f67b66c51
permissions -rw-r--r--
first import
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
import tempfile
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import os
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
import os.path
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
import shutil
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
from ldt.utils import zipfileext
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
import urllib
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
# import ldt.utils.log
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
import ldt.utils.xml
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
from django.conf import settings
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
from models import Content
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
import xml
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
import xml.dom
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    13
import xml.dom.minidom
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    14
import xml.dom.ext
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    15
import xml.xpath
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    16
import fnmatch
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    17
import uuid
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    18
import shutil
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    19
import lucene
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    20
from ldt.ldt import STORE
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    21
from ldt.ldt import ANALYZER
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    22
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    23
def Property(func):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    24
    return property(**func()) 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    25
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    26
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    27
class ContentIndexer(object):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    28
        
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    29
        def __init__(self, contentList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    30
                self.__contentList = contentList
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    31
                self.__decoupage_blacklist = decoupage_blackList
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    32
                self.__writer = writer
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    33
                    
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    34
        @Property
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    35
        def decoupage_blacklist(): #@NoSelf
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    36
            doc = """get blacklist""" #@UnusedVariable
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    37
           
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    38
            def fget(self):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    39
                if self.__decoupage_blacklist is None:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    40
                    self.__decoupage_blacklist = ()
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    41
                return self.__decoupage_blacklist
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    42
               
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    43
            def fset(self, value):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    44
                self.__decoupage_blacklist = value
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    45
               
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    46
            def fdel(self):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    47
                del self.__decoupage_blacklist
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    48
               
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    49
            return locals()
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    50
                   
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    51
        def index_all(self):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    52
            for content in self.__contentList:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    53
                self.index_content(content)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    54
                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    55
        def index_content(self, content):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    56
            url =content.iri_url()
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    57
            filepath = urllib.urlopen(url)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    58
            doc = xml.dom.minidom.parse(filepath)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    59
                                    
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    60
            self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    61
            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    62
            con = xml.xpath.Context.Context(doc, 1, 1, None)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    63
            res = xml.xpath.Evaluate("/iri/body/ensembles/ensemble", context=con)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    64
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    65
            for ensemble in res:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    66
                ensembleId = ensemble.getAttribute("id")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    67
                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    68
                for decoupageNode in ensemble.childNodes:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    69
                    if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttribute("id") in self.decoupage_blacklist:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    70
                        continue
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    71
                    
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    72
                    decoupId = decoupageNode.getAttribute("id")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    73
                    res = xml.xpath.Evaluate("elements/element", decoupageNode)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    74
                    for elementNode in res:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    75
                        doc = lucene.Document()
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    76
                        elementId = elementNode.getAttribute("id")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    77
                        tags = elementNode.getAttribute("tags")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    78
                        
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    79
                        if tags is not None:                            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    80
                            tags.replace(",", ";")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    81
                        
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    82
                        if tags is None or len(tags) == 0:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
                            tags = ""
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    84
                            restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    85
                            for tagnode in restagnode:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    86
                                tags = tags + " ; " + tagnode.data
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    87
                                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    88
                        if tags is None or len(tags) == 0:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    89
                            tags = ""
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    90
                            restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    91
                            for tagnode in restagnode:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    92
                                tags = tags + " ; " + tagnode.data                            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    93
    
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    94
                        title = ""
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    95
                        for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    96
                            title = title + txtRes.data 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    97
                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    98
                        abstract = ""
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
    99
                        for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   100
                            abstract = abstract + txtRes.data 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   101
                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   102
                        doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   103
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   104
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   105
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   106
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   107
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   108
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   109
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   110
            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   111
                        self.__writer.addDocument(doc)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   112
            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   113
            self.__writer.flush()
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   114
            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   115
            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   116
class ProjectIndexer(object):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   117
        
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   118
        def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   119
                self.__projectList = projectList
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   120
                self.__decoupage_blacklist = decoupage_blackList
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   121
                self.__writer = writer
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   122
                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   123
        @Property
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   124
        def decoupage_blacklist(): #@NoSelf
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   125
            doc = """get blacklist""" #@UnusedVariable
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   126
           
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   127
            def fget(self):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   128
                if self.__decoupage_blacklist is None:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   129
                    self.__decoupage_blacklist = ()
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   130
                return self.__decoupage_blacklist
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   131
               
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   132
            def fset(self, value):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   133
                self.__decoupage_blacklist = value
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   134
               
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   135
            def fdel(self):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   136
                del self.__decoupage_blacklist
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   137
               
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   138
            return locals()
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   139
                   
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   140
        def index_all(self):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   141
            for project in self.__projectList:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   142
                self.index_project(project)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   143
 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   144
        def index_project(self, project):
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   145
            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   146
            # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   147
            doc = xml.dom.minidom.parseString(project.ldt)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   148
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   149
            self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   150
            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   151
            con = xml.xpath.Context.Context(doc, 1, 1, None)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   152
            res = xml.xpath.Evaluate("/iri/annotations/content", context=con)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   153
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   154
            for content in res:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   155
                contentId = content.getAttribute("id")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   156
                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   157
                ensembleId = "ens_perso"
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   158
                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   159
                for decoupageNode in content.childNodes:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   160
                    # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   161
                    if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttribute("id") in self.decoupage_blacklist:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   162
                        continue
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   163
                    
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   164
                    decoupId = decoupageNode.getAttribute("id")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   165
                    res = xml.xpath.Evaluate("elements/element", decoupageNode)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   166
                    for elementNode in res:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   167
                        doc = lucene.Document()
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   168
                        elementId = elementNode.getAttribute("id")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   169
                        tags = elementNode.getAttribute("tags")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   170
                        
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   171
                        if tags is not None:                            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   172
                            tags.replace(",", ";")
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   173
                        
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   174
                        if tags is None or len(tags) == 0:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   175
                            tags = ""
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   176
                            restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   177
                            for tagnode in restagnode:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   178
                                tags = tags + " ; " + tagnode.data
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   179
                                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   180
                        if tags is None or len(tags) == 0:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   181
                            tags = ""
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   182
                            restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   183
                            for tagnode in restagnode:
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   184
                                tags = tags + " ; " + tagnode.data                            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   185
    
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   186
                        title = ""
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   187
                        for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   188
                            title = title + txtRes.data 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   189
                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   190
                        abstract = ""
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   191
                        for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   192
                            abstract = abstract + txtRes.data 
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   193
                
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   194
                        doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))              
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   195
                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.UN_TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   196
                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   197
                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   198
                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   199
                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   200
                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   201
                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   202
                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.TOKENIZED))
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   203
            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   204
                        self.__writer.addDocument(doc)
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   205
            
ecdfc63274bf first import
ymh <ymh.work@gmail.com>
parents:
diff changeset
   206
            self.__writer.flush()