web/ldt/ldt_utils/projectindexer.py
changeset 1 3a30d255c235
child 22 83b28fc0d731
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/web/ldt/ldt_utils/projectindexer.py	Sun Nov 14 20:25:22 2010 +0100
@@ -0,0 +1,108 @@
+import tempfile
+import os
+import os.path
+import shutil
+import ldt.utils.xml
+from ldt import settings
+import lucene
+from ldt.ldt_utils import STORE
+from ldt.ldt_utils import ANALYZER
+import lxml.etree
+
+def Property(func):
+    return property(**func()) 
+
+class ProjectIndexer(object):
+    def __init__(self, projectList, writer, decoupage_blackList = settings.DECOUPAGE_BLACKLIST):
+        self.__projectList = projectList
+        self.__decoupage_blacklist = decoupage_blackList
+        self.__writer = writer
+            
+    @Property
+    def decoupage_blacklist(): #@NoSelf
+        doc = """get blacklist""" #@UnusedVariable
+       
+        def fget(self):
+            if self.__decoupage_blacklist is None:
+                self.__decoupage_blacklist = ()
+            return self.__decoupage_blacklist
+           
+        def fset(self, value):
+            self.__decoupage_blacklist = value
+           
+        def fdel(self):
+            del self.__decoupage_blacklist
+           
+        return locals()
+               
+    def index_all(self):
+        for project in self.__projectList:
+            self.index_project(project)
+
+    def index_project(self, project):
+        # ldt.utils.log.debug("Indexing project : "+str(project.ldt_id))
+        
+        ldt=project.ldt
+        doc = lxml.etree.fromstring(ldt.encode( "utf-8" ))
+ 
+        self.__writer.deleteDocuments(lucene.Term("ldt_id", project.ldt_id))
+            
+        res = doc.xpath("/iri/annotations/content")
+        project.ldt.encode( "utf-8 " )
+
+        for content in res:
+            contentId = content.get("id")
+ 
+            res =content.xpath("ensemble")
+            for ensemble in res:
+                ensembleId = ensemble.get("id")
+ 
+                for decoupageNode in ensemble.getchildren():
+                    # ldt.utils.log.debug("Indexing project decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
+                    if decoupageNode.tag != "decoupage"  or decoupageNode.get("id") in self.decoupage_blacklist:
+                        continue
+                
+                    decoupId = decoupageNode.get("id")
+                    res = decoupageNode.xpath("elements/element")
+
+                    for elementNode in res:
+                        doc = lucene.Document()
+                        elementId = elementNode.get("id")
+                        tags = elementNode.get("tags")
+                        
+                        if tags is not None:                            
+                            tags.replace(",", ";")
+                        
+                        if tags is None or len(tags) == 0:
+                            tags = ""
+                            restagnode = elementNode.xpath("tag/text()")
+                            for tagnode in restagnode:
+                                tags = tags + " ; " + tagnode.text()
+                                
+                        if tags is None or len(tags) == 0:
+                            tags = ""
+                            restagnode = elementNode.xpath("tags/tag/text()")
+                            for tagnode in restagnode:
+                                tags = tags + " ; " + tagnode.text()                          
+
+                        title = ""
+                        for txtRes in elementNode.xpath("title/text()"): 
+                            title = title + txtRes.text()
+                
+                        abstract = ""
+                        for txtRes in elementNode.xpath("abstract/text()"): 
+                            abstract = abstract + txtRes.text() 
+                            
+                        doc.add(lucene.Field("ldt_id", project.ldt_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
+                        doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
+                        doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                        doc.add(lucene.Field("decoupage_id", decoupId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                        doc.add(lucene.Field("element_id", elementId, lucene.Field.Store.YES, lucene.Field.Index.NO))
+                        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        doc.add(lucene.Field("title", title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        doc.add(lucene.Field("abstract", abstract, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+                        doc.add(lucene.Field("all", " ".join([tags, title, abstract]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+            
+                        self.__writer.addDocument(doc)
+    
+        self.__writer.commit()