web/ldt/ldt_utils/contentindexer.py
changeset 94 9927a619d2b5
parent 91 9c83809fda01
--- a/web/ldt/ldt_utils/contentindexer.py	Thu Oct 14 12:17:31 2010 +0200
+++ b/web/ldt/ldt_utils/contentindexer.py	Fri Oct 15 12:36:43 2010 +0200
@@ -8,19 +8,13 @@
 import ldt.utils.xml
 from django.conf import settings
 from models import Content
-import xml
-import xml.dom
-import xml.dom.minidom
-import xml.dom.ext
-import xml.xpath
 import fnmatch
-import Ft
 import uuid
 import shutil
 import lucene
 from ldt.ldt_utils import STORE
 from ldt.ldt_utils import ANALYZER
-## import lxml.etree
+import lxml.etree
 
 def Property(func):
     return property(**func()) 
@@ -57,52 +51,50 @@
         def index_content(self, content):
             url =content.iri_url()
             filepath = urllib.urlopen(url)
-            doc = xml.dom.minidom.parse(filepath)
-            doc = Ft.Xml.Domlette.ConvertDocument(doc)
-            
+            doc = lxml.etree.fromstring(filepath) 
+           
             self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
             
-            con = xml.xpath.Context.Context(doc, 1, 1, None)
-            res = xml.xpath.Evaluate("/iri/body/ensembles/ensemble", context=con)
+            res = doc.xpath("/iri/body/ensembles/ensemble")
 
             for ensemble in res:
-                ensembleId = ensemble.getAttributeNS(None,u"id")
+                ensembleId = ensemble.get(None,u"id")
                 
-                for decoupageNode in ensemble.childNodes:
-                    if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttributeNS(None,u"id") in self.decoupage_blacklist:
+                for decoupageNode in ensemble.getchildren():
+                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,u"id") in self.decoupage_blacklist:
 
                         continue
                     
-                    decoupId = decoupageNode.getAttributeNS(None,u"id")
-                    res = xml.xpath.Evaluate("elements/element", decoupageNode)
+                    decoupId = decoupageNode.get(None,u"id")
+                    res = decoupageNode.xpath("elements/element")
                     for elementNode in res:
                         doc = lucene.Document()
-                        elementId = elementNode.getAttributeNS(None,u"id")
-                        tags = elementNode.getAttributeNS(None,u"tags")
+                        elementId = elementNode.get(None,u"id")
+                        tags = elementNode.get(None,u"tags")
                         
                         if tags is not None:                            
                             tags.replace(",", ";")
                         
                         if tags is None or len(tags) == 0:
                             tags = ""
-                            restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
+                            restagnode = elementNode.xpath("tag/text()")
                             for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.data
+                                tags = tags + " ; " + tagnode.text()
                                 
                         if tags is None or len(tags) == 0:
                             tags = ""
-                            restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
+                            restagnode = elementNode.xpath("tags/tag/text()")
 
                             for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.data                            
+                                tags = tags + " ; " + tagnode.text()
     
                         title = ""
-                        for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
-                            title = title + txtRes.data 
+                        for txtRes in elementNode.xpath("title/text()"): 
+                            title = title + txtRes.text()
                 
                         abstract = ""
-                        for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
-                            abstract = abstract + txtRes.data 
+                        for txtRes in elementNode.xpath("abstract/text()"): 
+                            abstract = abstract + txtRes.text()
                 
                         doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
                         doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
@@ -164,53 +156,51 @@
         def index_project(self, project):
             
             # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
-            doc = xml.dom.minidom.parseString(project.ldt)
-            doc = Ft.Xml.Domlette.ConvertDocument(doc) 
+            doc = lxml.etree.fromstring(project.ldt)
 
             self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
             
-            con = xml.xpath.Context.Context(doc, 1, 1, None)
-            res = xml.xpath.Evaluate("/iri/annotations/content", context=con)
+            res = doc.xpath("/iri/annotations/content")
 
             for content in res:
-                contentId = content.getAttributeNS(None,u"id")
+                contentId = content.get(None,u"id")
  
                 ensembleId = "ens_perso"
                 
-                for decoupageNode in content.childNodes:
+                for decoupageNode in content.getchildren():
                     # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
-                    if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage"  or decoupageNode.getAttributeNS(None,"id") in self.decoupage_blacklist:
-			continue
+                    if decoupageNode.tag != "decoupage"  or decoupageNode.get(None,"id") in self.decoupage_blacklist:
+                        continue
                     
-                    decoupId = decoupageNode.getAttributeNS(None,u"id")
-                    res = xml.xpath.Evaluate("elements/element", decoupageNode)
+                    decoupId = decoupageNode.get(None,u"id")
+                    res = decoupageNode.xpath("elements/element")
                     for elementNode in res:
                         doc = lucene.Document()
-                        elementId = elementNode.getAttributeNS(None,u"id")
-                        tags = elementNode.getAttributeNS(None,u"tags")
+                        elementId = elementNode.get(None,u"id")
+                        tags = elementNode.get(None,u"tags")
                         
                         if tags is not None:                            
                             tags.replace(",", ";")
                         
                         if tags is None or len(tags) == 0:
                             tags = ""
-                            restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
+                            restagnode = elementNode.xpath("tag/text()")
                             for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.data
+                                tags = tags + " ; " + tagnode.text()
                                 
                         if tags is None or len(tags) == 0:
                             tags = ""
-                            restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
+                            restagnode = elementNode.xpath("tags/tag/text()")
                             for tagnode in restagnode:
-                                tags = tags + " ; " + tagnode.data                            
+                                tags = tags + " ; " + tagnode.text()                  
     
                         title = ""
-                        for txtRes in xml.xpath.Evaluate("title/text()", elementNode): 
-                            title = title + txtRes.data 
+                        for txtRes in elementNode.xpath("title/text()"): 
+                            title = title + txtRes.text()
                 
                         abstract = ""
-                        for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): 
-                            abstract = abstract + txtRes.data 
+                        for txtRes in elementNode.xpath("abstract/text()"): 
+                            abstract = abstract + txtRes.text()
                 
                         doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
                         doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))