--- a/web/ldt/ldt_utils/contentindexer.py Thu Oct 14 12:17:31 2010 +0200
+++ b/web/ldt/ldt_utils/contentindexer.py Fri Oct 15 12:36:43 2010 +0200
@@ -8,19 +8,13 @@
import ldt.utils.xml
from django.conf import settings
from models import Content
-import xml
-import xml.dom
-import xml.dom.minidom
-import xml.dom.ext
-import xml.xpath
import fnmatch
-import Ft
import uuid
import shutil
import lucene
from ldt.ldt_utils import STORE
from ldt.ldt_utils import ANALYZER
-## import lxml.etree
+import lxml.etree
def Property(func):
return property(**func())
@@ -57,52 +51,50 @@
def index_content(self, content):
url =content.iri_url()
filepath = urllib.urlopen(url)
- doc = xml.dom.minidom.parse(filepath)
- doc = Ft.Xml.Domlette.ConvertDocument(doc)
-
+ doc = lxml.etree.fromstring(filepath)
+
self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id))
- con = xml.xpath.Context.Context(doc, 1, 1, None)
- res = xml.xpath.Evaluate("/iri/body/ensembles/ensemble", context=con)
+ res = doc.xpath("/iri/body/ensembles/ensemble")
for ensemble in res:
- ensembleId = ensemble.getAttributeNS(None,u"id")
+ ensembleId = ensemble.get(None,u"id")
- for decoupageNode in ensemble.childNodes:
- if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage" or decoupageNode.getAttributeNS(None,u"id") in self.decoupage_blacklist:
+ for decoupageNode in ensemble.getchildren():
+ if decoupageNode.tag != "decoupage" or decoupageNode.get(None,u"id") in self.decoupage_blacklist:
continue
- decoupId = decoupageNode.getAttributeNS(None,u"id")
- res = xml.xpath.Evaluate("elements/element", decoupageNode)
+ decoupId = decoupageNode.get(None,u"id")
+ res = decoupageNode.xpath("elements/element")
for elementNode in res:
doc = lucene.Document()
- elementId = elementNode.getAttributeNS(None,u"id")
- tags = elementNode.getAttributeNS(None,u"tags")
+ elementId = elementNode.get(None,u"id")
+ tags = elementNode.get(None,u"tags")
if tags is not None:
tags.replace(",", ";")
if tags is None or len(tags) == 0:
tags = ""
- restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
+ restagnode = elementNode.xpath("tag/text()")
for tagnode in restagnode:
- tags = tags + " ; " + tagnode.data
+ tags = tags + " ; " + tagnode.text()
if tags is None or len(tags) == 0:
tags = ""
- restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
+ restagnode = elementNode.xpath("tags/tag/text()")
for tagnode in restagnode:
- tags = tags + " ; " + tagnode.data
+ tags = tags + " ; " + tagnode.text()
title = ""
- for txtRes in xml.xpath.Evaluate("title/text()", elementNode):
- title = title + txtRes.data
+ for txtRes in elementNode.xpath("title/text()"):
+ title = title + txtRes.text()
abstract = ""
- for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode):
- abstract = abstract + txtRes.data
+ for txtRes in elementNode.xpath("abstract/text()"):
+ abstract = abstract + txtRes.text()
doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO))
@@ -164,53 +156,51 @@
def index_project(self, project):
# pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id))
- doc = xml.dom.minidom.parseString(project.ldt)
- doc = Ft.Xml.Domlette.ConvertDocument(doc)
+ doc = lxml.etree.fromstring(project.ldt)
self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id))
- con = xml.xpath.Context.Context(doc, 1, 1, None)
- res = xml.xpath.Evaluate("/iri/annotations/content", context=con)
+ res = doc.xpath("/iri/annotations/content")
for content in res:
- contentId = content.getAttributeNS(None,u"id")
+ contentId = content.get(None,u"id")
ensembleId = "ens_perso"
- for decoupageNode in content.childNodes:
+ for decoupageNode in content.getchildren():
# pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist))
- if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage" or decoupageNode.getAttributeNS(None,"id") in self.decoupage_blacklist:
- continue
+ if decoupageNode.tag != "decoupage" or decoupageNode.get(None,"id") in self.decoupage_blacklist:
+ continue
- decoupId = decoupageNode.getAttributeNS(None,u"id")
- res = xml.xpath.Evaluate("elements/element", decoupageNode)
+ decoupId = decoupageNode.get(None,u"id")
+ res = decoupageNode.xpath("elements/element")
for elementNode in res:
doc = lucene.Document()
- elementId = elementNode.getAttributeNS(None,u"id")
- tags = elementNode.getAttributeNS(None,u"tags")
+ elementId = elementNode.get(None,u"id")
+ tags = elementNode.get(None,u"tags")
if tags is not None:
tags.replace(",", ";")
if tags is None or len(tags) == 0:
tags = ""
- restagnode = xml.xpath.Evaluate("tag/text()", elementNode)
+ restagnode = elementNode.xpath("tag/text()")
for tagnode in restagnode:
- tags = tags + " ; " + tagnode.data
+ tags = tags + " ; " + tagnode.text()
if tags is None or len(tags) == 0:
tags = ""
- restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode)
+ restagnode = elementNode.xpath("tags/tag/text()")
for tagnode in restagnode:
- tags = tags + " ; " + tagnode.data
+ tags = tags + " ; " + tagnode.text()
title = ""
- for txtRes in xml.xpath.Evaluate("title/text()", elementNode):
- title = title + txtRes.data
+ for txtRes in elementNode.xpath("title/text()"):
+ title = title + txtRes.text()
abstract = ""
- for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode):
- abstract = abstract + txtRes.data
+ for txtRes in elementNode.xpath("abstract/text()"):
+ abstract = abstract + txtRes.text()
doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))
doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))