diff -r 2f978b081c4c -r 9927a619d2b5 web/ldt/ldt_utils/contentindexer.py --- a/web/ldt/ldt_utils/contentindexer.py Thu Oct 14 12:17:31 2010 +0200 +++ b/web/ldt/ldt_utils/contentindexer.py Fri Oct 15 12:36:43 2010 +0200 @@ -8,19 +8,13 @@ import ldt.utils.xml from django.conf import settings from models import Content -import xml -import xml.dom -import xml.dom.minidom -import xml.dom.ext -import xml.xpath import fnmatch -import Ft import uuid import shutil import lucene from ldt.ldt_utils import STORE from ldt.ldt_utils import ANALYZER -## import lxml.etree +import lxml.etree def Property(func): return property(**func()) @@ -57,52 +51,50 @@ def index_content(self, content): url =content.iri_url() filepath = urllib.urlopen(url) - doc = xml.dom.minidom.parse(filepath) - doc = Ft.Xml.Domlette.ConvertDocument(doc) - + doc = lxml.etree.fromstring(filepath) + self.__writer.deleteDocuments(lucene.Term("iri_id", content.iri_id)) - con = xml.xpath.Context.Context(doc, 1, 1, None) - res = xml.xpath.Evaluate("/iri/body/ensembles/ensemble", context=con) + res = doc.xpath("/iri/body/ensembles/ensemble") for ensemble in res: - ensembleId = ensemble.getAttributeNS(None,u"id") + ensembleId = ensemble.get(None,u"id") - for decoupageNode in ensemble.childNodes: - if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage" or decoupageNode.getAttributeNS(None,u"id") in self.decoupage_blacklist: + for decoupageNode in ensemble.getchildren(): + if decoupageNode.tag != "decoupage" or decoupageNode.get(None,u"id") in self.decoupage_blacklist: continue - decoupId = decoupageNode.getAttributeNS(None,u"id") - res = xml.xpath.Evaluate("elements/element", decoupageNode) + decoupId = decoupageNode.get(None,u"id") + res = decoupageNode.xpath("elements/element") for elementNode in res: doc = lucene.Document() - elementId = elementNode.getAttributeNS(None,u"id") - tags = elementNode.getAttributeNS(None,u"tags") + elementId = elementNode.get(None,u"id") + tags = elementNode.get(None,u"tags") if tags is not None: tags.replace(",", ";") if tags is None or len(tags) == 0: tags = "" - restagnode = xml.xpath.Evaluate("tag/text()", elementNode) + restagnode = elementNode.xpath("tag/text()") for tagnode in restagnode: - tags = tags + " ; " + tagnode.data + tags = tags + " ; " + tagnode.text() if tags is None or len(tags) == 0: tags = "" - restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode) + restagnode = elementNode.xpath("tags/tag/text()") for tagnode in restagnode: - tags = tags + " ; " + tagnode.data + tags = tags + " ; " + tagnode.text() title = "" - for txtRes in xml.xpath.Evaluate("title/text()", elementNode): - title = title + txtRes.data + for txtRes in elementNode.xpath("title/text()"): + title = title + txtRes.text() abstract = "" - for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): - abstract = abstract + txtRes.data + for txtRes in elementNode.xpath("abstract/text()"): + abstract = abstract + txtRes.text() doc.add(lucene.Field("iri_id", content.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("ensemble_id", ensembleId, lucene.Field.Store.YES, lucene.Field.Index.NO)) @@ -164,53 +156,51 @@ def index_project(self, project): # pocketfilms.utils.log.debug("Indexing project : "+str(project.iri_id)) - doc = xml.dom.minidom.parseString(project.ldt) - doc = Ft.Xml.Domlette.ConvertDocument(doc) + doc = lxml.etree.fromstring(project.ldt) self.__writer.deleteDocuments(lucene.Term("iri_id", project.iri_id)) - con = xml.xpath.Context.Context(doc, 1, 1, None) - res = xml.xpath.Evaluate("/iri/annotations/content", context=con) + res = doc.xpath("/iri/annotations/content") for content in res: - contentId = content.getAttributeNS(None,u"id") + contentId = content.get(None,u"id") ensembleId = "ens_perso" - for decoupageNode in content.childNodes: + for decoupageNode in content.getchildren(): # pocketfilms.utils.log.debug("Indexing content decoupage : "+ repr(decoupageNode.nodeType) + " in " + repr(self.decoupage_blacklist)) - if decoupageNode.nodeType != xml.dom.Node.ELEMENT_NODE or decoupageNode.tagName != "decoupage" or decoupageNode.getAttributeNS(None,"id") in self.decoupage_blacklist: - continue + if decoupageNode.tag != "decoupage" or decoupageNode.get(None,"id") in self.decoupage_blacklist: + continue - decoupId = decoupageNode.getAttributeNS(None,u"id") - res = xml.xpath.Evaluate("elements/element", decoupageNode) + decoupId = decoupageNode.get(None,u"id") + res = decoupageNode.xpath("elements/element") for elementNode in res: doc = lucene.Document() - elementId = elementNode.getAttributeNS(None,u"id") - tags = elementNode.getAttributeNS(None,u"tags") + elementId = elementNode.get(None,u"id") + tags = elementNode.get(None,u"tags") if tags is not None: tags.replace(",", ";") if tags is None or len(tags) == 0: tags = "" - restagnode = xml.xpath.Evaluate("tag/text()", elementNode) + restagnode = elementNode.xpath("tag/text()") for tagnode in restagnode: - tags = tags + " ; " + tagnode.data + tags = tags + " ; " + tagnode.text() if tags is None or len(tags) == 0: tags = "" - restagnode = xml.xpath.Evaluate("tags/tag/text()", elementNode) + restagnode = elementNode.xpath("tags/tag/text()") for tagnode in restagnode: - tags = tags + " ; " + tagnode.data + tags = tags + " ; " + tagnode.text() title = "" - for txtRes in xml.xpath.Evaluate("title/text()", elementNode): - title = title + txtRes.data + for txtRes in elementNode.xpath("title/text()"): + title = title + txtRes.text() abstract = "" - for txtRes in xml.xpath.Evaluate("abstract/text()", elementNode): - abstract = abstract + txtRes.data + for txtRes in elementNode.xpath("abstract/text()"): + abstract = abstract + txtRes.text() doc.add(lucene.Field("project_id", project.iri_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) doc.add(lucene.Field("iri_id", contentId, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))