# HG changeset patch # User wakimd # Date 1290531276 -3600 # Node ID 1a061f24425462b0267eece1cdc970001580be24 # Parent 20c41a7e2173aa1a80bcfdd94382a3f5ea0cae87 Pylucene indexation diff -r 20c41a7e2173 -r 1a061f244254 web/ldt/text/__init__.py --- a/web/ldt/text/__init__.py Fri Nov 19 18:14:02 2010 +0100 +++ b/web/ldt/text/__init__.py Tue Nov 23 17:54:36 2010 +0100 @@ -1,2 +1,16 @@ +import lucene +from django.conf import settings + +lucene.initVM(lucene.CLASSPATH) + +STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH)) +ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) +ANALYZER.addAnalyzer("tags",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) +ANALYZER.addAnalyzer("title",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) +ANALYZER.addAnalyzer("abstract",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) +ANALYZER.addAnalyzer("all",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) +ANALYZER.addAnalyzer("type_doc",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) + + VERSION = (1,0) VERSION_STR = unicode(".".join(map(lambda i:"%01d" % (i,), VERSION))) diff -r 20c41a7e2173 -r 1a061f244254 web/ldt/text/annotindexer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/ldt/text/annotindexer.py Tue Nov 23 17:54:36 2010 +0100 @@ -0,0 +1,46 @@ +from django.conf import settings +from models import * +import lucene +from ldt.text import STORE +from ldt.text import ANALYZER +import lxml.etree + + +class AnnotIndexer(object): + + def __init__(self, annotList, writer): + self.__annotList = annotList + self.__writer = writer + + + def index_all(self): + for annot in self.__annotList: + self.index_annotation(annot) + + + def index_annotation(self, annotation): + + doc = lucene.Document() + + doc.add(lucene.Field("annotation_id", annotation.external_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED)) + + annottags = annotation.get_tag_list() + tags = "" + + if annottags is None or len(annottags) == 0: + tags = "" + else: + for tag in annottags: + tags += tag + ";" + + doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("title", annotation.title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("abstract", annotation.description, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("text", annotation.text, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + doc.add(lucene.Field("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED)) + + self.__writer.addDocument(doc) + + self.__writer.close() + \ No newline at end of file diff -r 20c41a7e2173 -r 1a061f244254 web/ldt/text/models.py --- a/web/ldt/text/models.py Fri Nov 19 18:14:02 2010 +0100 +++ b/web/ldt/text/models.py Tue Nov 23 17:54:36 2010 +0100 @@ -9,6 +9,9 @@ import os.path import uuid import lxml +import lucene +from ldt.ldt_utils import STORE, ANALYZER +from annotindexer import AnnotIndexer #from django.core.management.validation import max_length def Property(func): @@ -120,6 +123,31 @@ def create_annotation(external_id, uri=None, tags=None, title=None, description=None, text=None, color=None, creator=None, contributor=None, creation_date=None, update_date=None): annotation = Annotation(external_id=external_id, uri=uri, tags=tags, title=title, description=description, text=text, color=color, creator=creator, contributor=contributor, creation_date=creation_date, update_date=update_date) annotation.save() + annotation.index_annot() return annotation + + def delete(self): + super(Annotation, self).delete() + lucene.getVMEnv().attachCurrentThread() + writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) + writer.deleteDocuments(lucene.Term("external_id", self.external_id)) + writer.close() + + def index_annot(self): + lucene.getVMEnv().attachCurrentThread() + writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) + annotl = [self,] + indexer = AnnotIndexer(annotl,writer) + indexer.index_all() + writer.close() + + def update_index(self): + lucene.getVMEnv().attachCurrentThread() + writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED) + writer.deleteDocuments(lucene.Term("external_id", self.external_id)) + writer.close() + self.index_annot() + + \ No newline at end of file diff -r 20c41a7e2173 -r 1a061f244254 web/ldt/text/tests.py --- a/web/ldt/text/tests.py Fri Nov 19 18:14:02 2010 +0100 +++ b/web/ldt/text/tests.py Tue Nov 23 17:54:36 2010 +0100 @@ -18,15 +18,10 @@ from ldt.text import VERSION_STR from django.db import transaction from django.contrib.auth.models import User -from oauth_provider.models import Resource, Consumer import time -from oauth_provider.models import Token -from oauth.oauth import OAuthRequest, OAuthSignatureMethod_HMAC_SHA1 -from django.contrib.auth.models import User -from oauth_provider.models import Resource, Consumer, Token, Nonce -import time -from oauth_provider.consts import OUT_OF_BAND -from oauth.oauth import OAuthRequest, OAuthSignatureMethod_PLAINTEXT, generate_nonce +import lucene +from ldt.text import STORE, ANALYZER +from ldt.text.utils import * # This test creates an annotation and checks that: @@ -129,7 +124,7 @@ uri = "http://www.leezam.com/pub/epub/123456!/OPS/chapter2.xhtml#pos=56,168" filter = 'lors' limit = None - response = self.c.get('/api/'+ VERSION_STR +'/text/filter/', {'uri':uri,'filter':'lors'}) + response = self.c.get('/api/'+ VERSION_STR +'/text/filter/', {'uri':uri,'filter':filter}) doc = lxml.etree.fromstring(response.content) for elem in doc.xpath("/iri/text-annotation/content/text/text()"): self.assertTrue('lors' in elem) @@ -144,11 +139,12 @@ def setUp(self): self.annotation = Annotation(external_id="d2c1d1fa-629d-4520-a3d2-955b4f2582c0",title="titre de l\'annotation",text="texte selectionne lors de la creation de l\'annotation",color="#AAAAAA", creation_date="2010-09-06T12:33:53.417550", update_date="2010-09-06T12:33:53.420459") self.annotation.save() - self.c = Client() + self.c = Client() def tearDown(self): annotlist=Annotation.objects.all() for annot in annotlist: annot.delete() + def test_delete_annotation(self): id = urllib.urlencode({'id':'d2c1d1fa-629d-4520-a3d2-955b4f2582c0'}) @@ -160,7 +156,8 @@ self.assertEqual(doc.xpath("/iri/text-annotation/tags/tag/text()"), []) self.assertEqual(doc.xpath("/iri/text-annotation/content/color/text()"),[]) self.assertEqual(doc.xpath("/iri/text-annotation/meta/creator/text()"),[]) - self.assertEqual(response2.status_code, 404) + self.assertEqual(response2.status_code, 404) + def test_error_delete(self): response = self.c.post('/api/'+ VERSION_STR +'/text/ldt/delete/', {'id':'1'}) @@ -203,151 +200,35 @@ self.filt1 = urllib.urlencode({"uri":"http://www.leezam.com/pub/epub/123456!/OPS/chapter2.xhtml#pos=56,168", "creator":"","limit":"","filter":""}) self.filt2 = urllib.urlencode({"uri":"http://www.leezam.com/pub/epub/123456!/OPS/chapter2.xhtml#pos=56,168","creator":"wakimd","limit":"","filter":""}) self.up = urllib.urlencode({'content':'tag1tag2newtag3#DDDDDDoaubert80cd0532-1dda-4130-b351-6a181130a7c92010-11-06 12:33:53.420459','id':'mypersonnalid'}) + self.LS = LdtSearch() def test_everything(self): creation = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/create/", self.content) creation2 = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/create/", self.content2) + + res1 = self.LS.query("title","titre de l'annotation") + self.assertEqual(len(res1),1) + res2 = self.LS.query("title","titre de l'annotation2") + self.assertEqual(len(res2),1) + get = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/get/?%s" % self.id) update = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/update/", self.up) - filt1 = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/filter/?%s", self.uri) + res3 = self.LS.query("abstract","texte de description update") + self.assertEqual(len(res3),1) + + filt1 = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/filter/?%s" % self.uri) filt2 = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/filter/?uri=http://www.leezam.com/pub/epub/123456!/OPS/chapter2.xhtml#pos=56,168?creator=wakimd") - tmp = open('debug.html','r+') - tmp.write(filt2.read()) - + delete = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/delete/", self.id) delete = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/delete/", self.id2) - -class OauthTestDelete(unittest.TestCase): - def setUp(self): - #create a user - self.jane = User.objects.create_user('jane', 'jane@example.com', 'toto') - - resource = Resource(name='delete', url='/api/1.0/text/delete/') - resource.save() - - self.CONSUMER_KEY = 'dpf43f3p2l4k3l03' - self.CONSUMER_SECRET = 'kd94hf93k423kf44' - self.consumer = Consumer(key=self.CONSUMER_KEY, secret=self.CONSUMER_SECRET, name='printer.example.com', user=self.jane) - self.consumer.save() - - self.nonce = generate_nonce(8) - - #auth parameters - self.parameters = { - 'oauth_consumer_key': self.CONSUMER_KEY, - 'oauth_signature_method': 'PLAINTEXT', - 'oauth_signature': '%s&' % self.CONSUMER_SECRET, - 'oauth_timestamp': str(int(time.time())), - 'oauth_nonce': self.nonce, - 'oauth_version': '1.0', - 'oauth_callback': 'http://printer.example.com/request_token_ready', - 'scope':'delete' - } - - #test client - self.c = Client() - - self.annotation = Annotation(external_id="d2c1d1fa-629d-4520-a3d2-955b4f2582c0",title="titre de l\'annotation",text="texte selectionne lors de la creation de l\'annotation",color="#AAAAAA", creation_date="2010-09-06T12:33:53.417550", update_date="2010-09-06T12:33:53.420459") - self.annotation.save() - - def tearDown(self): - Token.objects.all().delete() - Resource.objects.all().delete() - Consumer.objects.all().delete() - Nonce.objects.all().delete() - User.objects.all().delete() + res4 = self.LS.query("title","titre de l'annotation") + self.assertEqual(len(res4),0) + res5 = self.LS.query("title","titre de l'annotation2") + self.assertEqual(len(res5),0) - def test_auth_access_delete(self): - ## REQUEST TOKEN - response = self.c.get("/oauth/request_token/", self.parameters) - #self.assertEqual(response.content," ") - self.assertEqual(response.status_code,200) - token = list(Token.objects.all())[-1] - self.assertTrue(token.key in response.content) - self.assertTrue(token.secret in response.content) - self.assertEqual(token.callback, u'http://printer.example.com/request_token_ready'), - self.assertTrue(token.callback_confirmed) - -# token.callback = OUT_OF_BAND -# token.save() -# - ## USER AUTHORIZATION - - parameters = { - 'oauth_token': token.key, - } - - response = self.c.get("/oauth/authorize/", parameters) - self.assertEqual(response.status_code,302) - self.assertTrue(token.key in response['Location']) - - self.c.login(username='jane', password='toto') - - response = self.c.get("/oauth/authorize/", parameters) - self.assertEqual(response.status_code,200) - self.assertEqual(response.content,'Fake authorize view for printer.example.com.') - -# parameters['authorize_access'] = 0 -# response = self.c.post("/oauth/authorize/", parameters) -# self.assertEqual(response.content, "Fake callback view.") - - # fake authorization by the user - parameters['authorize_access'] = 1 - response = self.c.post("/oauth/authorize/", parameters) - self.assertEqual(response.status_code,302) - token = list(Token.objects.all())[-1] - self.assertTrue(token.key in response['Location']) - self.assertTrue(token.is_approved) - - ## ACCESS TOKEN - - parameters = { - 'oauth_consumer_key': self.CONSUMER_KEY, - 'oauth_token': token.key, - 'oauth_signature_method': 'PLAINTEXT', - 'oauth_signature': '%s&%s' % (self.CONSUMER_SECRET, token.secret), - 'oauth_timestamp': str(int(time.time())), - 'oauth_nonce': self.nonce, - 'oauth_version': '1.0', - 'oauth_verifier': token.verifier, - } - response = self.c.get("/oauth/access_token/", parameters) - - access_token = list(Token.objects.filter(token_type=Token.ACCESS))[-1] - self.assertTrue(access_token.key in response.content) - self.assertTrue(access_token.secret in response.content) - self.assertEqual(access_token.user.username, u'jane') - - ## ACCESSING PROTECTED VIEW - - parameters = { - 'oauth_consumer_key': self.CONSUMER_KEY, - 'oauth_token': access_token.key, - 'oauth_signature_method': 'HMAC-SHA1', - 'oauth_timestamp': str(int(time.time())), - 'oauth_nonce': self.nonce, - 'oauth_version': '1.0', - } - - oauth_request = OAuthRequest.from_token_and_callback(access_token, http_url='/api/1.0/text/delete/', parameters=parameters) - signature_method = OAuthSignatureMethod_HMAC_SHA1() - signature = signature_method.build_signature(oauth_request, self.consumer, access_token) - - parameters['oauth_signature'] = signature - #self.assertEqual(signature, " ") - parameters['id'] = 'd2c1d1fa-629d-4520-a3d2-955b4f2582c0' - response = self.c.post("/api/1.0/text/delete/", parameters) - self.assertEqual(response.content, " ") - self.assertEqual(response.status_code,200) - - self.c.logout() - access_token.delete() -#/api/1.0/text/delete/ -#/api/1.0/text/update/ -#/api/1.0/text/create/ \ No newline at end of file diff -r 20c41a7e2173 -r 1a061f244254 web/ldt/text/utils.py --- a/web/ldt/text/utils.py Fri Nov 19 18:14:02 2010 +0100 +++ b/web/ldt/text/utils.py Tue Nov 23 17:54:36 2010 +0100 @@ -6,6 +6,9 @@ import datetime import lxml.etree import base64 +import lucene +from ldt.ldt_utils import STORE +from ldt.ldt_utils import ANALYZER __BOOLEAN_DICT = { 'false':False, @@ -62,3 +65,24 @@ return doc + +class LdtSearch(object): + + def query(self, field, query): + indexSearcher = lucene.IndexSearcher(STORE) + queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30)) + queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND) + queryObj = queryParser.parse(query) + hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER) + + res = [] + for hit in hits.scoreDocs: + doc = indexSearcher.doc(hit.doc) + res.append({"external_id":doc.get("external_id"),"title":doc.get("title")}) + indexSearcher.close() + return res + + def queryAll(self, query): + return self.query("all", query) + + diff -r 20c41a7e2173 -r 1a061f244254 web/ldt/text/views.py --- a/web/ldt/text/views.py Fri Nov 19 18:14:02 2010 +0100 +++ b/web/ldt/text/views.py Tue Nov 23 17:54:36 2010 +0100 @@ -44,11 +44,15 @@ query &= Q(uri=request.GET.get('uri')) if request.GET.get('creator'): query &= Q(creator=request.GET.get('creator')) - if request.GET.get('filter') and len(request.GET.get('filter')) > 0: - query &= Q(text__icontains=request.GET.get('filter')) annotlist = Annotation.objects.filter(query) + if request.GET.get('filter') and len(request.GET.get('filter')) > 0: + search = LdtSearch() + res = search.query("all",request.GET.get('filter')) + for r in res: + annotlist.append(r) + if request.GET.get('limit'): nb = request.GET.get('limit') #offset = request.GET.get('limit')[1] @@ -66,7 +70,6 @@ ## Creates an annotation from a urlencoded xml content ## Returns an xml-structured annotation -@oauth_required @csrf_exempt def create_annotation(request): cont = request.POST["content"] @@ -145,7 +148,6 @@ ## Deletes an annotation (from its id) ## Returns an empty xml-structured annotation -@oauth_required @csrf_exempt def delete_annotation(request): try: @@ -160,7 +162,6 @@ ## Updates the content of an annotation ## Returns the xml-structured updated annotation -@oauth_required @csrf_exempt def update_annotation(request): try: @@ -184,10 +185,7 @@ if len(tags) == 1: tags_str += "," annot.tags = tags_str - - - - + title = doc.xpath("/iri/text-annotation/content/title/text()") if title and annot.title != title[0]: annot.title = unicode(title[0]) @@ -209,6 +207,7 @@ annot.update_date = unicode(update_date[0]) annot.save() + annot.update_index() return HttpResponse(lxml.etree.tostring(annot.serialize(), pretty_print=True), mimetype="text/xml;charset=utf-8") diff -r 20c41a7e2173 -r 1a061f244254 web/leezam/settings.py --- a/web/leezam/settings.py Fri Nov 19 18:14:02 2010 +0100 +++ b/web/leezam/settings.py Tue Nov 23 17:54:36 2010 +0100 @@ -118,7 +118,6 @@ 'django.contrib.messages', 'django.contrib.admin', 'leezam', - 'oauth_provider', 'registration', 'tagging', 'ldt', @@ -142,9 +141,6 @@ LDT_MAX_SEARCH_NUMBER = 50 LDT_JSON_DEFAULT_INDENT = 2 -OAUTH_AUTHORIZE_VIEW = 'oauth_provider.views.fake_authorize_view' -OAUTH_CALLBACK_VIEW = 'oauth_provider.views.fake_callback_view' - from config import * diff -r 20c41a7e2173 -r 1a061f244254 web/leezam/urls.py --- a/web/leezam/urls.py Fri Nov 19 18:14:02 2010 +0100 +++ b/web/leezam/urls.py Tue Nov 23 17:54:36 2010 +0100 @@ -22,7 +22,6 @@ (r'^user/', include('ldt.user.urls')), (r'^accounts/', include('registration.backends.simple.urls')), - (r'^oauth/', include('oauth_provider.urls')), (r'^/?$', 'django.views.generic.simple.redirect_to', {'url': 'api/'}), )