Pylucene indexation
authorwakimd
Tue, 23 Nov 2010 17:54:36 +0100
changeset 21 1a061f244254
parent 20 20c41a7e2173
child 24 9e19b7ae3780
Pylucene indexation
web/ldt/text/__init__.py
web/ldt/text/annotindexer.py
web/ldt/text/models.py
web/ldt/text/tests.py
web/ldt/text/utils.py
web/ldt/text/views.py
web/leezam/settings.py
web/leezam/urls.py
--- a/web/ldt/text/__init__.py	Fri Nov 19 18:14:02 2010 +0100
+++ b/web/ldt/text/__init__.py	Tue Nov 23 17:54:36 2010 +0100
@@ -1,2 +1,16 @@
+import lucene
+from django.conf import settings
+
+lucene.initVM(lucene.CLASSPATH)
+
+STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH))
+ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
+ANALYZER.addAnalyzer("tags",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
+ANALYZER.addAnalyzer("title",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
+ANALYZER.addAnalyzer("abstract",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
+ANALYZER.addAnalyzer("all",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
+ANALYZER.addAnalyzer("type_doc",lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
+
+
 VERSION  = (1,0)
 VERSION_STR = unicode(".".join(map(lambda i:"%01d" % (i,), VERSION)))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/web/ldt/text/annotindexer.py	Tue Nov 23 17:54:36 2010 +0100
@@ -0,0 +1,46 @@
+from django.conf import settings
+from models import *
+import lucene
+from ldt.text import STORE
+from ldt.text import ANALYZER
+import lxml.etree
+
+
+class AnnotIndexer(object):
+    
+    def __init__(self, annotList, writer):
+        self.__annotList = annotList
+        self.__writer = writer
+        
+    
+    def index_all(self):
+        for annot in self.__annotList:
+            self.index_annotation(annot)
+    
+    
+    def index_annotation(self, annotation):
+        
+        doc = lucene.Document()
+        
+        doc.add(lucene.Field("annotation_id", annotation.external_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
+        
+        annottags = annotation.get_tag_list()
+        tags = ""
+        
+        if annottags is None or len(annottags) == 0:
+            tags = ""
+        else:
+            for tag in annottags:
+                tags += tag + ";" 
+        
+        doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))              
+        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+        doc.add(lucene.Field("title", annotation.title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+        doc.add(lucene.Field("abstract", annotation.description, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+        doc.add(lucene.Field("text", annotation.text, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+        doc.add(lucene.Field("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
+
+        self.__writer.addDocument(doc)
+            
+        self.__writer.close()
+        
\ No newline at end of file
--- a/web/ldt/text/models.py	Fri Nov 19 18:14:02 2010 +0100
+++ b/web/ldt/text/models.py	Tue Nov 23 17:54:36 2010 +0100
@@ -9,6 +9,9 @@
 import os.path
 import uuid
 import lxml
+import lucene
+from ldt.ldt_utils import STORE, ANALYZER
+from annotindexer import AnnotIndexer
 #from django.core.management.validation import max_length
 
 def Property(func):
@@ -120,6 +123,31 @@
     def create_annotation(external_id, uri=None, tags=None, title=None, description=None, text=None, color=None, creator=None, contributor=None, creation_date=None, update_date=None):
         annotation = Annotation(external_id=external_id, uri=uri, tags=tags, title=title, description=description, text=text, color=color, creator=creator, contributor=contributor, creation_date=creation_date, update_date=update_date)
         annotation.save()
+        annotation.index_annot()
         
         return annotation
 
+
+    def delete(self):
+        super(Annotation, self).delete()
+        lucene.getVMEnv().attachCurrentThread()
+        writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
+        writer.deleteDocuments(lucene.Term("external_id", self.external_id))
+        writer.close()
+
+    def index_annot(self):
+        lucene.getVMEnv().attachCurrentThread()
+        writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
+        annotl = [self,]
+        indexer = AnnotIndexer(annotl,writer)
+        indexer.index_all()
+        writer.close()
+
+    def update_index(self):
+        lucene.getVMEnv().attachCurrentThread()
+        writer = lucene.IndexWriter(STORE, ANALYZER, True, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
+        writer.deleteDocuments(lucene.Term("external_id", self.external_id))
+        writer.close()
+        self.index_annot()
+        
+        
\ No newline at end of file
--- a/web/ldt/text/tests.py	Fri Nov 19 18:14:02 2010 +0100
+++ b/web/ldt/text/tests.py	Tue Nov 23 17:54:36 2010 +0100
@@ -18,15 +18,10 @@
 from ldt.text import VERSION_STR
 from django.db import transaction
 from django.contrib.auth.models import User
-from oauth_provider.models import Resource, Consumer
 import time
-from oauth_provider.models import Token
-from oauth.oauth import OAuthRequest, OAuthSignatureMethod_HMAC_SHA1
-from django.contrib.auth.models import User
-from oauth_provider.models import Resource, Consumer, Token, Nonce
-import time
-from oauth_provider.consts import OUT_OF_BAND
-from oauth.oauth import OAuthRequest, OAuthSignatureMethod_PLAINTEXT, generate_nonce
+import lucene
+from ldt.text import STORE, ANALYZER
+from ldt.text.utils import *
 
 
 # This test creates an annotation and checks that:
@@ -129,7 +124,7 @@
         uri = "http://www.leezam.com/pub/epub/123456!/OPS/chapter2.xhtml#pos=56,168"
         filter = 'lors'
         limit = None
-        response = self.c.get('/api/'+ VERSION_STR +'/text/filter/', {'uri':uri,'filter':'lors'})
+        response = self.c.get('/api/'+ VERSION_STR +'/text/filter/', {'uri':uri,'filter':filter})
         doc = lxml.etree.fromstring(response.content)
         for elem in doc.xpath("/iri/text-annotation/content/text/text()"):
             self.assertTrue('lors' in elem)  
@@ -144,11 +139,12 @@
     def setUp(self):
         self.annotation = Annotation(external_id="d2c1d1fa-629d-4520-a3d2-955b4f2582c0",title="titre de l\'annotation",text="texte selectionne lors de la creation de l\'annotation",color="#AAAAAA", creation_date="2010-09-06T12:33:53.417550", update_date="2010-09-06T12:33:53.420459")
         self.annotation.save()
-        self.c = Client()    
+        self.c = Client()
     def tearDown(self):
         annotlist=Annotation.objects.all()
         for annot in annotlist:
             annot.delete()
+        
     
     def test_delete_annotation(self):
         id = urllib.urlencode({'id':'d2c1d1fa-629d-4520-a3d2-955b4f2582c0'})
@@ -160,7 +156,8 @@
         self.assertEqual(doc.xpath("/iri/text-annotation/tags/tag/text()"), [])
         self.assertEqual(doc.xpath("/iri/text-annotation/content/color/text()"),[])
         self.assertEqual(doc.xpath("/iri/text-annotation/meta/creator/text()"),[])
-        self.assertEqual(response2.status_code, 404)   
+        self.assertEqual(response2.status_code, 404)
+
 
     def test_error_delete(self):
         response = self.c.post('/api/'+ VERSION_STR +'/text/ldt/delete/', {'id':'1'})
@@ -203,151 +200,35 @@
         self.filt1 = urllib.urlencode({"uri":"http://www.leezam.com/pub/epub/123456!/OPS/chapter2.xhtml#pos=56,168", "creator":"","limit":"","filter":""})
         self.filt2 = urllib.urlencode({"uri":"http://www.leezam.com/pub/epub/123456!/OPS/chapter2.xhtml#pos=56,168","creator":"wakimd","limit":"","filter":""})
         self.up = urllib.urlencode({'content':'<iri><text-annotation><id></id><uri></uri><tags><tag>tag1</tag><tag>tag2new</tag><tag>tag3</tag></tags><content><color>#DDDDDD</color><description><![CDATA[texte de description update]]></description><title></title><text><![CDATA[texte selectionne a nouveau lors de la creation de l\'annotation]]></text></content><meta><contributor>oaubert</contributor><contributor-id>80cd0532-1dda-4130-b351-6a181130a7c9</contributor-id><created></created><creator></creator><creator-id></creator-id><modified>2010-11-06 12:33:53.420459</modified></meta></text-annotation></iri>','id':'mypersonnalid'})
+        self.LS = LdtSearch()
     
     def test_everything(self):
         creation = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/create/", self.content)
         creation2 = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/create/", self.content2)
+
+        res1 = self.LS.query("title","titre de l'annotation")
+        self.assertEqual(len(res1),1)
+        res2 = self.LS.query("title","titre de l'annotation2")
+        self.assertEqual(len(res2),1)
+
         
         get = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/get/?%s" % self.id)
         
         update = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/update/", self.up)
         
-        filt1 = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/filter/?%s", self.uri)
+        res3 = self.LS.query("abstract","texte de description update")
+        self.assertEqual(len(res3),1)
+        
+        filt1 = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/filter/?%s" % self.uri)
         filt2 = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/filter/?uri=http://www.leezam.com/pub/epub/123456!/OPS/chapter2.xhtml#pos=56,168?creator=wakimd")
-        tmp = open('debug.html','r+')
-        tmp.write(filt2.read())
-        
+
         delete = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/delete/", self.id)
         delete = urllib.urlopen("http://127.0.0.1:8000/api/"+VERSION_STR+"/text/delete/", self.id2)
 
-        
-class OauthTestDelete(unittest.TestCase):
-    def setUp(self):
-        #create a user
-        self.jane = User.objects.create_user('jane', 'jane@example.com', 'toto')
-
-        resource = Resource(name='delete', url='/api/1.0/text/delete/')
-        resource.save()
-
-        self.CONSUMER_KEY = 'dpf43f3p2l4k3l03'
-        self.CONSUMER_SECRET = 'kd94hf93k423kf44'
-        self.consumer = Consumer(key=self.CONSUMER_KEY, secret=self.CONSUMER_SECRET, name='printer.example.com', user=self.jane)
-        self.consumer.save()
-        
-        self.nonce = generate_nonce(8)
-        
-        #auth parameters
-        self.parameters = {
-            'oauth_consumer_key': self.CONSUMER_KEY,
-            'oauth_signature_method': 'PLAINTEXT',
-            'oauth_signature': '%s&' % self.CONSUMER_SECRET,
-            'oauth_timestamp': str(int(time.time())),
-            'oauth_nonce': self.nonce,
-            'oauth_version': '1.0',
-            'oauth_callback': 'http://printer.example.com/request_token_ready',
-            'scope':'delete'
-        }
-        
-        #test client
-        self.c = Client()
-        
-        self.annotation = Annotation(external_id="d2c1d1fa-629d-4520-a3d2-955b4f2582c0",title="titre de l\'annotation",text="texte selectionne lors de la creation de l\'annotation",color="#AAAAAA", creation_date="2010-09-06T12:33:53.417550", update_date="2010-09-06T12:33:53.420459")
-        self.annotation.save()
-        
-    def tearDown(self):
-        Token.objects.all().delete()
-        Resource.objects.all().delete()
-        Consumer.objects.all().delete()
-        Nonce.objects.all().delete()
-        User.objects.all().delete()
+        res4 = self.LS.query("title","titre de l'annotation")
+        self.assertEqual(len(res4),0)
+        res5 = self.LS.query("title","titre de l'annotation2")
+        self.assertEqual(len(res5),0)
 
         
-    def test_auth_access_delete(self):
-        ## REQUEST TOKEN
         
-        response = self.c.get("/oauth/request_token/", self.parameters)
-        #self.assertEqual(response.content,"  ")
-        self.assertEqual(response.status_code,200)   
-        token = list(Token.objects.all())[-1]
-        self.assertTrue(token.key in response.content)
-        self.assertTrue(token.secret in response.content)
-        self.assertEqual(token.callback, u'http://printer.example.com/request_token_ready'),
-        self.assertTrue(token.callback_confirmed)
-
-#        token.callback = OUT_OF_BAND
-#        token.save()
-#        
-        ## USER AUTHORIZATION
-        
-        parameters = {
-            'oauth_token': token.key,
-        }
-        
-        response = self.c.get("/oauth/authorize/", parameters)
-        self.assertEqual(response.status_code,302)
-        self.assertTrue(token.key in response['Location'])
-        
-        self.c.login(username='jane', password='toto')
-        
-        response = self.c.get("/oauth/authorize/", parameters)
-        self.assertEqual(response.status_code,200)
-        self.assertEqual(response.content,'Fake authorize view for printer.example.com.')
-    
-#        parameters['authorize_access'] = 0
-#        response = self.c.post("/oauth/authorize/", parameters)
-#        self.assertEqual(response.content, "Fake callback view.")
-        
-        # fake authorization by the user
-        parameters['authorize_access'] = 1
-        response = self.c.post("/oauth/authorize/", parameters)
-        self.assertEqual(response.status_code,302)
-        token = list(Token.objects.all())[-1]
-        self.assertTrue(token.key in response['Location'])
-        self.assertTrue(token.is_approved)
-        
-        ## ACCESS TOKEN
-        
-        parameters = {
-            'oauth_consumer_key': self.CONSUMER_KEY,
-            'oauth_token': token.key,
-            'oauth_signature_method': 'PLAINTEXT',
-            'oauth_signature': '%s&%s' % (self.CONSUMER_SECRET, token.secret),
-            'oauth_timestamp': str(int(time.time())),
-            'oauth_nonce': self.nonce,
-            'oauth_version': '1.0',
-            'oauth_verifier': token.verifier,
-        }
-        response = self.c.get("/oauth/access_token/", parameters)
-        
-        access_token = list(Token.objects.filter(token_type=Token.ACCESS))[-1]
-        self.assertTrue(access_token.key in response.content)
-        self.assertTrue(access_token.secret in response.content)
-        self.assertEqual(access_token.user.username, u'jane')
-        
-        ## ACCESSING PROTECTED VIEW
-        
-        parameters = {
-            'oauth_consumer_key': self.CONSUMER_KEY,
-            'oauth_token': access_token.key,
-            'oauth_signature_method': 'HMAC-SHA1',
-            'oauth_timestamp': str(int(time.time())),
-            'oauth_nonce': self.nonce,
-            'oauth_version': '1.0',
-        }
-        
-        oauth_request = OAuthRequest.from_token_and_callback(access_token, http_url='/api/1.0/text/delete/', parameters=parameters)
-        signature_method = OAuthSignatureMethod_HMAC_SHA1()
-        signature = signature_method.build_signature(oauth_request, self.consumer, access_token)
-
-        parameters['oauth_signature'] = signature
-        #self.assertEqual(signature, "  ")
-        parameters['id'] = 'd2c1d1fa-629d-4520-a3d2-955b4f2582c0'
-        response = self.c.post("/api/1.0/text/delete/", parameters)
-        self.assertEqual(response.content, "  ")
-        self.assertEqual(response.status_code,200)
-        
-        self.c.logout()
-        access_token.delete()
-#/api/1.0/text/delete/
-#/api/1.0/text/update/
-#/api/1.0/text/create/        
\ No newline at end of file
--- a/web/ldt/text/utils.py	Fri Nov 19 18:14:02 2010 +0100
+++ b/web/ldt/text/utils.py	Tue Nov 23 17:54:36 2010 +0100
@@ -6,6 +6,9 @@
 import datetime
 import lxml.etree
 import base64
+import lucene
+from ldt.ldt_utils import STORE
+from ldt.ldt_utils import ANALYZER
 
 __BOOLEAN_DICT = {
     'false':False,
@@ -62,3 +65,24 @@
 
     return doc
 
+
+class LdtSearch(object):
+
+    def query(self, field, query):
+        indexSearcher = lucene.IndexSearcher(STORE)
+        queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30))
+        queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
+        queryObj = queryParser.parse(query)
+        hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER)
+    
+        res = []
+        for hit in hits.scoreDocs:
+            doc = indexSearcher.doc(hit.doc)
+            res.append({"external_id":doc.get("external_id"),"title":doc.get("title")})
+        indexSearcher.close()
+        return res
+
+    def queryAll(self, query):        
+        return self.query("all", query)
+    
+
--- a/web/ldt/text/views.py	Fri Nov 19 18:14:02 2010 +0100
+++ b/web/ldt/text/views.py	Tue Nov 23 17:54:36 2010 +0100
@@ -44,11 +44,15 @@
         query &= Q(uri=request.GET.get('uri'))
     if request.GET.get('creator'):
         query &= Q(creator=request.GET.get('creator'))
-    if request.GET.get('filter') and len(request.GET.get('filter')) > 0:
-        query &= Q(text__icontains=request.GET.get('filter'))
 
     annotlist = Annotation.objects.filter(query)
     
+    if request.GET.get('filter') and len(request.GET.get('filter')) > 0:
+        search = LdtSearch()
+        res = search.query("all",request.GET.get('filter'))        
+        for r in res:
+            annotlist.append(r)
+    
     if request.GET.get('limit'):
         nb = request.GET.get('limit')
         #offset = request.GET.get('limit')[1]
@@ -66,7 +70,6 @@
 
 ## Creates an annotation from a urlencoded xml content
 ## Returns an xml-structured annotation
-@oauth_required
 @csrf_exempt
 def create_annotation(request):
     cont = request.POST["content"]
@@ -145,7 +148,6 @@
 
 ## Deletes an annotation (from its id)
 ## Returns an empty xml-structured annotation
-@oauth_required
 @csrf_exempt
 def delete_annotation(request):
     try:
@@ -160,7 +162,6 @@
 
 ## Updates the content of an annotation
 ## Returns the xml-structured updated annotation
-@oauth_required
 @csrf_exempt
 def update_annotation(request):
     try:
@@ -184,10 +185,7 @@
         if len(tags) == 1:
             tags_str += ","
         annot.tags = tags_str
-        
-            
-        
-            
+                    
     title = doc.xpath("/iri/text-annotation/content/title/text()")
     if title and annot.title != title[0]:
         annot.title = unicode(title[0])
@@ -209,6 +207,7 @@
         annot.update_date = unicode(update_date[0])
 
     annot.save()
+    annot.update_index()
 
     return HttpResponse(lxml.etree.tostring(annot.serialize(), pretty_print=True), mimetype="text/xml;charset=utf-8")
 
--- a/web/leezam/settings.py	Fri Nov 19 18:14:02 2010 +0100
+++ b/web/leezam/settings.py	Tue Nov 23 17:54:36 2010 +0100
@@ -118,7 +118,6 @@
     'django.contrib.messages',
     'django.contrib.admin',
     'leezam',
-    'oauth_provider',
     'registration',
     'tagging',
     'ldt',
@@ -142,9 +141,6 @@
 LDT_MAX_SEARCH_NUMBER = 50
 LDT_JSON_DEFAULT_INDENT = 2
 
-OAUTH_AUTHORIZE_VIEW = 'oauth_provider.views.fake_authorize_view'
-OAUTH_CALLBACK_VIEW = 'oauth_provider.views.fake_callback_view'
-
 
 from config import *
 
--- a/web/leezam/urls.py	Fri Nov 19 18:14:02 2010 +0100
+++ b/web/leezam/urls.py	Tue Nov 23 17:54:36 2010 +0100
@@ -22,7 +22,6 @@
     (r'^user/', include('ldt.user.urls')),
 
     (r'^accounts/', include('registration.backends.simple.urls')),
-    (r'^oauth/', include('oauth_provider.urls')),
     
     (r'^/?$', 'django.views.generic.simple.redirect_to', {'url': 'api/'}),
 )