improve indexation query language
authorymh <ymh.work@gmail.com>
Thu, 02 Aug 2012 08:45:12 +0200
changeset 725 4f4005df9a97
parent 724 30f0bf1d3f58
child 726 c1529d821263
improve indexation query language
.settings/org.eclipse.core.resources.prefs
src/ldt/ldt/indexation/__init__.py
src/ldt/ldt/indexation/models.py
src/ldt/ldt/indexation/query_parser.py
src/ldt/ldt/indexation/tests.py
src/ldt/ldt/ldt_utils/utils.py
src/ldt/ldt/test/test_runner.py
virtualenv/res/lib/lib_create_env.py
virtualenv/res/src/mercurial-2.2.2.tar.gz
virtualenv/res/src/mercurial-2.2.3.tar.gz
--- a/.settings/org.eclipse.core.resources.prefs	Tue Jul 31 17:45:14 2012 +0200
+++ b/.settings/org.eclipse.core.resources.prefs	Thu Aug 02 08:45:12 2012 +0200
@@ -1,10 +1,12 @@
-#Fri Jul 27 18:53:06 CEST 2012
 eclipse.preferences.version=1
 encoding//src/ldt/ldt/core/migrations/0001_initial.py=utf-8
 encoding//src/ldt/ldt/core/migrations/0002_auto__del_owner.py=utf-8
 encoding//src/ldt/ldt/indexation/backends/elasticsearch_backend.py=utf-8
 encoding//src/ldt/ldt/indexation/highlighter.py=utf-8
+encoding//src/ldt/ldt/indexation/models.py=utf-8
+encoding//src/ldt/ldt/indexation/query_parser.py=utf-8
 encoding//src/ldt/ldt/indexation/search_indexes.py=utf-8
+encoding//src/ldt/ldt/indexation/tests.py=utf-8
 encoding//src/ldt/ldt/ldt_utils/migrations/0001_initial.py=utf-8
 encoding//src/ldt/ldt/ldt_utils/migrations/0002_auto__add_field_media_mimetype_field__chg_field_media_external_src_url.py=utf-8
 encoding//src/ldt/ldt/ldt_utils/migrations/0003_auto__chg_field_project_owner.py=utf-8
@@ -18,6 +20,7 @@
 encoding//src/ldt/ldt/ldt_utils/migrations/0017_correct_image_path.py=utf-8
 encoding//src/ldt/ldt/ldt_utils/views/json.py=utf-8
 encoding//src/ldt/ldt/management/utils.py=utf-8
+encoding//src/ldt/ldt/test/test_runner.py=utf-8
 encoding//src/ldt/ldt/text/migrations/0001_initial.py=utf-8
 encoding//src/ldt/ldt/user/migrations/0001_initial.py=utf-8
 encoding//src/ldt/ldt/user/migrations/0008_auto__chg_field_groupprofile_image__chg_field_groupprofile_group__chg_.py.old=utf-8
--- a/src/ldt/ldt/indexation/__init__.py	Tue Jul 31 17:45:14 2012 +0200
+++ b/src/ldt/ldt/indexation/__init__.py	Thu Aug 02 08:45:12 2012 +0200
@@ -2,6 +2,7 @@
 
 from haystack.query import SearchQuerySet
 from ldt.indexation.highlighter import LdtHighlighter as Highlighter
+from ldt.indexation.query_parser import QueryParser
 from ldt.ldt_utils.models import Segment
 from ldt.text.models import Annotation
 import re
@@ -32,7 +33,9 @@
     if field == 'all':
         field = 'text'
     
-    qs = SearchQuerySet().models(Segment).auto_query(query, field)
+    qp = QueryParser(field)
+    
+    qs = SearchQuerySet().models(Segment).filter(qp.parse(query))
     if highlight:
         qs = qs.highlight()
     return qs
@@ -46,7 +49,8 @@
     elif field == 'text':
         field = 'text_field'
         
-    qs = SearchQuerySet.models(Annotation).auto_query(query, field)
+    qp = QueryParser(field)        
+    qs = SearchQuerySet.models(Annotation).filter(qp.parse(query))
     
     return [{'external_id':res.get_stored_fields()['external_id'], 'title': res.get_stored_fields()['title'], 'score': res.score} for res in qs] 
     
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/ldt/ldt/indexation/models.py	Thu Aug 02 08:45:12 2012 +0200
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+'''
+Created on Aug 1, 2012
+
+@author: ymh
+'''
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/ldt/ldt/indexation/query_parser.py	Thu Aug 02 08:45:12 2012 +0200
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+'''
+Created on Aug 1, 2012
+
+@author: ymh
+'''
+
+#TODO: unitest for 
+
+from whoosh.qparser import SimpleParser, FieldsPlugin, OperatorsPlugin, PhrasePlugin, SingleQuotePlugin, GroupPlugin, PrefixPlugin, GtLtPlugin, RangePlugin
+from whoosh.query import Term, And, AndMaybe, Or, AndNot, Not, Phrase, Prefix, TermRange
+from haystack.query import SQ
+from django.conf import settings
+
+HAYSTACK_DEFAULT_OPERATOR = getattr(settings,'HAYSTACK_DEFAULT_OPERATOR','AND')
+
+class QueryParser(object):
+
+
+    def __init__(self, fieldname):
+        '''
+        Constructor
+        '''
+        self.w_parser = SimpleParser(fieldname, None)
+        self.w_parser.add_plugin(FieldsPlugin())
+        self.w_parser.add_plugin(OperatorsPlugin())
+        self.w_parser.add_plugin(PhrasePlugin())
+        self.w_parser.add_plugin(SingleQuotePlugin())
+        self.w_parser.add_plugin(GroupPlugin())
+        self.w_parser.add_plugin(PrefixPlugin())
+        self.w_parser.add_plugin(GtLtPlugin())
+        self.w_parser.add_plugin(RangePlugin())
+        self.query = None
+        self.current_node_stack = []        
+        
+    def parse(self, query):
+        
+        self.query = SQ()
+        self.current_node_stack = [(self.query, HAYSTACK_DEFAULT_OPERATOR)]
+
+        wquery = self.w_parser.parse(query)
+        
+        self.visit(wquery)
+        
+        if len(self.query) == 1 and isinstance(self.query.children[0], SQ):
+            return self.query.children[0]
+        else:
+            return self.query 
+        
+        
+    def visit(self, q):
+        
+        if isinstance(q, Term):
+            current_node, current_connector = self.current_node_stack.pop() 
+            current_node.add(SQ(**{q.fieldname:q.text}), current_connector)
+            self.current_node_stack.append((current_node,current_connector))
+        elif isinstance(q, And):
+            self._add_compound_query(q, SQ.AND)
+        elif isinstance(q, AndMaybe):
+            self._add_andmaybe(q)
+        elif isinstance(q, Or):
+            self._add_compound_query(q, SQ.OR)
+        elif isinstance(q, AndNot):
+            self._add_andnot(q)
+        elif isinstance(q, Not):
+            self._add_not(q)
+        elif isinstance(q, Phrase):
+            self._add_phrase(q)
+        elif isinstance(q, Prefix):
+            self._add_prefix(q)
+        elif isinstance(q, TermRange):
+            self._add_range(q)
+            
+    def _add_compound_query(self, q, connector):
+
+        new_node = SQ()
+        self.current_node_stack.append((new_node, connector))
+        for subquery in q.subqueries:
+            self.visit(subquery)
+        self.current_node_stack.pop()
+                        
+        if len(new_node)==1 and isinstance(new_node.children[0], SQ) :
+            new_node = new_node.children[0]
+        
+        current_node, current_connector = self.current_node_stack[-1]
+        current_node.add(new_node, current_connector)
+        
+        
+    def _add_andnot(self, q):
+        
+        new_node = SQ()
+        self.current_node_stack.append((new_node, SQ.AND))
+        self.visit(q.a)
+        self.visit(Not(q.b))
+        self.current_node_stack.pop()
+        
+        if len(new_node)==1 and isinstance(new_node.children[0], SQ) :
+            new_node = new_node.children[0]
+        
+        current_node, current_connector = self.current_node_stack[-1]
+        current_node.add(new_node, current_connector)
+
+    def _add_andmaybe(self, q):
+        
+        new_node = SQ()
+        self.current_node_stack.append((new_node, SQ.AND))
+        self.visit(q.a)
+        self.visit(q.b)
+        self.current_node_stack.pop()
+        
+        if len(new_node)==1 and isinstance(new_node.children[0], SQ) :
+            new_node = new_node.children[0]
+        
+        current_node, current_connector = self.current_node_stack[-1]
+        current_node.add(new_node, current_connector)
+
+        
+    def _add_not(self, q):
+        
+        new_node = SQ()
+        self.current_node_stack.append((new_node, SQ.AND))
+        self.visit(q.query)
+        self.current_node_stack.pop()
+        
+        if len(new_node)==1 and isinstance(new_node.children[0], SQ) :
+            new_node = new_node.children[0]
+            
+        current_node, current_connector = self.current_node_stack[-1]
+        current_node.add(~new_node, current_connector)
+        
+    def _add_phrase(self, q):
+        new_node = SQ(**{q.fieldname+"__exact":" ".join(q.words)})            
+        current_node, current_connector = self.current_node_stack[-1]
+        current_node.add(new_node, current_connector)
+
+    def _add_prefix(self, q):
+        new_node = SQ(**{q.fieldname+"__startswith":q.text})            
+        current_node, current_connector = self.current_node_stack[-1]
+        current_node.add(new_node, current_connector)
+
+    def _add_range(self, q):
+        
+        if q.start is None:
+            if q.endexcl:
+                postfix = "__lt"
+            else:
+                postfix = "__lte"
+            new_node = SQ(**{q.fieldname+postfix:self.__convert_nb(q.end)})
+        elif q.end is None:
+            if q.startexcl:
+                postfix = "__gt"
+            else:
+                postfix = "__gte"
+            new_node = SQ(**{q.fieldname+postfix:self.__convert_nb(q.start)})
+        else:
+            new_node = SQ(**{q.fieldname+"__range":[self.__convert_nb(q.start),self.__convert_nb(q.end)]})
+        
+        current_node, current_connector = self.current_node_stack[-1]
+        current_node.add(new_node, current_connector)
+
+    def __convert_nb(self, str):        
+        try:
+            res = int(str)
+            return res
+        except ValueError:
+            try:
+                res = float(str)
+                return res
+            except ValueError:
+                return str
+        
+        
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/ldt/ldt/indexation/tests.py	Thu Aug 02 08:45:12 2012 +0200
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+'''
+Created on Aug 1, 2012
+
+@author: ymh
+'''
+from django.test import SimpleTestCase
+from ldt.indexation.query_parser import QueryParser
+from haystack.query import SQ
+import unittest
+
+class QueryParserTest(SimpleTestCase):
+    
+    def test_simple_term(self):
+        
+        qp = QueryParser("text")
+        res = qp.parse("hello")
+        
+        self.assertEqual(str(res), str(SQ(text="hello")))
+
+    def test_multiple_terms(self):
+        
+        qp = QueryParser("text")
+        res = qp.parse("hello title:world")
+        
+        self.assertEquals(str(res), str(SQ(text="hello")|SQ(title="world")))
+
+
+    def test_operator(self):
+        
+        qp = QueryParser("text")
+        res = qp.parse("title:hello AND world")
+        
+        self.assertEquals(str(res), str(SQ(title="hello")&SQ(text="world")))
+
+    def test_complex(self):
+        
+        qp = QueryParser("text")
+        res = qp.parse("hello AND world foo")
+
+        self.assertEquals(str(res), str(SQ(text="hello")&SQ(text="world")|SQ(text="foo")))
+        
+    def test_minus(self):
+        qp = QueryParser("text")
+        res = qp.parse("hello -world")
+        
+        self.assertEquals(str(res), str(SQ(text="hello")&~SQ(text="world")))
+        
+    def test_not(self):
+        qp = QueryParser("text")
+        res = qp.parse("hello NOT world")
+        
+        self.assertEquals(str(res), str(SQ(text="hello")|~SQ(text="world")))    
+
+    def test_exact(self):
+        qp = QueryParser("text")
+        res = qp.parse('title:"hello world"')
+        
+        self.assertEquals(str(res), str(SQ(title__exact="hello world")))    
+
+    def test_single_quote(self):
+        qp = QueryParser("text")
+        res = qp.parse("title:'hello world'")
+        
+        self.assertEquals(str(res), str(SQ(title="hello world")))
+        
+    def test_group(self):
+        qp = QueryParser("text")
+        
+        res = qp.parse("(hello world) AND (foo bar)")
+        
+        self.assertEquals(str(res), str(SQ(text="hello")&SQ(text="world")&SQ(text="foo")&SQ(text="bar")))
+        
+    def test_group_or(self):
+        qp = QueryParser("text")
+        
+        res = qp.parse("(hello world) OR (foo bar)")
+        
+        self.assertEquals(str(res), str((SQ(text="hello")&SQ(text="world"))|(SQ(text="foo")&SQ(text="bar")) ))
+        
+    def test_prefix(self):
+        qp = QueryParser("text")
+        res = qp.parse("title:foo*")
+        self.assertEquals(str(res), str(SQ(title__startswith='foo')))
+        
+    def test_plus(self):
+        qp = QueryParser("text")
+        res = qp.parse("title:foo +bar")
+        self.assertEquals(str(res), str(SQ(text='bar')&SQ(title='foo')))
+
+    def test_plus_multiple(self):
+        qp = QueryParser("text")
+        
+        res = qp.parse("title:foo +bar +fighter")
+        self.assertEquals(str(res), str(SQ(text='bar')&SQ(text="fighter")&SQ(title='foo')))
+
+        res = qp.parse("+title:foo +bar +fighter")
+        self.assertEquals(str(res), str(SQ(title='foo')&SQ(text='bar')&SQ(text="fighter")))
+
+    def test_ltgt(self):
+        qp = QueryParser("text")
+        res = qp.parse("count:<10")
+        self.assertEquals(str(res), str(SQ(count__lt=10)))
+        res = qp.parse("count:>10")
+        self.assertEquals(str(res), str(SQ(count__gt=10)))
+        res = qp.parse("count:<=10")
+        self.assertEquals(str(res), str(SQ(count__lte=10)))
+        res = qp.parse("count:>=10")
+        self.assertEquals(str(res), str(SQ(count__gte=10)))
+
+    def test_ltgt_float(self):
+        qp = QueryParser("text")
+        res = qp.parse("count:<3.14")
+        self.assertEquals(str(res), str(SQ(count__lt=3.14)))
+        res = qp.parse("count:>3.14")
+        self.assertEquals(str(res), str(SQ(count__gt=3.14)))
+        res = qp.parse("count:<=3.14")
+        self.assertEquals(str(res), str(SQ(count__lte=3.14)))
+        res = qp.parse("count:>=3.14")
+        self.assertEquals(str(res), str(SQ(count__gte=3.14)))
+
+    def test_ltgt_str(self):
+        qp = QueryParser("text")
+        res = qp.parse("count:<foo")
+        self.assertEquals(str(res), str(SQ(count__lt='foo')))
+        res = qp.parse("count:>foo")
+        self.assertEquals(str(res), str(SQ(count__gt='foo')))
+        res = qp.parse("count:<=foo")
+        self.assertEquals(str(res), str(SQ(count__lte='foo')))
+        res = qp.parse("count:>=foo")
+        self.assertEquals(str(res), str(SQ(count__gte='foo')))
+
+        
+    def test_range(self):
+        qp = QueryParser("text")
+        res = qp.parse("count:[foo to bar]")
+        self.assertEquals(str(res), str(SQ(count__range=['foo','bar'])))
+        
+    def test_range_nb(self):
+        qp = QueryParser("text")
+        res = qp.parse("count:[3 to 5]")
+        self.assertEquals(str(res), str(SQ(count__range=[3,5])))
+        
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
--- a/src/ldt/ldt/ldt_utils/utils.py	Tue Jul 31 17:45:14 2012 +0200
+++ b/src/ldt/ldt/ldt_utils/utils.py	Thu Aug 02 08:45:12 2012 +0200
@@ -36,8 +36,6 @@
 def generate_uuid():
     return unicode(uuid.uuid1())
 
-        
-
 class LdtUtils(object):
     
     def generate_ldt(self, contentList, title=u"", author=u"IRI Web", web_url=u"", startSegment=None, projects=None, types_id_list=None):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/ldt/ldt/test/test_runner.py	Thu Aug 02 08:45:12 2012 +0200
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+'''
+Created on Aug 1, 2012
+
+@author: ymh
+'''
+from django.test.simple import DjangoTestSuiteRunner
+
+class NoDbTestRunner(DjangoTestSuiteRunner):
+    """ A test runner to test without database creation """
+
+    def setup_databases(self, **kwargs):
+        """ Override the database creation defined in parent class """
+        pass
+
+    def teardown_databases(self, old_config, **kwargs):
+        """ Override the database teardown defined in parent class """
+        pass
\ No newline at end of file
--- a/virtualenv/res/lib/lib_create_env.py	Tue Jul 31 17:45:14 2012 +0200
+++ b/virtualenv/res/lib/lib_create_env.py	Thu Aug 02 08:45:12 2012 +0200
@@ -34,7 +34,7 @@
     'PYCRYPTO': {'setup': 'pycrypto', 'url':'https://ftp.dlitz.net/pub/dlitz/crypto/pycrypto/pycrypto-2.6.tar.gz', 'local':'pycrypto-2.6.tar.gz', 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}},
     'SSH': {'setup': 'ssh', 'url':'http://pypi.python.org/packages/source/s/ssh/ssh-1.7.14.tar.gz#md5=4cdd0549ef4699bd67b96264d3b21427', 'local':'ssh-1.7.14.tar.gz', 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}},
     'FABRIC': {'setup': 'fabric', 'url':'https://github.com/fabric/fabric/tarball/1.4.2', 'local':'fabric-1.4.2.tar.gz', 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}},
-    'MERCURIAL': {'setup': 'mercurial', 'url':'http://mercurial.selenic.com/release/mercurial-2.2.2.tar.gz', 'local':'mercurial-2.2.2.tar.gz', 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}},
+    'MERCURIAL': {'setup': 'mercurial', 'url':'http://mercurial.selenic.com/release/mercurial-2.2.3.tar.gz', 'local':'mercurial-2.2.3.tar.gz', 'install': {'method': 'pip', 'option_str': None, 'dict_extra_env': None}},
     'HAYSTACK': {'setup': 'django-haystack', 'url': 'https://github.com/toastdriven/django-haystack/tarball/master', 'local': 'django-haystack-v2.0.0.tar.gz', 'install':{'method':'pip', 'option_str': None, 'dict_extra_env': None}},
     'REQUEST': {'setup': 'requests', 'url':'https://github.com/kennethreitz/requests/tarball/v0.13.3', 'local':'requests-v0.13.3.tar.gz', 'install' : {'method':'pip', 'option_str': None, 'dict_extra_env': None}},
     'PYELASTICSEARCH': {'setup': 'pyelasticsearch', 'url':'https://github.com/toastdriven/pyelasticsearch/tarball/master', 'local':'pyelasticsearch.tar.gz', 'install' : {'method':'pip', 'option_str': None, 'dict_extra_env': None}},
Binary file virtualenv/res/src/mercurial-2.2.2.tar.gz has changed
Binary file virtualenv/res/src/mercurial-2.2.3.tar.gz has changed