web/ldt/text/annotindexer.py
author wakimd
Tue, 23 Nov 2010 17:54:36 +0100
changeset 21 1a061f244254
permissions -rw-r--r--
Pylucene indexation
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
21
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
     1
from django.conf import settings
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
     2
from models import *
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
     3
import lucene
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
     4
from ldt.text import STORE
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
     5
from ldt.text import ANALYZER
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
     6
import lxml.etree
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
     7
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
     8
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
     9
class AnnotIndexer(object):
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    10
    
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    11
    def __init__(self, annotList, writer):
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    12
        self.__annotList = annotList
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    13
        self.__writer = writer
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    14
        
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    15
    
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    16
    def index_all(self):
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    17
        for annot in self.__annotList:
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    18
            self.index_annotation(annot)
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    19
    
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    20
    
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    21
    def index_annotation(self, annotation):
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    22
        
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    23
        doc = lucene.Document()
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    24
        
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    25
        doc.add(lucene.Field("annotation_id", annotation.external_id, lucene.Field.Store.YES, lucene.Field.Index.NOT_ANALYZED))              
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    26
        
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    27
        annottags = annotation.get_tag_list()
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    28
        tags = ""
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    29
        
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    30
        if annottags is None or len(annottags) == 0:
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    31
            tags = ""
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    32
        else:
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    33
            for tag in annottags:
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    34
                tags += tag + ";" 
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    35
        
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    36
        doc.add(lucene.Field("type_doc", "text-annotation", lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))              
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    37
        doc.add(lucene.Field("tags", tags, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    38
        doc.add(lucene.Field("title", annotation.title, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    39
        doc.add(lucene.Field("abstract", annotation.description, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    40
        doc.add(lucene.Field("text", annotation.text, lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    41
        doc.add(lucene.Field("all", " ".join([tags, annotation.title, annotation.description, annotation.text]), lucene.Field.Store.NO, lucene.Field.Index.ANALYZED))
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    42
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    43
        self.__writer.addDocument(doc)
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    44
            
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    45
        self.__writer.close()
1a061f244254 Pylucene indexation
wakimd
parents:
diff changeset
    46