src/ldt/ldt/indexation/__init__.py
author hamidouk
Thu, 02 Feb 2012 11:07:20 +0100
changeset 500 10ec59f06198
parent 452 8e9494006e7b
child 568 b67fc0fd2389
permissions -rw-r--r--
Merge with upstream
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     1
from django.conf import settings
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     2
import lucene
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     3
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     4
lucene.initVM(lucene.CLASSPATH)
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     5
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     6
STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     7
ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     8
ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
     9
ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    10
ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    11
ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT))
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    12
176
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    13
def get_results_with_context(field, query):
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    14
    res = get_results_list(field, query)
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    15
    searcher = get_searcher()
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    16
    query = get_query_parser(field).parse(query)
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    17
    contexts = [] 
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    18
        
176
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    19
    for i in res:
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    20
        doc = searcher.doc(i.doc)  
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    21
        ids = {"iri_id":doc.get("iri_id"), "ensemble_id":doc.get("ensemble_id"), "decoupage_id":doc.get("decoupage_id"), "element_id":doc.get("element_id"), "project_id":doc.get("project_id")}     
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    22
        score = i.score
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    23
        title = doc.getField('title').stringValue()
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    24
        desc = doc.getField('abstract').stringValue()
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    25
        tags = doc.getField('tags').stringValue()
349
63f729155d81 Enhance search and front template : add begin and duration to searched segments.
cavaliet
parents: 205
diff changeset
    26
        begin = doc.getField('begin').stringValue()
63f729155d81 Enhance search and front template : add begin and duration to searched segments.
cavaliet
parents: 205
diff changeset
    27
        duration = doc.getField('duration').stringValue()
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    28
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    29
        ids['context'] = desc
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    30
        ids['title'] = title
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    31
        ids['tags'] = tags
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    32
        ids['score'] = score
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    33
        ids['lucene_id'] = i.doc
349
63f729155d81 Enhance search and front template : add begin and duration to searched segments.
cavaliet
parents: 205
diff changeset
    34
        ids['begin'] = begin
63f729155d81 Enhance search and front template : add begin and duration to searched segments.
cavaliet
parents: 205
diff changeset
    35
        ids['duration'] = duration
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    36
        contexts.append(ids)     
176
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    37
    
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    38
    searcher.close()
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    39
    return contexts
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    40
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    41
def get_results_list(field, query):
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    42
    indexSearcher = get_searcher()
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    43
    queryParser = get_query_parser(field)
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    44
    queryObj = queryParser.parse(query)
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    45
    hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER)
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    46
    
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    47
    return hits.scoreDocs
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    48
452
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    49
def highlight_documents(results_list, query, field):
205
49c9890dce4a Fixed bug when displaying annotations from iri files in search results + thread in indexation
verrierj
parents: 198
diff changeset
    50
    searcher = get_searcher()
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    51
    analyzer = lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    52
    formatter = lucene.SimpleHTMLFormatter('<span class="highlight">', '</span>')
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    53
    query = get_query_parser(field).parse(query)
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    54
    highlighter = lucene.Highlighter(formatter, lucene.QueryScorer (query))
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    55
    
452
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    56
    for project in results_list:
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    57
        for segment in project['list']:
452
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    58
            lucene_doc = searcher.doc(segment.lucene_id)
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    59
            segment.context = get_highlighted_text(lucene_doc, analyzer, highlighter, 'abstract')
198
202107fff250 Fixed minor bugs
verrierj
parents: 183
diff changeset
    60
            tags = get_highlighted_text(lucene_doc, analyzer, highlighter, 'tags')
452
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    61
            segment.title = get_highlighted_text(lucene_doc, analyzer, highlighter, 'title')
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    62
            
452
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    63
            if segment.context == u'':
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    64
                segment.context = lucene_doc.getField('abstract').stringValue()
198
202107fff250 Fixed minor bugs
verrierj
parents: 183
diff changeset
    65
            if tags == u'':
202107fff250 Fixed minor bugs
verrierj
parents: 183
diff changeset
    66
                tags = lucene_doc.getField('tags').stringValue()
452
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    67
            if segment.title == u'':
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    68
                segment.title = lucene_doc.getField('title').stringValue() 
198
202107fff250 Fixed minor bugs
verrierj
parents: 183
diff changeset
    69
                
452
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    70
            segment.context_tags = tags[tags.find(';')+1:]          
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    71
            
452
8e9494006e7b segment abstracts + content images can be retrieved directly from search results page
verrierj
parents: 349
diff changeset
    72
    return results_list    
183
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    73
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    74
def get_highlighted_text(doc, analyzer, highlighter, field):
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    75
    res = doc.getField(field).stringValue()
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    76
    ts = analyzer.tokenStream("body", lucene.StringReader(res)) 
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    77
    res = highlighter.getBestFragments(ts, res, settings.LDT_MAX_FRAGMENT_PER_SEARCH, "...")
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    78
    return res    
89334901b7c2 Improve search page
verrierj
parents: 176
diff changeset
    79
142
0b2f32965787 small corrections
ymh <ymh.work@gmail.com>
parents: 119
diff changeset
    80
def get_writer(new=False):
84
91a4dafd5904 improve setup and debug lucene calls
ymh <ymh.work@gmail.com>
parents: 77
diff changeset
    81
    lucene.getVMEnv().attachCurrentThread()
142
0b2f32965787 small corrections
ymh <ymh.work@gmail.com>
parents: 119
diff changeset
    82
    return lucene.IndexWriter(STORE, ANALYZER, new, lucene.IndexWriter.MaxFieldLength.UNLIMITED)
77
7923feb2e362 improve indexation
ymh <ymh.work@gmail.com>
parents:
diff changeset
    83
95
8e57d39f660d correct indexation problem
ymh <ymh.work@gmail.com>
parents: 84
diff changeset
    84
def get_searcher():
8e57d39f660d correct indexation problem
ymh <ymh.work@gmail.com>
parents: 84
diff changeset
    85
    lucene.getVMEnv().attachCurrentThread()
8e57d39f660d correct indexation problem
ymh <ymh.work@gmail.com>
parents: 84
diff changeset
    86
    return lucene.IndexSearcher(STORE)
8e57d39f660d correct indexation problem
ymh <ymh.work@gmail.com>
parents: 84
diff changeset
    87
97
10f69a5bd9e1 correct propagation of project id on indexation
ymh <ymh.work@gmail.com>
parents: 95
diff changeset
    88
def get_query_parser(field):
95
8e57d39f660d correct indexation problem
ymh <ymh.work@gmail.com>
parents: 84
diff changeset
    89
    queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30))
8e57d39f660d correct indexation problem
ymh <ymh.work@gmail.com>
parents: 84
diff changeset
    90
    queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND)
8e57d39f660d correct indexation problem
ymh <ymh.work@gmail.com>
parents: 84
diff changeset
    91
    return queryParser
176
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    92
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    93
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    94
a88714473302 Added a page to display search results
verrierj
parents: 142
diff changeset
    95