| author | hamidouk |
| Thu, 02 Feb 2012 11:07:20 +0100 | |
| changeset 500 | 10ec59f06198 |
| parent 452 | 8e9494006e7b |
| child 568 | b67fc0fd2389 |
| permissions | -rw-r--r-- |
| 77 | 1 |
from django.conf import settings |
2 |
import lucene |
|
3 |
||
4 |
lucene.initVM(lucene.CLASSPATH) |
|
5 |
||
6 |
STORE = lucene.SimpleFSDirectory(lucene.File(settings.INDEX_PATH)) |
|
7 |
ANALYZER = lucene.PerFieldAnalyzerWrapper(lucene.StandardAnalyzer(lucene.Version.LUCENE_CURRENT)) |
|
8 |
ANALYZER.addAnalyzer("tags", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) |
|
9 |
ANALYZER.addAnalyzer("title", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) |
|
10 |
ANALYZER.addAnalyzer("abstract", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) |
|
11 |
ANALYZER.addAnalyzer("all", lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT)) |
|
12 |
||
| 176 | 13 |
def get_results_with_context(field, query): |
14 |
res = get_results_list(field, query) |
|
15 |
searcher = get_searcher() |
|
16 |
query = get_query_parser(field).parse(query) |
|
| 183 | 17 |
contexts = [] |
18 |
||
| 176 | 19 |
for i in res: |
20 |
doc = searcher.doc(i.doc) |
|
21 |
ids = {"iri_id":doc.get("iri_id"), "ensemble_id":doc.get("ensemble_id"), "decoupage_id":doc.get("decoupage_id"), "element_id":doc.get("element_id"), "project_id":doc.get("project_id")} |
|
| 183 | 22 |
score = i.score |
23 |
title = doc.getField('title').stringValue() |
|
24 |
desc = doc.getField('abstract').stringValue() |
|
25 |
tags = doc.getField('tags').stringValue() |
|
|
349
63f729155d81
Enhance search and front template : add begin and duration to searched segments.
cavaliet
parents:
205
diff
changeset
|
26 |
begin = doc.getField('begin').stringValue() |
|
63f729155d81
Enhance search and front template : add begin and duration to searched segments.
cavaliet
parents:
205
diff
changeset
|
27 |
duration = doc.getField('duration').stringValue() |
| 183 | 28 |
|
29 |
ids['context'] = desc |
|
30 |
ids['title'] = title |
|
31 |
ids['tags'] = tags |
|
32 |
ids['score'] = score |
|
33 |
ids['lucene_id'] = i.doc |
|
|
349
63f729155d81
Enhance search and front template : add begin and duration to searched segments.
cavaliet
parents:
205
diff
changeset
|
34 |
ids['begin'] = begin |
|
63f729155d81
Enhance search and front template : add begin and duration to searched segments.
cavaliet
parents:
205
diff
changeset
|
35 |
ids['duration'] = duration |
| 183 | 36 |
contexts.append(ids) |
| 176 | 37 |
|
38 |
searcher.close() |
|
39 |
return contexts |
|
40 |
||
41 |
def get_results_list(field, query): |
|
42 |
indexSearcher = get_searcher() |
|
43 |
queryParser = get_query_parser(field) |
|
44 |
queryObj = queryParser.parse(query) |
|
45 |
hits = indexSearcher.search(queryObj, settings.LDT_MAX_SEARCH_NUMBER) |
|
46 |
||
47 |
return hits.scoreDocs |
|
48 |
||
|
452
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
49 |
def highlight_documents(results_list, query, field): |
|
205
49c9890dce4a
Fixed bug when displaying annotations from iri files in search results + thread in indexation
verrierj
parents:
198
diff
changeset
|
50 |
searcher = get_searcher() |
| 183 | 51 |
analyzer = lucene.FrenchAnalyzer(lucene.Version.LUCENE_CURRENT) |
52 |
formatter = lucene.SimpleHTMLFormatter('<span class="highlight">', '</span>') |
|
53 |
query = get_query_parser(field).parse(query) |
|
54 |
highlighter = lucene.Highlighter(formatter, lucene.QueryScorer (query)) |
|
55 |
||
|
452
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
56 |
for project in results_list: |
| 183 | 57 |
for segment in project['list']: |
|
452
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
58 |
lucene_doc = searcher.doc(segment.lucene_id) |
|
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
59 |
segment.context = get_highlighted_text(lucene_doc, analyzer, highlighter, 'abstract') |
| 198 | 60 |
tags = get_highlighted_text(lucene_doc, analyzer, highlighter, 'tags') |
|
452
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
61 |
segment.title = get_highlighted_text(lucene_doc, analyzer, highlighter, 'title') |
| 183 | 62 |
|
|
452
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
63 |
if segment.context == u'': |
|
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
64 |
segment.context = lucene_doc.getField('abstract').stringValue() |
| 198 | 65 |
if tags == u'': |
66 |
tags = lucene_doc.getField('tags').stringValue() |
|
|
452
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
67 |
if segment.title == u'': |
|
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
68 |
segment.title = lucene_doc.getField('title').stringValue() |
| 198 | 69 |
|
|
452
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
70 |
segment.context_tags = tags[tags.find(';')+1:] |
| 183 | 71 |
|
|
452
8e9494006e7b
segment abstracts + content images can be retrieved directly from search results page
verrierj
parents:
349
diff
changeset
|
72 |
return results_list |
| 183 | 73 |
|
74 |
def get_highlighted_text(doc, analyzer, highlighter, field): |
|
75 |
res = doc.getField(field).stringValue() |
|
76 |
ts = analyzer.tokenStream("body", lucene.StringReader(res)) |
|
77 |
res = highlighter.getBestFragments(ts, res, settings.LDT_MAX_FRAGMENT_PER_SEARCH, "...") |
|
78 |
return res |
|
79 |
||
| 142 | 80 |
def get_writer(new=False): |
|
84
91a4dafd5904
improve setup and debug lucene calls
ymh <ymh.work@gmail.com>
parents:
77
diff
changeset
|
81 |
lucene.getVMEnv().attachCurrentThread() |
| 142 | 82 |
return lucene.IndexWriter(STORE, ANALYZER, new, lucene.IndexWriter.MaxFieldLength.UNLIMITED) |
| 77 | 83 |
|
| 95 | 84 |
def get_searcher(): |
85 |
lucene.getVMEnv().attachCurrentThread() |
|
86 |
return lucene.IndexSearcher(STORE) |
|
87 |
||
|
97
10f69a5bd9e1
correct propagation of project id on indexation
ymh <ymh.work@gmail.com>
parents:
95
diff
changeset
|
88 |
def get_query_parser(field): |
| 95 | 89 |
queryParser = lucene.QueryParser(lucene.Version.LUCENE_30, field, lucene.FrenchAnalyzer(lucene.Version.LUCENE_30)) |
90 |
queryParser.setDefaultOperator(lucene.QueryParser.Operator.AND) |
|
91 |
return queryParser |
|
| 176 | 92 |
|
93 |
||
94 |
||
95 |