-correct css and display
- add action to sort tags : bug #16
- correct text sort
-correct text encoding #3
# -*- coding: utf-8 -*-
from django.conf import settings
from django.utils.http import urlquote
from haystack.constants import DJANGO_ID
from haystack.query import SearchQuerySet
from hdabo.models import Tag, TaggedSheet
from wikitools import api, wiki
def normalize_tag(tag):
if len(tag) == 0:
return tag
tag = tag.strip()
tag = tag.replace("_", " ")
tag = " ".join(tag.split())
tag = tag[0].upper() + tag[1:]
return tag
def urlize_for_wikipedia(label):
return urlquote(label.replace(" ", "_"))
def __is_homonymie(page_dict):
for cat in page_dict.get(u"categories", []):
if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""):
return True
return False
def query_wikipedia_title(site, label=None, pageid=None):
params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500', 'rvprop':'ids'}
if label:
params['titles'] = label
else:
params['pageids'] = pageid
wpquery = api.APIRequest(site, params) #@UndefinedVariable
response = wpquery.query()
original_response = response
def return_null_result():
return { 'new_label': None, 'status': Tag.TAG_URL_STATUS_DICT["null_result"], 'wikipedia_url': None, 'pageid': None, 'dbpedia_uri': None, 'revision_id': None, 'response': response }
query_dict = response['query']
# get page if multiple pages or none -> return Tag.null_result
pages = query_dict.get("pages", {})
if len(pages) > 1 or len(pages) == 0:
return return_null_result()
page = pages.values()[0]
if u"invalid" in page or u"missing" in page:
return return_null_result()
url = page.get(u'fullurl', None)
pageid = page.get(u'pageid', None)
new_label = page[u'title']
if __is_homonymie(page):
status = Tag.TAG_URL_STATUS_DICT["homonyme"]
elif u"redirect" in page:
status = Tag.TAG_URL_STATUS_DICT["redirection"]
else:
status = Tag.TAG_URL_STATUS_DICT["match"]
if status == Tag.TAG_URL_STATUS_DICT["redirection"]:
params['redirects'] = True
wpquery = api.APIRequest(site, params) #@UndefinedVariable
response = wpquery.query()
query_dict = response['query']
pages = query_dict.get("pages", {})
#we know that we have at least one answer
if len(pages) > 1 or len(pages) == 0:
return return_null_result()
page = pages.values()[0]
revision_id = page.get('lastrevid', None)
#process language to extract the english label
english_label = None
if status == Tag.TAG_URL_STATUS_DICT['match'] or status == Tag.TAG_URL_STATUS_DICT['redirection']:
lang_links = page.get('langlinks', [])
for lang_info_dict in lang_links:
if lang_info_dict['lang'] == "en":
english_label = lang_info_dict["*"]
break
if english_label and "#" not in english_label:
dbpedia_uri = settings.DBPEDIA_URI_TEMPLATE % (urlize_for_wikipedia(english_label))
else:
dbpedia_uri = None
return { 'new_label': new_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'dbpedia_uri': dbpedia_uri, 'revision_id': revision_id, 'response': original_response }
def get_or_create_tag(tag_label):
tag_label_normalized = normalize_tag(tag_label)
# We get the wikipedia references for the tag_label
# We get or create the tag object
tag = None
for t in Tag.objects.filter(label__iexact=tag_label_normalized):
if tag is None or t.url_status != Tag.TAG_URL_STATUS_DICT['null_result']:
tag = t
if tag.url_status != Tag.TAG_URL_STATUS_DICT['null_result']:
break
if tag is None:
tag = Tag(label=tag_label_normalized, original_label=tag_label)
created = True
else:
created = False
site = wiki.Wiki(settings.WIKIPEDIA_API_URL) #@UndefinedVariable
if created:
wp_res = query_wikipedia_title(site, label=tag_label_normalized)
new_label, status, url, pageid, dbpedia_uri, wikipedia_revision_id = wp_res['new_label'], wp_res['status'], wp_res['wikipedia_url'], wp_res['pageid'], wp_res["dbpedia_uri"], wp_res['revision_id']
# We save the datas
if new_label is not None:
tag.label = new_label
if status is not None:
tag.url_status = status
tag.wikipedia_url = url
tag.wikipedia_pageid = pageid
tag.dbpedia_uri = dbpedia_uri
tag.save()
elif tag.wikipedia_pageid:
wp_res = query_wikipedia_title(site, pageid=tag.wikipedia_pageid)
wikipedia_revision_id = wp_res['revision_id']
else:
wikipedia_revision_id = None
return tag, wikipedia_revision_id, created
def process_tag(site, tag, verbosity=0):
wp_res = query_wikipedia_title(site, label=tag.label)
new_label, status, url, pageid, response, dbpedia_uri, revision_id = wp_res['new_label'], wp_res['status'], wp_res['wikipedia_url'], wp_res['pageid'], wp_res['response'], wp_res["dbpedia_uri"], wp_res["revision_id"]
if verbosity >= 2 :
print "response from query to %s with parameters %s :" % (site.apibase, repr(new_label))
print repr(response)
prev_wikipedia_pageid = tag.wikipedia_pageid
if new_label is not None:
tag.label = new_label
if status is not None:
tag.url_status = status
tag.wikipedia_url = url
tag.wikipedia_pageid = pageid
tag.dbpedia_uri = dbpedia_uri
tag.save()
if prev_wikipedia_pageid != pageid:
TaggedSheet.objects.filter(tag=tag).update(wikipedia_revision_id=revision_id)
def reorder_datasheet_tags(ds):
ts_list = []
for ts in ds.taggedsheet_set.all():
kwargs = {DJANGO_ID + "__exact": unicode(ds.pk)}
results = SearchQuerySet().filter(title=ts.tag.label).filter_or(description=ts.tag.label).filter(**kwargs)
if len(results) > 0:
ts.index_note = results[0].score
ts.save()
ts_list.append(ts)
ts_list.sort(key=lambda t: (-t.index_note, t.order))
for k, ts in enumerate(ts_list):
ts.order = k + 1
ts.save()
if ds.manual_order:
ds.manual_order = False
ds.save()