web/hdabo/wp_utils.py
changeset 271 8f77cf71ab02
parent 265 73f19fa4f997
child 272 1c774f7a0341
--- a/web/hdabo/wp_utils.py	Fri Nov 16 18:12:05 2012 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,222 +0,0 @@
-# -*- coding: utf-8 -*-
-from django.conf import settings
-from django.utils.http import urlquote
-from haystack.constants import DJANGO_ID
-from haystack.query import SearchQuerySet
-from hdabo.models import Tag, TaggedSheet
-from wikitools import api, wiki
-
-
-def normalize_tag(tag):
-    if len(tag) == 0:
-        return tag
-    tag = tag.strip()
-    tag = tag.replace("_", " ")
-    tag = " ".join(tag.split())
-    tag = tag[0].upper() + tag[1:]
-    return tag
-
-def urlize_for_wikipedia(label):
-    return urlquote(label.replace(" ", "_"))
-
-
-def __is_homonymie(page_dict):
-    for cat in page_dict.get(u"categories", []):
-        if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""):
-            return True
-    return False
-
-
-def query_wikipedia_title(site, label=None, pageid=None):
-    
-    params = {'action':'query', 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500', 'rvprop':'ids'}
-        
-    if label:
-        params['titles'] = label
-    else:
-        params['pageids'] = pageid
-    wpquery = api.APIRequest(site, params) #@UndefinedVariable
-    
-    response = wpquery.query()
-    original_response = response
-    def return_null_result():
-        return { 'new_label': None, 'alternative_label': None, 'status': Tag.TAG_URL_STATUS_DICT["null_result"], 'wikipedia_url': None, 'pageid': None, 'alternative_wikipedia_url': None, 'alternative_pageid': None, 'dbpedia_uri': None, 'revision_id': None, 'response': response }
-    
-
-    query_dict = response['query']
-    # get page if multiple pages or none -> return Tag.null_result
-    pages = query_dict.get("pages", {})
-    if len(pages) > 1 or len(pages) == 0:
-        return return_null_result()
-    
-    page = pages.values()[0]
-    
-    if u"invalid" in page or u"missing" in page:
-        return return_null_result()
-
-    url = page.get(u'fullurl', None)
-    pageid = page.get(u'pageid', None)
-    new_label = page[u'title']
-    alternative_label = None
-    alternative_url = None
-    alternative_pageid = None
-    
-    if __is_homonymie(page):
-        status = Tag.TAG_URL_STATUS_DICT["homonyme"]
-    elif u"redirect" in page:
-        status = Tag.TAG_URL_STATUS_DICT["redirection"]
-    else:
-        status = Tag.TAG_URL_STATUS_DICT["match"]
-    
-    if status == Tag.TAG_URL_STATUS_DICT["redirection"]:
-        params['redirects'] = True
-        wpquery = api.APIRequest(site, params) #@UndefinedVariable    
-        response = wpquery.query()
-        query_dict = response['query']
-        pages = query_dict.get("pages", {})
-        #we know that we have at least one answer        
-        if len(pages) > 1 or len(pages) == 0:
-            return return_null_result()
-        page = pages.values()[0]
-        alternative_label = page.get('title', None)
-        alternative_url = page.get('fullurl', None)
-        alternative_pageid = page.get('pageid',None)
-        
-
-    revision_id = page.get('lastrevid', None)
-    
-    #process language to extract the english label
-    english_label = None
-    
-    if status == Tag.TAG_URL_STATUS_DICT['match'] or status == Tag.TAG_URL_STATUS_DICT['redirection']:
-        lang_links = page.get('langlinks', [])
-        for lang_info_dict in lang_links:
-            if lang_info_dict['lang'] == "en":
-                english_label = lang_info_dict["*"]
-                break
-    
-    if english_label and "#" not in english_label:
-        dbpedia_uri = settings.DBPEDIA_URI_TEMPLATE % (urlize_for_wikipedia(english_label))
-    else:
-        dbpedia_uri = None
-            
-
-    return { 'new_label': new_label, 'alternative_label': alternative_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'alternative_wikipedia_url': alternative_url, 'alternative_pageid': alternative_pageid, 'dbpedia_uri': dbpedia_uri, 'revision_id': revision_id, 'response': original_response }
-
-
-
-def get_or_create_tag(tag_label):
-    
-    tag_label_normalized = normalize_tag(tag_label)
-    # We get the wikipedia references for the tag_label
-    # We get or create the tag object
-    
-    tag = None
-    for t in Tag.objects.filter(label__iexact=tag_label_normalized):
-        if tag is None or t.url_status != Tag.TAG_URL_STATUS_DICT['null_result']:
-            tag = t
-            if tag.url_status != Tag.TAG_URL_STATUS_DICT['null_result']:
-                break
- 
-    if tag is None:
-        tag = Tag(label=tag_label_normalized, original_label=tag_label)
-        created = True
-    else:
-        created = False
-    
-    site = wiki.Wiki(settings.WIKIPEDIA_API_URL) #@UndefinedVariable
-
-    if created:
-        wp_res = query_wikipedia_title(site, label=tag_label_normalized)
-        new_label = wp_res['new_label']
-        alternative_label= wp_res['alternative_label']
-        status = wp_res['status']
-        url = wp_res['wikipedia_url']
-        alternative_url = wp_res['alternative_wikipedia_url']
-        pageid = wp_res['pageid']
-        alternative_pageid = wp_res['alternative_pageid']
-        dbpedia_uri = wp_res["dbpedia_uri"]
-        wikipedia_revision_id = wp_res['revision_id']
-
-    
-        # We save the datas
-        if new_label is not None:
-            tag.label = new_label
-        if status is not None:
-            tag.url_status = status
-        tag.alternative_label = alternative_label
-        tag.alternative_wikipedia_url = alternative_url
-        tag.alternative_wikipedia_pageid = alternative_pageid
-        tag.wikipedia_url = url            
-        tag.wikipedia_pageid = pageid
-        tag.dbpedia_uri = dbpedia_uri 
-
-        tag.save()
-        
-    elif tag.wikipedia_pageid:
-        wp_res = query_wikipedia_title(site, pageid=tag.wikipedia_pageid)
-        wikipedia_revision_id = wp_res['revision_id']
-    else:
-        wikipedia_revision_id = None
-        
-        
-    return tag, wikipedia_revision_id, created
-
-def process_tag(site, tag, verbosity=0):
-    
-    wp_res = query_wikipedia_title(site, label=tag.label)
-    new_label = wp_res['new_label']
-    alternative_label= wp_res['alternative_label']
-    status =  wp_res['status']
-    url = wp_res['wikipedia_url']
-    alternative_url = wp_res['alternative_wikipedia_url']
-    pageid = wp_res['pageid']
-    alternative_pageid = wp_res['alternative_pageid']
-    response = wp_res['response']
-    dbpedia_uri =  wp_res["dbpedia_uri"]
-    revision_id = wp_res["revision_id"]
-    
-    if verbosity >= 2 :
-        print "response from query to %s with parameters %s :" % (site.apibase, repr(new_label))
-        print repr(response)
-    
-    prev_wikipedia_pageid = tag.wikipedia_pageid
-    
-    if new_label is not None:
-        tag.label = new_label
-    if status is not None:
-        tag.url_status = status
-    tag.wikipedia_url = url
-    tag.wikipedia_pageid = pageid
-    tag.dbpedia_uri = dbpedia_uri
-    tag.alternative_label = alternative_label
-    tag.alternative_wikipedia_url = alternative_url
-    tag.alternative_wikipedia_pageid = alternative_pageid
-        
-    tag.save()
-    
-    if prev_wikipedia_pageid != pageid:
-        TaggedSheet.objects.filter(tag=tag).update(wikipedia_revision_id=revision_id)
-
-
-def reorder_datasheet_tags(ds):
-    ts_list = []
-    for ts in ds.taggedsheet_set.all():
-        ts.index_note = 0
-        kwargs = {DJANGO_ID + "__exact": unicode(ds.pk)}
-
-        results = SearchQuerySet().filter(title=ts.tag.label).filter_or(description=ts.tag.label).filter(**kwargs)
-        if len(results) > 0:
-            ts.index_note += results[0].score
-            ts.save()
-
-        ts_list.append(ts)
-    ts_list.sort(key=lambda t: (-t.index_note, t.order))
-    for k, ts in enumerate(ts_list):
-        ts.order = k + 1
-        ts.save()
-    if ds.manual_order:
-        ds.manual_order = False
-        ds.save()
-
-