- add popularity
- calculate dbpedia_uri
- display dbpedia uri
- add manual_order
- various corrections
# -*- coding: utf-8 -*-
'''
Created on Jun 7, 2011
@author: ymh
'''
from django.conf import settings
from django.core.management.base import NoArgsCommand
from django.core.management.color import no_style
from hdabo.models import Tag
from hdabo.wp_utils import process_tag
from optparse import make_option
from wikitools import wiki
import math
import sys
class Command(NoArgsCommand):
'''
query and update wikipedia for tag title.
'''
options = ''
help = """query and update wikipedia for tag title."""
option_list = NoArgsCommand.option_list + (
make_option('--force',
action='store_true',
dest='force',
default=False,
help='force all tags to be updated, not only those not yet processed'),
make_option('--random',
action='store_true',
dest='random',
default=False,
help='randomize query on tags'),
make_option('--site',
action='store',
type='string',
dest='site_url',
default="http://fr.wikipedia.org/w/api.php",
help='the url for the wikipedia site'),
make_option('--limit',
action='store',
type='int',
dest='limit',
default= -1,
help='number of tag to process'),
make_option('--start',
action='store',
type='int',
dest='start',
default=0,
help='number of tag to ignore'),
)
def __is_homonymie(self, page_dict):
for cat in page_dict.get(u"categories", []):
if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""):
return True
return False
def process_wp_response(self, label, response):
query_dict = response['query']
# get page if multiple pages or none -> return Tag.null_result
pages = query_dict.get("pages", {})
if len(pages) > 1 or len(pages) == 0:
return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
page = pages.values()[0]
if u"invalid" in page or u"missing" in page:
return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None
url = page.get(u'fullurl', None)
pageid = page.get(u'pageid', None)
new_label = page[u'title']
if self.__is_homonymie(page):
status = Tag.TAG_URL_STATUS_DICT["homonyme"]
elif u"redirect" in page:
status = Tag.TAG_URL_STATUS_DICT["redirection"]
else:
status = Tag.TAG_URL_STATUS_DICT["match"]
return new_label, status, url, pageid
def show_progress(self, current_line, total_line, label, width):
percent = (float(current_line) / float(total_line)) * 100.0
marks = math.floor(width * (percent / 100.0))
spaces = math.floor(width - marks)
loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']'
sys.stdout.write(u"%s %d%% %d/%d - %s\r" % (loader, percent, current_line - 1, total_line - 1, repr(label))) #takes the header into account
if percent >= 100:
sys.stdout.write("\n")
sys.stdout.flush()
def handle_noargs(self, **options):
self.style = no_style()
interactive = options.get('interactive', True)
verbosity = int(options.get('verbosity', '1'))
force = options.get('force', False)
limit = options.get("limit", -1)
start = options.get("start", 0)
site_url = options.get('site_url', settings.WIKIPEDIA_API_URL)
random = options.get('random', False)
if verbosity > 2:
print "option passed : " + repr(options)
if force and interactive:
confirm = raw_input("""You have requested to query and replace the wikipedia information for all datasheets.
Are you sure you want to do this ?
Type 'yes' to continue, or 'no' to cancel: """)
else:
confirm = 'yes'
if confirm != "yes":
print "wikipedia query cancelled"
return
if force:
queryset = Tag.objects.all()
else:
queryset = Tag.objects.filter(url_status=None)
if random:
queryset = queryset.order_by("?")
else:
queryset = queryset.order_by("label")
if limit >= 0:
queryset = queryset[start:limit]
else:
queryset = queryset[start:]
if verbosity > 2 :
print "Tag Query is %s" % (queryset.query)
site = wiki.Wiki(site_url) #@UndefinedVariable
count = queryset.count()
if verbosity > 1:
print "Processing %d tags" % (count)
for i, tag in enumerate(queryset):
if verbosity > 1:
print "processing tag %s (%d/%d)" % (tag.label, i + 1, count)
else:
self.show_progress(i + 1, count, tag.label, 60)
process_tag(site, tag, verbosity)