# HG changeset patch # User ymh # Date 1308697247 -7200 # Node ID 08b008c5a07d4c2b6bc76ec7cdefcba24961f5ad # Parent 3ad571e54608b3362888ec2c93d2184c68d38e67 - add popularity - calculate dbpedia_uri - display dbpedia uri - add manual_order - various corrections diff -r 3ad571e54608 -r 08b008c5a07d .settings/org.eclipse.core.resources.prefs --- a/.settings/org.eclipse.core.resources.prefs Mon Jun 20 15:49:22 2011 +0200 +++ b/.settings/org.eclipse.core.resources.prefs Wed Jun 22 01:00:47 2011 +0200 @@ -1,4 +1,4 @@ -#Fri Jun 17 01:31:06 CEST 2011 +#Fri Jun 17 17:33:03 CEST 2011 eclipse.preferences.version=1 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/haystack/backends/__init__.py=utf-8 encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/sortedm2m/fields.py=utf-8 @@ -8,8 +8,9 @@ encoding//virtualenv/web/env/hdabo/lib/python2.6/site-packages/wikitools/wiki.py=utf-8 encoding//web/hdabo/fields.py=utf-8 encoding//web/hdabo/forms.py=utf-8 -encoding//web/hdabo/management/commands/importcsv.py=utf-8 -encoding//web/hdabo/management/commands/querywikipedia.py=utf-8 +encoding//web/hdabo/management/commands/import_csv.py=utf-8 +encoding//web/hdabo/management/commands/import_tag_popularity.py=utf-8 +encoding//web/hdabo/management/commands/query_wikipedia.py=utf-8 encoding//web/hdabo/models.py=utf-8 encoding//web/hdabo/search/french_whoosh_backend.py=utf-8 encoding//web/hdabo/tests/models.py=utf-8 diff -r 3ad571e54608 -r 08b008c5a07d sql/create_db.sql --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/sql/create_db.sql Wed Jun 22 01:00:47 2011 +0200 @@ -0,0 +1,7 @@ +CREATE DATABASE hdabo + WITH ENCODING='UTF8' + OWNER=iri + TEMPLATE=template0 + LC_COLLATE='fr_FR.UTF-8' + LC_CTYPE='fr_FR.UTF-8' + CONNECTION LIMIT=-1; \ No newline at end of file diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/fixtures/datasheet_10.yaml.bz2 Binary file web/hdabo/fixtures/datasheet_10.yaml.bz2 has changed diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/fixtures/datasheet_347.yaml.bz2 Binary file web/hdabo/fixtures/datasheet_347.yaml.bz2 has changed diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/fixtures/initial_data.yaml.bz2 Binary file web/hdabo/fixtures/initial_data.yaml.bz2 has changed diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/management/commands/import_csv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/hdabo/management/commands/import_csv.py Wed Jun 22 01:00:47 2011 +0200 @@ -0,0 +1,257 @@ +# -*- coding: utf-8 -*- +''' +Created on May 25, 2011 + +@author: ymh +''' +#Auteur,Chemin,Comment,Controle,Datcre,Datmaj,Desc,Domaine,Format,ID,Insee,Org,Org_Home,OrgID,Periode1,Periode2,Periode3,Satut,Sousdom,Tag,Theme2,Theme3,Titre,Url,Vignette,Ville +#"Auteur","Chemin","Comment","Controle","Datcre","Datmaj","Desc","Domaine","Format","ID","Insee","Org","Org_Home","OrgID","Periode1","Periode2","Periode3","Satut","Sousdom","Tag","Theme2","Theme3","Titre","Url","Vignette","Ville", + +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction +from hdabo.models import (Author, Datasheet, DocumentFormat, Domain, Organisation, + Tag, TaggedSheet, TimePeriod, Location) +from hdabo.wp_utils import normalize_tag +from optparse import make_option +import csv +import datetime +import math +import sys + +class Command(BaseCommand): + ''' + Command to import csvfile + ''' + args = '' + options = '[--ignore-existing] [--lines] [--encoding]' + help = """Import of a csv file for hdabo +Options: + --ignore-existing : ignore existing datasheets + --lines : max number of lines to load (for each file). 0 means all. + --encoding : files encoding. default to latin-1""" + + option_list = BaseCommand.option_list + ( + make_option('--encoding', + action='store', + type='string', + dest='encoding', + default="latin-1", + help='fix the file encoding. default to latin-1'), + make_option('--delimiter', + action='store', + type='string', + dest='delimiter', + default=";", + help='csv file delimiter'), + make_option('--dialect', + action='store', + type='string', + dest='dialect', + default="excel", + help='csv dialect'), + make_option('--fieldnames', + action='store', + type='string', + dest='fieldnames', + default=None, + help='fields list (comma separated)'), + make_option('--lines', + action='store', + type='int', + dest='lines', + default=0, + help='Number of lines to read. 0 means all.'), + make_option('--ignore-existing', + action='store_true', + dest='ignore_existing', + default=False, + help='force insertion'), + + ) + + def show_progress(self, current_line, total_line, width): + + percent = (float(current_line) / float(total_line)) * 100.0 + + marks = math.floor(width * (percent / 100.0)) + spaces = math.floor(width - marks) + + loader = '[' + ('=' * int(marks)) + (' ' * int(spaces)) + ']' + + sys.stdout.write("%s %d%% %d/%d\r" % (loader, percent, current_line - 1, total_line - 1)) #takes the header into account + if percent >= 100: + sys.stdout.write("\n") + sys.stdout.flush() + + + def create_domain_period(self, row_value, klass, school_period): + res_list = [] + if not row_value: + return res_list + for label_str in [dstr.strip() for dstr in row_value.split('\x0b')]: + if label_str: + res_obj, created = klass.objects.get_or_create(label=label_str, school_period=school_period, defaults={"label":label_str, "school_period":school_period}) #@UnusedVariable + res_list.append(res_obj) + return res_list + + def create_datasheet(self, row): + + if self.ignore_existing and Datasheet.objects.filter(hda_id=row[u"ID"]).count() > 0: + return + + author_str = row[u'Auteur'] + if author_str: + author_array = author_str.split(" ") + if len(author_array) == 0: + firstname = "" + lastname = "" + elif len(author_array) == 1: + firstname = "" + lastname = author_array[0] + elif len(author_array) == 2: + firstname = author_array[0] + lastname = author_array[1] + + author, created = Author.objects.get_or_create(hda_id=author_str, defaults={"firstname":firstname, "lastname":lastname}) #@UnusedVariable + else: + author = None + + org_str = row[u"Org"] + if org_str: + url_str = row[u'Org_Home'] + if url_str is not None: + url_str = url_str.strip() + org, created = Organisation.objects.get_or_create(hda_id=org_str, defaults={"name":org_str, "website" : url_str}) #@UnusedVariable + else: + org = None + + town_str = row[u"Ville"] + if town_str: + insee_str = row[u'Insee'].strip() if row[u'Insee'] else row[u'Insee'] + if len(insee_str) > 5: + insee_str = "" + loc, created = Location.objects.get_or_create(insee=insee_str, defaults={"name": town_str, "insee": insee_str}) #@UnusedVariable + else: + loc = None + + format_str = row[u"Format"] + if format_str: + format, created = DocumentFormat.objects.get_or_create(label=format_str, defaults={"label": format_str}) #@UnusedVariable + else: + format = None + + domains = self.create_domain_period(row[u"Domaine"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Global']) + + primary_periods = self.create_domain_period(row[u"Periode1"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Primaire']) + college_periods = self.create_domain_period(row[u"Periode2"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Collège']) + highschool_periods = self.create_domain_period(row[u"Periode3"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Lycée']) + + primary_themes = self.create_domain_period(row[u"Sousdom"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Primaire']) + college_themes = self.create_domain_period(row[u"Theme2"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Collège']) + highschool_themes = self.create_domain_period(row[u"Theme3"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Lycée']) + + url = row[u"Url"] + if url is not None: + url = url.strip() + + datasheet = Datasheet.objects.create( + hda_id=row[u"ID"], + author=author, + organisation=org, + title=row[u"Titre"], + description=row[u"Desc"], + url=url, + town=loc, + format=format, + original_creation_date=datetime.datetime.strptime(row[u"Datcre"], "%d/%m/%Y").date(), + original_modification_date=datetime.datetime.strptime(row[u"Datmaj"], "%d/%m/%Y").date(), + validated=False + ) + + datasheet.save() + + datasheet.domains = domains + datasheet.primary_periods = primary_periods + datasheet.college_periods = college_periods + datasheet.highschool_periods = highschool_periods + datasheet.primary_themes = primary_themes + datasheet.college_themes = college_themes + datasheet.highschool_themes = highschool_themes + + + if row[u'Tag']: + for i, tag in enumerate([t.strip() for t in row[u'Tag'].split(u";")]): + if len(tag) == 0: + continue + tag_label = normalize_tag(tag) + tag_obj, created = Tag.objects.get_or_create(label__iexact=tag_label, defaults={'label':tag_label, 'original_label':tag}) #@UnusedVariable + tagged_ds = TaggedSheet(datasheet=datasheet, tag=tag_obj, original_order=i + 1, order=i + 1) + tagged_ds.save() + + + def handle(self, *args, **options): + + if len(args) == 0: + raise CommandError("Gives at lat one csv file to import") + + self.encoding = options.get('encoding', "latin-1") + lines = options.get('lines', 0) + self.ignore_existing = options.get('ignore_existing', False) + fieldnames = options.get('fieldnames', None) + + transaction.commit_unless_managed() + transaction.enter_transaction_management() + transaction.managed(True) + + try: + for csv_path in args: + try: + print "Processing %s " % (csv_path) + with open(csv_path, 'rU') as csv_file: + + # get the number of lines if necessary + if not lines: + for i, l in enumerate(csv_file): #@UnusedVariable + pass + total_line = i + 1 + if fieldnames: + total_line = total_line + 1 + csv_file.seek(0) + else: + total_line = lines + 1 + + delimiter = options.get('delimiter', ";") + if delimiter == "TAB" or delimiter == "\\t": + delimiter = '\t' + + dr_kwargs = {'delimiter':delimiter} + if fieldnames is not None: + dr_kwargs['fieldnames'] = [f.strip() for f in fieldnames.split(",")] + dialect = options.get('dialect', "excel") + if dialect is not None: + dr_kwargs['dialect'] = dialect + + reader = csv.DictReader(csv_file, **dr_kwargs) + + for j, row in enumerate(reader): + if lines and j >= lines: + break + line_num = reader.line_num if fieldnames is None else reader.line_num + 1 + self.show_progress(line_num, total_line, 60) + def safe_decode(val, encoding): + if val: + return val.decode(encoding) + else: + return val + + row = dict([(safe_decode(key, self.encoding), safe_decode(value, self.encoding)) for key, value in row.items()]) + self.create_datasheet(row) + + transaction.commit() + except Exception: + transaction.rollback() + raise + finally: + print('') + finally: + transaction.leave_transaction_management() diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/management/commands/import_tag_popularity.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/hdabo/management/commands/import_tag_popularity.py Wed Jun 22 01:00:47 2011 +0200 @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +''' +Created on Jun 17, 2011 + +@author: ymh + +command to import tag popularity + +''' + +from django.core.management.base import BaseCommand, CommandError +from hdabo.models import Tag +from optparse import make_option +import csv +import math +import sys + + +class Command(BaseCommand): + ''' + Command to import csvfile + ''' + args = '' + options = '[--ignore-existing] [--lines] [--encoding]' + help = """Import of a tag popularity file for hdabo +Options: + --ignore-existing : ignore existing datasheets + --lines : max number of lines to load (for each file). 0 means all. + --encoding : files encoding. default to latin-1""" + + option_list = BaseCommand.option_list + ( + make_option('--encoding', + action='store', + type='string', + dest='encoding', + default="latin-1", + help='fix the file encoding. default to latin-1'), + make_option('--delimiter', + action='store', + type='string', + dest='delimiter', + default=";", + help='csv file delimiter'), + make_option('--dialect', + action='store', + type='string', + dest='dialect', + default="excel", + help='csv dialect'), + make_option('--fieldnames', + action='store', + type='string', + dest='fieldnames', + default="label,popularity", + help='fields list (comma separated)'), + make_option('--lines', + action='store', + type='int', + dest='lines', + default=0, + help='Number of lines to read. 0 means all.'), + + ) + + def show_progress(self, current_line, total_line, width): + + percent = (float(current_line) / float(total_line)) * 100.0 + + marks = math.floor(width * (percent / 100.0)) + spaces = math.floor(width - marks) + + loader = '[' + ('=' * int(marks)) + (' ' * int(spaces)) + ']' + + sys.stdout.write("%s %d%% %d/%d\r" % (loader, percent, current_line - 1, total_line - 1)) #takes the header into account + if percent >= 100: + sys.stdout.write("\n") + sys.stdout.flush() + + def handle(self, *args, **options): + + if len(args) == 0: + raise CommandError("Give one csv file to import") + elif len(args) > 1: + raise CommandError("Only one file can be imported") + + self.encoding = options.get('encoding', "latin-1") + lines = options.get('lines', 0) + fieldnames = options.get('fieldnames', "label,popularity") + + csv_path = args[0] + + print("Processing %s " % (csv_path)) + + with open(csv_path, 'rU') as csv_file: + # get the number of lines if necessary + if not lines: + for i, l in enumerate(csv_file): #@UnusedVariable + pass + total_line = i + 1 + if fieldnames: + total_line = total_line + 1 + csv_file.seek(0) + else: + total_line = lines + 1 + + delimiter = options.get('delimiter', ";") + if delimiter == "TAB" or delimiter == "\\t": + delimiter = '\t' + dr_kwargs = {'delimiter':delimiter} + if fieldnames is not None: + dr_kwargs['fieldnames'] = [f.strip() for f in fieldnames.split(",")] + dialect = options.get('dialect', "excel") + if dialect is not None: + dr_kwargs['dialect'] = dialect + + reader = csv.DictReader(csv_file, **dr_kwargs) + + for j, row in enumerate(reader): + if lines and j >= lines: + break + line_num = reader.line_num if fieldnames is None else reader.line_num + 1 + self.show_progress(line_num, total_line, 60) + def safe_decode(val, encoding): + if val: + return val.decode(encoding) + else: + return val + + row = dict([(safe_decode(key, self.encoding), safe_decode(value, self.encoding)) for key, value in row.items()]) + + label = row['label'].strip() + + if not label: + continue + + for tag in Tag.objects.filter(label__iexact=label): + tag.popularity = int(row['popularity']) + tag.save() diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/management/commands/importcsv.py --- a/web/hdabo/management/commands/importcsv.py Mon Jun 20 15:49:22 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,253 +0,0 @@ -# -*- coding: utf-8 -*- -''' -Created on May 25, 2011 - -@author: ymh -''' -#Auteur,Chemin,Comment,Controle,Datcre,Datmaj,Desc,Domaine,Format,ID,Insee,Org,Org_Home,OrgID,Periode1,Periode2,Periode3,Satut,Sousdom,Tag,Theme2,Theme3,Titre,Url,Vignette,Ville -#"Auteur","Chemin","Comment","Controle","Datcre","Datmaj","Desc","Domaine","Format","ID","Insee","Org","Org_Home","OrgID","Periode1","Periode2","Periode3","Satut","Sousdom","Tag","Theme2","Theme3","Titre","Url","Vignette","Ville", - -from django.core.management.base import BaseCommand, CommandError -from django.db import transaction -from hdabo.models import (Author, Datasheet, DocumentFormat, Domain, Organisation, - Tag, TaggedSheet, TimePeriod, Location) -from hdabo.wp_utils import normalize_tag -from optparse import make_option -import csv -import datetime -import math -import sys - -class Command(BaseCommand): - ''' - Command to import csvfile - ''' - args = '' - options = '[--ignore-existing] [--lines] [--encoding]' - help = """Import of a csv file for hdabo -Options: - --ignore-existing : ignore existing datasheets - --lines : max number of lines to load (for each file). 0 means all. - --encoding : files encoding. default to latin-1""" - - option_list = BaseCommand.option_list + ( - make_option('--encoding', - action='store', - type='string', - dest='encoding', - default="latin-1", - help='fix the file encoding. default to latin-1'), - make_option('--delimiter', - action='store', - type='string', - dest='delimiter', - default=";", - help='csv file delimiter'), - make_option('--dialect', - action='store', - type='string', - dest='dialect', - default="excel", - help='csv dialect'), - make_option('--fieldnames', - action='store', - type='string', - dest='fieldnames', - default=None, - help='fields list (comma separated)'), - make_option('--lines', - action='store', - type='int', - dest='lines', - default=0, - help='Number of lines to read. 0 means all.'), - make_option('--ignore-existing', - action='store_true', - dest='ignore_existing', - default=False, - help='force insertion'), - - ) - - def show_progress(self, current_line, total_line, width): - - percent = (float(current_line) / float(total_line)) * 100.0 - - marks = math.floor(width * (percent / 100.0)) - spaces = math.floor(width - marks) - - loader = '[' + ('=' * int(marks)) + (' ' * int(spaces)) + ']' - - sys.stdout.write("%s %d%% %d/%d\r" % (loader, percent, current_line - 1, total_line - 1)) #takes the header into account - if percent >= 100: - sys.stdout.write("\n") - sys.stdout.flush() - - - def create_domain_period(self, row_value, klass, school_period): - res_list = [] - if not row_value: - return res_list - for label_str in [dstr.strip() for dstr in row_value.split('\x0b')]: - if label_str: - res_obj, created = klass.objects.get_or_create(label=label_str, school_period=school_period, defaults={"label":label_str, "school_period":school_period}) #@UnusedVariable - res_list.append(res_obj) - return res_list - - def create_datasheet(self, row): - - if self.ignore_existing and Datasheet.objects.filter(hda_id=row[u"ID"]).count() > 0: - return - - author_str = row[u'Auteur'] - if author_str: - author_array = author_str.split(" ") - if len(author_array) == 0: - firstname = "" - lastname = "" - elif len(author_array) == 1: - firstname = "" - lastname = author_array[0] - elif len(author_array) == 2: - firstname = author_array[0] - lastname = author_array[1] - - author, created = Author.objects.get_or_create(hda_id=author_str, defaults={"firstname":firstname, "lastname":lastname}) #@UnusedVariable - else: - author = None - - org_str = row[u"Org"] - if org_str: - url_str = row[u'Org_Home'] - if url_str is not None: - url_str = url_str.strip() - org, created = Organisation.objects.get_or_create(hda_id=org_str, defaults={"name":org_str, "website" : url_str}) #@UnusedVariable - else: - org = None - - town_str = row[u"Ville"] - if town_str: - insee_str = row[u'Insee'].strip() if row[u'Insee'] else row[u'Insee'] - if len(insee_str) > 5: - insee_str = "" - loc, created = Location.objects.get_or_create(insee=insee_str, defaults={"name": town_str, "insee": insee_str}) #@UnusedVariable - else: - loc = None - - format_str = row[u"Format"] - if format_str: - format, created = DocumentFormat.objects.get_or_create(label=format_str, defaults={"label": format_str}) #@UnusedVariable - else: - format = None - - domains = self.create_domain_period(row[u"Domaine"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Global']) - - primary_periods = self.create_domain_period(row[u"Periode1"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Primaire']) - college_periods = self.create_domain_period(row[u"Periode2"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Collège']) - highschool_periods = self.create_domain_period(row[u"Periode3"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Lycée']) - - primary_themes = self.create_domain_period(row[u"Sousdom"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Primaire']) - college_themes = self.create_domain_period(row[u"Theme2"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Collège']) - highschool_themes = self.create_domain_period(row[u"Theme3"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Lycée']) - - url = row[u"Url"] - if url is not None: - url = url.strip() - - datasheet = Datasheet.objects.create( - hda_id=row[u"ID"], - author=author, - organisation=org, - title=row[u"Titre"], - description=row[u"Desc"], - url=url, - town=loc, - format=format, - original_creation_date=datetime.datetime.strptime(row[u"Datcre"], "%d/%m/%Y").date(), - original_modification_date=datetime.datetime.strptime(row[u"Datmaj"], "%d/%m/%Y").date(), - validated=False - ) - - datasheet.save() - - datasheet.domains = domains - datasheet.primary_periods = primary_periods - datasheet.college_periods = college_periods - datasheet.highschool_periods = highschool_periods - datasheet.primary_themes = primary_themes - datasheet.college_themes = college_themes - datasheet.highschool_themes = highschool_themes - - - if row[u'Tag']: - for i, tag in enumerate([t.strip() for t in row[u'Tag'].split(u";")]): - if len(tag) == 0: - continue - tag_label = normalize_tag(tag) - tag_obj, created = Tag.objects.get_or_create(label__iexact=tag_label, defaults={'label':tag_label, 'original_label':tag}) #@UnusedVariable - tagged_ds = TaggedSheet(datasheet=datasheet, tag=tag_obj, original_order=i + 1, order=i + 1) - tagged_ds.save() - - - def handle(self, *args, **options): - - if len(args) == 0: - raise CommandError("Gives at lat one csv file to import") - - self.encoding = options.get('encoding', "latin-1") - lines = options.get('lines', 0) - self.ignore_existing = options.get('ignore_existing', False) - fieldnames = options.get('fieldnames', None) - - transaction.commit_unless_managed() - transaction.enter_transaction_management() - transaction.managed(True) - - try: - for csv_path in args: - try: - print "Processing %s " % (csv_path) - with open(csv_path, 'rU') as csv_file: - - # get the number of lines if necessary - if not lines: - for i, l in enumerate(csv_file): #@UnusedVariable - pass - total_line = i + 1 - if fieldnames: - total_line = total_line + 1 - csv_file.seek(0) - else: - total_line = lines + 1 - - dr_kwargs = {'delimiter':options.get('delimiter', ";")} - if fieldnames is not None: - dr_kwargs['fieldnames'] = [f.strip() for f in fieldnames.split(",")] - dialect = options.get('dialect', "excel") - if dialect is not None: - dr_kwargs['dialect'] = dialect - - reader = csv.DictReader(csv_file, **dr_kwargs) - - for j, row in enumerate(reader): - if lines and j >= lines: - break - line_num = reader.line_num if fieldnames is None else reader.line_num + 1 - self.show_progress(line_num, total_line, 60) - def safe_decode(val, encoding): - if val: - return val.decode(encoding) - else: - return val - - row = dict([(safe_decode(key, self.encoding), safe_decode(value, self.encoding)) for key, value in row.items()]) - self.create_datasheet(row) - - transaction.commit() - except Exception: - transaction.rollback() - raise - finally: - print('') - finally: - transaction.leave_transaction_management() diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/management/commands/order_tags.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/hdabo/management/commands/order_tags.py Wed Jun 22 01:00:47 2011 +0200 @@ -0,0 +1,105 @@ +''' +Created on Jun 7, 2011 + +@author: ymh +''' + +from django.core.management.base import NoArgsCommand +from django.core.management.color import no_style +from haystack.constants import DJANGO_ID +from haystack.query import SearchQuerySet +from hdabo.models import Datasheet +import math +import sys +from optparse import make_option +from django.db import transaction + + +class Command(NoArgsCommand): + ''' + Command to calculate the order of tags based on indexation + recalculate all tags. Will ask for confirmation + ''' + + args = '' + options = '-f : force ' + help = "calculate the order of tags based on indexation recalculate all tags. Will ask for confirmation" + + option_list = NoArgsCommand.option_list + ( + make_option('-f', '--force', + action='store_true', + dest='force', + default=False, + help='force reordering of all datasheets'), + ) + + + def show_progress(self, current_line, total_line, width): + + percent = (float(current_line) / float(total_line)) * 100.0 + + marks = math.floor(width * (percent / 100.0)) + spaces = math.floor(width - marks) + + loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']' + + sys.stdout.write(u"%s %d%% %d/%d\r" % (loader, percent, current_line, total_line)) #takes the header into account + if percent >= 100: + sys.stdout.write("\n") + sys.stdout.flush() + + + def handle_noargs(self, **options): + + self.style = no_style() + + interactive = options.get('interactive', True) + force = options.get('force', True) + + if interactive: + confirm = raw_input("""You have requested to recalculate the index order of all the tags. +This will process all the tags in %s datasheets. Are you sure you want to do this ? + Type 'yes' to continue, or 'no' to cancel: """ % ("all" if force else "not validated")) + else: + confirm = 'yes' + + if confirm != "yes": + print "Tag reordering cancelled" + return + + if force: + queryset = Datasheet.objects.all() + else: + queryset = Datasheet.objects.filter(validated=False, manual_order=False) + total = queryset.count() + + transaction.commit_unless_managed() + transaction.enter_transaction_management() + transaction.managed(True) + + try: + for i, ds in enumerate(queryset): + self.show_progress(i + 1, total, 60) + ts_list = [] + for ts in ds.taggedsheet_set.all(): + kwargs = {DJANGO_ID + "__exact": unicode(ds.pk)} + results = SearchQuerySet().filter(title=ts.tag.label).filter_or(description=ts.tag.label).filter(**kwargs) + if len(results) > 0: + ts.index_note = results[0].score + ts.save() + ts_list.append(ts) + ts_list.sort(key=lambda t: (-t.index_note, t.order)) + for i, ts in enumerate(ts_list): + ts.order = i + 1 + ts.save() + if ds.manual_order: + ds.manual_order = False + ds.save() + transaction.commit() + except: + transaction.rollback() + raise + finally: + transaction.leave_transaction_management() + + diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/management/commands/ordertags.py --- a/web/hdabo/management/commands/ordertags.py Mon Jun 20 15:49:22 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -''' -Created on Jun 7, 2011 - -@author: ymh -''' - -from django.core.management.base import NoArgsCommand -from django.core.management.color import no_style -from haystack.constants import DJANGO_ID -from haystack.query import SearchQuerySet -from hdabo.models import Datasheet -import math -import sys - - -class Command(NoArgsCommand): - ''' - Command to calculate the order of tags based on indexation - recalculate all tags. Will ask for confirmation - ''' - - args = '' - options = '' - help = "calculate the order of tags based on indexation recalculate all tags. Will ask for confirmation" - - def show_progress(self, current_line, total_line, width): - - percent = (float(current_line) / float(total_line)) * 100.0 - - marks = math.floor(width * (percent / 100.0)) - spaces = math.floor(width - marks) - - loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']' - - sys.stdout.write(u"%s %d%% %d/%d\r" % (loader, percent, current_line, total_line)) #takes the header into account - if percent >= 100: - sys.stdout.write("\n") - sys.stdout.flush() - - - def handle_noargs(self, **options): - - self.style = no_style() - - interactive = options.get('interactive', True) - - if interactive: - confirm = raw_input("""You have requested to recalculate the index order of all the tags. -This will process all the tags in all datasheets. Are you sure you want to do this ? - Type 'yes' to continue, or 'no' to cancel: """) - else: - confirm = 'yes' - - if confirm != "yes": - print "Tag reordering cancelled" - return - - total = Datasheet.objects.all().count() - - for i, ds in enumerate(Datasheet.objects.all()): - self.show_progress(i + 1, total, 60) - ts_list = [] - for ts in ds.taggedsheet_set.all(): - kwargs = {DJANGO_ID + "__exact": unicode(ds.pk)} - results = SearchQuerySet().filter(title=ts.tag.label).filter_or(description=ts.tag.label).filter(**kwargs) - if len(results) > 0: - ts.index_note = results[0].score - ts.save() - ts_list.append(ts) - ts_list.sort(key=lambda t: (-t.index_note, t.order)) - for i, ts in enumerate(ts_list): - ts.order = i + 1 - ts.save() - diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/management/commands/query_wikipedia.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/web/hdabo/management/commands/query_wikipedia.py Wed Jun 22 01:00:47 2011 +0200 @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- +''' +Created on Jun 7, 2011 + +@author: ymh +''' + +from django.conf import settings +from django.core.management.base import NoArgsCommand +from django.core.management.color import no_style +from hdabo.models import Tag +from hdabo.wp_utils import process_tag +from optparse import make_option +from wikitools import wiki +import math +import sys + + + +class Command(NoArgsCommand): + ''' + query and update wikipedia for tag title. + ''' + options = '' + help = """query and update wikipedia for tag title.""" + + option_list = NoArgsCommand.option_list + ( + make_option('--force', + action='store_true', + dest='force', + default=False, + help='force all tags to be updated, not only those not yet processed'), + make_option('--random', + action='store_true', + dest='random', + default=False, + help='randomize query on tags'), + make_option('--site', + action='store', + type='string', + dest='site_url', + default="http://fr.wikipedia.org/w/api.php", + help='the url for the wikipedia site'), + make_option('--limit', + action='store', + type='int', + dest='limit', + default= -1, + help='number of tag to process'), + make_option('--start', + action='store', + type='int', + dest='start', + default=0, + help='number of tag to ignore'), + ) + + def __is_homonymie(self, page_dict): + for cat in page_dict.get(u"categories", []): + if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""): + return True + return False + + + def process_wp_response(self, label, response): + + + query_dict = response['query'] + # get page if multiple pages or none -> return Tag.null_result + pages = query_dict.get("pages", {}) + if len(pages) > 1 or len(pages) == 0: + return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None + + page = pages.values()[0] + + if u"invalid" in page or u"missing" in page: + return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None + + url = page.get(u'fullurl', None) + pageid = page.get(u'pageid', None) + new_label = page[u'title'] + + if self.__is_homonymie(page): + status = Tag.TAG_URL_STATUS_DICT["homonyme"] + elif u"redirect" in page: + status = Tag.TAG_URL_STATUS_DICT["redirection"] + else: + status = Tag.TAG_URL_STATUS_DICT["match"] + + return new_label, status, url, pageid + + def show_progress(self, current_line, total_line, label, width): + + percent = (float(current_line) / float(total_line)) * 100.0 + + marks = math.floor(width * (percent / 100.0)) + spaces = math.floor(width - marks) + + loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']' + + sys.stdout.write(u"%s %d%% %d/%d - %s\r" % (loader, percent, current_line - 1, total_line - 1, repr(label))) #takes the header into account + if percent >= 100: + sys.stdout.write("\n") + sys.stdout.flush() + + def handle_noargs(self, **options): + + self.style = no_style() + + interactive = options.get('interactive', True) + + verbosity = int(options.get('verbosity', '1')) + + force = options.get('force', False) + + limit = options.get("limit", -1) + start = options.get("start", 0) + + site_url = options.get('site_url', settings.WIKIPEDIA_API_URL) + + random = options.get('random', False) + + if verbosity > 2: + print "option passed : " + repr(options) + + if force and interactive: + confirm = raw_input("""You have requested to query and replace the wikipedia information for all datasheets. +Are you sure you want to do this ? + Type 'yes' to continue, or 'no' to cancel: """) + else: + confirm = 'yes' + + if confirm != "yes": + print "wikipedia query cancelled" + return + + if force: + queryset = Tag.objects.all() + else: + queryset = Tag.objects.filter(url_status=None) + + if random: + queryset = queryset.order_by("?") + else: + queryset = queryset.order_by("label") + + if limit >= 0: + queryset = queryset[start:limit] + else: + queryset = queryset[start:] + + + if verbosity > 2 : + print "Tag Query is %s" % (queryset.query) + + site = wiki.Wiki(site_url) #@UndefinedVariable + + + count = queryset.count() + if verbosity > 1: + print "Processing %d tags" % (count) + + + + for i, tag in enumerate(queryset): + + if verbosity > 1: + print "processing tag %s (%d/%d)" % (tag.label, i + 1, count) + else: + self.show_progress(i + 1, count, tag.label, 60) + + process_tag(site, tag, verbosity) + + diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/management/commands/querywikipedia.py --- a/web/hdabo/management/commands/querywikipedia.py Mon Jun 20 15:49:22 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,193 +0,0 @@ -# -*- coding: utf-8 -*- -''' -Created on Jun 7, 2011 - -@author: ymh -''' - -from django.conf import settings -from django.core.management.base import NoArgsCommand -from django.core.management.color import no_style -from hdabo.models import Tag -from hdabo.wp_utils import query_wikipedia_title -from optparse import make_option -from wikitools import wiki -import math -import sys - - -def process_tag(site, tag, verbosity): - new_label, status, url, pageid, response = query_wikipedia_title(site, tag.label) - - if verbosity >= 2 : - print "response from query to %s with parameters %s :" % (site.apibase, repr(new_label)) - print repr(response) - - if new_label is not None: - tag.label = new_label - if status is not None: - tag.url_status = status - if url is not None: - tag.wikipedia_url = url - if pageid is not None: - tag.wikipedia_pageid = pageid - - tag.save() - - - -class Command(NoArgsCommand): - ''' - query and update wikipedia for tag title. - ''' - options = '' - help = """query and update wikipedia for tag title.""" - - option_list = NoArgsCommand.option_list + ( - make_option('--force', - action='store_true', - dest='force', - default=False, - help='force all tags to be updated, not only those not yet processed'), - make_option('--random', - action='store_true', - dest='random', - default=False, - help='randomize query on tags'), - make_option('--site', - action='store', - type='string', - dest='site_url', - default="http://fr.wikipedia.org/w/api.php", - help='the url for the wikipedia site'), - make_option('--limit', - action='store', - type='int', - dest='limit', - default= -1, - help='number of tag to process'), - make_option('--start', - action='store', - type='int', - dest='start', - default=0, - help='number of tag to ignore'), - ) - - def __is_homonymie(self, page_dict): - for cat in page_dict.get(u"categories", []): - if u'Catégorie:Homonymie' in cat.get(u"title", u"") or u'Category:Disambiguation pages' in cat.get(u"title", u""): - return True - return False - - - def process_wp_response(self, label, response): - - - query_dict = response['query'] - # get page if multiple pages or none -> return Tag.null_result - pages = query_dict.get("pages", {}) - if len(pages) > 1 or len(pages) == 0: - return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None - - page = pages.values()[0] - - if u"invalid" in page or u"missing" in page: - return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None - - url = page.get(u'fullurl', None) - pageid = page.get(u'pageid', None) - new_label = page[u'title'] - - if self.__is_homonymie(page): - status = Tag.TAG_URL_STATUS_DICT["homonyme"] - elif u"redirect" in page: - status = Tag.TAG_URL_STATUS_DICT["redirection"] - else: - status = Tag.TAG_URL_STATUS_DICT["match"] - - return new_label, status, url, pageid - - def show_progress(self, current_line, total_line, label, width): - - percent = (float(current_line) / float(total_line)) * 100.0 - - marks = math.floor(width * (percent / 100.0)) - spaces = math.floor(width - marks) - - loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']' - - sys.stdout.write(u"%s %d%% %d/%d - %s\r" % (loader, percent, current_line - 1, total_line - 1, repr(label))) #takes the header into account - if percent >= 100: - sys.stdout.write("\n") - sys.stdout.flush() - - def handle_noargs(self, **options): - - self.style = no_style() - - interactive = options.get('interactive', True) - - verbosity = int(options.get('verbosity', '1')) - - force = options.get('force', False) - - limit = options.get("limit", -1) - start = options.get("start", 0) - - site_url = options.get('site_url', settings.WIKIPEDIA_API_URL) - - random = options.get('random', False) - - if verbosity > 2: - print "option passed : " + repr(options) - - if force and interactive: - confirm = raw_input("""You have requested to query and replace the wikipedia information for all datasheets. -Are you sure you want to do this ? - Type 'yes' to continue, or 'no' to cancel: """) - else: - confirm = 'yes' - - if confirm != "yes": - print "wikipedia query cancelled" - return - - if force: - queryset = Tag.objects.all() - else: - queryset = Tag.objects.filter(url_status=None) - - if random: - queryset = queryset.order_by("?") - else: - queryset = queryset.order_by("label") - - if limit >= 0: - queryset = queryset[start:limit] - else: - queryset = queryset[start:] - - - if verbosity > 2 : - print "Tag Query is %s" % (queryset.query) - - site = wiki.Wiki(site_url) #@UndefinedVariable - - - count = queryset.count() - if verbosity > 1: - print "Processing %d tags" % (count) - - - - for i, tag in enumerate(queryset): - - if verbosity > 1: - print "processing tag %s (%d/%d)" % (tag.label, i + 1, count) - else: - self.show_progress(i + 1, count, tag.label, 60) - - process_tag(site, tag, verbosity) - - diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/models.py --- a/web/hdabo/models.py Mon Jun 20 15:49:22 2011 +0200 +++ b/web/hdabo/models.py Wed Jun 22 01:00:47 2011 +0200 @@ -1,278 +1,280 @@ -# -*- coding: utf-8 -*- - -from django.contrib.auth.models import User -from django.db import models -from hdabo.fields import SortedManyToManyField -from hdabo.utils import Property -import datetime - -class Organisation(models.Model): - hda_id = models.CharField(max_length=512, unique=True, blank=False, null=False) - name = models.CharField(max_length=512, unique=False, blank=False, null=False) - location = models.CharField(max_length=512, unique=False, blank=True, null=True) - website = models.CharField(max_length=2048, unique=False, blank=True, null=True) - - -class Author(models.Model): - hda_id = models.CharField(max_length=512, unique=True, blank=False, null=False) - lastname = models.CharField(max_length=512, unique=False, blank=True, null=True) - firstname = models.CharField(max_length=512, unique=False, blank=True, null=True) - -class TimePeriod(models.Model): - TIME_PERIOD_CHOICES = ( - (1, u'Primaire'), - (2, u'Collège'), - (3, u'Lycée'), - ) - TIME_PERIOD_DICT = { - u'Primaire': 1, - u'Collège': 2, - u'Lycée': 3, - } - label = models.CharField(max_length=512, unique=False, blank=False, null=False) - school_period = models.IntegerField(choices=TIME_PERIOD_CHOICES) - - class Meta: - unique_together = ("label", "school_period") - - def __unicode__(self): - return unicode(self.label) - - -class Domain(models.Model): - DOMAIN_PERIOD_CHOICES = ( - (0, u'Global'), - (1, u'Primaire'), - (2, u'Collège'), - (3, u'Lycée'), - ) - DOMAIN_PERIOD_DICT = { - u'Global': 0, - u'Primaire': 1, - u'Collège': 2, - u'Lycée': 3, - } - label = models.CharField(max_length=512, unique=False, blank=False, null=False) - school_period = models.IntegerField(choices=DOMAIN_PERIOD_CHOICES) - - class Meta: - unique_together = ("label", "school_period") - - def __unicode__(self): - return unicode(self.label) - - -class DocumentFormat(models.Model): - label = models.CharField(max_length=512, unique=True, blank=False, null=False) - - def __unicode__(self): - return unicode(self.label) - -class TagCategory(models.Model): - label = models.CharField(max_length=512, unique=True, blank=False, null=False) - - def __unicode__(self): - return unicode(self.label) - - class Meta: - verbose_name_plural = "TagCategories" - -class Tag(models.Model): - TAG_URL_STATUS_CHOICES = ( - (0, "null_result"), - (1, "redirection"), - (2, "homonyme"), - (3, "match"), - ) - - TAG_URL_STATUS_DICT = { - "null_result":0, - "redirection":1, - "homonyme":2, - "match":3, - } - - label = models.CharField(max_length=1024, unique=False, blank=False, null=False) - original_label = models.CharField(max_length=1024, unique=True, blank=False, null=False, editable=False) - alias = models.CharField(max_length=1024, unique=False, blank=True, null=True) - category = models.ForeignKey(TagCategory, null=True, blank=True) - wikipedia_url = models.URLField(verify_exists=False, max_length=2048, blank=True, null=True) - wikipedia_pageid = models.BigIntegerField(unique=False, blank=True, null=True) - url_status = models.IntegerField(choices=TAG_URL_STATUS_CHOICES, blank=True, null=True, default=None) - dbpedia_uri = models.URLField(verify_exists=False, max_length=2048, blank=True, null=True) - - @Property - def url_status_text(): #@NoSelf - def fget(self): - return self.TAG_URL_STATUS_CHOICES[self.url_status][1] - - return locals() - -class Location(models.Model): - name = models.CharField(max_length=512, unique=False, blank=False, null=False) - insee = models.CharField(max_length=5, unique=True, blank=False, null=False) - - def __unicode__(self): - return unicode("%s : %s" % (self.name, self.insee)) - -class Datasheet(models.Model): - hda_id = models.CharField(max_length=512, unique=True, blank=False, null=False) - author = models.ForeignKey(Author, null=True, blank=True) - organisation = models.ForeignKey(Organisation) - title = models.CharField(max_length=2048, unique=False, blank=False, null=False) - description = models.TextField(blank=True, null=True) - url = models.URLField(verify_exists=False, max_length=2048, blank=True, null=True) - domains = SortedManyToManyField(Domain, limit_choices_to={'school_period':Domain.DOMAIN_PERIOD_DICT[u'Global']}, related_name="datasheets") - primary_periods = SortedManyToManyField(TimePeriod, limit_choices_to={'school_period':TimePeriod.TIME_PERIOD_DICT[u'Primaire']}, related_name="primary_periods_datasheets") - college_periods = SortedManyToManyField(TimePeriod, limit_choices_to={'school_period':TimePeriod.TIME_PERIOD_DICT[u'Collège']}, related_name="college_periods_datasheets") - highschool_periods = SortedManyToManyField(TimePeriod, limit_choices_to={'school_period':TimePeriod.TIME_PERIOD_DICT[u'Lycée']}, related_name="highschool_periods_datasheets") - primary_themes = SortedManyToManyField(Domain, limit_choices_to={'school_period':Domain.DOMAIN_PERIOD_DICT[u'Primaire']}, related_name="primary_themes_datasheets") - college_themes = SortedManyToManyField(Domain, limit_choices_to={'school_period':Domain.DOMAIN_PERIOD_DICT[u'Collège']}, related_name="college_themes_datasheets") - highschool_themes = SortedManyToManyField(Domain, limit_choices_to={'school_period':Domain.DOMAIN_PERIOD_DICT[u'Lycée']}, related_name="highschool_themes_datasheets") - town = models.ForeignKey(Location, null=True, blank=True) - format = models.ForeignKey(DocumentFormat, null=True, blank=True) - original_creation_date = models.DateField() - original_modification_date = models.DateField() - modification_datetime = models.DateTimeField(auto_now=True) - validation_date = models.DateTimeField(null=True, blank=True) - validated = models.BooleanField(default=False) - validator = models.ForeignKey(User, null=True, blank=True) - tags = models.ManyToManyField(Tag, through='TaggedSheet') - - - def validate(self, user): - self.validation_date = datetime.datetime.now() - self.validated = True - self.validator = user - self.save() - - def unvalidate(self): - self.validation_date = datetime.datetime.min - self.validated = False - self.validator = None - self.save() - - @Property - def domains_list(): #@NoSelf - def fget(self): - return [d.label for d in self.domains.all()] - - return locals() - - @Property - def domains_text(): #@NoSelf - def fget(self): - return "; ".join(self.domains_list) - - return locals() - - @Property - def primary_periods_list(): #@NoSelf - def fget(self): - return [d.label for d in self.primary_periods.all()] - - return locals() - - - @Property - def primary_periods_text(): #@NoSelf - def fget(self): - return "; ".join(self.primary_periods_list) - - return locals() - - @Property - def college_periods_list(): #@NoSelf - def fget(self): - return [d.label for d in self.college_periods.all()] - - return locals() - - @Property - def college_periods_text(): #@NoSelf - def fget(self): - return "; ".join(self.college_periods_list) - - return locals() - - @Property - def highschool_periods_list(): #@NoSelf - def fget(self): - return [d.label for d in self.highschool_periods.all()] - - return locals() - - @Property - def highschool_periods_text(): #@NoSelf - def fget(self): - return "; ".join(self.highschool_periods_list) - - return locals() - - - @Property - def primary_themes_list(): #@NoSelf - def fget(self): - return [d.label for d in self.primary_themes.all()] - - return locals() - - - @Property - def primary_themes_text(): #@NoSelf - def fget(self): - return "; ".join(self.primary_themes_list) - - return locals() - - @Property - def college_themes_list(): #@NoSelf - def fget(self): - return [d.label for d in self.college_themes.all()] - - return locals() - - @Property - def college_themes_text(): #@NoSelf - def fget(self): - return "; ".join(self.college_themes_list) - - return locals() - - @Property - def highschool_themes_list(): #@NoSelf - def fget(self): - return [d.label for d in self.highschool_themes.all()] - - return locals() - - @Property - def highschool_themes_text(): #@NoSelf - def fget(self): - return "; ".join(self.highschool_themes_list) - - return locals() - - @Property - def town_text(): #@NoSelf - def fget(self): - return self.town.name if self.town else "" - - return locals() - - @Property - def tags_text(): #@NoSelf - def fget(self): - return "; ".join([t.label for t in self.tags.all()]) - - return locals() - - -class TaggedSheet(models.Model): - datasheet = models.ForeignKey(Datasheet) - tag = models.ForeignKey(Tag) - original_order = models.IntegerField(default=0) - order = models.IntegerField(default=0) - index_note = models.FloatField(default=0.0) - - +# -*- coding: utf-8 -*- + +from django.contrib.auth.models import User +from django.db import models +from hdabo.fields import SortedManyToManyField +from hdabo.utils import Property +import datetime + +class Organisation(models.Model): + hda_id = models.CharField(max_length=512, unique=True, blank=False, null=False) + name = models.CharField(max_length=512, unique=False, blank=False, null=False) + location = models.CharField(max_length=512, unique=False, blank=True, null=True) + website = models.CharField(max_length=2048, unique=False, blank=True, null=True) + + +class Author(models.Model): + hda_id = models.CharField(max_length=512, unique=True, blank=False, null=False) + lastname = models.CharField(max_length=512, unique=False, blank=True, null=True) + firstname = models.CharField(max_length=512, unique=False, blank=True, null=True) + +class TimePeriod(models.Model): + TIME_PERIOD_CHOICES = ( + (1, u'Primaire'), + (2, u'Collège'), + (3, u'Lycée'), + ) + TIME_PERIOD_DICT = { + u'Primaire': 1, + u'Collège': 2, + u'Lycée': 3, + } + label = models.CharField(max_length=512, unique=False, blank=False, null=False) + school_period = models.IntegerField(choices=TIME_PERIOD_CHOICES) + + class Meta: + unique_together = ("label", "school_period") + + def __unicode__(self): + return unicode(self.label) + + +class Domain(models.Model): + DOMAIN_PERIOD_CHOICES = ( + (0, u'Global'), + (1, u'Primaire'), + (2, u'Collège'), + (3, u'Lycée'), + ) + DOMAIN_PERIOD_DICT = { + u'Global': 0, + u'Primaire': 1, + u'Collège': 2, + u'Lycée': 3, + } + label = models.CharField(max_length=512, unique=False, blank=False, null=False) + school_period = models.IntegerField(choices=DOMAIN_PERIOD_CHOICES) + + class Meta: + unique_together = ("label", "school_period") + + def __unicode__(self): + return unicode(self.label) + + +class DocumentFormat(models.Model): + label = models.CharField(max_length=512, unique=True, blank=False, null=False) + + def __unicode__(self): + return unicode(self.label) + +class TagCategory(models.Model): + label = models.CharField(max_length=512, unique=True, blank=False, null=False) + + def __unicode__(self): + return unicode(self.label) + + class Meta: + verbose_name_plural = "TagCategories" + +class Tag(models.Model): + TAG_URL_STATUS_CHOICES = ( + (0, "null_result"), + (1, "redirection"), + (2, "homonyme"), + (3, "match"), + ) + + TAG_URL_STATUS_DICT = { + "null_result":0, + "redirection":1, + "homonyme":2, + "match":3, + } + + label = models.CharField(max_length=1024, unique=False, blank=False, null=False, db_index=True) + original_label = models.CharField(max_length=1024, unique=True, blank=False, null=False, editable=False) + alias = models.CharField(max_length=1024, unique=False, blank=True, null=True) + category = models.ForeignKey(TagCategory, null=True, blank=True) + wikipedia_url = models.URLField(verify_exists=False, max_length=2048, blank=True, null=True) + wikipedia_pageid = models.BigIntegerField(unique=False, blank=True, null=True) + url_status = models.IntegerField(choices=TAG_URL_STATUS_CHOICES, blank=True, null=True, default=None) + dbpedia_uri = models.URLField(verify_exists=False, max_length=2048, blank=True, null=True) + popularity = models.IntegerField(blank=False, null=False, default=0, db_index=True) + + @Property + def url_status_text(): #@NoSelf + def fget(self): + return self.TAG_URL_STATUS_CHOICES[self.url_status][1] + + return locals() + +class Location(models.Model): + name = models.CharField(max_length=512, unique=False, blank=False, null=False) + insee = models.CharField(max_length=5, unique=True, blank=False, null=False) + + def __unicode__(self): + return unicode("%s : %s" % (self.name, self.insee)) + +class Datasheet(models.Model): + hda_id = models.CharField(max_length=512, unique=True, blank=False, null=False) + author = models.ForeignKey(Author, null=True, blank=True) + organisation = models.ForeignKey(Organisation) + title = models.CharField(max_length=2048, unique=False, blank=False, null=False) + description = models.TextField(blank=True, null=True) + url = models.URLField(verify_exists=False, max_length=2048, blank=True, null=True) + domains = SortedManyToManyField(Domain, limit_choices_to={'school_period':Domain.DOMAIN_PERIOD_DICT[u'Global']}, related_name="datasheets") + primary_periods = SortedManyToManyField(TimePeriod, limit_choices_to={'school_period':TimePeriod.TIME_PERIOD_DICT[u'Primaire']}, related_name="primary_periods_datasheets") + college_periods = SortedManyToManyField(TimePeriod, limit_choices_to={'school_period':TimePeriod.TIME_PERIOD_DICT[u'Collège']}, related_name="college_periods_datasheets") + highschool_periods = SortedManyToManyField(TimePeriod, limit_choices_to={'school_period':TimePeriod.TIME_PERIOD_DICT[u'Lycée']}, related_name="highschool_periods_datasheets") + primary_themes = SortedManyToManyField(Domain, limit_choices_to={'school_period':Domain.DOMAIN_PERIOD_DICT[u'Primaire']}, related_name="primary_themes_datasheets") + college_themes = SortedManyToManyField(Domain, limit_choices_to={'school_period':Domain.DOMAIN_PERIOD_DICT[u'Collège']}, related_name="college_themes_datasheets") + highschool_themes = SortedManyToManyField(Domain, limit_choices_to={'school_period':Domain.DOMAIN_PERIOD_DICT[u'Lycée']}, related_name="highschool_themes_datasheets") + town = models.ForeignKey(Location, null=True, blank=True) + format = models.ForeignKey(DocumentFormat, null=True, blank=True) + original_creation_date = models.DateField() + original_modification_date = models.DateField() + modification_datetime = models.DateTimeField(auto_now=True) + validation_date = models.DateTimeField(null=True, blank=True) + validated = models.BooleanField(default=False, db_index=True) + validator = models.ForeignKey(User, null=True, blank=True) + manual_order = models.BooleanField(default=False, db_index=True) + tags = models.ManyToManyField(Tag, through='TaggedSheet') + + + def validate(self, user): + self.validation_date = datetime.datetime.now() + self.validated = True + self.validator = user + self.save() + + def unvalidate(self): + self.validation_date = datetime.datetime.min + self.validated = False + self.validator = None + self.save() + + @Property + def domains_list(): #@NoSelf + def fget(self): + return [d.label for d in self.domains.all()] + + return locals() + + @Property + def domains_text(): #@NoSelf + def fget(self): + return "; ".join(self.domains_list) + + return locals() + + @Property + def primary_periods_list(): #@NoSelf + def fget(self): + return [d.label for d in self.primary_periods.all()] + + return locals() + + + @Property + def primary_periods_text(): #@NoSelf + def fget(self): + return "; ".join(self.primary_periods_list) + + return locals() + + @Property + def college_periods_list(): #@NoSelf + def fget(self): + return [d.label for d in self.college_periods.all()] + + return locals() + + @Property + def college_periods_text(): #@NoSelf + def fget(self): + return "; ".join(self.college_periods_list) + + return locals() + + @Property + def highschool_periods_list(): #@NoSelf + def fget(self): + return [d.label for d in self.highschool_periods.all()] + + return locals() + + @Property + def highschool_periods_text(): #@NoSelf + def fget(self): + return "; ".join(self.highschool_periods_list) + + return locals() + + + @Property + def primary_themes_list(): #@NoSelf + def fget(self): + return [d.label for d in self.primary_themes.all()] + + return locals() + + + @Property + def primary_themes_text(): #@NoSelf + def fget(self): + return "; ".join(self.primary_themes_list) + + return locals() + + @Property + def college_themes_list(): #@NoSelf + def fget(self): + return [d.label for d in self.college_themes.all()] + + return locals() + + @Property + def college_themes_text(): #@NoSelf + def fget(self): + return "; ".join(self.college_themes_list) + + return locals() + + @Property + def highschool_themes_list(): #@NoSelf + def fget(self): + return [d.label for d in self.highschool_themes.all()] + + return locals() + + @Property + def highschool_themes_text(): #@NoSelf + def fget(self): + return "; ".join(self.highschool_themes_list) + + return locals() + + @Property + def town_text(): #@NoSelf + def fget(self): + return self.town.name if self.town else "" + + return locals() + + @Property + def tags_text(): #@NoSelf + def fget(self): + return "; ".join([t.label for t in self.tags.all()]) + + return locals() + + +class TaggedSheet(models.Model): + datasheet = models.ForeignKey(Datasheet) + tag = models.ForeignKey(Tag) + original_order = models.IntegerField(null=False, blank=False, default=0) + order = models.IntegerField(null=False, blank=False, default=0, db_index=True) + index_note = models.FloatField(null=False, blank=False, default=0.0, db_index=True) + + diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/settings.py --- a/web/hdabo/settings.py Mon Jun 20 15:49:22 2011 +0200 +++ b/web/hdabo/settings.py Wed Jun 22 01:00:47 2011 +0200 @@ -148,4 +148,6 @@ WIKIPEDIA_API_URL = "http://fr.wikipedia.org/w/api.php" +DBPEDIA_URI_TEMPLATE = "http://dbpedia.org/resource/%s" + from hdabo.config import * #@UnusedWildImport diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/static/hdabo/img/arrow_green_right.png Binary file web/hdabo/static/hdabo/img/arrow_green_right.png has changed diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/templates/partial/all_tags_table.html --- a/web/hdabo/templates/partial/all_tags_table.html Mon Jun 20 15:49:22 2011 +0200 +++ b/web/hdabo/templates/partial/all_tags_table.html Wed Jun 22 01:00:47 2011 +0200 @@ -4,6 +4,7 @@ label {% comment %}original_label{% endcomment %} Lien W + Lien D Catégorie Supprimer
le lien W Alias @@ -17,7 +18,15 @@ {% else %} - {% endif %} + {% endif %} + + + {% if tag.dbpedia_uri and tag.dbpedia_uri != "" %} + + {% else %} +   + {% endif %} + {% if tag.category %}{{ tag.category }}{% endif %} {{tag.label}} {% if tag.alias %}{{tag.alias}}{% endif %} diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/templates/partial/tag_table.html --- a/web/hdabo/templates/partial/tag_table.html Mon Jun 20 15:49:22 2011 +0200 +++ b/web/hdabo/templates/partial/tag_table.html Wed Jun 22 01:00:47 2011 +0200 @@ -8,6 +8,7 @@ label {% comment %}original_label{% endcomment %} Lien W + Lien D Catégorie Supprimer
le lien W Alias @@ -28,7 +29,15 @@ {% else %} - {% endif %} + {% endif %} + + + {% if tag.dbpedia_uri and tag.dbpedia_uri != "" %} + + {% else %} +   + {% endif %} + {% if t.tag.category %}{{ t.tag.category }}{% endif %} {{t.tag.label}} {% if t.tag.alias %}{{t.tag.alias}}{% endif %} @@ -50,7 +59,15 @@ {% else %} - {% endif %} + {% endif %} + + + {% if tag.dbpedia_uri and tag.dbpedia_uri != "" %} + + {% else %} +   + {% endif %} + {% if t.category %}{{ t.category }}{% endif %} {{t.alias}} diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/views.py --- a/web/hdabo/views.py Mon Jun 20 15:49:22 2011 +0200 +++ b/web/hdabo/views.py Wed Jun 22 01:00:47 2011 +0200 @@ -4,15 +4,15 @@ from django.contrib.auth.decorators import login_required #@UnusedImport from django.core.paginator import Paginator from django.db.models import Max -from django.http import HttpResponse, HttpResponseBadRequest +from django.http import HttpResponseBadRequest from django.shortcuts import render_to_response, redirect from django.template import RequestContext from haystack.constants import DJANGO_ID from haystack.query import SearchQuerySet -from hdabo.management.commands.querywikipedia import process_tag -from hdabo.wp_utils import (normalize_tag, query_wikipedia_title, +from hdabo.wp_utils import process_tag +from hdabo.utils import OrderedDict +from hdabo.wp_utils import (normalize_tag, query_wikipedia_title, get_or_create_tag) -from hdabo.utils import OrderedDict from models import Datasheet, Organisation, Tag, TagCategory, TaggedSheet from wikitools import wiki import django.utils.simplejson as json @@ -22,7 +22,7 @@ #@login_required def home(request): - # Get all organisations + # Get all organizations orgas = Organisation.objects.all().order_by('name') # Count all validated, unvalidated sheets for each organisation org_list = [] @@ -132,7 +132,6 @@ # NB : it is different from the TagSheet.order in the database. new_order = int(request.POST["new_order"]) - 1 old_order = int(request.POST["old_order"]) - 1 - s = "new_order = " + str(new_order) + ", old_order = " + str(old_order) # First we get the datasheet's TaggedSheets (list to force evaluation) ordered_tags = list(TaggedSheet.objects.filter(datasheet=Datasheet.objects.get(id=ds_id)).order_by('order')) # We change the moved TaggedSheets's order @@ -143,16 +142,19 @@ # We move the TaggedSheets's order if new_order > old_order : # And we decrease the other ones - for i in range(old_order+1,new_order+1) : + for i in range(old_order + 1, new_order + 1) : ts = ordered_tags[i] ts.order = ts.order - 1 ts.save() else : # And we increase the other ones - for i in range(new_order,old_order) : + for i in range(new_order, old_order) : ts = ordered_tags[i] ts.order = ts.order + 1 ts.save() + ds = Datasheet.objects.get(id=ds_id) + ds.manual_order = True + ds.save() return get_tag_table(request=request, ds_id=ds_id, valid=0) @@ -205,6 +207,10 @@ ts = ds_tags.filter(tag=Tag.objects.filter(id=tag_id))[0] ts.delete() + ds = Datasheet.objects.get(id=ds_id) + ds.manual_order = True + ds.save() + return get_tag_table(request=request, ds_id=ds_id, valid=0) @@ -218,23 +224,18 @@ if tag.label != tag_label: - tag.label = tag_label site = wiki.Wiki(settings.WIKIPEDIA_API_URL) #@UndefinedVariable - new_label, status, url, pageid, response = query_wikipedia_title(site, tag_label) #@UnusedVariable + wp_res = query_wikipedia_title(site, tag_label) + status, url, pageid, dbpedia_uri = (wp_res['status'], wp_res['wikipedia_url'], wp_res['page_id'], wp_res["dbpedia_uri"]) if status is not None: tag.url_status = status - if url is not None: - tag.wikipedia_url = url - else: - tag.wikipedia_url = None - - if pageid is not None: - tag.wikipedia_pageid = pageid - else: - tag.wikipedia_pageid = None + + tag.wikipedia_url = url + tag.wikipedia_pageid = pageid + tag.dbpedia_uri = dbpedia_uri tag.save() @@ -246,7 +247,7 @@ tag_id = request.POST["id"] tag_label = request.POST["value"] - ds_id=request.POST["datasheet_id"] + ds_id = request.POST["datasheet_id"] tag = Tag.objects.get(id=tag_id) @@ -268,6 +269,8 @@ ts.save() + ds.manual_order = True + ds.save() return get_tag_table(request=request, ds_id=ds_id, valid=0) @@ -313,10 +316,12 @@ # if the tag is created or if the tag is not in the list list_ts = TaggedSheet.objects.filter(datasheet=ds) - if created or len(list_ts.filter(tag=tag))==0 : + if created or len(list_ts.filter(tag=tag)) == 0 : new_order = list_ts.aggregate(Max('order'))['order__max'] + 1 ts = TaggedSheet.objects.create(datasheet=ds, tag=tag, original_order=new_order, order=new_order) ts.save() + ds.manual_order = True + ds.save() return get_tag_table(request=request, ds_id=ds_id, valid=0) @@ -347,9 +352,15 @@ else : valid = False # We validate or unvalidate the requester datasheet + + if request.user.is_authenticated(): + user = request.user + else: + user = None + ds = Datasheet.objects.get(id=ds_id) if valid : - ds.validate(None) + ds.validate(user) else : ds.unvalidate() ds.save() @@ -361,7 +372,7 @@ else : # We ask to display the validated ds valid_req = 1 - if len(same_organisation_ds)>0 : + if len(same_organisation_ds) > 0 : return redirect('list_for_orga', orga_id=ds.organisation.id, valid=valid_req) else : return redirect('home') @@ -407,4 +418,4 @@ # This function is available only in all_tags_table context return get_all_tags_table(request=request, num_page=request.POST["num_page"], nb_by_page=request.POST["nb_by_page"]) - \ No newline at end of file + diff -r 3ad571e54608 -r 08b008c5a07d web/hdabo/wp_utils.py --- a/web/hdabo/wp_utils.py Mon Jun 20 15:49:22 2011 +0200 +++ b/web/hdabo/wp_utils.py Wed Jun 22 01:00:47 2011 +0200 @@ -2,6 +2,20 @@ from django.conf import settings from hdabo.models import Tag from wikitools import api, wiki +from django.utils.http import urlquote + +def normalize_tag(tag): + if len(tag) == 0: + return tag + tag = tag.strip() + tag = tag.replace("_", " ") + tag = " ".join(tag.split()) + tag = tag[0].upper() + tag[1:] + return tag + +def urlize_for_wkipedia(label): + return urlquote(label.replace(" ","_")) + def __is_homonymie(page_dict): for cat in page_dict.get(u"categories", []): @@ -11,21 +25,22 @@ def query_wikipedia_title(site, label): - params = {'action':'query', 'titles': label, 'prop':'info|categories', 'inprop':'url'} + params = {'action':'query', 'titles': label, 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500'} wpquery = api.APIRequest(site, params) #@UndefinedVariable - response = wpquery.query() + response = wpquery.query() + original_response = response query_dict = response['query'] # get page if multiple pages or none -> return Tag.null_result pages = query_dict.get("pages", {}) if len(pages) > 1 or len(pages) == 0: - return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None, response + return { 'new_label': None, 'status': Tag.TAG_URL_STATUS_DICT["null_result"], 'wikipedia_url': None, 'pageid': None, 'dbpedia_uri': None, 'response': response } page = pages.values()[0] if u"invalid" in page or u"missing" in page: - return None, Tag.TAG_URL_STATUS_DICT["null_result"], None, None, response + return { 'new_label': None, 'status': Tag.TAG_URL_STATUS_DICT["null_result"], 'wikipedia_url': None, 'pageid': None, 'dbpedia_uri': None, 'response': response } url = page.get(u'fullurl', None) pageid = page.get(u'pageid', None) @@ -37,17 +52,37 @@ status = Tag.TAG_URL_STATUS_DICT["redirection"] else: status = Tag.TAG_URL_STATUS_DICT["match"] - - return new_label, status, url, pageid, response + + if status == Tag.TAG_URL_STATUS_DICT["redirection"]: + params = {'action':'query', 'titles': label, 'prop':'info|categories|langlinks', 'inprop':'url', 'lllimit':'500', 'cllimit':'500', 'redirects':True} + wpquery = api.APIRequest(site, params) #@UndefinedVariable + response = wpquery.query() + query_dict = response['query'] + pages = query_dict.get("pages", {}) + #we know that we have at least one answer + if len(pages) > 1 or len(pages) == 0: + return { 'new_label': None, 'status': Tag.TAG_URL_STATUS_DICT["null_result"], 'wikipedia_url': None, 'pageid': None, 'dbpedia_uri': None, 'response': response } + page = pages.values()[0] + -def normalize_tag(tag): - if len(tag) == 0: - return tag - tag = tag.strip() - tag = tag.replace("_", " ") - tag = " ".join(tag.split()) - tag = tag[0].upper() + tag[1:] - return tag + + #process language to extract the english label + english_label = None + + if status == Tag.TAG_URL_STATUS_DICT['match'] or status == Tag.TAG_URL_STATUS_DICT['redirection']: + lang_links = page.get('langlinks', []) + for lang_info_dict in lang_links: + if lang_info_dict['lang'] == "en": + english_label = lang_info_dict["*"] + break + + if english_label and "#" not in english_label: + dbpedia_uri = settings.DBPEDIA_URI_TEMPLATE % (urlize_for_wkipedia(english_label)) + else: + dbpedia_uri = None + + return { 'new_label': new_label, 'status': status, 'wikipedia_url': url, 'pageid': pageid, 'dbpedia_uri': dbpedia_uri, 'response': original_response } + def get_or_create_tag(tag_label): @@ -60,25 +95,41 @@ if created: site = wiki.Wiki(settings.WIKIPEDIA_API_URL) #@UndefinedVariable - new_label, status, url, pageid, response = query_wikipedia_title(site, tag_label_normalized) #@UnusedVariable + wp_res = query_wikipedia_title(site, tag_label_normalized) #@UnusedVariable + new_label, status, url, pageid, dbpedia_uri = wp_res['new_label'], wp_res['status'], wp_res['wikipedia_url'], wp_res['pageid'], wp_res["dbpedia_uri"] + # We save the datas if new_label is not None: tag.label = new_label if status is not None: tag.url_status = status - if url is not None: - tag.wikipedia_url = url - else: - tag.wikipedia_url = None - - if pageid is not None: - tag.wikipedia_pageid = pageid - else: - tag.wikipedia_pageid = None + tag.wikipedia_url = url + tag.wikipedia_pageid = pageid + tag.dbpedia_uri = dbpedia_uri tag.save() return tag, created +def process_tag(site, tag, verbosity): + wp_res = query_wikipedia_title(site, tag.label) + new_label, status, url, pageid, response, dbpedia_uri = wp_res['new_label'], wp_res['status'], wp_res['wikipedia_url'], wp_res['pageid'], wp_res['response'], wp_res["dbpedia_uri"] + + if verbosity >= 2 : + print "response from query to %s with parameters %s :" % (site.apibase, repr(new_label)) + print repr(response) + + if new_label is not None: + tag.label = new_label + if status is not None: + tag.url_status = status + tag.wikipedia_url = url + tag.wikipedia_pageid = pageid + tag.dbpedia_uri = dbpedia_uri + + tag.save() + + +