src/p4l/utils.py
author ymh <ymh.work@gmail.com>
Sat, 05 Apr 2014 23:29:54 +0200
changeset 168 ac755301cea3
parent 161 216b3f9582aa
permissions -rw-r--r--
Version 0.7

# -*- coding: utf-8 -*-
#
# Copyright IRI (c) 2013
#
# contact@iri.centrepompidou.fr
#
# This software is governed by the CeCILL-B license under French law and
# abiding by the rules of distribution of free software.  You can  use, 
# modify and/ or redistribute the software under the terms of the CeCILL-B
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info". 
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability. 
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or 
# data to be ensured and,  more generally, to use and operate it in the 
# same conditions as regards security. 
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-B license and that you accept its terms.
#

import codecs
import hashlib
import logging
import math
import sys
import unicodedata

from django.conf import settings
from django.core.paginator import Paginator, PageNotAnInteger, EmptyPage, Page
from django.core.validators import URLValidator
from django.utils.http import urlquote_plus
import requests


logger = logging.getLogger(__name__)

def show_progress(current_line, total_line, label, width, writer=None, newline=False):

    if writer is None:
        writer = sys.stdout
        if sys.stdout.encoding is not None:
            writer = codecs.getwriter(sys.stdout.encoding)(sys.stdout)

    percent = (float(current_line) / float(total_line)) * 100.0

    marks = math.floor(width * (percent / 100.0)) #@UndefinedVariable
    spaces = math.floor(width - marks) #@UndefinedVariable

    loader = u'[' + (u'=' * int(marks)) + (u' ' * int(spaces)) + u']'
        
    s = u"%s %3d%% %*d/%d - %*s%s" % (loader, percent, len(str(total_line)), current_line, total_line, width, label[:width], "\n" if newline else "\r")
    
    writer.write(s) #takes the header into account
    if percent >= 100:
        writer.write("\n")
    writer.flush()
    
    return writer

LANGUAGE_NS = u"http://psi.oasis-open.org/iso/639/#"
LANGUAGE_URI_MAP = {u'roh': u'rm', u'sco': u'sco', u'scn': u'scn', u'rom': u'rom', u'ron': u'ro', u'oss': u'os', u'ale': u'ale', u'mni': u'mni', u'nwc': u'nwc', u'osa': u'osa', u'mnc': u'mnc', u'mwr': u'mwr', u'ven': u'ven', u'uga': u'uga', u'mwl': u'mwl', u'fas': u'fa', u'fat': u'fat', u'fan': u'fan', u'fao': u'fo', u'din': u'din', u'hye': u'hy', u'dsb': u'dsb', u'srd': u'sc', u'iba': u'iba', u'div': u'div', u'tel': u'te', u'tem': u'tem', u'nbl': u'nr', u'ter': u'ter', u'tet': u'tet', u'sun': u'su', u'kut': u'kut', u'suk': u'suk', u'kur': u'ku', u'kum': u'kum', u'sus': u'sus', u'new': u'new', u'nep': u'ne', u'sux': u'sux', u'men': u'men', u'lez': u'lez', u'gla': u'gd', u'bos': u'bs', u'gle': u'ga', u'eka': u'eka', u'glg': u'gl', u'akk': u'akk', u'aka': u'aka', u'bod': u'bo', u'glv': u'gv', u'jrb': u'jrb', u'vie': u'vi', u'ipk': u'ik', u'uzb': u'uz', u'sga': u'sga', u'bre': u'br', u'bra': u'bra', u'aym': u'ay', u'cha': u'ch', u'chb': u'chb', u'che': u'ce', u'chg': u'chg', u'chk': u'chk', u'chm': u'chm', u'chn': u'chn', u'cho': u'cho', u'chp': u'chp', u'chr': u'chr', u'chu': u'cu', u'chv': u'cv', u'chy': u'chy', u'msa': u'ms', u'iii': u'ii', u'ndo': u'ng', u'ibo': u'ibo', u'car': u'car', u'xho': u'xh', u'deu': u'de', u'cat': u'ca', u'del': u'del', u'den': u'den', u'cad': u'cad', u'tat': u'tt', u'srn': u'srn', u'raj': u'raj', u'spa': u'es', u'tam': u'ta', u'tah': u'ty', u'afh': u'afh', u'eng': u'en', u'enm': u'enm', u'csb': u'csb', u'nyn': u'nyn', u'nyo': u'nyo', u'sid': u'sid', u'nya': u'ny', u'sin': u'si', u'afr': u'af', u'lam': u'lam', u'snd': u'sd', u'mar': u'mr', u'lah': u'lah', u'sna': u'sn', u'lad': u'lad', u'snk': u'snk', u'mad': u'mad', u'mag': u'mag', u'lat': u'la', u'mah': u'mh', u'mak': u'mak', u'mal': u'ml', u'man': u'man', u'egy': u'egy', u'znd': u'znd', u'zen': u'zen', u'kbd': u'kbd', u'ita': u'it', u'tsn': u'tn', u'tso': u'ts', u'tsi': u'tsi', u'byn': u'byn', u'fij': u'fj', u'fin': u'fi', u'eus': u'eu', u'non': u'non', u'ceb': u'ceb', u'dan': u'da', u'nym': u'nym', u'nob': u'nb', u'dak': u'dak', u'ces': u'cs', u'dar': u'dar', u'day': u'day', u'nor': u'no', u'kpe': u'kpe', u'guj': u'gu', u'mdf': u'mdf', u'mas': u'mas', u'lao': u'lo', u'mdr': u'mdr', u'gon': u'gon', u'goh': u'goh', u'sms': u'sms', u'smo': u'sm', u'smn': u'smn', u'smj': u'smj', u'got': u'got', u'sme': u'se', u'bla': u'bla', u'sma': u'sma', u'gor': u'gor', u'ast': u'ast', u'orm': u'om', u'que': u'qu', u'ori': u'or', u'bal': u'bal', u'asm': u'as', u'pus': u'ps', u'kik': u'ki', u'ltz': u'lb', u'wln': u'wa', u'isl': u'is', u'mai': u'mai', u'lav': u'lv', u'zap': u'zap', u'yid': u'yi', u'kok': u'kok', u'kom': u'kv', u'kon': u'kon', u'ukr': u'uk', u'ton': u'to', u'kos': u'kos', u'kor': u'ko', u'tog': u'tog', u'hun': u'hu', u'hup': u'hup', u'cym': u'cy', u'udm': u'udm', u'bej': u'bej', u'ben': u'bn', u'bel': u'be', u'bem': u'bem', u'aar': u'aa', u'nzi': u'nzi', u'sah': u'sah', u'san': u'sa', u'sam': u'sam', u'pro': u'pro', u'sag': u'sg', u'sad': u'sad', u'rar': u'rar', u'rap': u'rap', u'sas': u'sas', u'sat': u'sat', u'min': u'min', u'lim': u'li', u'lin': u'ln', u'lit': u'lt', u'efi': u'efi', u'btk': u'btk', u'kac': u'kac', u'kab': u'kab', u'kaa': u'kaa', u'kan': u'kn', u'kam': u'kam', u'kal': u'kl', u'kas': u'ks', u'kar': u'kar', u'kaw': u'kaw', u'kau': u'kau', u'kat': u'ka', u'kaz': u'kk', u'tyv': u'tyv', u'awa': u'awa', u'urd': u'ur', u'doi': u'doi', u'tpi': u'tpi', u'mri': u'mi', u'abk': u'ab', u'tkl': u'tkl', u'nld': u'nl', u'oji': u'oji', u'oci': u'oc', u'wol': u'wo', u'jav': u'jv', u'hrv': u'hr', u'mga': u'mga', u'hit': u'hit', u'gez': u'gez', u'ssw': u'ss', u'hil': u'hil', u'him': u'him', u'hin': u'hi', u'bas': u'bas', u'gba': u'gba', u'bad': u'bad', u'kua': u'kj', u'cre': u'cre', u'ban': u'ban', u'crh': u'crh', u'bam': u'bam', u'bak': u'ba', u'shn': u'shn', u'arp': u'arp', u'arw': u'arw', u'ara': u'ar', u'arc': u'arc', u'sel': u'sel', u'arn': u'arn', u'lus': u'lus', u'mus': u'mus', u'lua': u'lua', u'lub': u'lub', u'lug': u'lug', u'lui': u'lui', u'lun': u'lun', u'luo': u'luo', u'iku': u'iu', u'tur': u'tr', u'tuk': u'tk', u'tum': u'tum', u'mkd': u'mk', u'cop': u'cop', u'cos': u'co', u'ile': u'ie', u'ilo': u'ilo', u'gwi': u'gwi', u'und': u'und', u'tli': u'tli', u'tlh': u'tlh', u'por': u'pt', u'pon': u'pon', u'pol': u'pl', u'ang': u'ang', u'tgk': u'tg', u'tgl': u'tl', u'fra': u'fr', u'fre': u'fr', u'dum': u'dum', u'swa': u'sw', u'dua': u'dua', u'swe': u'sv', u'yap': u'yap', u'frm': u'frm', u'tiv': u'tiv', u'yao': u'yao', u'xal': u'xal', u'fry': u'fy', u'gay': u'gay', u'ota': u'ota', u'hmn': u'hmn', u'hmo': u'ho', u'an': u'arg', u'gaa': u'gaa', u'fur': u'fur', u'mlg': u'mg', u'slv': u'sl', u'fil': u'fil', u'mlt': u'mt', u'slk': u'sk', u'ful': u'ful', u'jpn': u'ja', u'vol': u'vo', u'vot': u'vot', u'ind': u'id', u'ave': u'ae', u'jpr': u'jpr', u'ava': u'ava', u'pap': u'pap', u'ewo': u'ewo', u'pau': u'pau', u'ewe': u'ewe', u'pag': u'pag', u'pal': u'pal', u'pam': u'pam', u'pan': u'pa', u'nog': u'nog', u'phn': u'phn', u'kir': u'ky', u'nia': u'nia', u'dgr': u'dgr', u'syr': u'syr', u'kin': u'rw', u'niu': u'niu', u'epo': u'eo', u'jbo': u'jbo', u'mic': u'mic', u'tha': u'th', u'hai': u'hai', u'gmh': u'gmh', u'ell': u'el', u'ady': u'ady', u'elx': u'elx', u'ada': u'ada', u'nav': u'nv', u'hat': u'ht', u'hau': u'ha', u'haw': u'haw', u'bin': u'bin', u'amh': u'am', u'bik': u'bik', u'bih': u'bh', u'mos': u'mos', u'moh': u'moh', u'mon': u'mn', u'bho': u'bho', u'mol': u'mo', u'bis': u'bi', u'tvl': u'tvl', u'ijo': u'ijo', u'est': u'et', u'kmb': u'kmb', u'peo': u'peo', u'umb': u'umb', u'tmh': u'tmh', u'fon': u'fon', u'hsb': u'hsb', u'run': u'rn', u'rus': u'ru', u'pli': u'pi', u'ace': u'ace', u'ach': u'ach', u'nde': u'nd', u'dzo': u'dz', u'kru': u'kru', u'srr': u'srr', u'ido': u'io', u'srp': u'sr', u'kro': u'kro', u'krc': u'krc', u'nds': u'nds', u'zun': u'zun', u'zul': u'zu', u'twi': u'tw', u'sog': u'sog', u'nso': u'nso', u'fro': u'fro', u'som': u'so', u'son': u'son', u'sot': u'st', u'vai': u'vai', u'her': u'hz', u'lol': u'lol', u'heb': u'he', u'loz': u'loz', u'gil': u'gil', u'was': u'was', u'war': u'war', u'bul': u'bg', u'wal': u'wal', u'bua': u'bua', u'bug': u'bug', u'aze': u'az', u'zha': u'za', u'zho': u'zh', u'nno': u'nn', u'uig': u'ug', u'myv': u'myv', u'inh': u'inh', u'khm': u'km', u'kho': u'kho', u'mya': u'my', u'kha': u'kha', u'ina': u'ia', u'nah': u'nah', u'tir': u'ti', u'nap': u'nap', u'grb': u'grb', u'grc': u'grc', u'nau': u'na', u'grn': u'gn', u'tig': u'tig', u'yor': u'yo', u'cor': u'kw', u'sqi': u'sq', u'dyu': u'dyu'}

def get_code_from_language_uri(lang_uri):

    if not lang_uri:
        return None
    if lang_uri.startswith(LANGUAGE_NS):
        lang_uri = lang_uri[len(LANGUAGE_NS):]

    return LANGUAGE_URI_MAP.get(lang_uri, None)



def get_labels_for_uris(uri_list, scheme_uri, lang, acronyms=False):
    '''
    This methods gathers labels for concept in thesaurus.
    
    :param uri_list: The list of uris of thesaurus entries.    
    :param scheme_uri: The scheme (uri) of the thesaurus 
    :param lang: The language for the label
    :param acronyms: do the labels must include the acronym (altLabel) 
    '''
    query_without_acronym = """
PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl:<http://www.w3.org/2002/07/owl#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
SELECT ?uri ?label
WHERE {
    ?uri skos:inScheme <%s> .
    ?uri skos:prefLabel|skos:label ?label .
    FILTER (%s)
}
"""
    query_with_acronym = """
PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl:<http://www.w3.org/2002/07/owl#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
SELECT ?uri ?label ?acro
WHERE {
    ?uri skos:inScheme <%s> .
    ?uri skos:prefLabel|skos:label ?label .
    OPTIONAL { ?uri skos:altLabel ?acro }
    FILTER (%s)
}
"""
    if acronyms:
        query = query_with_acronym
    else:
        query = query_without_acronym
    res_dict = {}
    if not uri_list:
        return res_dict
    # We build the filter string
    filter_str = ""
    validate = URLValidator()
    for i,uri in enumerate(uri_list):
        res_dict[uri] = ""
        # We test if the uri is correct. If not, all the sparql request fails
        try:
            validate(uri)
        except:
            continue
        uri = uri.replace(" ", "") # avoid bug when only few urls are not good
        filter_str += (" || ?uri = <" + uri + ">") if i else ("?uri = <" + uri + ">")
    # We request the labels
    res = requests.get(
        settings.SPARQL_QUERY_ENDPOINT,
        params={'query':query % (scheme_uri, filter_str), 'timeout':10},#, '$root' : "<"+uri+">"},
        headers={'accept':'application/sparql-results+json'},
    )
    if res.ok and res.text:
        json_res = res.json()
        if 'results' in json_res and 'bindings' in json_res['results'] and len(json_res['results']['bindings'])>0:
            # json_res['results']['bindings'] has several languages. If we find french, we save the french label.
            # If not, we save the first one.
            tmp_dict = {}
            first_label = None
            # We create a temporary dict with the lang code and the label
            for b in json_res['results']['bindings']:
                if lang:
                    if 'label' in b and 'value' in b['label'] and 'xml:lang' in b['label']:
                        tmp_dict[b['label']['xml:lang']] = b['label']['value']
                        if not first_label:
                            first_label = b['label']['value']
                else:
                    if 'acro' in b and 'value' in b['acro']:
                        first_label = b['acro']['value'] + ". " + b['label']['value']
                    else:
                        first_label = b['label']['value']
                if lang in tmp_dict or first_label:
                    if lang in tmp_dict:
                        label = tmp_dict[lang]
                    else:
                        label = first_label
                    res_dict[b['uri']['value']] = label
    return res_dict

    
    
def fill_label_for_model(model, property_uri, scheme_uri):
    query = """
PREFIX skos:<http://www.w3.org/2004/02/skos/core#>
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl:<http://www.w3.org/2002/07/owl#>
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#>
SELECT ?uri ?label 
WHERE {
    ?uri skos:inScheme <%s> .
    ?uri skos:prefLabel|skos:label ?label .
    FILTER (?uri = $root)
}
"""
    # Loads Models label from sparkl query
    objs = model.objects.filter(label=None)
    total_objs = len(objs)
    writer = None
    i = 0
    found = 0
    for o in objs:
        i += 1
        res = requests.get(
            settings.SPARQL_QUERY_ENDPOINT,
            params={'query':query % scheme_uri, 'timeout':10, '$root' : "<"+getattr(o, property_uri)+">"},
            headers={'accept':'application/sparql-results+json'},
        )
        if not res.ok:
            continue
        elif res.text:
            json_res = res.json()
            if 'results' in json_res and 'bindings' in json_res['results'] and len(json_res['results']['bindings'])>0:
                # json_res['results']['bindings'] has several languages. If we find french, we save the french label.
                # If not, we save the first one.
                tmp_dict = {}
                first_label = None
                # We create a temporary dict with the lang code and the label
                for b in json_res['results']['bindings']:
                    if 'label' in b and 'value' in b['label'] and 'xml:lang' in b['label']:
                        tmp_dict[b['label']['xml:lang']] = b['label']['value']
                        if not first_label:
                            first_label = b['label']['value']
                if 'fr' in tmp_dict or first_label:
                    if 'fr' in tmp_dict:
                        o.label = tmp_dict['fr']
                    else:
                        o.label = first_label
                    l = o.label
                    o.save()
                    found += 1
                    writer = show_progress(i, total_objs, l, 50, writer=writer)
    print("Processing Sparql Done. %d found on %d" % (found, total_objs))

def strip_accents(value):
    return ''.join(c for c in unicodedata.normalize('NFD', value)
                  if unicodedata.category(c) != 'Mn')
    
    
def safe_cache_key(value):
    '''Returns an md5 hexdigest of value if len(value) > 250. Replaces invalid memcache
       control characters with an underscore. Also adds the CACHE_MIDDLEWARE_KEY_PREFIX
       to your keys automatically.
    '''
    value = urlquote_plus(value)
    for char in value:
        if ord(char) < 33:
            value = value.replace(char, '_')
    
    value = "%s_%s" % (settings.CACHE_MIDDLEWARE_KEY_PREFIX, value)
    
    if len(value) <= 250:
        return value
    
    return hashlib.md5(value).hexdigest()


class P4lPaginator(Paginator):

    def validate_number(self, number):
        "Validates the given 1-based page number."
        try:
            number = int(number)
        except (TypeError, ValueError):
            raise PageNotAnInteger('That page number is not an integer')
        if number < 1:
            return 1
        if number > self.num_pages:
            if number == 1 and self.allow_empty_first_page:
                return 1
            elif self.num_pages > 0: 
                return self.num_pages
            else:
                raise EmptyPage('That page contains no results')
        return number

    
    def page(self, number):        
        page = super(P4lPaginator, self).page(number)
        return P4lPage(page.object_list, page.number, self)


class P4lPage(Page):
    
    visible_range = getattr(settings, 'PAGINATOR_VISIBLE_RANGE', 5)
    start_range = getattr(settings, 'PAGINATOR_START_RANGE', visible_range/2)
        
    def __get_start_range(self):
        return max(1,self.number-self.visible_range/2)
    
    def __get_end_range(self):
        return min(self.paginator.num_pages, self.number+self.visible_range/2) + 1
    
    def visible_page_range(self):        
        start = self.__get_start_range()
        end = self.__get_end_range()
                
        ranges = set(range(1,self.start_range+1)) | set(range(start , end)) | set(range(self.paginator.num_pages - self.start_range +1, self.paginator.num_pages+1))
        prev = None
        res = []
        for i in sorted(ranges):
            if 1 <= i <= self.paginator.num_pages:
                if prev and i-prev > 1:
                    res.append(0)
                res.append(i)
                prev = i
        return res