diff -r 3d54acec55d6 -r 27f71b0a772d src/hdabo/utils.py --- a/src/hdabo/utils.py Wed Dec 10 10:05:38 2014 +0100 +++ b/src/hdabo/utils.py Wed Feb 18 01:53:34 2015 +0100 @@ -1,9 +1,13 @@ # -*- coding: utf-8 -*- +import codecs import collections +import math +import re +import sys import unicodedata -import sys -import math -import codecs + +import unidecode + ### # allow to declare a property as a decorator @@ -342,12 +346,27 @@ return ItemsView(self) ## end of http://code.activestate.com/recipes/576693/ }}} -def remove_accents(str): - nkfd_form = unicodedata.normalize('NFKD', unicode(str)) +def remove_accents(lne): + nkfd_form = unicodedata.normalize('NFKD', unicode(lne)) return u"".join([c for c in nkfd_form if not unicodedata.combining(c)]) -def normalize(str): - return remove_accents(str).lower().replace(u"œ",u"oe") +def normalize(lne): + return remove_accents(lne).lower().replace(u"œ",u"oe") + +def sanitize(line, separator = '-', ascii_only = True): + + if not line: + return '' + + #Transliterate non-ASCII characters + line = unidecode.unidecode(line) + #Remove all characters that are not the separator, a-z, 0-9, or whitespace + line = re.sub('[^\%sa-z0-9\s]+'%separator, '', line.lower()) + #// Replace all separator characters and whitespace by a single separator + line = re.sub('[\%s\s]+' % separator, separator, line) + + return line.strip(separator) + def show_progress(current_line, total_line, label, width, writer=None):