web/lib/django/utils/text.py
changeset 38 77b6da96e6f1
equal deleted inserted replaced
37:8d941af65caf 38:77b6da96e6f1
       
     1 import re
       
     2 from django.utils.encoding import force_unicode
       
     3 from django.utils.functional import allow_lazy
       
     4 from django.utils.translation import ugettext_lazy
       
     5 from htmlentitydefs import name2codepoint
       
     6 
       
     7 # Capitalizes the first letter of a string.
       
     8 capfirst = lambda x: x and force_unicode(x)[0].upper() + force_unicode(x)[1:]
       
     9 capfirst = allow_lazy(capfirst, unicode)
       
    10 
       
    11 def wrap(text, width):
       
    12     """
       
    13     A word-wrap function that preserves existing line breaks and most spaces in
       
    14     the text. Expects that existing line breaks are posix newlines.
       
    15     """
       
    16     text = force_unicode(text)
       
    17     def _generator():
       
    18         it = iter(text.split(' '))
       
    19         word = it.next()
       
    20         yield word
       
    21         pos = len(word) - word.rfind('\n') - 1
       
    22         for word in it:
       
    23             if "\n" in word:
       
    24                 lines = word.split('\n')
       
    25             else:
       
    26                 lines = (word,)
       
    27             pos += len(lines[0]) + 1
       
    28             if pos > width:
       
    29                 yield '\n'
       
    30                 pos = len(lines[-1])
       
    31             else:
       
    32                 yield ' '
       
    33                 if len(lines) > 1:
       
    34                     pos = len(lines[-1])
       
    35             yield word
       
    36     return u''.join(_generator())
       
    37 wrap = allow_lazy(wrap, unicode)
       
    38 
       
    39 def truncate_words(s, num, end_text='...'):
       
    40     """Truncates a string after a certain number of words. Takes an optional
       
    41     argument of what should be used to notify that the string has been
       
    42     truncated, defaults to ellipsis (...)"""
       
    43     s = force_unicode(s)
       
    44     length = int(num)
       
    45     words = s.split()
       
    46     if len(words) > length:
       
    47         words = words[:length]
       
    48         if not words[-1].endswith(end_text):
       
    49             words.append(end_text)
       
    50     return u' '.join(words)
       
    51 truncate_words = allow_lazy(truncate_words, unicode)
       
    52 
       
    53 def truncate_html_words(s, num, end_text='...'):
       
    54     """Truncates html to a certain number of words (not counting tags and
       
    55     comments). Closes opened tags if they were correctly closed in the given
       
    56     html. Takes an optional argument of what should be used to notify that the
       
    57     string has been truncated, defaults to ellipsis (...)."""
       
    58     s = force_unicode(s)
       
    59     length = int(num)
       
    60     if length <= 0:
       
    61         return u''
       
    62     html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
       
    63     # Set up regular expressions
       
    64     re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
       
    65     re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
       
    66     # Count non-HTML words and keep note of open tags
       
    67     pos = 0
       
    68     end_text_pos = 0
       
    69     words = 0
       
    70     open_tags = []
       
    71     while words <= length:
       
    72         m = re_words.search(s, pos)
       
    73         if not m:
       
    74             # Checked through whole string
       
    75             break
       
    76         pos = m.end(0)
       
    77         if m.group(1):
       
    78             # It's an actual non-HTML word
       
    79             words += 1
       
    80             if words == length:
       
    81                 end_text_pos = pos
       
    82             continue
       
    83         # Check for tag
       
    84         tag = re_tag.match(m.group(0))
       
    85         if not tag or end_text_pos:
       
    86             # Don't worry about non tags or tags after our truncate point
       
    87             continue
       
    88         closing_tag, tagname, self_closing = tag.groups()
       
    89         tagname = tagname.lower()  # Element names are always case-insensitive
       
    90         if self_closing or tagname in html4_singlets:
       
    91             pass
       
    92         elif closing_tag:
       
    93             # Check for match in open tags list
       
    94             try:
       
    95                 i = open_tags.index(tagname)
       
    96             except ValueError:
       
    97                 pass
       
    98             else:
       
    99                 # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
       
   100                 open_tags = open_tags[i+1:]
       
   101         else:
       
   102             # Add it to the start of the open tags list
       
   103             open_tags.insert(0, tagname)
       
   104     if words <= length:
       
   105         # Don't try to close tags if we don't need to truncate
       
   106         return s
       
   107     out = s[:end_text_pos]
       
   108     if end_text:
       
   109         out += ' ' + end_text
       
   110     # Close any tags still open
       
   111     for tag in open_tags:
       
   112         out += '</%s>' % tag
       
   113     # Return string
       
   114     return out
       
   115 truncate_html_words = allow_lazy(truncate_html_words, unicode)
       
   116 
       
   117 def get_valid_filename(s):
       
   118     """
       
   119     Returns the given string converted to a string that can be used for a clean
       
   120     filename. Specifically, leading and trailing spaces are removed; other
       
   121     spaces are converted to underscores; and anything that is not a unicode
       
   122     alphanumeric, dash, underscore, or dot, is removed.
       
   123     >>> get_valid_filename("john's portrait in 2004.jpg")
       
   124     u'johns_portrait_in_2004.jpg'
       
   125     """
       
   126     s = force_unicode(s).strip().replace(' ', '_')
       
   127     return re.sub(r'(?u)[^-\w.]', '', s)
       
   128 get_valid_filename = allow_lazy(get_valid_filename, unicode)
       
   129 
       
   130 def get_text_list(list_, last_word=ugettext_lazy(u'or')):
       
   131     """
       
   132     >>> get_text_list(['a', 'b', 'c', 'd'])
       
   133     u'a, b, c or d'
       
   134     >>> get_text_list(['a', 'b', 'c'], 'and')
       
   135     u'a, b and c'
       
   136     >>> get_text_list(['a', 'b'], 'and')
       
   137     u'a and b'
       
   138     >>> get_text_list(['a'])
       
   139     u'a'
       
   140     >>> get_text_list([])
       
   141     u''
       
   142     """
       
   143     if len(list_) == 0: return u''
       
   144     if len(list_) == 1: return force_unicode(list_[0])
       
   145     return u'%s %s %s' % (', '.join([force_unicode(i) for i in list_][:-1]), force_unicode(last_word), force_unicode(list_[-1]))
       
   146 get_text_list = allow_lazy(get_text_list, unicode)
       
   147 
       
   148 def normalize_newlines(text):
       
   149     return force_unicode(re.sub(r'\r\n|\r|\n', '\n', text))
       
   150 normalize_newlines = allow_lazy(normalize_newlines, unicode)
       
   151 
       
   152 def recapitalize(text):
       
   153     "Recapitalizes text, placing caps after end-of-sentence punctuation."
       
   154     text = force_unicode(text).lower()
       
   155     capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
       
   156     text = capsRE.sub(lambda x: x.group(1).upper(), text)
       
   157     return text
       
   158 recapitalize = allow_lazy(recapitalize)
       
   159 
       
   160 def phone2numeric(phone):
       
   161     "Converts a phone number with letters into its numeric equivalent."
       
   162     letters = re.compile(r'[A-Z]', re.I)
       
   163     char2number = lambda m: {'a': '2', 'b': '2', 'c': '2', 'd': '3', 'e': '3',
       
   164          'f': '3', 'g': '4', 'h': '4', 'i': '4', 'j': '5', 'k': '5', 'l': '5',
       
   165          'm': '6', 'n': '6', 'o': '6', 'p': '7', 'q': '7', 'r': '7', 's': '7',
       
   166          't': '8', 'u': '8', 'v': '8', 'w': '9', 'x': '9', 'y': '9', 'z': '9',
       
   167         }.get(m.group(0).lower())
       
   168     return letters.sub(char2number, phone)
       
   169 phone2numeric = allow_lazy(phone2numeric)
       
   170 
       
   171 # From http://www.xhaus.com/alan/python/httpcomp.html#gzip
       
   172 # Used with permission.
       
   173 def compress_string(s):
       
   174     import cStringIO, gzip
       
   175     zbuf = cStringIO.StringIO()
       
   176     zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
       
   177     zfile.write(s)
       
   178     zfile.close()
       
   179     return zbuf.getvalue()
       
   180 
       
   181 ustring_re = re.compile(u"([\u0080-\uffff])")
       
   182 
       
   183 def javascript_quote(s, quote_double_quotes=False):
       
   184 
       
   185     def fix(match):
       
   186         return r"\u%04x" % ord(match.group(1))
       
   187 
       
   188     if type(s) == str:
       
   189         s = s.decode('utf-8')
       
   190     elif type(s) != unicode:
       
   191         raise TypeError(s)
       
   192     s = s.replace('\\', '\\\\')
       
   193     s = s.replace('\r', '\\r')
       
   194     s = s.replace('\n', '\\n')
       
   195     s = s.replace('\t', '\\t')
       
   196     s = s.replace("'", "\\'")
       
   197     if quote_double_quotes:
       
   198         s = s.replace('"', '&quot;')
       
   199     return str(ustring_re.sub(fix, s))
       
   200 javascript_quote = allow_lazy(javascript_quote, unicode)
       
   201 
       
   202 # Expression to match some_token and some_token="with spaces" (and similarly
       
   203 # for single-quoted strings).
       
   204 smart_split_re = re.compile(r"""
       
   205     ((?:
       
   206         [^\s'"]*
       
   207         (?:
       
   208             (?:"(?:[^"\\]|\\.)*" | '(?:[^'\\]|\\.)*')
       
   209             [^\s'"]*
       
   210         )+
       
   211     ) | \S+)
       
   212 """, re.VERBOSE)
       
   213 
       
   214 def smart_split(text):
       
   215     r"""
       
   216     Generator that splits a string by spaces, leaving quoted phrases together.
       
   217     Supports both single and double quotes, and supports escaping quotes with
       
   218     backslashes. In the output, strings will keep their initial and trailing
       
   219     quote marks and escaped quotes will remain escaped (the results can then
       
   220     be further processed with unescape_string_literal()).
       
   221 
       
   222     >>> list(smart_split(r'This is "a person\'s" test.'))
       
   223     [u'This', u'is', u'"a person\\\'s"', u'test.']
       
   224     >>> list(smart_split(r"Another 'person\'s' test."))
       
   225     [u'Another', u"'person\\'s'", u'test.']
       
   226     >>> list(smart_split(r'A "\"funky\" style" test.'))
       
   227     [u'A', u'"\\"funky\\" style"', u'test.']
       
   228     """
       
   229     text = force_unicode(text)
       
   230     for bit in smart_split_re.finditer(text):
       
   231         yield bit.group(0)
       
   232 smart_split = allow_lazy(smart_split, unicode)
       
   233 
       
   234 def _replace_entity(match):
       
   235     text = match.group(1)
       
   236     if text[0] == u'#':
       
   237         text = text[1:]
       
   238         try:
       
   239             if text[0] in u'xX':
       
   240                 c = int(text[1:], 16)
       
   241             else:
       
   242                 c = int(text)
       
   243             return unichr(c)
       
   244         except ValueError:
       
   245             return match.group(0)
       
   246     else:
       
   247         try:
       
   248             return unichr(name2codepoint[text])
       
   249         except (ValueError, KeyError):
       
   250             return match.group(0)
       
   251 
       
   252 _entity_re = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
       
   253 
       
   254 def unescape_entities(text):
       
   255     return _entity_re.sub(_replace_entity, text)
       
   256 unescape_entities = allow_lazy(unescape_entities, unicode)
       
   257 
       
   258 def unescape_string_literal(s):
       
   259     r"""
       
   260     Convert quoted string literals to unquoted strings with escaped quotes and
       
   261     backslashes unquoted::
       
   262 
       
   263         >>> unescape_string_literal('"abc"')
       
   264         'abc'
       
   265         >>> unescape_string_literal("'abc'")
       
   266         'abc'
       
   267         >>> unescape_string_literal('"a \"bc\""')
       
   268         'a "bc"'
       
   269         >>> unescape_string_literal("'\'ab\' c'")
       
   270         "'ab' c"
       
   271     """
       
   272     if s[0] not in "\"'" or s[-1] != s[0]:
       
   273         raise ValueError("Not a string literal: %r" % s)
       
   274     quote = s[0]
       
   275     return s[1:-1].replace(r'\%s' % quote, quote).replace(r'\\', '\\')
       
   276 unescape_string_literal = allow_lazy(unescape_string_literal)