web/lib/django/utils/text.py
changeset 0 0d40e90630ef
child 29 cc9b7e14412b
equal deleted inserted replaced
-1:000000000000 0:0d40e90630ef
       
     1 import re
       
     2 from django.conf import settings
       
     3 from django.utils.encoding import force_unicode
       
     4 from django.utils.functional import allow_lazy
       
     5 from django.utils.translation import ugettext_lazy
       
     6 from htmlentitydefs import name2codepoint
       
     7 
       
     8 # Capitalizes the first letter of a string.
       
     9 capfirst = lambda x: x and force_unicode(x)[0].upper() + force_unicode(x)[1:]
       
    10 capfirst = allow_lazy(capfirst, unicode)
       
    11 
       
    12 def wrap(text, width):
       
    13     """
       
    14     A word-wrap function that preserves existing line breaks and most spaces in
       
    15     the text. Expects that existing line breaks are posix newlines.
       
    16     """
       
    17     text = force_unicode(text)
       
    18     def _generator():
       
    19         it = iter(text.split(' '))
       
    20         word = it.next()
       
    21         yield word
       
    22         pos = len(word) - word.rfind('\n') - 1
       
    23         for word in it:
       
    24             if "\n" in word:
       
    25                 lines = word.split('\n')
       
    26             else:
       
    27                 lines = (word,)
       
    28             pos += len(lines[0]) + 1
       
    29             if pos > width:
       
    30                 yield '\n'
       
    31                 pos = len(lines[-1])
       
    32             else:
       
    33                 yield ' '
       
    34                 if len(lines) > 1:
       
    35                     pos = len(lines[-1])
       
    36             yield word
       
    37     return u''.join(_generator())
       
    38 wrap = allow_lazy(wrap, unicode)
       
    39 
       
    40 def truncate_words(s, num):
       
    41     "Truncates a string after a certain number of words."
       
    42     s = force_unicode(s)
       
    43     length = int(num)
       
    44     words = s.split()
       
    45     if len(words) > length:
       
    46         words = words[:length]
       
    47         if not words[-1].endswith('...'):
       
    48             words.append('...')
       
    49     return u' '.join(words)
       
    50 truncate_words = allow_lazy(truncate_words, unicode)
       
    51 
       
    52 def truncate_html_words(s, num):
       
    53     """
       
    54     Truncates html to a certain number of words (not counting tags and
       
    55     comments). Closes opened tags if they were correctly closed in the given
       
    56     html.
       
    57     """
       
    58     s = force_unicode(s)
       
    59     length = int(num)
       
    60     if length <= 0:
       
    61         return u''
       
    62     html4_singlets = ('br', 'col', 'link', 'base', 'img', 'param', 'area', 'hr', 'input')
       
    63     # Set up regular expressions
       
    64     re_words = re.compile(r'&.*?;|<.*?>|(\w[\w-]*)', re.U)
       
    65     re_tag = re.compile(r'<(/)?([^ ]+?)(?: (/)| .*?)?>')
       
    66     # Count non-HTML words and keep note of open tags
       
    67     pos = 0
       
    68     ellipsis_pos = 0
       
    69     words = 0
       
    70     open_tags = []
       
    71     while words <= length:
       
    72         m = re_words.search(s, pos)
       
    73         if not m:
       
    74             # Checked through whole string
       
    75             break
       
    76         pos = m.end(0)
       
    77         if m.group(1):
       
    78             # It's an actual non-HTML word
       
    79             words += 1
       
    80             if words == length:
       
    81                 ellipsis_pos = pos
       
    82             continue
       
    83         # Check for tag
       
    84         tag = re_tag.match(m.group(0))
       
    85         if not tag or ellipsis_pos:
       
    86             # Don't worry about non tags or tags after our truncate point
       
    87             continue
       
    88         closing_tag, tagname, self_closing = tag.groups()
       
    89         tagname = tagname.lower()  # Element names are always case-insensitive
       
    90         if self_closing or tagname in html4_singlets:
       
    91             pass
       
    92         elif closing_tag:
       
    93             # Check for match in open tags list
       
    94             try:
       
    95                 i = open_tags.index(tagname)
       
    96             except ValueError:
       
    97                 pass
       
    98             else:
       
    99                 # SGML: An end tag closes, back to the matching start tag, all unclosed intervening start tags with omitted end tags
       
   100                 open_tags = open_tags[i+1:]
       
   101         else:
       
   102             # Add it to the start of the open tags list
       
   103             open_tags.insert(0, tagname)
       
   104     if words <= length:
       
   105         # Don't try to close tags if we don't need to truncate
       
   106         return s
       
   107     out = s[:ellipsis_pos] + ' ...'
       
   108     # Close any tags still open
       
   109     for tag in open_tags:
       
   110         out += '</%s>' % tag
       
   111     # Return string
       
   112     return out
       
   113 truncate_html_words = allow_lazy(truncate_html_words, unicode)
       
   114 
       
   115 def get_valid_filename(s):
       
   116     """
       
   117     Returns the given string converted to a string that can be used for a clean
       
   118     filename. Specifically, leading and trailing spaces are removed; other
       
   119     spaces are converted to underscores; and anything that is not a unicode
       
   120     alphanumeric, dash, underscore, or dot, is removed.
       
   121     >>> get_valid_filename("john's portrait in 2004.jpg")
       
   122     u'johns_portrait_in_2004.jpg'
       
   123     """
       
   124     s = force_unicode(s).strip().replace(' ', '_')
       
   125     return re.sub(r'(?u)[^-\w.]', '', s)
       
   126 get_valid_filename = allow_lazy(get_valid_filename, unicode)
       
   127 
       
   128 def get_text_list(list_, last_word=ugettext_lazy(u'or')):
       
   129     """
       
   130     >>> get_text_list(['a', 'b', 'c', 'd'])
       
   131     u'a, b, c or d'
       
   132     >>> get_text_list(['a', 'b', 'c'], 'and')
       
   133     u'a, b and c'
       
   134     >>> get_text_list(['a', 'b'], 'and')
       
   135     u'a and b'
       
   136     >>> get_text_list(['a'])
       
   137     u'a'
       
   138     >>> get_text_list([])
       
   139     u''
       
   140     """
       
   141     if len(list_) == 0: return u''
       
   142     if len(list_) == 1: return force_unicode(list_[0])
       
   143     return u'%s %s %s' % (', '.join([force_unicode(i) for i in list_][:-1]), force_unicode(last_word), force_unicode(list_[-1]))
       
   144 get_text_list = allow_lazy(get_text_list, unicode)
       
   145 
       
   146 def normalize_newlines(text):
       
   147     return force_unicode(re.sub(r'\r\n|\r|\n', '\n', text))
       
   148 normalize_newlines = allow_lazy(normalize_newlines, unicode)
       
   149 
       
   150 def recapitalize(text):
       
   151     "Recapitalizes text, placing caps after end-of-sentence punctuation."
       
   152     text = force_unicode(text).lower()
       
   153     capsRE = re.compile(r'(?:^|(?<=[\.\?\!] ))([a-z])')
       
   154     text = capsRE.sub(lambda x: x.group(1).upper(), text)
       
   155     return text
       
   156 recapitalize = allow_lazy(recapitalize)
       
   157 
       
   158 def phone2numeric(phone):
       
   159     "Converts a phone number with letters into its numeric equivalent."
       
   160     letters = re.compile(r'[A-PR-Y]', re.I)
       
   161     char2number = lambda m: {'a': '2', 'c': '2', 'b': '2', 'e': '3',
       
   162          'd': '3', 'g': '4', 'f': '3', 'i': '4', 'h': '4', 'k': '5',
       
   163          'j': '5', 'm': '6', 'l': '5', 'o': '6', 'n': '6', 'p': '7',
       
   164          's': '7', 'r': '7', 'u': '8', 't': '8', 'w': '9', 'v': '8',
       
   165          'y': '9', 'x': '9'}.get(m.group(0).lower())
       
   166     return letters.sub(char2number, phone)
       
   167 phone2numeric = allow_lazy(phone2numeric)
       
   168 
       
   169 # From http://www.xhaus.com/alan/python/httpcomp.html#gzip
       
   170 # Used with permission.
       
   171 def compress_string(s):
       
   172     import cStringIO, gzip
       
   173     zbuf = cStringIO.StringIO()
       
   174     zfile = gzip.GzipFile(mode='wb', compresslevel=6, fileobj=zbuf)
       
   175     zfile.write(s)
       
   176     zfile.close()
       
   177     return zbuf.getvalue()
       
   178 
       
   179 ustring_re = re.compile(u"([\u0080-\uffff])")
       
   180 
       
   181 def javascript_quote(s, quote_double_quotes=False):
       
   182 
       
   183     def fix(match):
       
   184         return r"\u%04x" % ord(match.group(1))
       
   185 
       
   186     if type(s) == str:
       
   187         s = s.decode('utf-8')
       
   188     elif type(s) != unicode:
       
   189         raise TypeError, s
       
   190     s = s.replace('\\', '\\\\')
       
   191     s = s.replace('\r', '\\r')
       
   192     s = s.replace('\n', '\\n')
       
   193     s = s.replace('\t', '\\t')
       
   194     s = s.replace("'", "\\'")
       
   195     if quote_double_quotes:
       
   196         s = s.replace('"', '&quot;')
       
   197     return str(ustring_re.sub(fix, s))
       
   198 javascript_quote = allow_lazy(javascript_quote, unicode)
       
   199 
       
   200 # Expression to match some_token and some_token="with spaces" (and similarly
       
   201 # for single-quoted strings).
       
   202 smart_split_re = re.compile(r"""
       
   203     ([^\s"]*"(?:[^"\\]*(?:\\.[^"\\]*)*)"\S*|
       
   204      [^\s']*'(?:[^'\\]*(?:\\.[^'\\]*)*)'\S*|
       
   205      \S+)""", re.VERBOSE)
       
   206 
       
   207 def smart_split(text):
       
   208     r"""
       
   209     Generator that splits a string by spaces, leaving quoted phrases together.
       
   210     Supports both single and double quotes, and supports escaping quotes with
       
   211     backslashes. In the output, strings will keep their initial and trailing
       
   212     quote marks and escaped quotes will remain escaped (the results can then
       
   213     be further processed with unescape_string_literal()).
       
   214 
       
   215     >>> list(smart_split(r'This is "a person\'s" test.'))
       
   216     [u'This', u'is', u'"a person\\\'s"', u'test.']
       
   217     >>> list(smart_split(r"Another 'person\'s' test."))
       
   218     [u'Another', u"'person\\'s'", u'test.']
       
   219     >>> list(smart_split(r'A "\"funky\" style" test.'))
       
   220     [u'A', u'"\\"funky\\" style"', u'test.']
       
   221     """
       
   222     text = force_unicode(text)
       
   223     for bit in smart_split_re.finditer(text):
       
   224         yield bit.group(0)
       
   225 smart_split = allow_lazy(smart_split, unicode)
       
   226 
       
   227 def _replace_entity(match):
       
   228     text = match.group(1)
       
   229     if text[0] == u'#':
       
   230         text = text[1:]
       
   231         try:
       
   232             if text[0] in u'xX':
       
   233                 c = int(text[1:], 16)
       
   234             else:
       
   235                 c = int(text)
       
   236             return unichr(c)
       
   237         except ValueError:
       
   238             return match.group(0)
       
   239     else:
       
   240         try:
       
   241             return unichr(name2codepoint[text])
       
   242         except (ValueError, KeyError):
       
   243             return match.group(0)
       
   244 
       
   245 _entity_re = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
       
   246 
       
   247 def unescape_entities(text):
       
   248     return _entity_re.sub(_replace_entity, text)
       
   249 unescape_entities = allow_lazy(unescape_entities, unicode)
       
   250 
       
   251 def unescape_string_literal(s):
       
   252     r"""
       
   253     Convert quoted string literals to unquoted strings with escaped quotes and
       
   254     backslashes unquoted::
       
   255 
       
   256         >>> unescape_string_literal('"abc"')
       
   257         'abc'
       
   258         >>> unescape_string_literal("'abc'")
       
   259         'abc'
       
   260         >>> unescape_string_literal('"a \"bc\""')
       
   261         'a "bc"'
       
   262         >>> unescape_string_literal("'\'ab\' c'")
       
   263         "'ab' c"
       
   264     """
       
   265     if s[0] not in "\"'" or s[-1] != s[0]:
       
   266         raise ValueError("Not a string literal: %r" % s)
       
   267     quote = s[0]
       
   268     return s[1:-1].replace(r'\%s' % quote, quote).replace(r'\\', '\\')
       
   269 unescape_string_literal = allow_lazy(unescape_string_literal)