web/lib/django/http/multipartparser.py
changeset 38 77b6da96e6f1
parent 0 0d40e90630ef
equal deleted inserted replaced
37:8d941af65caf 38:77b6da96e6f1
       
     1 """
       
     2 Multi-part parsing for file uploads.
       
     3 
       
     4 Exposes one class, ``MultiPartParser``, which feeds chunks of uploaded data to
       
     5 file upload handlers for processing.
       
     6 """
       
     7 
       
     8 import cgi
       
     9 from django.conf import settings
       
    10 from django.core.exceptions import SuspiciousOperation
       
    11 from django.utils.datastructures import MultiValueDict
       
    12 from django.utils.encoding import force_unicode
       
    13 from django.utils.text import unescape_entities
       
    14 from django.core.files.uploadhandler import StopUpload, SkipFile, StopFutureHandlers
       
    15 
       
    16 __all__ = ('MultiPartParser', 'MultiPartParserError', 'InputStreamExhausted')
       
    17 
       
    18 class MultiPartParserError(Exception):
       
    19     pass
       
    20 
       
    21 class InputStreamExhausted(Exception):
       
    22     """
       
    23     No more reads are allowed from this device.
       
    24     """
       
    25     pass
       
    26 
       
    27 RAW = "raw"
       
    28 FILE = "file"
       
    29 FIELD = "field"
       
    30 
       
    31 class MultiPartParser(object):
       
    32     """
       
    33     A rfc2388 multipart/form-data parser.
       
    34 
       
    35     ``MultiValueDict.parse()`` reads the input stream in ``chunk_size`` chunks
       
    36     and returns a tuple of ``(MultiValueDict(POST), MultiValueDict(FILES))``. If
       
    37     """
       
    38     def __init__(self, META, input_data, upload_handlers, encoding=None):
       
    39         """
       
    40         Initialize the MultiPartParser object.
       
    41 
       
    42         :META:
       
    43             The standard ``META`` dictionary in Django request objects.
       
    44         :input_data:
       
    45             The raw post data, as a file-like object.
       
    46         :upload_handler:
       
    47             An UploadHandler instance that performs operations on the uploaded
       
    48             data.
       
    49         :encoding:
       
    50             The encoding with which to treat the incoming data.
       
    51         """
       
    52 
       
    53         #
       
    54         # Content-Type should containt multipart and the boundary information.
       
    55         #
       
    56 
       
    57         content_type = META.get('HTTP_CONTENT_TYPE', META.get('CONTENT_TYPE', ''))
       
    58         if not content_type.startswith('multipart/'):
       
    59             raise MultiPartParserError('Invalid Content-Type: %s' % content_type)
       
    60 
       
    61         # Parse the header to get the boundary to split the parts.
       
    62         ctypes, opts = parse_header(content_type)
       
    63         boundary = opts.get('boundary')
       
    64         if not boundary or not cgi.valid_boundary(boundary):
       
    65             raise MultiPartParserError('Invalid boundary in multipart: %s' % boundary)
       
    66 
       
    67 
       
    68         #
       
    69         # Content-Length should contain the length of the body we are about
       
    70         # to receive.
       
    71         #
       
    72         try:
       
    73             content_length = int(META.get('HTTP_CONTENT_LENGTH', META.get('CONTENT_LENGTH',0)))
       
    74         except (ValueError, TypeError):
       
    75             # For now set it to 0; we'll try again later on down.
       
    76             content_length = 0
       
    77 
       
    78         if content_length <= 0:
       
    79             # This means we shouldn't continue...raise an error.
       
    80             raise MultiPartParserError("Invalid content length: %r" % content_length)
       
    81 
       
    82         self._boundary = boundary
       
    83         self._input_data = input_data
       
    84 
       
    85         # For compatibility with low-level network APIs (with 32-bit integers),
       
    86         # the chunk size should be < 2^31, but still divisible by 4.
       
    87         possible_sizes = [x.chunk_size for x in upload_handlers if x.chunk_size]
       
    88         self._chunk_size = min([2**31-4] + possible_sizes)
       
    89 
       
    90         self._meta = META
       
    91         self._encoding = encoding or settings.DEFAULT_CHARSET
       
    92         self._content_length = content_length
       
    93         self._upload_handlers = upload_handlers
       
    94 
       
    95     def parse(self):
       
    96         """
       
    97         Parse the POST data and break it into a FILES MultiValueDict and a POST
       
    98         MultiValueDict.
       
    99 
       
   100         Returns a tuple containing the POST and FILES dictionary, respectively.
       
   101         """
       
   102         # We have to import QueryDict down here to avoid a circular import.
       
   103         from django.http import QueryDict
       
   104 
       
   105         encoding = self._encoding
       
   106         handlers = self._upload_handlers
       
   107 
       
   108         limited_input_data = LimitBytes(self._input_data, self._content_length)
       
   109 
       
   110         # See if the handler will want to take care of the parsing.
       
   111         # This allows overriding everything if somebody wants it.
       
   112         for handler in handlers:
       
   113             result = handler.handle_raw_input(limited_input_data,
       
   114                                               self._meta,
       
   115                                               self._content_length,
       
   116                                               self._boundary,
       
   117                                               encoding)
       
   118             if result is not None:
       
   119                 return result[0], result[1]
       
   120 
       
   121         # Create the data structures to be used later.
       
   122         self._post = QueryDict('', mutable=True)
       
   123         self._files = MultiValueDict()
       
   124 
       
   125         # Instantiate the parser and stream:
       
   126         stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size))
       
   127 
       
   128         # Whether or not to signal a file-completion at the beginning of the loop.
       
   129         old_field_name = None
       
   130         counters = [0] * len(handlers)
       
   131 
       
   132         try:
       
   133             for item_type, meta_data, field_stream in Parser(stream, self._boundary):
       
   134                 if old_field_name:
       
   135                     # We run this at the beginning of the next loop
       
   136                     # since we cannot be sure a file is complete until
       
   137                     # we hit the next boundary/part of the multipart content.
       
   138                     self.handle_file_complete(old_field_name, counters)
       
   139                     old_field_name = None
       
   140 
       
   141                 try:
       
   142                     disposition = meta_data['content-disposition'][1]
       
   143                     field_name = disposition['name'].strip()
       
   144                 except (KeyError, IndexError, AttributeError):
       
   145                     continue
       
   146 
       
   147                 transfer_encoding = meta_data.get('content-transfer-encoding')
       
   148                 field_name = force_unicode(field_name, encoding, errors='replace')
       
   149 
       
   150                 if item_type == FIELD:
       
   151                     # This is a post field, we can just set it in the post
       
   152                     if transfer_encoding == 'base64':
       
   153                         raw_data = field_stream.read()
       
   154                         try:
       
   155                             data = str(raw_data).decode('base64')
       
   156                         except:
       
   157                             data = raw_data
       
   158                     else:
       
   159                         data = field_stream.read()
       
   160 
       
   161                     self._post.appendlist(field_name,
       
   162                                           force_unicode(data, encoding, errors='replace'))
       
   163                 elif item_type == FILE:
       
   164                     # This is a file, use the handler...
       
   165                     file_name = disposition.get('filename')
       
   166                     if not file_name:
       
   167                         continue
       
   168                     file_name = force_unicode(file_name, encoding, errors='replace')
       
   169                     file_name = self.IE_sanitize(unescape_entities(file_name))
       
   170 
       
   171                     content_type = meta_data.get('content-type', ('',))[0].strip()
       
   172                     try:
       
   173                         charset = meta_data.get('content-type', (0,{}))[1].get('charset', None)
       
   174                     except:
       
   175                         charset = None
       
   176 
       
   177                     try:
       
   178                         content_length = int(meta_data.get('content-length')[0])
       
   179                     except (IndexError, TypeError, ValueError):
       
   180                         content_length = None
       
   181 
       
   182                     counters = [0] * len(handlers)
       
   183                     try:
       
   184                         for handler in handlers:
       
   185                             try:
       
   186                                 handler.new_file(field_name, file_name,
       
   187                                                  content_type, content_length,
       
   188                                                  charset)
       
   189                             except StopFutureHandlers:
       
   190                                 break
       
   191 
       
   192                         for chunk in field_stream:
       
   193                             if transfer_encoding == 'base64':
       
   194                                 # We only special-case base64 transfer encoding
       
   195                                 try:
       
   196                                     chunk = str(chunk).decode('base64')
       
   197                                 except Exception, e:
       
   198                                     # Since this is only a chunk, any error is an unfixable error.
       
   199                                     raise MultiPartParserError("Could not decode base64 data: %r" % e)
       
   200 
       
   201                             for i, handler in enumerate(handlers):
       
   202                                 chunk_length = len(chunk)
       
   203                                 chunk = handler.receive_data_chunk(chunk,
       
   204                                                                    counters[i])
       
   205                                 counters[i] += chunk_length
       
   206                                 if chunk is None:
       
   207                                     # If the chunk received by the handler is None, then don't continue.
       
   208                                     break
       
   209 
       
   210                     except SkipFile, e:
       
   211                         # Just use up the rest of this file...
       
   212                         exhaust(field_stream)
       
   213                     else:
       
   214                         # Handle file upload completions on next iteration.
       
   215                         old_field_name = field_name
       
   216                 else:
       
   217                     # If this is neither a FIELD or a FILE, just exhaust the stream.
       
   218                     exhaust(stream)
       
   219         except StopUpload, e:
       
   220             if not e.connection_reset:
       
   221                 exhaust(limited_input_data)
       
   222         else:
       
   223             # Make sure that the request data is all fed
       
   224             exhaust(limited_input_data)
       
   225 
       
   226         # Signal that the upload has completed.
       
   227         for handler in handlers:
       
   228             retval = handler.upload_complete()
       
   229             if retval:
       
   230                 break
       
   231 
       
   232         return self._post, self._files
       
   233 
       
   234     def handle_file_complete(self, old_field_name, counters):
       
   235         """
       
   236         Handle all the signalling that takes place when a file is complete.
       
   237         """
       
   238         for i, handler in enumerate(self._upload_handlers):
       
   239             file_obj = handler.file_complete(counters[i])
       
   240             if file_obj:
       
   241                 # If it returns a file object, then set the files dict.
       
   242                 self._files.appendlist(force_unicode(old_field_name,
       
   243                                                      self._encoding,
       
   244                                                      errors='replace'),
       
   245                                        file_obj)
       
   246                 break
       
   247 
       
   248     def IE_sanitize(self, filename):
       
   249         """Cleanup filename from Internet Explorer full paths."""
       
   250         return filename and filename[filename.rfind("\\")+1:].strip()
       
   251 
       
   252 class LazyStream(object):
       
   253     """
       
   254     The LazyStream wrapper allows one to get and "unget" bytes from a stream.
       
   255 
       
   256     Given a producer object (an iterator that yields bytestrings), the
       
   257     LazyStream object will support iteration, reading, and keeping a "look-back"
       
   258     variable in case you need to "unget" some bytes.
       
   259     """
       
   260     def __init__(self, producer, length=None):
       
   261         """
       
   262         Every LazyStream must have a producer when instantiated.
       
   263 
       
   264         A producer is an iterable that returns a string each time it
       
   265         is called.
       
   266         """
       
   267         self._producer = producer
       
   268         self._empty = False
       
   269         self._leftover = ''
       
   270         self.length = length
       
   271         self.position = 0
       
   272         self._remaining = length
       
   273         self._unget_history = []
       
   274 
       
   275     def tell(self):
       
   276         return self.position
       
   277 
       
   278     def read(self, size=None):
       
   279         def parts():
       
   280             remaining = (size is not None and [size] or [self._remaining])[0]
       
   281             # do the whole thing in one shot if no limit was provided.
       
   282             if remaining is None:
       
   283                 yield ''.join(self)
       
   284                 return
       
   285 
       
   286             # otherwise do some bookkeeping to return exactly enough
       
   287             # of the stream and stashing any extra content we get from
       
   288             # the producer
       
   289             while remaining != 0:
       
   290                 assert remaining > 0, 'remaining bytes to read should never go negative'
       
   291 
       
   292                 chunk = self.next()
       
   293 
       
   294                 emitting = chunk[:remaining]
       
   295                 self.unget(chunk[remaining:])
       
   296                 remaining -= len(emitting)
       
   297                 yield emitting
       
   298 
       
   299         out = ''.join(parts())
       
   300         return out
       
   301 
       
   302     def next(self):
       
   303         """
       
   304         Used when the exact number of bytes to read is unimportant.
       
   305 
       
   306         This procedure just returns whatever is chunk is conveniently returned
       
   307         from the iterator instead. Useful to avoid unnecessary bookkeeping if
       
   308         performance is an issue.
       
   309         """
       
   310         if self._leftover:
       
   311             output = self._leftover
       
   312             self._leftover = ''
       
   313         else:
       
   314             output = self._producer.next()
       
   315             self._unget_history = []
       
   316         self.position += len(output)
       
   317         return output
       
   318 
       
   319     def close(self):
       
   320         """
       
   321         Used to invalidate/disable this lazy stream.
       
   322 
       
   323         Replaces the producer with an empty list. Any leftover bytes that have
       
   324         already been read will still be reported upon read() and/or next().
       
   325         """
       
   326         self._producer = []
       
   327 
       
   328     def __iter__(self):
       
   329         return self
       
   330 
       
   331     def unget(self, bytes):
       
   332         """
       
   333         Places bytes back onto the front of the lazy stream.
       
   334 
       
   335         Future calls to read() will return those bytes first. The
       
   336         stream position and thus tell() will be rewound.
       
   337         """
       
   338         if not bytes:
       
   339             return
       
   340         self._update_unget_history(len(bytes))
       
   341         self.position -= len(bytes)
       
   342         self._leftover = ''.join([bytes, self._leftover])
       
   343 
       
   344     def _update_unget_history(self, num_bytes):
       
   345         """
       
   346         Updates the unget history as a sanity check to see if we've pushed
       
   347         back the same number of bytes in one chunk. If we keep ungetting the
       
   348         same number of bytes many times (here, 50), we're mostly likely in an
       
   349         infinite loop of some sort. This is usually caused by a
       
   350         maliciously-malformed MIME request.
       
   351         """
       
   352         self._unget_history = [num_bytes] + self._unget_history[:49]
       
   353         number_equal = len([current_number for current_number in self._unget_history
       
   354                             if current_number == num_bytes])
       
   355 
       
   356         if number_equal > 40:
       
   357             raise SuspiciousOperation(
       
   358                 "The multipart parser got stuck, which shouldn't happen with"
       
   359                 " normal uploaded files. Check for malicious upload activity;"
       
   360                 " if there is none, report this to the Django developers."
       
   361             )
       
   362 
       
   363 class ChunkIter(object):
       
   364     """
       
   365     An iterable that will yield chunks of data. Given a file-like object as the
       
   366     constructor, this object will yield chunks of read operations from that
       
   367     object.
       
   368     """
       
   369     def __init__(self, flo, chunk_size=64 * 1024):
       
   370         self.flo = flo
       
   371         self.chunk_size = chunk_size
       
   372 
       
   373     def next(self):
       
   374         try:
       
   375             data = self.flo.read(self.chunk_size)
       
   376         except InputStreamExhausted:
       
   377             raise StopIteration()
       
   378         if data:
       
   379             return data
       
   380         else:
       
   381             raise StopIteration()
       
   382 
       
   383     def __iter__(self):
       
   384         return self
       
   385 
       
   386 class LimitBytes(object):
       
   387     """ Limit bytes for a file object. """
       
   388     def __init__(self, fileobject, length):
       
   389         self._file = fileobject
       
   390         self.remaining = length
       
   391 
       
   392     def read(self, num_bytes=None):
       
   393         """
       
   394         Read data from the underlying file.
       
   395         If you ask for too much or there isn't anything left,
       
   396         this will raise an InputStreamExhausted error.
       
   397         """
       
   398         if self.remaining <= 0:
       
   399             raise InputStreamExhausted()
       
   400         if num_bytes is None:
       
   401             num_bytes = self.remaining
       
   402         else:
       
   403             num_bytes = min(num_bytes, self.remaining)
       
   404         self.remaining -= num_bytes
       
   405         return self._file.read(num_bytes)
       
   406 
       
   407 class InterBoundaryIter(object):
       
   408     """
       
   409     A Producer that will iterate over boundaries.
       
   410     """
       
   411     def __init__(self, stream, boundary):
       
   412         self._stream = stream
       
   413         self._boundary = boundary
       
   414 
       
   415     def __iter__(self):
       
   416         return self
       
   417 
       
   418     def next(self):
       
   419         try:
       
   420             return LazyStream(BoundaryIter(self._stream, self._boundary))
       
   421         except InputStreamExhausted:
       
   422             raise StopIteration()
       
   423 
       
   424 class BoundaryIter(object):
       
   425     """
       
   426     A Producer that is sensitive to boundaries.
       
   427 
       
   428     Will happily yield bytes until a boundary is found. Will yield the bytes
       
   429     before the boundary, throw away the boundary bytes themselves, and push the
       
   430     post-boundary bytes back on the stream.
       
   431 
       
   432     The future calls to .next() after locating the boundary will raise a
       
   433     StopIteration exception.
       
   434     """
       
   435 
       
   436     def __init__(self, stream, boundary):
       
   437         self._stream = stream
       
   438         self._boundary = boundary
       
   439         self._done = False
       
   440         # rollback an additional six bytes because the format is like
       
   441         # this: CRLF<boundary>[--CRLF]
       
   442         self._rollback = len(boundary) + 6
       
   443 
       
   444         # Try to use mx fast string search if available. Otherwise
       
   445         # use Python find. Wrap the latter for consistency.
       
   446         unused_char = self._stream.read(1)
       
   447         if not unused_char:
       
   448             raise InputStreamExhausted()
       
   449         self._stream.unget(unused_char)
       
   450         try:
       
   451             from mx.TextTools import FS
       
   452             self._fs = FS(boundary).find
       
   453         except ImportError:
       
   454             self._fs = lambda data: data.find(boundary)
       
   455 
       
   456     def __iter__(self):
       
   457         return self
       
   458 
       
   459     def next(self):
       
   460         if self._done:
       
   461             raise StopIteration()
       
   462 
       
   463         stream = self._stream
       
   464         rollback = self._rollback
       
   465 
       
   466         bytes_read = 0
       
   467         chunks = []
       
   468         for bytes in stream:
       
   469             bytes_read += len(bytes)
       
   470             chunks.append(bytes)
       
   471             if bytes_read > rollback:
       
   472                 break
       
   473             if not bytes:
       
   474                 break
       
   475         else:
       
   476             self._done = True
       
   477 
       
   478         if not chunks:
       
   479             raise StopIteration()
       
   480 
       
   481         chunk = ''.join(chunks)
       
   482         boundary = self._find_boundary(chunk, len(chunk) < self._rollback)
       
   483 
       
   484         if boundary:
       
   485             end, next = boundary
       
   486             stream.unget(chunk[next:])
       
   487             self._done = True
       
   488             return chunk[:end]
       
   489         else:
       
   490             # make sure we dont treat a partial boundary (and
       
   491             # its separators) as data
       
   492             if not chunk[:-rollback]:# and len(chunk) >= (len(self._boundary) + 6):
       
   493                 # There's nothing left, we should just return and mark as done.
       
   494                 self._done = True
       
   495                 return chunk
       
   496             else:
       
   497                 stream.unget(chunk[-rollback:])
       
   498                 return chunk[:-rollback]
       
   499 
       
   500     def _find_boundary(self, data, eof = False):
       
   501         """
       
   502         Finds a multipart boundary in data.
       
   503 
       
   504         Should no boundry exist in the data None is returned instead. Otherwise
       
   505         a tuple containing the indices of the following are returned:
       
   506 
       
   507          * the end of current encapsulation
       
   508          * the start of the next encapsulation
       
   509         """
       
   510         index = self._fs(data)
       
   511         if index < 0:
       
   512             return None
       
   513         else:
       
   514             end = index
       
   515             next = index + len(self._boundary)
       
   516             # backup over CRLF
       
   517             if data[max(0,end-1)] == '\n':
       
   518                 end -= 1
       
   519             if data[max(0,end-1)] == '\r':
       
   520                 end -= 1
       
   521             return end, next
       
   522 
       
   523 def exhaust(stream_or_iterable):
       
   524     """
       
   525     Completely exhausts an iterator or stream.
       
   526 
       
   527     Raise a MultiPartParserError if the argument is not a stream or an iterable.
       
   528     """
       
   529     iterator = None
       
   530     try:
       
   531         iterator = iter(stream_or_iterable)
       
   532     except TypeError:
       
   533         iterator = ChunkIter(stream_or_iterable, 16384)
       
   534 
       
   535     if iterator is None:
       
   536         raise MultiPartParserError('multipartparser.exhaust() was passed a non-iterable or stream parameter')
       
   537 
       
   538     for __ in iterator:
       
   539         pass
       
   540 
       
   541 def parse_boundary_stream(stream, max_header_size):
       
   542     """
       
   543     Parses one and exactly one stream that encapsulates a boundary.
       
   544     """
       
   545     # Stream at beginning of header, look for end of header
       
   546     # and parse it if found. The header must fit within one
       
   547     # chunk.
       
   548     chunk = stream.read(max_header_size)
       
   549 
       
   550     # 'find' returns the top of these four bytes, so we'll
       
   551     # need to munch them later to prevent them from polluting
       
   552     # the payload.
       
   553     header_end = chunk.find('\r\n\r\n')
       
   554 
       
   555     def _parse_header(line):
       
   556         main_value_pair, params = parse_header(line)
       
   557         try:
       
   558             name, value = main_value_pair.split(':', 1)
       
   559         except:
       
   560             raise ValueError("Invalid header: %r" % line)
       
   561         return name, (value, params)
       
   562 
       
   563     if header_end == -1:
       
   564         # we find no header, so we just mark this fact and pass on
       
   565         # the stream verbatim
       
   566         stream.unget(chunk)
       
   567         return (RAW, {}, stream)
       
   568 
       
   569     header = chunk[:header_end]
       
   570 
       
   571     # here we place any excess chunk back onto the stream, as
       
   572     # well as throwing away the CRLFCRLF bytes from above.
       
   573     stream.unget(chunk[header_end + 4:])
       
   574 
       
   575     TYPE = RAW
       
   576     outdict = {}
       
   577 
       
   578     # Eliminate blank lines
       
   579     for line in header.split('\r\n'):
       
   580         # This terminology ("main value" and "dictionary of
       
   581         # parameters") is from the Python docs.
       
   582         try:
       
   583             name, (value, params) = _parse_header(line)
       
   584         except:
       
   585             continue
       
   586 
       
   587         if name == 'content-disposition':
       
   588             TYPE = FIELD
       
   589             if params.get('filename'):
       
   590                 TYPE = FILE
       
   591 
       
   592         outdict[name] = value, params
       
   593 
       
   594     if TYPE == RAW:
       
   595         stream.unget(chunk)
       
   596 
       
   597     return (TYPE, outdict, stream)
       
   598 
       
   599 class Parser(object):
       
   600     def __init__(self, stream, boundary):
       
   601         self._stream = stream
       
   602         self._separator = '--' + boundary
       
   603 
       
   604     def __iter__(self):
       
   605         boundarystream = InterBoundaryIter(self._stream, self._separator)
       
   606         for sub_stream in boundarystream:
       
   607             # Iterate over each part
       
   608             yield parse_boundary_stream(sub_stream, 1024)
       
   609 
       
   610 def parse_header(line):
       
   611     """ Parse the header into a key-value. """
       
   612     plist = _parse_header_params(';' + line)
       
   613     key = plist.pop(0).lower()
       
   614     pdict = {}
       
   615     for p in plist:
       
   616         i = p.find('=')
       
   617         if i >= 0:
       
   618             name = p[:i].strip().lower()
       
   619             value = p[i+1:].strip()
       
   620             if len(value) >= 2 and value[0] == value[-1] == '"':
       
   621                 value = value[1:-1]
       
   622                 value = value.replace('\\\\', '\\').replace('\\"', '"')
       
   623             pdict[name] = value
       
   624     return key, pdict
       
   625 
       
   626 def _parse_header_params(s):
       
   627     plist = []
       
   628     while s[:1] == ';':
       
   629         s = s[1:]
       
   630         end = s.find(';')
       
   631         while end > 0 and s.count('"', 0, end) % 2:
       
   632             end = s.find(';', end + 1)
       
   633         if end < 0:
       
   634             end = len(s)
       
   635         f = s[:end]
       
   636         plist.append(f.strip())
       
   637         s = s[end:]
       
   638     return plist