web/wp-includes/SimplePie/Sanitize.php
changeset 204 09a1c134465b
equal deleted inserted replaced
203:f507feede89a 204:09a1c134465b
       
     1 <?php
       
     2 /**
       
     3  * SimplePie
       
     4  *
       
     5  * A PHP-Based RSS and Atom Feed Framework.
       
     6  * Takes the hard work out of managing a complete RSS/Atom solution.
       
     7  *
       
     8  * Copyright (c) 2004-2012, Ryan Parman, Geoffrey Sneddon, Ryan McCue, and contributors
       
     9  * All rights reserved.
       
    10  *
       
    11  * Redistribution and use in source and binary forms, with or without modification, are
       
    12  * permitted provided that the following conditions are met:
       
    13  *
       
    14  * 	* Redistributions of source code must retain the above copyright notice, this list of
       
    15  * 	  conditions and the following disclaimer.
       
    16  *
       
    17  * 	* Redistributions in binary form must reproduce the above copyright notice, this list
       
    18  * 	  of conditions and the following disclaimer in the documentation and/or other materials
       
    19  * 	  provided with the distribution.
       
    20  *
       
    21  * 	* Neither the name of the SimplePie Team nor the names of its contributors may be used
       
    22  * 	  to endorse or promote products derived from this software without specific prior
       
    23  * 	  written permission.
       
    24  *
       
    25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS
       
    26  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
       
    27  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS
       
    28  * AND CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
       
    29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
       
    30  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
       
    31  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
       
    32  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       
    33  * POSSIBILITY OF SUCH DAMAGE.
       
    34  *
       
    35  * @package SimplePie
       
    36  * @version 1.3.1
       
    37  * @copyright 2004-2012 Ryan Parman, Geoffrey Sneddon, Ryan McCue
       
    38  * @author Ryan Parman
       
    39  * @author Geoffrey Sneddon
       
    40  * @author Ryan McCue
       
    41  * @link http://simplepie.org/ SimplePie
       
    42  * @license http://www.opensource.org/licenses/bsd-license.php BSD License
       
    43  */
       
    44 
       
    45 /**
       
    46  * Used for data cleanup and post-processing
       
    47  *
       
    48  *
       
    49  * This class can be overloaded with {@see SimplePie::set_sanitize_class()}
       
    50  *
       
    51  * @package SimplePie
       
    52  * @todo Move to using an actual HTML parser (this will allow tags to be properly stripped, and to switch between HTML and XHTML), this will also make it easier to shorten a string while preserving HTML tags
       
    53  */
       
    54 class SimplePie_Sanitize
       
    55 {
       
    56 	// Private vars
       
    57 	var $base;
       
    58 
       
    59 	// Options
       
    60 	var $remove_div = true;
       
    61 	var $image_handler = '';
       
    62 	var $strip_htmltags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style');
       
    63 	var $encode_instead_of_strip = false;
       
    64 	var $strip_attributes = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc');
       
    65 	var $strip_comments = false;
       
    66 	var $output_encoding = 'UTF-8';
       
    67 	var $enable_cache = true;
       
    68 	var $cache_location = './cache';
       
    69 	var $cache_name_function = 'md5';
       
    70 	var $timeout = 10;
       
    71 	var $useragent = '';
       
    72 	var $force_fsockopen = false;
       
    73 	var $replace_url_attributes = null;
       
    74 
       
    75 	public function __construct()
       
    76 	{
       
    77 		// Set defaults
       
    78 		$this->set_url_replacements(null);
       
    79 	}
       
    80 
       
    81 	public function remove_div($enable = true)
       
    82 	{
       
    83 		$this->remove_div = (bool) $enable;
       
    84 	}
       
    85 
       
    86 	public function set_image_handler($page = false)
       
    87 	{
       
    88 		if ($page)
       
    89 		{
       
    90 			$this->image_handler = (string) $page;
       
    91 		}
       
    92 		else
       
    93 		{
       
    94 			$this->image_handler = false;
       
    95 		}
       
    96 	}
       
    97 
       
    98 	public function set_registry(SimplePie_Registry $registry)
       
    99 	{
       
   100 		$this->registry = $registry;
       
   101 	}
       
   102 
       
   103 	public function pass_cache_data($enable_cache = true, $cache_location = './cache', $cache_name_function = 'md5', $cache_class = 'SimplePie_Cache')
       
   104 	{
       
   105 		if (isset($enable_cache))
       
   106 		{
       
   107 			$this->enable_cache = (bool) $enable_cache;
       
   108 		}
       
   109 
       
   110 		if ($cache_location)
       
   111 		{
       
   112 			$this->cache_location = (string) $cache_location;
       
   113 		}
       
   114 
       
   115 		if ($cache_name_function)
       
   116 		{
       
   117 			$this->cache_name_function = (string) $cache_name_function;
       
   118 		}
       
   119 	}
       
   120 
       
   121 	public function pass_file_data($file_class = 'SimplePie_File', $timeout = 10, $useragent = '', $force_fsockopen = false)
       
   122 	{
       
   123 		if ($timeout)
       
   124 		{
       
   125 			$this->timeout = (string) $timeout;
       
   126 		}
       
   127 
       
   128 		if ($useragent)
       
   129 		{
       
   130 			$this->useragent = (string) $useragent;
       
   131 		}
       
   132 
       
   133 		if ($force_fsockopen)
       
   134 		{
       
   135 			$this->force_fsockopen = (string) $force_fsockopen;
       
   136 		}
       
   137 	}
       
   138 
       
   139 	public function strip_htmltags($tags = array('base', 'blink', 'body', 'doctype', 'embed', 'font', 'form', 'frame', 'frameset', 'html', 'iframe', 'input', 'marquee', 'meta', 'noscript', 'object', 'param', 'script', 'style'))
       
   140 	{
       
   141 		if ($tags)
       
   142 		{
       
   143 			if (is_array($tags))
       
   144 			{
       
   145 				$this->strip_htmltags = $tags;
       
   146 			}
       
   147 			else
       
   148 			{
       
   149 				$this->strip_htmltags = explode(',', $tags);
       
   150 			}
       
   151 		}
       
   152 		else
       
   153 		{
       
   154 			$this->strip_htmltags = false;
       
   155 		}
       
   156 	}
       
   157 
       
   158 	public function encode_instead_of_strip($encode = false)
       
   159 	{
       
   160 		$this->encode_instead_of_strip = (bool) $encode;
       
   161 	}
       
   162 
       
   163 	public function strip_attributes($attribs = array('bgsound', 'class', 'expr', 'id', 'style', 'onclick', 'onerror', 'onfinish', 'onmouseover', 'onmouseout', 'onfocus', 'onblur', 'lowsrc', 'dynsrc'))
       
   164 	{
       
   165 		if ($attribs)
       
   166 		{
       
   167 			if (is_array($attribs))
       
   168 			{
       
   169 				$this->strip_attributes = $attribs;
       
   170 			}
       
   171 			else
       
   172 			{
       
   173 				$this->strip_attributes = explode(',', $attribs);
       
   174 			}
       
   175 		}
       
   176 		else
       
   177 		{
       
   178 			$this->strip_attributes = false;
       
   179 		}
       
   180 	}
       
   181 
       
   182 	public function strip_comments($strip = false)
       
   183 	{
       
   184 		$this->strip_comments = (bool) $strip;
       
   185 	}
       
   186 
       
   187 	public function set_output_encoding($encoding = 'UTF-8')
       
   188 	{
       
   189 		$this->output_encoding = (string) $encoding;
       
   190 	}
       
   191 
       
   192 	/**
       
   193 	 * Set element/attribute key/value pairs of HTML attributes
       
   194 	 * containing URLs that need to be resolved relative to the feed
       
   195 	 *
       
   196 	 * Defaults to |a|@href, |area|@href, |blockquote|@cite, |del|@cite,
       
   197 	 * |form|@action, |img|@longdesc, |img|@src, |input|@src, |ins|@cite,
       
   198 	 * |q|@cite
       
   199 	 *
       
   200 	 * @since 1.0
       
   201 	 * @param array|null $element_attribute Element/attribute key/value pairs, null for default
       
   202 	 */
       
   203 	public function set_url_replacements($element_attribute = null)
       
   204 	{
       
   205 		if ($element_attribute === null)
       
   206 		{
       
   207 			$element_attribute = array(
       
   208 				'a' => 'href',
       
   209 				'area' => 'href',
       
   210 				'blockquote' => 'cite',
       
   211 				'del' => 'cite',
       
   212 				'form' => 'action',
       
   213 				'img' => array(
       
   214 					'longdesc',
       
   215 					'src'
       
   216 				),
       
   217 				'input' => 'src',
       
   218 				'ins' => 'cite',
       
   219 				'q' => 'cite'
       
   220 			);
       
   221 		}
       
   222 		$this->replace_url_attributes = (array) $element_attribute;
       
   223 	}
       
   224 
       
   225 	public function sanitize($data, $type, $base = '')
       
   226 	{
       
   227 		$data = trim($data);
       
   228 		if ($data !== '' || $type & SIMPLEPIE_CONSTRUCT_IRI)
       
   229 		{
       
   230 			if ($type & SIMPLEPIE_CONSTRUCT_MAYBE_HTML)
       
   231 			{
       
   232 				if (preg_match('/(&(#(x[0-9a-fA-F]+|[0-9]+)|[a-zA-Z0-9]+)|<\/[A-Za-z][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E]*' . SIMPLEPIE_PCRE_HTML_ATTRIBUTE . '>)/', $data))
       
   233 				{
       
   234 					$type |= SIMPLEPIE_CONSTRUCT_HTML;
       
   235 				}
       
   236 				else
       
   237 				{
       
   238 					$type |= SIMPLEPIE_CONSTRUCT_TEXT;
       
   239 				}
       
   240 			}
       
   241 
       
   242 			if ($type & SIMPLEPIE_CONSTRUCT_BASE64)
       
   243 			{
       
   244 				$data = base64_decode($data);
       
   245 			}
       
   246 
       
   247 			if ($type & (SIMPLEPIE_CONSTRUCT_HTML | SIMPLEPIE_CONSTRUCT_XHTML))
       
   248 			{
       
   249 
       
   250 				if (!class_exists('DOMDocument'))
       
   251 				{
       
   252 					$this->registry->call('Misc', 'error', array('DOMDocument not found, unable to use sanitizer', E_USER_WARNING, __FILE__, __LINE__));
       
   253 					return '';
       
   254 				}
       
   255 				$document = new DOMDocument();
       
   256 				$document->encoding = 'UTF-8';
       
   257 				$data = $this->preprocess($data, $type);
       
   258 
       
   259 				set_error_handler(array('SimplePie_Misc', 'silence_errors'));
       
   260 				$document->loadHTML($data);
       
   261 				restore_error_handler();
       
   262 
       
   263 				// Strip comments
       
   264 				if ($this->strip_comments)
       
   265 				{
       
   266 					$xpath = new DOMXPath($document);
       
   267 					$comments = $xpath->query('//comment()');
       
   268 
       
   269 					foreach ($comments as $comment)
       
   270 					{
       
   271 						$comment->parentNode->removeChild($comment);
       
   272 					}
       
   273 				}
       
   274 
       
   275 				// Strip out HTML tags and attributes that might cause various security problems.
       
   276 				// Based on recommendations by Mark Pilgrim at:
       
   277 				// http://diveintomark.org/archives/2003/06/12/how_to_consume_rss_safely
       
   278 				if ($this->strip_htmltags)
       
   279 				{
       
   280 					foreach ($this->strip_htmltags as $tag)
       
   281 					{
       
   282 						$this->strip_tag($tag, $document, $type);
       
   283 					}
       
   284 				}
       
   285 
       
   286 				if ($this->strip_attributes)
       
   287 				{
       
   288 					foreach ($this->strip_attributes as $attrib)
       
   289 					{
       
   290 						$this->strip_attr($attrib, $document);
       
   291 					}
       
   292 				}
       
   293 
       
   294 				// Replace relative URLs
       
   295 				$this->base = $base;
       
   296 				foreach ($this->replace_url_attributes as $element => $attributes)
       
   297 				{
       
   298 					$this->replace_urls($document, $element, $attributes);
       
   299 				}
       
   300 
       
   301 				// If image handling (caching, etc.) is enabled, cache and rewrite all the image tags.
       
   302 				if (isset($this->image_handler) && ((string) $this->image_handler) !== '' && $this->enable_cache)
       
   303 				{
       
   304 					$images = $document->getElementsByTagName('img');
       
   305 					foreach ($images as $img)
       
   306 					{
       
   307 						if ($img->hasAttribute('src'))
       
   308 						{
       
   309 							$image_url = call_user_func($this->cache_name_function, $img->getAttribute('src'));
       
   310 							$cache = $this->registry->call('Cache', 'get_handler', array($this->cache_location, $image_url, 'spi'));
       
   311 
       
   312 							if ($cache->load())
       
   313 							{
       
   314 								$img->setAttribute('src', $this->image_handler . $image_url);
       
   315 							}
       
   316 							else
       
   317 							{
       
   318 								$file = $this->registry->create('File', array($img['attribs']['src']['data'], $this->timeout, 5, array('X-FORWARDED-FOR' => $_SERVER['REMOTE_ADDR']), $this->useragent, $this->force_fsockopen));
       
   319 								$headers = $file->headers;
       
   320 
       
   321 								if ($file->success && ($file->method & SIMPLEPIE_FILE_SOURCE_REMOTE === 0 || ($file->status_code === 200 || $file->status_code > 206 && $file->status_code < 300)))
       
   322 								{
       
   323 									if ($cache->save(array('headers' => $file->headers, 'body' => $file->body)))
       
   324 									{
       
   325 										$img->setAttribute('src', $this->image_handler . $image_url);
       
   326 									}
       
   327 									else
       
   328 									{
       
   329 										trigger_error("$this->cache_location is not writeable. Make sure you've set the correct relative or absolute path, and that the location is server-writable.", E_USER_WARNING);
       
   330 									}
       
   331 								}
       
   332 							}
       
   333 						}
       
   334 					}
       
   335 				}
       
   336 
       
   337 				// Remove the DOCTYPE
       
   338 				// Seems to cause segfaulting if we don't do this
       
   339 				if ($document->firstChild instanceof DOMDocumentType)
       
   340 				{
       
   341 					$document->removeChild($document->firstChild);
       
   342 				}
       
   343 
       
   344 				// Move everything from the body to the root
       
   345 				$real_body = $document->getElementsByTagName('body')->item(0)->childNodes->item(0);
       
   346 				$document->replaceChild($real_body, $document->firstChild);
       
   347 
       
   348 				// Finally, convert to a HTML string
       
   349 				$data = trim($document->saveHTML());
       
   350 
       
   351 				if ($this->remove_div)
       
   352 				{
       
   353 					$data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '', $data);
       
   354 					$data = preg_replace('/<\/div>$/', '', $data);
       
   355 				}
       
   356 				else
       
   357 				{
       
   358 					$data = preg_replace('/^<div' . SIMPLEPIE_PCRE_XML_ATTRIBUTE . '>/', '<div>', $data);
       
   359 				}
       
   360 			}
       
   361 
       
   362 			if ($type & SIMPLEPIE_CONSTRUCT_IRI)
       
   363 			{
       
   364 				$absolute = $this->registry->call('Misc', 'absolutize_url', array($data, $base));
       
   365 				if ($absolute !== false)
       
   366 				{
       
   367 					$data = $absolute;
       
   368 				}
       
   369 			}
       
   370 
       
   371 			if ($type & (SIMPLEPIE_CONSTRUCT_TEXT | SIMPLEPIE_CONSTRUCT_IRI))
       
   372 			{
       
   373 				$data = htmlspecialchars($data, ENT_COMPAT, 'UTF-8');
       
   374 			}
       
   375 
       
   376 			if ($this->output_encoding !== 'UTF-8')
       
   377 			{
       
   378 				$data = $this->registry->call('Misc', 'change_encoding', array($data, 'UTF-8', $this->output_encoding));
       
   379 			}
       
   380 		}
       
   381 		return $data;
       
   382 	}
       
   383 
       
   384 	protected function preprocess($html, $type)
       
   385 	{
       
   386 		$ret = '';
       
   387 		if ($type & ~SIMPLEPIE_CONSTRUCT_XHTML)
       
   388 		{
       
   389 			// Atom XHTML constructs are wrapped with a div by default
       
   390 			// Note: No protection if $html contains a stray </div>!
       
   391 			$html = '<div>' . $html . '</div>';
       
   392 			$ret .= '<!DOCTYPE html>';
       
   393 			$content_type = 'text/html';
       
   394 		}
       
   395 		else
       
   396 		{
       
   397 			$ret .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
       
   398 			$content_type = 'application/xhtml+xml';
       
   399 		}
       
   400 
       
   401 		$ret .= '<html><head>';
       
   402 		$ret .= '<meta http-equiv="Content-Type" content="' . $content_type . '; charset=utf-8" />';
       
   403 		$ret .= '</head><body>' . $html . '</body></html>';
       
   404 		return $ret;
       
   405 	}
       
   406 
       
   407 	public function replace_urls($document, $tag, $attributes)
       
   408 	{
       
   409 		if (!is_array($attributes))
       
   410 		{
       
   411 			$attributes = array($attributes);
       
   412 		}
       
   413 
       
   414 		if (!is_array($this->strip_htmltags) || !in_array($tag, $this->strip_htmltags))
       
   415 		{
       
   416 			$elements = $document->getElementsByTagName($tag);
       
   417 			foreach ($elements as $element)
       
   418 			{
       
   419 				foreach ($attributes as $attribute)
       
   420 				{
       
   421 					if ($element->hasAttribute($attribute))
       
   422 					{
       
   423 						$value = $this->registry->call('Misc', 'absolutize_url', array($element->getAttribute($attribute), $this->base));
       
   424 						if ($value !== false)
       
   425 						{
       
   426 							$element->setAttribute($attribute, $value);
       
   427 						}
       
   428 					}
       
   429 				}
       
   430 			}
       
   431 		}
       
   432 	}
       
   433 
       
   434 	public function do_strip_htmltags($match)
       
   435 	{
       
   436 		if ($this->encode_instead_of_strip)
       
   437 		{
       
   438 			if (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
       
   439 			{
       
   440 				$match[1] = htmlspecialchars($match[1], ENT_COMPAT, 'UTF-8');
       
   441 				$match[2] = htmlspecialchars($match[2], ENT_COMPAT, 'UTF-8');
       
   442 				return "&lt;$match[1]$match[2]&gt;$match[3]&lt;/$match[1]&gt;";
       
   443 			}
       
   444 			else
       
   445 			{
       
   446 				return htmlspecialchars($match[0], ENT_COMPAT, 'UTF-8');
       
   447 			}
       
   448 		}
       
   449 		elseif (isset($match[4]) && !in_array(strtolower($match[1]), array('script', 'style')))
       
   450 		{
       
   451 			return $match[4];
       
   452 		}
       
   453 		else
       
   454 		{
       
   455 			return '';
       
   456 		}
       
   457 	}
       
   458 
       
   459 	protected function strip_tag($tag, $document, $type)
       
   460 	{
       
   461 		$xpath = new DOMXPath($document);
       
   462 		$elements = $xpath->query('body//' . $tag);
       
   463 		if ($this->encode_instead_of_strip)
       
   464 		{
       
   465 			foreach ($elements as $element)
       
   466 			{
       
   467 				$fragment = $document->createDocumentFragment();
       
   468 
       
   469 				// For elements which aren't script or style, include the tag itself
       
   470 				if (!in_array($tag, array('script', 'style')))
       
   471 				{
       
   472 					$text = '<' . $tag;
       
   473 					if ($element->hasAttributes())
       
   474 					{
       
   475 						$attrs = array();
       
   476 						foreach ($element->attributes as $name => $attr)
       
   477 						{
       
   478 							$value = $attr->value;
       
   479 
       
   480 							// In XHTML, empty values should never exist, so we repeat the value
       
   481 							if (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_XHTML))
       
   482 							{
       
   483 								$value = $name;
       
   484 							}
       
   485 							// For HTML, empty is fine
       
   486 							elseif (empty($value) && ($type & SIMPLEPIE_CONSTRUCT_HTML))
       
   487 							{
       
   488 								$attrs[] = $name;
       
   489 								continue;
       
   490 							}
       
   491 
       
   492 							// Standard attribute text
       
   493 							$attrs[] = $name . '="' . $attr->value . '"';
       
   494 						}
       
   495 						$text .= ' ' . implode(' ', $attrs);
       
   496 					}
       
   497 					$text .= '>';
       
   498 					$fragment->appendChild(new DOMText($text));
       
   499 				}
       
   500 
       
   501 				$number = $element->childNodes->length;
       
   502 				for ($i = $number; $i > 0; $i--)
       
   503 				{
       
   504 					$child = $element->childNodes->item(0);
       
   505 					$fragment->appendChild($child);
       
   506 				}
       
   507 
       
   508 				if (!in_array($tag, array('script', 'style')))
       
   509 				{
       
   510 					$fragment->appendChild(new DOMText('</' . $tag . '>'));
       
   511 				}
       
   512 
       
   513 				$element->parentNode->replaceChild($fragment, $element);
       
   514 			}
       
   515 
       
   516 			return;
       
   517 		}
       
   518 		elseif (in_array($tag, array('script', 'style')))
       
   519 		{
       
   520 			foreach ($elements as $element)
       
   521 			{
       
   522 				$element->parentNode->removeChild($element);
       
   523 			}
       
   524 
       
   525 			return;
       
   526 		}
       
   527 		else
       
   528 		{
       
   529 			foreach ($elements as $element)
       
   530 			{
       
   531 				$fragment = $document->createDocumentFragment();
       
   532 				$number = $element->childNodes->length;
       
   533 				for ($i = $number; $i > 0; $i--)
       
   534 				{
       
   535 					$child = $element->childNodes->item(0);
       
   536 					$fragment->appendChild($child);
       
   537 				}
       
   538 
       
   539 				$element->parentNode->replaceChild($fragment, $element);
       
   540 			}
       
   541 		}
       
   542 	}
       
   543 
       
   544 	protected function strip_attr($attrib, $document)
       
   545 	{
       
   546 		$xpath = new DOMXPath($document);
       
   547 		$elements = $xpath->query('//*[@' . $attrib . ']');
       
   548 
       
   549 		foreach ($elements as $element)
       
   550 		{
       
   551 			$element->removeAttribute($attrib);
       
   552 		}
       
   553 	}
       
   554 }