vendor/symfony/src/Symfony/Component/CssSelector/Tokenizer.php
changeset 0 7f95f8617b0b
equal deleted inserted replaced
-1:000000000000 0:7f95f8617b0b
       
     1 <?php
       
     2 
       
     3 /*
       
     4  * This file is part of the Symfony package.
       
     5  *
       
     6  * (c) Fabien Potencier <fabien@symfony.com>
       
     7  *
       
     8  * For the full copyright and license information, please view the LICENSE
       
     9  * file that was distributed with this source code.
       
    10  */
       
    11 
       
    12 namespace Symfony\Component\CssSelector;
       
    13 
       
    14 use Symfony\Component\CssSelector\Exception\ParseException;
       
    15 
       
    16 /**
       
    17  * Tokenizer lexes a CSS Selector to tokens.
       
    18  *
       
    19  * This component is a port of the Python lxml library,
       
    20  * which is copyright Infrae and distributed under the BSD license.
       
    21  *
       
    22  * @author Fabien Potencier <fabien@symfony.com>
       
    23  */
       
    24 class Tokenizer
       
    25 {
       
    26     /**
       
    27      * Takes a CSS selector and returns an array holding the Tokens
       
    28      * it contains.
       
    29      *
       
    30      * @param  string $s The selector to lex.
       
    31      *
       
    32      * @return array Token[]
       
    33      */
       
    34     public function tokenize($s)
       
    35     {
       
    36         if (function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) {
       
    37             $mbEncoding = mb_internal_encoding();
       
    38             mb_internal_encoding('ASCII');
       
    39         }
       
    40 
       
    41         $tokens = array();
       
    42         $pos = 0;
       
    43         $s = preg_replace('#/\*.*?\*/#s', '', $s);
       
    44 
       
    45         while (true) {
       
    46             if (preg_match('#\s+#A', $s, $match, 0, $pos)) {
       
    47                 $precedingWhitespacePos = $pos;
       
    48                 $pos += strlen($match[0]);
       
    49             } else {
       
    50                 $precedingWhitespacePos = 0;
       
    51             }
       
    52 
       
    53             if ($pos >= strlen($s)) {
       
    54                 if (isset($mbEncoding)) {
       
    55                     mb_internal_encoding($mbEncoding);
       
    56                 }
       
    57 
       
    58                 return $tokens;
       
    59             }
       
    60 
       
    61             if (preg_match('#[+-]?\d*n(?:[+-]\d+)?#A', $s, $match, 0, $pos) && 'n' !== $match[0]) {
       
    62                 $sym = substr($s, $pos, strlen($match[0]));
       
    63                 $tokens[] = new Token('Symbol', $sym, $pos);
       
    64                 $pos += strlen($match[0]);
       
    65 
       
    66                 continue;
       
    67             }
       
    68 
       
    69             $c = $s[$pos];
       
    70             $c2 = substr($s, $pos, 2);
       
    71             if (in_array($c2, array('~=', '|=', '^=', '$=', '*=', '::', '!='))) {
       
    72                 $tokens[] = new Token('Token', $c2, $pos);
       
    73                 $pos += 2;
       
    74 
       
    75                 continue;
       
    76             }
       
    77 
       
    78             if (in_array($c, array('>', '+', '~', ',', '.', '*', '=', '[', ']', '(', ')', '|', ':', '#'))) {
       
    79                 if (in_array($c, array('.', '#', '[')) && $precedingWhitespacePos > 0) {
       
    80                     $tokens[] = new Token('Token', ' ', $precedingWhitespacePos);
       
    81                 }
       
    82                 $tokens[] = new Token('Token', $c, $pos);
       
    83                 ++$pos;
       
    84 
       
    85                 continue;
       
    86             }
       
    87 
       
    88             if ('"' === $c || "'" === $c) {
       
    89                 // Quoted string
       
    90                 $oldPos = $pos;
       
    91                 list($sym, $pos) = $this->tokenizeEscapedString($s, $pos);
       
    92 
       
    93                 $tokens[] = new Token('String', $sym, $oldPos);
       
    94 
       
    95                 continue;
       
    96             }
       
    97 
       
    98             $oldPos = $pos;
       
    99             list($sym, $pos) = $this->tokenizeSymbol($s, $pos);
       
   100 
       
   101             $tokens[] = new Token('Symbol', $sym, $oldPos);
       
   102 
       
   103             continue;
       
   104         }
       
   105     }
       
   106 
       
   107     /**
       
   108      * Tokenizes a quoted string (i.e. 'A string quoted with \' characters'),
       
   109      * and returns an array holding the unquoted string contained by $s and
       
   110      * the new position from which tokenizing should take over.
       
   111      *
       
   112      * @throws ParseException When expected closing is not found
       
   113      *
       
   114      * @param  string  $s   The selector string containing the quoted string.
       
   115      * @param  integer $pos The starting position for the quoted string.
       
   116      *
       
   117      * @return array
       
   118      */
       
   119     private function tokenizeEscapedString($s, $pos)
       
   120     {
       
   121         $quote = $s[$pos];
       
   122 
       
   123         $pos = $pos + 1;
       
   124         $start = $pos;
       
   125         while (true) {
       
   126             $next = strpos($s, $quote, $pos);
       
   127             if (false === $next) {
       
   128                 throw new ParseException(sprintf('Expected closing %s for string in: %s', $quote, substr($s, $start)));
       
   129             }
       
   130 
       
   131             $result = substr($s, $start, $next - $start);
       
   132             if ('\\' === $result[strlen($result) - 1]) {
       
   133                 // next quote character is escaped
       
   134                 $pos = $next + 1;
       
   135                 continue;
       
   136             }
       
   137 
       
   138             if (false !== strpos($result, '\\')) {
       
   139                 $result = $this->unescapeStringLiteral($result);
       
   140             }
       
   141 
       
   142             return array($result, $next + 1);
       
   143         }
       
   144     }
       
   145 
       
   146     /**
       
   147      * Unescapes a string literal and returns the unescaped string.
       
   148      *
       
   149      * @throws ParseException When invalid escape sequence is found
       
   150      *
       
   151      * @param  string $literal The string literal to unescape.
       
   152      *
       
   153      * @return string
       
   154      */
       
   155     private function unescapeStringLiteral($literal)
       
   156     {
       
   157         return preg_replace_callback('#(\\\\(?:[A-Fa-f0-9]{1,6}(?:\r\n|\s)?|[^A-Fa-f0-9]))#', function ($matches) use ($literal)
       
   158         {
       
   159             if ($matches[0][0] == '\\' && strlen($matches[0]) > 1) {
       
   160                 $matches[0] = substr($matches[0], 1);
       
   161                 if (in_array($matches[0][0], array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'))) {
       
   162                     return chr(trim($matches[0]));
       
   163                 }
       
   164             } else {
       
   165                 throw new ParseException(sprintf('Invalid escape sequence %s in string %s', $matches[0], $literal));
       
   166             }
       
   167         }, $literal);
       
   168     }
       
   169 
       
   170     /**
       
   171      * Lexes selector $s and returns an array holding the name of the symbol
       
   172      * contained in it and the new position from which tokenizing should take
       
   173      * over.
       
   174      *
       
   175      * @throws ParseException When Unexpected symbol is found
       
   176      *
       
   177      * @param  string  $s   The selector string.
       
   178      * @param  integer $pos The position in $s at which the symbol starts.
       
   179      *
       
   180      * @return array
       
   181      */
       
   182     private function tokenizeSymbol($s, $pos)
       
   183     {
       
   184         $start = $pos;
       
   185 
       
   186         if (!preg_match('#[^\w\-]#', $s, $match, PREG_OFFSET_CAPTURE, $pos)) {
       
   187             // Goes to end of s
       
   188             return array(substr($s, $start), strlen($s));
       
   189         }
       
   190 
       
   191         $matchStart = $match[0][1];
       
   192 
       
   193         if ($matchStart == $pos) {
       
   194             throw new ParseException(sprintf('Unexpected symbol: %s at %s', $s[$pos], $pos));
       
   195         }
       
   196 
       
   197         $result = substr($s, $start, $matchStart - $start);
       
   198         $pos = $matchStart;
       
   199 
       
   200         return array($result, $pos);
       
   201     }
       
   202 }