|
0
|
1 |
<?php |
|
|
2 |
|
|
|
3 |
/* |
|
|
4 |
* This file is part of the Symfony package. |
|
|
5 |
* |
|
|
6 |
* (c) Fabien Potencier <fabien@symfony.com> |
|
|
7 |
* |
|
|
8 |
* For the full copyright and license information, please view the LICENSE |
|
|
9 |
* file that was distributed with this source code. |
|
|
10 |
*/ |
|
|
11 |
|
|
|
12 |
namespace Symfony\Component\CssSelector; |
|
|
13 |
|
|
|
14 |
use Symfony\Component\CssSelector\Exception\ParseException; |
|
|
15 |
|
|
|
16 |
/** |
|
|
17 |
* Tokenizer lexes a CSS Selector to tokens. |
|
|
18 |
* |
|
|
19 |
* This component is a port of the Python lxml library, |
|
|
20 |
* which is copyright Infrae and distributed under the BSD license. |
|
|
21 |
* |
|
|
22 |
* @author Fabien Potencier <fabien@symfony.com> |
|
|
23 |
*/ |
|
|
24 |
class Tokenizer |
|
|
25 |
{ |
|
|
26 |
/** |
|
|
27 |
* Takes a CSS selector and returns an array holding the Tokens |
|
|
28 |
* it contains. |
|
|
29 |
* |
|
|
30 |
* @param string $s The selector to lex. |
|
|
31 |
* |
|
|
32 |
* @return array Token[] |
|
|
33 |
*/ |
|
|
34 |
public function tokenize($s) |
|
|
35 |
{ |
|
|
36 |
if (function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) { |
|
|
37 |
$mbEncoding = mb_internal_encoding(); |
|
|
38 |
mb_internal_encoding('ASCII'); |
|
|
39 |
} |
|
|
40 |
|
|
|
41 |
$tokens = array(); |
|
|
42 |
$pos = 0; |
|
|
43 |
$s = preg_replace('#/\*.*?\*/#s', '', $s); |
|
|
44 |
|
|
|
45 |
while (true) { |
|
|
46 |
if (preg_match('#\s+#A', $s, $match, 0, $pos)) { |
|
|
47 |
$precedingWhitespacePos = $pos; |
|
|
48 |
$pos += strlen($match[0]); |
|
|
49 |
} else { |
|
|
50 |
$precedingWhitespacePos = 0; |
|
|
51 |
} |
|
|
52 |
|
|
|
53 |
if ($pos >= strlen($s)) { |
|
|
54 |
if (isset($mbEncoding)) { |
|
|
55 |
mb_internal_encoding($mbEncoding); |
|
|
56 |
} |
|
|
57 |
|
|
|
58 |
return $tokens; |
|
|
59 |
} |
|
|
60 |
|
|
|
61 |
if (preg_match('#[+-]?\d*n(?:[+-]\d+)?#A', $s, $match, 0, $pos) && 'n' !== $match[0]) { |
|
|
62 |
$sym = substr($s, $pos, strlen($match[0])); |
|
|
63 |
$tokens[] = new Token('Symbol', $sym, $pos); |
|
|
64 |
$pos += strlen($match[0]); |
|
|
65 |
|
|
|
66 |
continue; |
|
|
67 |
} |
|
|
68 |
|
|
|
69 |
$c = $s[$pos]; |
|
|
70 |
$c2 = substr($s, $pos, 2); |
|
|
71 |
if (in_array($c2, array('~=', '|=', '^=', '$=', '*=', '::', '!='))) { |
|
|
72 |
$tokens[] = new Token('Token', $c2, $pos); |
|
|
73 |
$pos += 2; |
|
|
74 |
|
|
|
75 |
continue; |
|
|
76 |
} |
|
|
77 |
|
|
|
78 |
if (in_array($c, array('>', '+', '~', ',', '.', '*', '=', '[', ']', '(', ')', '|', ':', '#'))) { |
|
|
79 |
if (in_array($c, array('.', '#', '[')) && $precedingWhitespacePos > 0) { |
|
|
80 |
$tokens[] = new Token('Token', ' ', $precedingWhitespacePos); |
|
|
81 |
} |
|
|
82 |
$tokens[] = new Token('Token', $c, $pos); |
|
|
83 |
++$pos; |
|
|
84 |
|
|
|
85 |
continue; |
|
|
86 |
} |
|
|
87 |
|
|
|
88 |
if ('"' === $c || "'" === $c) { |
|
|
89 |
// Quoted string |
|
|
90 |
$oldPos = $pos; |
|
|
91 |
list($sym, $pos) = $this->tokenizeEscapedString($s, $pos); |
|
|
92 |
|
|
|
93 |
$tokens[] = new Token('String', $sym, $oldPos); |
|
|
94 |
|
|
|
95 |
continue; |
|
|
96 |
} |
|
|
97 |
|
|
|
98 |
$oldPos = $pos; |
|
|
99 |
list($sym, $pos) = $this->tokenizeSymbol($s, $pos); |
|
|
100 |
|
|
|
101 |
$tokens[] = new Token('Symbol', $sym, $oldPos); |
|
|
102 |
|
|
|
103 |
continue; |
|
|
104 |
} |
|
|
105 |
} |
|
|
106 |
|
|
|
107 |
/** |
|
|
108 |
* Tokenizes a quoted string (i.e. 'A string quoted with \' characters'), |
|
|
109 |
* and returns an array holding the unquoted string contained by $s and |
|
|
110 |
* the new position from which tokenizing should take over. |
|
|
111 |
* |
|
|
112 |
* @throws ParseException When expected closing is not found |
|
|
113 |
* |
|
|
114 |
* @param string $s The selector string containing the quoted string. |
|
|
115 |
* @param integer $pos The starting position for the quoted string. |
|
|
116 |
* |
|
|
117 |
* @return array |
|
|
118 |
*/ |
|
|
119 |
private function tokenizeEscapedString($s, $pos) |
|
|
120 |
{ |
|
|
121 |
$quote = $s[$pos]; |
|
|
122 |
|
|
|
123 |
$pos = $pos + 1; |
|
|
124 |
$start = $pos; |
|
|
125 |
while (true) { |
|
|
126 |
$next = strpos($s, $quote, $pos); |
|
|
127 |
if (false === $next) { |
|
|
128 |
throw new ParseException(sprintf('Expected closing %s for string in: %s', $quote, substr($s, $start))); |
|
|
129 |
} |
|
|
130 |
|
|
|
131 |
$result = substr($s, $start, $next - $start); |
|
|
132 |
if ('\\' === $result[strlen($result) - 1]) { |
|
|
133 |
// next quote character is escaped |
|
|
134 |
$pos = $next + 1; |
|
|
135 |
continue; |
|
|
136 |
} |
|
|
137 |
|
|
|
138 |
if (false !== strpos($result, '\\')) { |
|
|
139 |
$result = $this->unescapeStringLiteral($result); |
|
|
140 |
} |
|
|
141 |
|
|
|
142 |
return array($result, $next + 1); |
|
|
143 |
} |
|
|
144 |
} |
|
|
145 |
|
|
|
146 |
/** |
|
|
147 |
* Unescapes a string literal and returns the unescaped string. |
|
|
148 |
* |
|
|
149 |
* @throws ParseException When invalid escape sequence is found |
|
|
150 |
* |
|
|
151 |
* @param string $literal The string literal to unescape. |
|
|
152 |
* |
|
|
153 |
* @return string |
|
|
154 |
*/ |
|
|
155 |
private function unescapeStringLiteral($literal) |
|
|
156 |
{ |
|
|
157 |
return preg_replace_callback('#(\\\\(?:[A-Fa-f0-9]{1,6}(?:\r\n|\s)?|[^A-Fa-f0-9]))#', function ($matches) use ($literal) |
|
|
158 |
{ |
|
|
159 |
if ($matches[0][0] == '\\' && strlen($matches[0]) > 1) { |
|
|
160 |
$matches[0] = substr($matches[0], 1); |
|
|
161 |
if (in_array($matches[0][0], array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'))) { |
|
|
162 |
return chr(trim($matches[0])); |
|
|
163 |
} |
|
|
164 |
} else { |
|
|
165 |
throw new ParseException(sprintf('Invalid escape sequence %s in string %s', $matches[0], $literal)); |
|
|
166 |
} |
|
|
167 |
}, $literal); |
|
|
168 |
} |
|
|
169 |
|
|
|
170 |
/** |
|
|
171 |
* Lexes selector $s and returns an array holding the name of the symbol |
|
|
172 |
* contained in it and the new position from which tokenizing should take |
|
|
173 |
* over. |
|
|
174 |
* |
|
|
175 |
* @throws ParseException When Unexpected symbol is found |
|
|
176 |
* |
|
|
177 |
* @param string $s The selector string. |
|
|
178 |
* @param integer $pos The position in $s at which the symbol starts. |
|
|
179 |
* |
|
|
180 |
* @return array |
|
|
181 |
*/ |
|
|
182 |
private function tokenizeSymbol($s, $pos) |
|
|
183 |
{ |
|
|
184 |
$start = $pos; |
|
|
185 |
|
|
|
186 |
if (!preg_match('#[^\w\-]#', $s, $match, PREG_OFFSET_CAPTURE, $pos)) { |
|
|
187 |
// Goes to end of s |
|
|
188 |
return array(substr($s, $start), strlen($s)); |
|
|
189 |
} |
|
|
190 |
|
|
|
191 |
$matchStart = $match[0][1]; |
|
|
192 |
|
|
|
193 |
if ($matchStart == $pos) { |
|
|
194 |
throw new ParseException(sprintf('Unexpected symbol: %s at %s', $s[$pos], $pos)); |
|
|
195 |
} |
|
|
196 |
|
|
|
197 |
$result = substr($s, $start, $matchStart - $start); |
|
|
198 |
$pos = $matchStart; |
|
|
199 |
|
|
|
200 |
return array($result, $pos); |
|
|
201 |
} |
|
|
202 |
} |