|
1 <?php |
|
2 |
|
3 /* |
|
4 * This file is part of the Symfony package. |
|
5 * |
|
6 * (c) Fabien Potencier <fabien@symfony.com> |
|
7 * |
|
8 * For the full copyright and license information, please view the LICENSE |
|
9 * file that was distributed with this source code. |
|
10 */ |
|
11 |
|
12 namespace Symfony\Component\CssSelector; |
|
13 |
|
14 use Symfony\Component\CssSelector\Exception\ParseException; |
|
15 |
|
16 /** |
|
17 * Tokenizer lexes a CSS Selector to tokens. |
|
18 * |
|
19 * This component is a port of the Python lxml library, |
|
20 * which is copyright Infrae and distributed under the BSD license. |
|
21 * |
|
22 * @author Fabien Potencier <fabien@symfony.com> |
|
23 */ |
|
24 class Tokenizer |
|
25 { |
|
26 /** |
|
27 * Takes a CSS selector and returns an array holding the Tokens |
|
28 * it contains. |
|
29 * |
|
30 * @param string $s The selector to lex. |
|
31 * |
|
32 * @return array Token[] |
|
33 */ |
|
34 public function tokenize($s) |
|
35 { |
|
36 if (function_exists('mb_internal_encoding') && ((int) ini_get('mbstring.func_overload')) & 2) { |
|
37 $mbEncoding = mb_internal_encoding(); |
|
38 mb_internal_encoding('ASCII'); |
|
39 } |
|
40 |
|
41 $tokens = array(); |
|
42 $pos = 0; |
|
43 $s = preg_replace('#/\*.*?\*/#s', '', $s); |
|
44 |
|
45 while (true) { |
|
46 if (preg_match('#\s+#A', $s, $match, 0, $pos)) { |
|
47 $precedingWhitespacePos = $pos; |
|
48 $pos += strlen($match[0]); |
|
49 } else { |
|
50 $precedingWhitespacePos = 0; |
|
51 } |
|
52 |
|
53 if ($pos >= strlen($s)) { |
|
54 if (isset($mbEncoding)) { |
|
55 mb_internal_encoding($mbEncoding); |
|
56 } |
|
57 |
|
58 return $tokens; |
|
59 } |
|
60 |
|
61 if (preg_match('#[+-]?\d*n(?:[+-]\d+)?#A', $s, $match, 0, $pos) && 'n' !== $match[0]) { |
|
62 $sym = substr($s, $pos, strlen($match[0])); |
|
63 $tokens[] = new Token('Symbol', $sym, $pos); |
|
64 $pos += strlen($match[0]); |
|
65 |
|
66 continue; |
|
67 } |
|
68 |
|
69 $c = $s[$pos]; |
|
70 $c2 = substr($s, $pos, 2); |
|
71 if (in_array($c2, array('~=', '|=', '^=', '$=', '*=', '::', '!='))) { |
|
72 $tokens[] = new Token('Token', $c2, $pos); |
|
73 $pos += 2; |
|
74 |
|
75 continue; |
|
76 } |
|
77 |
|
78 if (in_array($c, array('>', '+', '~', ',', '.', '*', '=', '[', ']', '(', ')', '|', ':', '#'))) { |
|
79 if (in_array($c, array('.', '#', '[')) && $precedingWhitespacePos > 0) { |
|
80 $tokens[] = new Token('Token', ' ', $precedingWhitespacePos); |
|
81 } |
|
82 $tokens[] = new Token('Token', $c, $pos); |
|
83 ++$pos; |
|
84 |
|
85 continue; |
|
86 } |
|
87 |
|
88 if ('"' === $c || "'" === $c) { |
|
89 // Quoted string |
|
90 $oldPos = $pos; |
|
91 list($sym, $pos) = $this->tokenizeEscapedString($s, $pos); |
|
92 |
|
93 $tokens[] = new Token('String', $sym, $oldPos); |
|
94 |
|
95 continue; |
|
96 } |
|
97 |
|
98 $oldPos = $pos; |
|
99 list($sym, $pos) = $this->tokenizeSymbol($s, $pos); |
|
100 |
|
101 $tokens[] = new Token('Symbol', $sym, $oldPos); |
|
102 |
|
103 continue; |
|
104 } |
|
105 } |
|
106 |
|
107 /** |
|
108 * Tokenizes a quoted string (i.e. 'A string quoted with \' characters'), |
|
109 * and returns an array holding the unquoted string contained by $s and |
|
110 * the new position from which tokenizing should take over. |
|
111 * |
|
112 * @throws ParseException When expected closing is not found |
|
113 * |
|
114 * @param string $s The selector string containing the quoted string. |
|
115 * @param integer $pos The starting position for the quoted string. |
|
116 * |
|
117 * @return array |
|
118 */ |
|
119 private function tokenizeEscapedString($s, $pos) |
|
120 { |
|
121 $quote = $s[$pos]; |
|
122 |
|
123 $pos = $pos + 1; |
|
124 $start = $pos; |
|
125 while (true) { |
|
126 $next = strpos($s, $quote, $pos); |
|
127 if (false === $next) { |
|
128 throw new ParseException(sprintf('Expected closing %s for string in: %s', $quote, substr($s, $start))); |
|
129 } |
|
130 |
|
131 $result = substr($s, $start, $next - $start); |
|
132 if ('\\' === $result[strlen($result) - 1]) { |
|
133 // next quote character is escaped |
|
134 $pos = $next + 1; |
|
135 continue; |
|
136 } |
|
137 |
|
138 if (false !== strpos($result, '\\')) { |
|
139 $result = $this->unescapeStringLiteral($result); |
|
140 } |
|
141 |
|
142 return array($result, $next + 1); |
|
143 } |
|
144 } |
|
145 |
|
146 /** |
|
147 * Unescapes a string literal and returns the unescaped string. |
|
148 * |
|
149 * @throws ParseException When invalid escape sequence is found |
|
150 * |
|
151 * @param string $literal The string literal to unescape. |
|
152 * |
|
153 * @return string |
|
154 */ |
|
155 private function unescapeStringLiteral($literal) |
|
156 { |
|
157 return preg_replace_callback('#(\\\\(?:[A-Fa-f0-9]{1,6}(?:\r\n|\s)?|[^A-Fa-f0-9]))#', function ($matches) use ($literal) |
|
158 { |
|
159 if ($matches[0][0] == '\\' && strlen($matches[0]) > 1) { |
|
160 $matches[0] = substr($matches[0], 1); |
|
161 if (in_array($matches[0][0], array('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'))) { |
|
162 return chr(trim($matches[0])); |
|
163 } |
|
164 } else { |
|
165 throw new ParseException(sprintf('Invalid escape sequence %s in string %s', $matches[0], $literal)); |
|
166 } |
|
167 }, $literal); |
|
168 } |
|
169 |
|
170 /** |
|
171 * Lexes selector $s and returns an array holding the name of the symbol |
|
172 * contained in it and the new position from which tokenizing should take |
|
173 * over. |
|
174 * |
|
175 * @throws ParseException When Unexpected symbol is found |
|
176 * |
|
177 * @param string $s The selector string. |
|
178 * @param integer $pos The position in $s at which the symbol starts. |
|
179 * |
|
180 * @return array |
|
181 */ |
|
182 private function tokenizeSymbol($s, $pos) |
|
183 { |
|
184 $start = $pos; |
|
185 |
|
186 if (!preg_match('#[^\w\-]#', $s, $match, PREG_OFFSET_CAPTURE, $pos)) { |
|
187 // Goes to end of s |
|
188 return array(substr($s, $start), strlen($s)); |
|
189 } |
|
190 |
|
191 $matchStart = $match[0][1]; |
|
192 |
|
193 if ($matchStart == $pos) { |
|
194 throw new ParseException(sprintf('Unexpected symbol: %s at %s', $s[$pos], $pos)); |
|
195 } |
|
196 |
|
197 $result = substr($s, $start, $matchStart - $start); |
|
198 $pos = $matchStart; |
|
199 |
|
200 return array($result, $pos); |
|
201 } |
|
202 } |