|
1 <?php |
|
2 |
|
3 /* |
|
4 * This file is part of the Symfony package. |
|
5 * |
|
6 * (c) Fabien Potencier <fabien@symfony.com> |
|
7 * |
|
8 * For the full copyright and license information, please view the LICENSE |
|
9 * file that was distributed with this source code. |
|
10 */ |
|
11 |
|
12 namespace Symfony\Component\DomCrawler; |
|
13 |
|
14 use Symfony\Component\CssSelector\CssSelector; |
|
15 |
|
16 /** |
|
17 * Crawler eases navigation of a list of \DOMNode objects. |
|
18 * |
|
19 * @author Fabien Potencier <fabien@symfony.com> |
|
20 * |
|
21 * @api |
|
22 */ |
|
23 class Crawler extends \SplObjectStorage |
|
24 { |
|
25 private $uri; |
|
26 |
|
27 /** |
|
28 * Constructor. |
|
29 * |
|
30 * @param mixed $node A Node to use as the base for the crawling |
|
31 * @param string $uri The current URI or the base href value |
|
32 * |
|
33 * @api |
|
34 */ |
|
35 public function __construct($node = null, $uri = null) |
|
36 { |
|
37 $this->uri = $uri; |
|
38 |
|
39 $this->add($node); |
|
40 } |
|
41 |
|
42 /** |
|
43 * Removes all the nodes. |
|
44 * |
|
45 * @api |
|
46 */ |
|
47 public function clear() |
|
48 { |
|
49 $this->removeAll($this); |
|
50 } |
|
51 |
|
52 /** |
|
53 * Adds a node to the current list of nodes. |
|
54 * |
|
55 * This method uses the appropriate specialized add*() method based |
|
56 * on the type of the argument. |
|
57 * |
|
58 * @param null|\DOMNodeList|array|\DOMNode $node A node |
|
59 * |
|
60 * @api |
|
61 */ |
|
62 public function add($node) |
|
63 { |
|
64 if ($node instanceof \DOMNodeList) { |
|
65 $this->addNodeList($node); |
|
66 } elseif (is_array($node)) { |
|
67 $this->addNodes($node); |
|
68 } elseif (is_string($node)) { |
|
69 $this->addContent($node); |
|
70 } elseif (is_object($node)) { |
|
71 $this->addNode($node); |
|
72 } |
|
73 } |
|
74 |
|
75 /** |
|
76 * Adds HTML/XML content. |
|
77 * |
|
78 * @param string $content A string to parse as HTML/XML |
|
79 * @param string $type The content type of the string |
|
80 */ |
|
81 public function addContent($content, $type = null) |
|
82 { |
|
83 if (empty($type)) { |
|
84 $type = 'text/html'; |
|
85 } |
|
86 |
|
87 // DOM only for HTML/XML content |
|
88 if (!preg_match('/(x|ht)ml/i', $type, $matches)) { |
|
89 return null; |
|
90 } |
|
91 |
|
92 $charset = 'ISO-8859-1'; |
|
93 if (false !== $pos = strpos($type, 'charset=')) { |
|
94 $charset = substr($type, $pos + 8); |
|
95 if (false !== $pos = strpos($charset, ';')) { |
|
96 $charset = substr($charset, 0, $pos); |
|
97 } |
|
98 } |
|
99 |
|
100 if ('x' === $matches[1]) { |
|
101 $this->addXmlContent($content, $charset); |
|
102 } else { |
|
103 $this->addHtmlContent($content, $charset); |
|
104 } |
|
105 } |
|
106 |
|
107 /** |
|
108 * Adds an HTML content to the list of nodes. |
|
109 * |
|
110 * @param string $content The HTML content |
|
111 * @param string $charset The charset |
|
112 * |
|
113 * @api |
|
114 */ |
|
115 public function addHtmlContent($content, $charset = 'UTF-8') |
|
116 { |
|
117 $dom = new \DOMDocument('1.0', $charset); |
|
118 $dom->validateOnParse = true; |
|
119 |
|
120 @$dom->loadHTML($content); |
|
121 $this->addDocument($dom); |
|
122 |
|
123 $base = $this->filter('base')->extract(array('href')); |
|
124 |
|
125 if (count($base)) { |
|
126 $this->uri = current($base); |
|
127 } |
|
128 } |
|
129 |
|
130 /** |
|
131 * Adds an XML content to the list of nodes. |
|
132 * |
|
133 * @param string $content The XML content |
|
134 * @param string $charset The charset |
|
135 * |
|
136 * @api |
|
137 */ |
|
138 public function addXmlContent($content, $charset = 'UTF-8') |
|
139 { |
|
140 $dom = new \DOMDocument('1.0', $charset); |
|
141 $dom->validateOnParse = true; |
|
142 |
|
143 // remove the default namespace to make XPath expressions simpler |
|
144 @$dom->loadXML(str_replace('xmlns', 'ns', $content)); |
|
145 $this->addDocument($dom); |
|
146 } |
|
147 |
|
148 /** |
|
149 * Adds a \DOMDocument to the list of nodes. |
|
150 * |
|
151 * @param \DOMDocument $dom A \DOMDocument instance |
|
152 * |
|
153 * @api |
|
154 */ |
|
155 public function addDocument(\DOMDocument $dom) |
|
156 { |
|
157 if ($dom->documentElement) { |
|
158 $this->addNode($dom->documentElement); |
|
159 } |
|
160 } |
|
161 |
|
162 /** |
|
163 * Adds a \DOMNodeList to the list of nodes. |
|
164 * |
|
165 * @param \DOMNodeList $nodes A \DOMNodeList instance |
|
166 * |
|
167 * @api |
|
168 */ |
|
169 public function addNodeList(\DOMNodeList $nodes) |
|
170 { |
|
171 foreach ($nodes as $node) { |
|
172 $this->addNode($node); |
|
173 } |
|
174 } |
|
175 |
|
176 /** |
|
177 * Adds an array of \DOMNode instances to the list of nodes. |
|
178 * |
|
179 * @param array $nodes An array of \DOMNode instances |
|
180 * |
|
181 * @api |
|
182 */ |
|
183 public function addNodes(array $nodes) |
|
184 { |
|
185 foreach ($nodes as $node) { |
|
186 $this->add($node); |
|
187 } |
|
188 } |
|
189 |
|
190 /** |
|
191 * Adds a \DOMNode instance to the list of nodes. |
|
192 * |
|
193 * @param \DOMNode $node A \DOMNode instance |
|
194 * |
|
195 * @api |
|
196 */ |
|
197 public function addNode(\DOMNode $node) |
|
198 { |
|
199 if ($node instanceof \DOMDocument) { |
|
200 $this->attach($node->documentElement); |
|
201 } else { |
|
202 $this->attach($node); |
|
203 } |
|
204 } |
|
205 |
|
206 /** |
|
207 * Returns a node given its position in the node list. |
|
208 * |
|
209 * @param integer $position The position |
|
210 * |
|
211 * @return A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist. |
|
212 * |
|
213 * @api |
|
214 */ |
|
215 public function eq($position) |
|
216 { |
|
217 foreach ($this as $i => $node) { |
|
218 if ($i == $position) { |
|
219 return new static($node, $this->uri); |
|
220 } |
|
221 } |
|
222 |
|
223 return new static(null, $this->uri); |
|
224 } |
|
225 |
|
226 /** |
|
227 * Calls an anonymous function on each node of the list. |
|
228 * |
|
229 * The anonymous function receives the position and the node as arguments. |
|
230 * |
|
231 * Example: |
|
232 * |
|
233 * $crawler->filter('h1')->each(function ($node, $i) |
|
234 * { |
|
235 * return $node->nodeValue; |
|
236 * }); |
|
237 * |
|
238 * @param \Closure $closure An anonymous function |
|
239 * |
|
240 * @return array An array of values returned by the anonymous function |
|
241 * |
|
242 * @api |
|
243 */ |
|
244 public function each(\Closure $closure) |
|
245 { |
|
246 $data = array(); |
|
247 foreach ($this as $i => $node) { |
|
248 $data[] = $closure($node, $i); |
|
249 } |
|
250 |
|
251 return $data; |
|
252 } |
|
253 |
|
254 /** |
|
255 * Reduces the list of nodes by calling an anonymous function. |
|
256 * |
|
257 * To remove a node from the list, the anonymous function must return false. |
|
258 * |
|
259 * @param \Closure $closure An anonymous function |
|
260 * |
|
261 * @return Crawler A Crawler instance with the selected nodes. |
|
262 * |
|
263 * @api |
|
264 */ |
|
265 public function reduce(\Closure $closure) |
|
266 { |
|
267 $nodes = array(); |
|
268 foreach ($this as $i => $node) { |
|
269 if (false !== $closure($node, $i)) { |
|
270 $nodes[] = $node; |
|
271 } |
|
272 } |
|
273 |
|
274 return new static($nodes, $this->uri); |
|
275 } |
|
276 |
|
277 /** |
|
278 * Returns the first node of the current selection |
|
279 * |
|
280 * @return Crawler A Crawler instance with the first selected node |
|
281 * |
|
282 * @api |
|
283 */ |
|
284 public function first() |
|
285 { |
|
286 return $this->eq(0); |
|
287 } |
|
288 |
|
289 /** |
|
290 * Returns the last node of the current selection |
|
291 * |
|
292 * @return Crawler A Crawler instance with the last selected node |
|
293 * |
|
294 * @api |
|
295 */ |
|
296 public function last() |
|
297 { |
|
298 return $this->eq(count($this) - 1); |
|
299 } |
|
300 |
|
301 /** |
|
302 * Returns the siblings nodes of the current selection |
|
303 * |
|
304 * @return Crawler A Crawler instance with the sibling nodes |
|
305 * |
|
306 * @throws \InvalidArgumentException When current node is empty |
|
307 * |
|
308 * @api |
|
309 */ |
|
310 public function siblings() |
|
311 { |
|
312 if (!count($this)) { |
|
313 throw new \InvalidArgumentException('The current node list is empty.'); |
|
314 } |
|
315 |
|
316 return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri); |
|
317 } |
|
318 |
|
319 /** |
|
320 * Returns the next siblings nodes of the current selection |
|
321 * |
|
322 * @return Crawler A Crawler instance with the next sibling nodes |
|
323 * |
|
324 * @throws \InvalidArgumentException When current node is empty |
|
325 * |
|
326 * @api |
|
327 */ |
|
328 public function nextAll() |
|
329 { |
|
330 if (!count($this)) { |
|
331 throw new \InvalidArgumentException('The current node list is empty.'); |
|
332 } |
|
333 |
|
334 return new static($this->sibling($this->getNode(0)), $this->uri); |
|
335 } |
|
336 |
|
337 /** |
|
338 * Returns the previous sibling nodes of the current selection |
|
339 * |
|
340 * @return Crawler A Crawler instance with the previous sibling nodes |
|
341 * |
|
342 * @api |
|
343 */ |
|
344 public function previousAll() |
|
345 { |
|
346 if (!count($this)) { |
|
347 throw new \InvalidArgumentException('The current node list is empty.'); |
|
348 } |
|
349 |
|
350 return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri); |
|
351 } |
|
352 |
|
353 /** |
|
354 * Returns the parents nodes of the current selection |
|
355 * |
|
356 * @return Crawler A Crawler instance with the parents nodes of the current selection |
|
357 * |
|
358 * @throws \InvalidArgumentException When current node is empty |
|
359 * |
|
360 * @api |
|
361 */ |
|
362 public function parents() |
|
363 { |
|
364 if (!count($this)) { |
|
365 throw new \InvalidArgumentException('The current node list is empty.'); |
|
366 } |
|
367 |
|
368 $node = $this->getNode(0); |
|
369 $nodes = array(); |
|
370 |
|
371 while ($node = $node->parentNode) { |
|
372 if (1 === $node->nodeType && '_root' !== $node->nodeName) { |
|
373 $nodes[] = $node; |
|
374 } |
|
375 } |
|
376 |
|
377 return new static($nodes, $this->uri); |
|
378 } |
|
379 |
|
380 /** |
|
381 * Returns the children nodes of the current selection |
|
382 * |
|
383 * @return Crawler A Crawler instance with the children nodes |
|
384 * |
|
385 * @throws \InvalidArgumentException When current node is empty |
|
386 * |
|
387 * @api |
|
388 */ |
|
389 public function children() |
|
390 { |
|
391 if (!count($this)) { |
|
392 throw new \InvalidArgumentException('The current node list is empty.'); |
|
393 } |
|
394 |
|
395 return new static($this->sibling($this->getNode(0)->firstChild), $this->uri); |
|
396 } |
|
397 |
|
398 /** |
|
399 * Returns the attribute value of the first node of the list. |
|
400 * |
|
401 * @param string $attribute The attribute name |
|
402 * |
|
403 * @return string The attribute value |
|
404 * |
|
405 * @throws \InvalidArgumentException When current node is empty |
|
406 * |
|
407 * @api |
|
408 */ |
|
409 public function attr($attribute) |
|
410 { |
|
411 if (!count($this)) { |
|
412 throw new \InvalidArgumentException('The current node list is empty.'); |
|
413 } |
|
414 |
|
415 return $this->getNode(0)->getAttribute($attribute); |
|
416 } |
|
417 |
|
418 /** |
|
419 * Returns the node value of the first node of the list. |
|
420 * |
|
421 * @return string The node value |
|
422 * |
|
423 * @throws \InvalidArgumentException When current node is empty |
|
424 * |
|
425 * @api |
|
426 */ |
|
427 public function text() |
|
428 { |
|
429 if (!count($this)) { |
|
430 throw new \InvalidArgumentException('The current node list is empty.'); |
|
431 } |
|
432 |
|
433 return $this->getNode(0)->nodeValue; |
|
434 } |
|
435 |
|
436 /** |
|
437 * Extracts information from the list of nodes. |
|
438 * |
|
439 * You can extract attributes or/and the node value (_text). |
|
440 * |
|
441 * Example: |
|
442 * |
|
443 * $crawler->filter('h1 a')->extract(array('_text', 'href')); |
|
444 * |
|
445 * @param array $attributes An array of attributes |
|
446 * |
|
447 * @return array An array of extracted values |
|
448 * |
|
449 * @api |
|
450 */ |
|
451 public function extract($attributes) |
|
452 { |
|
453 $attributes = (array) $attributes; |
|
454 |
|
455 $data = array(); |
|
456 foreach ($this as $node) { |
|
457 $elements = array(); |
|
458 foreach ($attributes as $attribute) { |
|
459 if ('_text' === $attribute) { |
|
460 $elements[] = $node->nodeValue; |
|
461 } else { |
|
462 $elements[] = $node->getAttribute($attribute); |
|
463 } |
|
464 } |
|
465 |
|
466 $data[] = count($attributes) > 1 ? $elements : $elements[0]; |
|
467 } |
|
468 |
|
469 return $data; |
|
470 } |
|
471 |
|
472 /** |
|
473 * Filters the list of nodes with an XPath expression. |
|
474 * |
|
475 * @param string $xpath An XPath expression |
|
476 * |
|
477 * @return Crawler A new instance of Crawler with the filtered list of nodes |
|
478 * |
|
479 * @api |
|
480 */ |
|
481 public function filterXPath($xpath) |
|
482 { |
|
483 $document = new \DOMDocument('1.0', 'UTF-8'); |
|
484 $root = $document->appendChild($document->createElement('_root')); |
|
485 foreach ($this as $node) { |
|
486 $root->appendChild($document->importNode($node, true)); |
|
487 } |
|
488 |
|
489 $domxpath = new \DOMXPath($document); |
|
490 |
|
491 return new static($domxpath->query($xpath), $this->uri); |
|
492 } |
|
493 |
|
494 /** |
|
495 * Filters the list of nodes with a CSS selector. |
|
496 * |
|
497 * This method only works if you have installed the CssSelector Symfony Component. |
|
498 * |
|
499 * @param string $selector A CSS selector |
|
500 * |
|
501 * @return Crawler A new instance of Crawler with the filtered list of nodes |
|
502 * |
|
503 * @throws \RuntimeException if the CssSelector Component is not available |
|
504 * |
|
505 * @api |
|
506 */ |
|
507 public function filter($selector) |
|
508 { |
|
509 if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) { |
|
510 // @codeCoverageIgnoreStart |
|
511 throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).'); |
|
512 // @codeCoverageIgnoreEnd |
|
513 } |
|
514 |
|
515 return $this->filterXPath(CssSelector::toXPath($selector)); |
|
516 } |
|
517 |
|
518 /** |
|
519 * Selects links by name or alt value for clickable images. |
|
520 * |
|
521 * @param string $value The link text |
|
522 * |
|
523 * @return Crawler A new instance of Crawler with the filtered list of nodes |
|
524 * |
|
525 * @api |
|
526 */ |
|
527 public function selectLink($value) |
|
528 { |
|
529 $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')). |
|
530 sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' ')); |
|
531 |
|
532 return $this->filterXPath($xpath); |
|
533 } |
|
534 |
|
535 /** |
|
536 * Selects a button by name or alt value for images. |
|
537 * |
|
538 * @param string $value The button text |
|
539 * |
|
540 * @return Crawler A new instance of Crawler with the filtered list of nodes |
|
541 * |
|
542 * @api |
|
543 */ |
|
544 public function selectButton($value) |
|
545 { |
|
546 $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')). |
|
547 sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value). |
|
548 sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value); |
|
549 |
|
550 return $this->filterXPath($xpath); |
|
551 } |
|
552 |
|
553 /** |
|
554 * Returns a Link object for the first node in the list. |
|
555 * |
|
556 * @param string $method The method for the link (get by default) |
|
557 * |
|
558 * @return Link A Link instance |
|
559 * |
|
560 * @throws \InvalidArgumentException If the current node list is empty |
|
561 * |
|
562 * @api |
|
563 */ |
|
564 public function link($method = 'get') |
|
565 { |
|
566 if (!count($this)) { |
|
567 throw new \InvalidArgumentException('The current node list is empty.'); |
|
568 } |
|
569 |
|
570 $node = $this->getNode(0); |
|
571 |
|
572 return new Link($node, $this->uri, $method); |
|
573 } |
|
574 |
|
575 /** |
|
576 * Returns an array of Link objects for the nodes in the list. |
|
577 * |
|
578 * @return array An array of Link instances |
|
579 * |
|
580 * @api |
|
581 */ |
|
582 public function links() |
|
583 { |
|
584 $links = array(); |
|
585 foreach ($this as $node) { |
|
586 $links[] = new Link($node, $this->uri, 'get'); |
|
587 } |
|
588 |
|
589 return $links; |
|
590 } |
|
591 |
|
592 /** |
|
593 * Returns a Form object for the first node in the list. |
|
594 * |
|
595 * @param array $values An array of values for the form fields |
|
596 * @param string $method The method for the form |
|
597 * |
|
598 * @return Form A Form instance |
|
599 * |
|
600 * @throws \InvalidArgumentException If the current node list is empty |
|
601 * |
|
602 * @api |
|
603 */ |
|
604 public function form(array $values = null, $method = null) |
|
605 { |
|
606 if (!count($this)) { |
|
607 throw new \InvalidArgumentException('The current node list is empty.'); |
|
608 } |
|
609 |
|
610 $form = new Form($this->getNode(0), $this->uri, $method); |
|
611 |
|
612 if (null !== $values) { |
|
613 $form->setValues($values); |
|
614 } |
|
615 |
|
616 return $form; |
|
617 } |
|
618 |
|
619 static public function xpathLiteral($s) |
|
620 { |
|
621 if (false === strpos($s, "'")) { |
|
622 return sprintf("'%s'", $s); |
|
623 } |
|
624 |
|
625 if (false === strpos($s, '"')) { |
|
626 return sprintf('"%s"', $s); |
|
627 } |
|
628 |
|
629 $string = $s; |
|
630 $parts = array(); |
|
631 while (true) { |
|
632 if (false !== $pos = strpos($string, "'")) { |
|
633 $parts[] = sprintf("'%s'", substr($string, 0, $pos)); |
|
634 $parts[] = "\"'\""; |
|
635 $string = substr($string, $pos + 1); |
|
636 } else { |
|
637 $parts[] = "'$string'"; |
|
638 break; |
|
639 } |
|
640 } |
|
641 |
|
642 return sprintf("concat(%s)", implode($parts, ', ')); |
|
643 } |
|
644 |
|
645 private function getNode($position) |
|
646 { |
|
647 foreach ($this as $i => $node) { |
|
648 if ($i == $position) { |
|
649 return $node; |
|
650 } |
|
651 // @codeCoverageIgnoreStart |
|
652 } |
|
653 |
|
654 return null; |
|
655 // @codeCoverageIgnoreEnd |
|
656 } |
|
657 |
|
658 private function sibling($node, $siblingDir = 'nextSibling') |
|
659 { |
|
660 $nodes = array(); |
|
661 |
|
662 do { |
|
663 if ($node !== $this->getNode(0) && $node->nodeType === 1) { |
|
664 $nodes[] = $node; |
|
665 } |
|
666 } while ($node = $node->$siblingDir); |
|
667 |
|
668 return $nodes; |
|
669 } |
|
670 } |