|
1 <?php |
|
2 /** |
|
3 * REST API: WP_REST_URL_Details_Controller class |
|
4 * |
|
5 * @package WordPress |
|
6 * @subpackage REST_API |
|
7 * @since 5.9.0 |
|
8 */ |
|
9 |
|
10 /** |
|
11 * Controller which provides REST endpoint for retrieving information |
|
12 * from a remote site's HTML response. |
|
13 * |
|
14 * @since 5.9.0 |
|
15 * |
|
16 * @see WP_REST_Controller |
|
17 */ |
|
18 class WP_REST_URL_Details_Controller extends WP_REST_Controller { |
|
19 |
|
20 /** |
|
21 * Constructs the controller. |
|
22 * |
|
23 * @since 5.9.0 |
|
24 */ |
|
25 public function __construct() { |
|
26 $this->namespace = 'wp-block-editor/v1'; |
|
27 $this->rest_base = 'url-details'; |
|
28 } |
|
29 |
|
30 /** |
|
31 * Registers the necessary REST API routes. |
|
32 * |
|
33 * @since 5.9.0 |
|
34 */ |
|
35 public function register_routes() { |
|
36 register_rest_route( |
|
37 $this->namespace, |
|
38 '/' . $this->rest_base, |
|
39 array( |
|
40 array( |
|
41 'methods' => WP_REST_Server::READABLE, |
|
42 'callback' => array( $this, 'parse_url_details' ), |
|
43 'args' => array( |
|
44 'url' => array( |
|
45 'required' => true, |
|
46 'description' => __( 'The URL to process.' ), |
|
47 'validate_callback' => 'wp_http_validate_url', |
|
48 'sanitize_callback' => 'esc_url_raw', |
|
49 'type' => 'string', |
|
50 'format' => 'uri', |
|
51 ), |
|
52 ), |
|
53 'permission_callback' => array( $this, 'permissions_check' ), |
|
54 'schema' => array( $this, 'get_public_item_schema' ), |
|
55 ), |
|
56 ) |
|
57 ); |
|
58 } |
|
59 |
|
60 /** |
|
61 * Retrieves the item's schema, conforming to JSON Schema. |
|
62 * |
|
63 * @since 5.9.0 |
|
64 * |
|
65 * @return array Item schema data. |
|
66 */ |
|
67 public function get_item_schema() { |
|
68 if ( $this->schema ) { |
|
69 return $this->add_additional_fields_schema( $this->schema ); |
|
70 } |
|
71 |
|
72 $this->schema = array( |
|
73 '$schema' => 'http://json-schema.org/draft-04/schema#', |
|
74 'title' => 'url-details', |
|
75 'type' => 'object', |
|
76 'properties' => array( |
|
77 'title' => array( |
|
78 'description' => sprintf( |
|
79 /* translators: %s: HTML title tag. */ |
|
80 __( 'The contents of the %s element from the URL.' ), |
|
81 '<title>' |
|
82 ), |
|
83 'type' => 'string', |
|
84 'context' => array( 'view', 'edit', 'embed' ), |
|
85 'readonly' => true, |
|
86 ), |
|
87 'icon' => array( |
|
88 'description' => sprintf( |
|
89 /* translators: %s: HTML link tag. */ |
|
90 __( 'The favicon image link of the %s element from the URL.' ), |
|
91 '<link rel="icon">' |
|
92 ), |
|
93 'type' => 'string', |
|
94 'format' => 'uri', |
|
95 'context' => array( 'view', 'edit', 'embed' ), |
|
96 'readonly' => true, |
|
97 ), |
|
98 'description' => array( |
|
99 'description' => sprintf( |
|
100 /* translators: %s: HTML meta tag. */ |
|
101 __( 'The content of the %s element from the URL.' ), |
|
102 '<meta name="description">' |
|
103 ), |
|
104 'type' => 'string', |
|
105 'context' => array( 'view', 'edit', 'embed' ), |
|
106 'readonly' => true, |
|
107 ), |
|
108 'image' => array( |
|
109 'description' => sprintf( |
|
110 /* translators: 1: HTML meta tag, 2: HTML meta tag. */ |
|
111 __( 'The Open Graph image link of the %1$s or %2$s element from the URL.' ), |
|
112 '<meta property="og:image">', |
|
113 '<meta property="og:image:url">' |
|
114 ), |
|
115 'type' => 'string', |
|
116 'format' => 'uri', |
|
117 'context' => array( 'view', 'edit', 'embed' ), |
|
118 'readonly' => true, |
|
119 ), |
|
120 ), |
|
121 ); |
|
122 |
|
123 return $this->add_additional_fields_schema( $this->schema ); |
|
124 } |
|
125 |
|
126 /** |
|
127 * Retrieves the contents of the title tag from the HTML response. |
|
128 * |
|
129 * @since 5.9.0 |
|
130 * |
|
131 * @param WP_REST_REQUEST $request Full details about the request. |
|
132 * @return WP_REST_Response|WP_Error The parsed details as a response object. WP_Error if there are errors. |
|
133 */ |
|
134 public function parse_url_details( $request ) { |
|
135 $url = untrailingslashit( $request['url'] ); |
|
136 |
|
137 if ( empty( $url ) ) { |
|
138 return new WP_Error( 'rest_invalid_url', __( 'Invalid URL' ), array( 'status' => 404 ) ); |
|
139 } |
|
140 |
|
141 // Transient per URL. |
|
142 $cache_key = $this->build_cache_key_for_url( $url ); |
|
143 |
|
144 // Attempt to retrieve cached response. |
|
145 $cached_response = $this->get_cache( $cache_key ); |
|
146 |
|
147 if ( ! empty( $cached_response ) ) { |
|
148 $remote_url_response = $cached_response; |
|
149 } else { |
|
150 $remote_url_response = $this->get_remote_url( $url ); |
|
151 |
|
152 // Exit if we don't have a valid body or it's empty. |
|
153 if ( is_wp_error( $remote_url_response ) || empty( $remote_url_response ) ) { |
|
154 return $remote_url_response; |
|
155 } |
|
156 |
|
157 // Cache the valid response. |
|
158 $this->set_cache( $cache_key, $remote_url_response ); |
|
159 } |
|
160 |
|
161 $html_head = $this->get_document_head( $remote_url_response ); |
|
162 $meta_elements = $this->get_meta_with_content_elements( $html_head ); |
|
163 |
|
164 $data = $this->add_additional_fields_to_object( |
|
165 array( |
|
166 'title' => $this->get_title( $html_head ), |
|
167 'icon' => $this->get_icon( $html_head, $url ), |
|
168 'description' => $this->get_description( $meta_elements ), |
|
169 'image' => $this->get_image( $meta_elements, $url ), |
|
170 ), |
|
171 $request |
|
172 ); |
|
173 |
|
174 // Wrap the data in a response object. |
|
175 $response = rest_ensure_response( $data ); |
|
176 |
|
177 /** |
|
178 * Filters the URL data for the response. |
|
179 * |
|
180 * @since 5.9.0 |
|
181 * |
|
182 * @param WP_REST_Response $response The response object. |
|
183 * @param string $url The requested URL. |
|
184 * @param WP_REST_Request $request Request object. |
|
185 * @param string $remote_url_response HTTP response body from the remote URL. |
|
186 */ |
|
187 return apply_filters( 'rest_prepare_url_details', $response, $url, $request, $remote_url_response ); |
|
188 } |
|
189 |
|
190 /** |
|
191 * Checks whether a given request has permission to read remote URLs. |
|
192 * |
|
193 * @since 5.9.0 |
|
194 * |
|
195 * @return WP_Error|bool True if the request has permission, else WP_Error. |
|
196 */ |
|
197 public function permissions_check() { |
|
198 if ( current_user_can( 'edit_posts' ) ) { |
|
199 return true; |
|
200 } |
|
201 |
|
202 foreach ( get_post_types( array( 'show_in_rest' => true ), 'objects' ) as $post_type ) { |
|
203 if ( current_user_can( $post_type->cap->edit_posts ) ) { |
|
204 return true; |
|
205 } |
|
206 } |
|
207 |
|
208 return new WP_Error( |
|
209 'rest_cannot_view_url_details', |
|
210 __( 'Sorry, you are not allowed to process remote URLs.' ), |
|
211 array( 'status' => rest_authorization_required_code() ) |
|
212 ); |
|
213 } |
|
214 |
|
215 /** |
|
216 * Retrieves the document title from a remote URL. |
|
217 * |
|
218 * @since 5.9.0 |
|
219 * |
|
220 * @param string $url The website URL whose HTML to access. |
|
221 * @return string|WP_Error The HTTP response from the remote URL on success. |
|
222 * WP_Error if no response or no content. |
|
223 */ |
|
224 private function get_remote_url( $url ) { |
|
225 |
|
226 /* |
|
227 * Provide a modified UA string to workaround web properties which block WordPress "Pingbacks". |
|
228 * Why? The UA string used for pingback requests contains `WordPress/` which is very similar |
|
229 * to that used as the default UA string by the WP HTTP API. Therefore requests from this |
|
230 * REST endpoint are being unintentionally blocked as they are misidentified as pingback requests. |
|
231 * By slightly modifying the UA string, but still retaining the "WordPress" identification (via "WP") |
|
232 * we are able to work around this issue. |
|
233 * Example UA string: `WP-URLDetails/5.9-alpha-51389 (+http://localhost:8888)`. |
|
234 */ |
|
235 $modified_user_agent = 'WP-URLDetails/' . get_bloginfo( 'version' ) . ' (+' . get_bloginfo( 'url' ) . ')'; |
|
236 |
|
237 $args = array( |
|
238 'limit_response_size' => 150 * KB_IN_BYTES, |
|
239 'user-agent' => $modified_user_agent, |
|
240 ); |
|
241 |
|
242 /** |
|
243 * Filters the HTTP request args for URL data retrieval. |
|
244 * |
|
245 * Can be used to adjust response size limit and other WP_Http::request() args. |
|
246 * |
|
247 * @since 5.9.0 |
|
248 * |
|
249 * @param array $args Arguments used for the HTTP request. |
|
250 * @param string $url The attempted URL. |
|
251 */ |
|
252 $args = apply_filters( 'rest_url_details_http_request_args', $args, $url ); |
|
253 |
|
254 $response = wp_safe_remote_get( $url, $args ); |
|
255 |
|
256 if ( WP_Http::OK !== wp_remote_retrieve_response_code( $response ) ) { |
|
257 // Not saving the error response to cache since the error might be temporary. |
|
258 return new WP_Error( |
|
259 'no_response', |
|
260 __( 'URL not found. Response returned a non-200 status code for this URL.' ), |
|
261 array( 'status' => WP_Http::NOT_FOUND ) |
|
262 ); |
|
263 } |
|
264 |
|
265 $remote_body = wp_remote_retrieve_body( $response ); |
|
266 |
|
267 if ( empty( $remote_body ) ) { |
|
268 return new WP_Error( |
|
269 'no_content', |
|
270 __( 'Unable to retrieve body from response at this URL.' ), |
|
271 array( 'status' => WP_Http::NOT_FOUND ) |
|
272 ); |
|
273 } |
|
274 |
|
275 return $remote_body; |
|
276 } |
|
277 |
|
278 /** |
|
279 * Parses the title tag contents from the provided HTML. |
|
280 * |
|
281 * @since 5.9.0 |
|
282 * |
|
283 * @param string $html The HTML from the remote website at URL. |
|
284 * @return string The title tag contents on success. Empty string if not found. |
|
285 */ |
|
286 private function get_title( $html ) { |
|
287 $pattern = '#<title[^>]*>(.*?)<\s*/\s*title>#is'; |
|
288 preg_match( $pattern, $html, $match_title ); |
|
289 |
|
290 if ( empty( $match_title[1] ) || ! is_string( $match_title[1] ) ) { |
|
291 return ''; |
|
292 } |
|
293 |
|
294 $title = trim( $match_title[1] ); |
|
295 |
|
296 return $this->prepare_metadata_for_output( $title ); |
|
297 } |
|
298 |
|
299 /** |
|
300 * Parses the site icon from the provided HTML. |
|
301 * |
|
302 * @since 5.9.0 |
|
303 * |
|
304 * @param string $html The HTML from the remote website at URL. |
|
305 * @param string $url The target website URL. |
|
306 * @return string The icon URI on success. Empty string if not found. |
|
307 */ |
|
308 private function get_icon( $html, $url ) { |
|
309 // Grab the icon's link element. |
|
310 $pattern = '#<link\s[^>]*rel=(?:[\"\']??)\s*(?:icon|shortcut icon|icon shortcut)\s*(?:[\"\']??)[^>]*\/?>#isU'; |
|
311 preg_match( $pattern, $html, $element ); |
|
312 if ( empty( $element[0] ) || ! is_string( $element[0] ) ) { |
|
313 return ''; |
|
314 } |
|
315 $element = trim( $element[0] ); |
|
316 |
|
317 // Get the icon's href value. |
|
318 $pattern = '#href=([\"\']??)([^\" >]*?)\\1[^>]*#isU'; |
|
319 preg_match( $pattern, $element, $icon ); |
|
320 if ( empty( $icon[2] ) || ! is_string( $icon[2] ) ) { |
|
321 return ''; |
|
322 } |
|
323 $icon = trim( $icon[2] ); |
|
324 |
|
325 // If the icon is a data URL, return it. |
|
326 $parsed_icon = parse_url( $icon ); |
|
327 if ( isset( $parsed_icon['scheme'] ) && 'data' === $parsed_icon['scheme'] ) { |
|
328 return $icon; |
|
329 } |
|
330 |
|
331 // Attempt to convert relative URLs to absolute. |
|
332 if ( ! is_string( $url ) || '' === $url ) { |
|
333 return $icon; |
|
334 } |
|
335 $parsed_url = parse_url( $url ); |
|
336 if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) { |
|
337 $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; |
|
338 $icon = WP_Http::make_absolute_url( $icon, $root_url ); |
|
339 } |
|
340 |
|
341 return $icon; |
|
342 } |
|
343 |
|
344 /** |
|
345 * Parses the meta description from the provided HTML. |
|
346 * |
|
347 * @since 5.9.0 |
|
348 * |
|
349 * @param array $meta_elements { |
|
350 * A multi-dimensional indexed array on success, else empty array. |
|
351 * |
|
352 * @type string[] $0 Meta elements with a content attribute. |
|
353 * @type string[] $1 Content attribute's opening quotation mark. |
|
354 * @type string[] $2 Content attribute's value for each meta element. |
|
355 * } |
|
356 * @return string The meta description contents on success. Empty string if not found. |
|
357 */ |
|
358 private function get_description( $meta_elements ) { |
|
359 // Bail out if there are no meta elements. |
|
360 if ( empty( $meta_elements[0] ) ) { |
|
361 return ''; |
|
362 } |
|
363 |
|
364 $description = $this->get_metadata_from_meta_element( |
|
365 $meta_elements, |
|
366 'name', |
|
367 '(?:description|og:description)' |
|
368 ); |
|
369 |
|
370 // Bail out if description not found. |
|
371 if ( '' === $description ) { |
|
372 return ''; |
|
373 } |
|
374 |
|
375 return $this->prepare_metadata_for_output( $description ); |
|
376 } |
|
377 |
|
378 /** |
|
379 * Parses the Open Graph (OG) Image from the provided HTML. |
|
380 * |
|
381 * See: https://ogp.me/. |
|
382 * |
|
383 * @since 5.9.0 |
|
384 * |
|
385 * @param array $meta_elements { |
|
386 * A multi-dimensional indexed array on success, else empty array. |
|
387 * |
|
388 * @type string[] $0 Meta elements with a content attribute. |
|
389 * @type string[] $1 Content attribute's opening quotation mark. |
|
390 * @type string[] $2 Content attribute's value for each meta element. |
|
391 * } |
|
392 * @param string $url The target website URL. |
|
393 * @return string The OG image on success. Empty string if not found. |
|
394 */ |
|
395 private function get_image( $meta_elements, $url ) { |
|
396 $image = $this->get_metadata_from_meta_element( |
|
397 $meta_elements, |
|
398 'property', |
|
399 '(?:og:image|og:image:url)' |
|
400 ); |
|
401 |
|
402 // Bail out if image not found. |
|
403 if ( '' === $image ) { |
|
404 return ''; |
|
405 } |
|
406 |
|
407 // Attempt to convert relative URLs to absolute. |
|
408 $parsed_url = parse_url( $url ); |
|
409 if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) { |
|
410 $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; |
|
411 $image = WP_Http::make_absolute_url( $image, $root_url ); |
|
412 } |
|
413 |
|
414 return $image; |
|
415 } |
|
416 |
|
417 /** |
|
418 * Prepares the metadata by: |
|
419 * - stripping all HTML tags and tag entities. |
|
420 * - converting non-tag entities into characters. |
|
421 * |
|
422 * @since 5.9.0 |
|
423 * |
|
424 * @param string $metadata The metadata content to prepare. |
|
425 * @return string The prepared metadata. |
|
426 */ |
|
427 private function prepare_metadata_for_output( $metadata ) { |
|
428 $metadata = html_entity_decode( $metadata, ENT_QUOTES, get_bloginfo( 'charset' ) ); |
|
429 $metadata = wp_strip_all_tags( $metadata ); |
|
430 return $metadata; |
|
431 } |
|
432 |
|
433 /** |
|
434 * Utility function to build cache key for a given URL. |
|
435 * |
|
436 * @since 5.9.0 |
|
437 * |
|
438 * @param string $url The URL for which to build a cache key. |
|
439 * @return string The cache key. |
|
440 */ |
|
441 private function build_cache_key_for_url( $url ) { |
|
442 return 'g_url_details_response_' . md5( $url ); |
|
443 } |
|
444 |
|
445 /** |
|
446 * Utility function to retrieve a value from the cache at a given key. |
|
447 * |
|
448 * @since 5.9.0 |
|
449 * |
|
450 * @param string $key The cache key. |
|
451 * @return mixed The value from the cache. |
|
452 */ |
|
453 private function get_cache( $key ) { |
|
454 return get_site_transient( $key ); |
|
455 } |
|
456 |
|
457 /** |
|
458 * Utility function to cache a given data set at a given cache key. |
|
459 * |
|
460 * @since 5.9.0 |
|
461 * |
|
462 * @param string $key The cache key under which to store the value. |
|
463 * @param string $data The data to be stored at the given cache key. |
|
464 * @return bool True when transient set. False if not set. |
|
465 */ |
|
466 private function set_cache( $key, $data = '' ) { |
|
467 $ttl = HOUR_IN_SECONDS; |
|
468 |
|
469 /** |
|
470 * Filters the cache expiration. |
|
471 * |
|
472 * Can be used to adjust the time until expiration in seconds for the cache |
|
473 * of the data retrieved for the given URL. |
|
474 * |
|
475 * @since 5.9.0 |
|
476 * |
|
477 * @param int $ttl The time until cache expiration in seconds. |
|
478 */ |
|
479 $cache_expiration = apply_filters( 'rest_url_details_cache_expiration', $ttl ); |
|
480 |
|
481 return set_site_transient( $key, $data, $cache_expiration ); |
|
482 } |
|
483 |
|
484 /** |
|
485 * Retrieves the head element section. |
|
486 * |
|
487 * @since 5.9.0 |
|
488 * |
|
489 * @param string $html The string of HTML to parse. |
|
490 * @return string The `<head>..</head>` section on success. Given `$html` if not found. |
|
491 */ |
|
492 private function get_document_head( $html ) { |
|
493 $head_html = $html; |
|
494 |
|
495 // Find the opening `<head>` tag. |
|
496 $head_start = strpos( $html, '<head' ); |
|
497 if ( false === $head_start ) { |
|
498 // Didn't find it. Return the original HTML. |
|
499 return $html; |
|
500 } |
|
501 |
|
502 // Find the closing `</head>` tag. |
|
503 $head_end = strpos( $head_html, '</head>' ); |
|
504 if ( false === $head_end ) { |
|
505 // Didn't find it. Find the opening `<body>` tag. |
|
506 $head_end = strpos( $head_html, '<body' ); |
|
507 |
|
508 // Didn't find it. Return the original HTML. |
|
509 if ( false === $head_end ) { |
|
510 return $html; |
|
511 } |
|
512 } |
|
513 |
|
514 // Extract the HTML from opening tag to the closing tag. Then add the closing tag. |
|
515 $head_html = substr( $head_html, $head_start, $head_end ); |
|
516 $head_html .= '</head>'; |
|
517 |
|
518 return $head_html; |
|
519 } |
|
520 |
|
521 /** |
|
522 * Gets all the meta tag elements that have a 'content' attribute. |
|
523 * |
|
524 * @since 5.9.0 |
|
525 * |
|
526 * @param string $html The string of HTML to be parsed. |
|
527 * @return array { |
|
528 * A multi-dimensional indexed array on success, else empty array. |
|
529 * |
|
530 * @type string[] $0 Meta elements with a content attribute. |
|
531 * @type string[] $1 Content attribute's opening quotation mark. |
|
532 * @type string[] $2 Content attribute's value for each meta element. |
|
533 * } |
|
534 */ |
|
535 private function get_meta_with_content_elements( $html ) { |
|
536 /* |
|
537 * Parse all meta elements with a content attribute. |
|
538 * |
|
539 * Why first search for the content attribute rather than directly searching for name=description element? |
|
540 * tl;dr The content attribute's value will be truncated when it contains a > symbol. |
|
541 * |
|
542 * The content attribute's value (i.e. the description to get) can have HTML in it and be well-formed as |
|
543 * it's a string to the browser. Imagine what happens when attempting to match for the name=description |
|
544 * first. Hmm, if a > or /> symbol is in the content attribute's value, then it terminates the match |
|
545 * as the element's closing symbol. But wait, it's in the content attribute and is not the end of the |
|
546 * element. This is a limitation of using regex. It can't determine "wait a minute this is inside of quotation". |
|
547 * If this happens, what gets matched is not the entire element or all of the content. |
|
548 * |
|
549 * Why not search for the name=description and then content="(.*)"? |
|
550 * The attribute order could be opposite. Plus, additional attributes may exist including being between |
|
551 * the name and content attributes. |
|
552 * |
|
553 * Why not lookahead? |
|
554 * Lookahead is not constrained to stay within the element. The first <meta it finds may not include |
|
555 * the name or content, but rather could be from a different element downstream. |
|
556 */ |
|
557 $pattern = '#<meta\s' . |
|
558 |
|
559 /* |
|
560 * Allows for additional attributes before the content attribute. |
|
561 * Searches for anything other than > symbol. |
|
562 */ |
|
563 '[^>]*' . |
|
564 |
|
565 /* |
|
566 * Find the content attribute. When found, capture its value (.*). |
|
567 * |
|
568 * Allows for (a) single or double quotes and (b) whitespace in the value. |
|
569 * |
|
570 * Why capture the opening quotation mark, i.e. (["\']), and then backreference, |
|
571 * i.e \1, for the closing quotation mark? |
|
572 * To ensure the closing quotation mark matches the opening one. Why? Attribute values |
|
573 * can contain quotation marks, such as an apostrophe in the content. |
|
574 */ |
|
575 'content=(["\']??)(.*)\1' . |
|
576 |
|
577 /* |
|
578 * Allows for additional attributes after the content attribute. |
|
579 * Searches for anything other than > symbol. |
|
580 */ |
|
581 '[^>]*' . |
|
582 |
|
583 /* |
|
584 * \/?> searches for the closing > symbol, which can be in either /> or > format. |
|
585 * # ends the pattern. |
|
586 */ |
|
587 '\/?>#' . |
|
588 |
|
589 /* |
|
590 * These are the options: |
|
591 * - i : case insensitive |
|
592 * - s : allows newline characters for the . match (needed for multiline elements) |
|
593 * - U means non-greedy matching |
|
594 */ |
|
595 'isU'; |
|
596 |
|
597 preg_match_all( $pattern, $html, $elements ); |
|
598 |
|
599 return $elements; |
|
600 } |
|
601 |
|
602 /** |
|
603 * Gets the metadata from a target meta element. |
|
604 * |
|
605 * @since 5.9.0 |
|
606 * |
|
607 * @param array $meta_elements { |
|
608 * A multi-dimensional indexed array on success, else empty array. |
|
609 * |
|
610 * @type string[] $0 Meta elements with a content attribute. |
|
611 * @type string[] $1 Content attribute's opening quotation mark. |
|
612 * @type string[] $2 Content attribute's value for each meta element. |
|
613 * } |
|
614 * @param string $attr Attribute that identifies the element with the target metadata. |
|
615 * @param string $attr_value The attribute's value that identifies the element with the target metadata. |
|
616 * @return string The metadata on success. Empty string if not found. |
|
617 */ |
|
618 private function get_metadata_from_meta_element( $meta_elements, $attr, $attr_value ) { |
|
619 // Bail out if there are no meta elements. |
|
620 if ( empty( $meta_elements[0] ) ) { |
|
621 return ''; |
|
622 } |
|
623 |
|
624 $metadata = ''; |
|
625 $pattern = '#' . |
|
626 /* |
|
627 * Target this attribute and value to find the metadata element. |
|
628 * |
|
629 * Allows for (a) no, single, double quotes and (b) whitespace in the value. |
|
630 * |
|
631 * Why capture the opening quotation mark, i.e. (["\']), and then backreference, |
|
632 * i.e \1, for the closing quotation mark? |
|
633 * To ensure the closing quotation mark matches the opening one. Why? Attribute values |
|
634 * can contain quotation marks, such as an apostrophe in the content. |
|
635 */ |
|
636 $attr . '=([\"\']??)\s*' . $attr_value . '\s*\1' . |
|
637 |
|
638 /* |
|
639 * These are the options: |
|
640 * - i : case insensitive |
|
641 * - s : allows newline characters for the . match (needed for multiline elements) |
|
642 * - U means non-greedy matching |
|
643 */ |
|
644 '#isU'; |
|
645 |
|
646 // Find the metadata element. |
|
647 foreach ( $meta_elements[0] as $index => $element ) { |
|
648 preg_match( $pattern, $element, $match ); |
|
649 |
|
650 // This is not the metadata element. Skip it. |
|
651 if ( empty( $match ) ) { |
|
652 continue; |
|
653 } |
|
654 |
|
655 /* |
|
656 * Found the metadata element. |
|
657 * Get the metadata from its matching content array. |
|
658 */ |
|
659 if ( isset( $meta_elements[2][ $index ] ) && is_string( $meta_elements[2][ $index ] ) ) { |
|
660 $metadata = trim( $meta_elements[2][ $index ] ); |
|
661 } |
|
662 |
|
663 break; |
|
664 } |
|
665 |
|
666 return $metadata; |
|
667 } |
|
668 } |