author | ymh <ymh.work@gmail.com> |
Fri, 05 Sep 2025 18:52:52 +0200 | |
changeset 22 | 8c2e4d02f4ef |
parent 21 | 48c4eec2b7e6 |
permissions | -rw-r--r-- |
19 | 1 |
<?php |
2 |
/** |
|
3 |
* REST API: WP_REST_URL_Details_Controller class |
|
4 |
* |
|
5 |
* @package WordPress |
|
6 |
* @subpackage REST_API |
|
7 |
* @since 5.9.0 |
|
8 |
*/ |
|
9 |
||
10 |
/** |
|
11 |
* Controller which provides REST endpoint for retrieving information |
|
12 |
* from a remote site's HTML response. |
|
13 |
* |
|
14 |
* @since 5.9.0 |
|
15 |
* |
|
16 |
* @see WP_REST_Controller |
|
17 |
*/ |
|
18 |
class WP_REST_URL_Details_Controller extends WP_REST_Controller { |
|
19 |
||
20 |
/** |
|
21 |
* Constructs the controller. |
|
22 |
* |
|
23 |
* @since 5.9.0 |
|
24 |
*/ |
|
25 |
public function __construct() { |
|
26 |
$this->namespace = 'wp-block-editor/v1'; |
|
27 |
$this->rest_base = 'url-details'; |
|
28 |
} |
|
29 |
||
30 |
/** |
|
31 |
* Registers the necessary REST API routes. |
|
32 |
* |
|
33 |
* @since 5.9.0 |
|
34 |
*/ |
|
35 |
public function register_routes() { |
|
36 |
register_rest_route( |
|
37 |
$this->namespace, |
|
38 |
'/' . $this->rest_base, |
|
39 |
array( |
|
40 |
array( |
|
41 |
'methods' => WP_REST_Server::READABLE, |
|
42 |
'callback' => array( $this, 'parse_url_details' ), |
|
43 |
'args' => array( |
|
44 |
'url' => array( |
|
45 |
'required' => true, |
|
46 |
'description' => __( 'The URL to process.' ), |
|
47 |
'validate_callback' => 'wp_http_validate_url', |
|
21
48c4eec2b7e6
Add CLAUDE.md documentation and sync WordPress core files
ymh <ymh.work@gmail.com>
parents:
19
diff
changeset
|
48 |
'sanitize_callback' => 'sanitize_url', |
19 | 49 |
'type' => 'string', |
50 |
'format' => 'uri', |
|
51 |
), |
|
52 |
), |
|
53 |
'permission_callback' => array( $this, 'permissions_check' ), |
|
54 |
'schema' => array( $this, 'get_public_item_schema' ), |
|
55 |
), |
|
56 |
) |
|
57 |
); |
|
58 |
} |
|
59 |
||
60 |
/** |
|
61 |
* Retrieves the item's schema, conforming to JSON Schema. |
|
62 |
* |
|
63 |
* @since 5.9.0 |
|
64 |
* |
|
65 |
* @return array Item schema data. |
|
66 |
*/ |
|
67 |
public function get_item_schema() { |
|
68 |
if ( $this->schema ) { |
|
69 |
return $this->add_additional_fields_schema( $this->schema ); |
|
70 |
} |
|
71 |
||
72 |
$this->schema = array( |
|
73 |
'$schema' => 'http://json-schema.org/draft-04/schema#', |
|
74 |
'title' => 'url-details', |
|
75 |
'type' => 'object', |
|
76 |
'properties' => array( |
|
77 |
'title' => array( |
|
78 |
'description' => sprintf( |
|
79 |
/* translators: %s: HTML title tag. */ |
|
80 |
__( 'The contents of the %s element from the URL.' ), |
|
81 |
'<title>' |
|
82 |
), |
|
83 |
'type' => 'string', |
|
84 |
'context' => array( 'view', 'edit', 'embed' ), |
|
85 |
'readonly' => true, |
|
86 |
), |
|
87 |
'icon' => array( |
|
88 |
'description' => sprintf( |
|
89 |
/* translators: %s: HTML link tag. */ |
|
90 |
__( 'The favicon image link of the %s element from the URL.' ), |
|
91 |
'<link rel="icon">' |
|
92 |
), |
|
93 |
'type' => 'string', |
|
94 |
'format' => 'uri', |
|
95 |
'context' => array( 'view', 'edit', 'embed' ), |
|
96 |
'readonly' => true, |
|
97 |
), |
|
98 |
'description' => array( |
|
99 |
'description' => sprintf( |
|
100 |
/* translators: %s: HTML meta tag. */ |
|
101 |
__( 'The content of the %s element from the URL.' ), |
|
102 |
'<meta name="description">' |
|
103 |
), |
|
104 |
'type' => 'string', |
|
105 |
'context' => array( 'view', 'edit', 'embed' ), |
|
106 |
'readonly' => true, |
|
107 |
), |
|
108 |
'image' => array( |
|
109 |
'description' => sprintf( |
|
110 |
/* translators: 1: HTML meta tag, 2: HTML meta tag. */ |
|
111 |
__( 'The Open Graph image link of the %1$s or %2$s element from the URL.' ), |
|
112 |
'<meta property="og:image">', |
|
113 |
'<meta property="og:image:url">' |
|
114 |
), |
|
115 |
'type' => 'string', |
|
116 |
'format' => 'uri', |
|
117 |
'context' => array( 'view', 'edit', 'embed' ), |
|
118 |
'readonly' => true, |
|
119 |
), |
|
120 |
), |
|
121 |
); |
|
122 |
||
123 |
return $this->add_additional_fields_schema( $this->schema ); |
|
124 |
} |
|
125 |
||
126 |
/** |
|
127 |
* Retrieves the contents of the title tag from the HTML response. |
|
128 |
* |
|
129 |
* @since 5.9.0 |
|
130 |
* |
|
21
48c4eec2b7e6
Add CLAUDE.md documentation and sync WordPress core files
ymh <ymh.work@gmail.com>
parents:
19
diff
changeset
|
131 |
* @param WP_REST_Request $request Full details about the request. |
19 | 132 |
* @return WP_REST_Response|WP_Error The parsed details as a response object. WP_Error if there are errors. |
133 |
*/ |
|
134 |
public function parse_url_details( $request ) { |
|
135 |
$url = untrailingslashit( $request['url'] ); |
|
136 |
||
137 |
if ( empty( $url ) ) { |
|
138 |
return new WP_Error( 'rest_invalid_url', __( 'Invalid URL' ), array( 'status' => 404 ) ); |
|
139 |
} |
|
140 |
||
141 |
// Transient per URL. |
|
142 |
$cache_key = $this->build_cache_key_for_url( $url ); |
|
143 |
||
144 |
// Attempt to retrieve cached response. |
|
145 |
$cached_response = $this->get_cache( $cache_key ); |
|
146 |
||
147 |
if ( ! empty( $cached_response ) ) { |
|
148 |
$remote_url_response = $cached_response; |
|
149 |
} else { |
|
150 |
$remote_url_response = $this->get_remote_url( $url ); |
|
151 |
||
152 |
// Exit if we don't have a valid body or it's empty. |
|
153 |
if ( is_wp_error( $remote_url_response ) || empty( $remote_url_response ) ) { |
|
154 |
return $remote_url_response; |
|
155 |
} |
|
156 |
||
157 |
// Cache the valid response. |
|
158 |
$this->set_cache( $cache_key, $remote_url_response ); |
|
159 |
} |
|
160 |
||
161 |
$html_head = $this->get_document_head( $remote_url_response ); |
|
162 |
$meta_elements = $this->get_meta_with_content_elements( $html_head ); |
|
163 |
||
164 |
$data = $this->add_additional_fields_to_object( |
|
165 |
array( |
|
166 |
'title' => $this->get_title( $html_head ), |
|
167 |
'icon' => $this->get_icon( $html_head, $url ), |
|
168 |
'description' => $this->get_description( $meta_elements ), |
|
169 |
'image' => $this->get_image( $meta_elements, $url ), |
|
170 |
), |
|
171 |
$request |
|
172 |
); |
|
173 |
||
174 |
// Wrap the data in a response object. |
|
175 |
$response = rest_ensure_response( $data ); |
|
176 |
||
177 |
/** |
|
178 |
* Filters the URL data for the response. |
|
179 |
* |
|
180 |
* @since 5.9.0 |
|
181 |
* |
|
182 |
* @param WP_REST_Response $response The response object. |
|
183 |
* @param string $url The requested URL. |
|
184 |
* @param WP_REST_Request $request Request object. |
|
185 |
* @param string $remote_url_response HTTP response body from the remote URL. |
|
186 |
*/ |
|
187 |
return apply_filters( 'rest_prepare_url_details', $response, $url, $request, $remote_url_response ); |
|
188 |
} |
|
189 |
||
190 |
/** |
|
191 |
* Checks whether a given request has permission to read remote URLs. |
|
192 |
* |
|
193 |
* @since 5.9.0 |
|
194 |
* |
|
22
8c2e4d02f4ef
Update WordPress to latest version (6.7)
ymh <ymh.work@gmail.com>
parents:
21
diff
changeset
|
195 |
* @return true|WP_Error True if the request has permission, else WP_Error. |
19 | 196 |
*/ |
197 |
public function permissions_check() { |
|
198 |
if ( current_user_can( 'edit_posts' ) ) { |
|
199 |
return true; |
|
200 |
} |
|
201 |
||
202 |
foreach ( get_post_types( array( 'show_in_rest' => true ), 'objects' ) as $post_type ) { |
|
203 |
if ( current_user_can( $post_type->cap->edit_posts ) ) { |
|
204 |
return true; |
|
205 |
} |
|
206 |
} |
|
207 |
||
208 |
return new WP_Error( |
|
209 |
'rest_cannot_view_url_details', |
|
210 |
__( 'Sorry, you are not allowed to process remote URLs.' ), |
|
211 |
array( 'status' => rest_authorization_required_code() ) |
|
212 |
); |
|
213 |
} |
|
214 |
||
215 |
/** |
|
216 |
* Retrieves the document title from a remote URL. |
|
217 |
* |
|
218 |
* @since 5.9.0 |
|
219 |
* |
|
220 |
* @param string $url The website URL whose HTML to access. |
|
221 |
* @return string|WP_Error The HTTP response from the remote URL on success. |
|
222 |
* WP_Error if no response or no content. |
|
223 |
*/ |
|
224 |
private function get_remote_url( $url ) { |
|
225 |
||
226 |
/* |
|
227 |
* Provide a modified UA string to workaround web properties which block WordPress "Pingbacks". |
|
228 |
* Why? The UA string used for pingback requests contains `WordPress/` which is very similar |
|
229 |
* to that used as the default UA string by the WP HTTP API. Therefore requests from this |
|
230 |
* REST endpoint are being unintentionally blocked as they are misidentified as pingback requests. |
|
231 |
* By slightly modifying the UA string, but still retaining the "WordPress" identification (via "WP") |
|
232 |
* we are able to work around this issue. |
|
233 |
* Example UA string: `WP-URLDetails/5.9-alpha-51389 (+http://localhost:8888)`. |
|
234 |
*/ |
|
235 |
$modified_user_agent = 'WP-URLDetails/' . get_bloginfo( 'version' ) . ' (+' . get_bloginfo( 'url' ) . ')'; |
|
236 |
||
237 |
$args = array( |
|
238 |
'limit_response_size' => 150 * KB_IN_BYTES, |
|
239 |
'user-agent' => $modified_user_agent, |
|
240 |
); |
|
241 |
||
242 |
/** |
|
243 |
* Filters the HTTP request args for URL data retrieval. |
|
244 |
* |
|
245 |
* Can be used to adjust response size limit and other WP_Http::request() args. |
|
246 |
* |
|
247 |
* @since 5.9.0 |
|
248 |
* |
|
249 |
* @param array $args Arguments used for the HTTP request. |
|
250 |
* @param string $url The attempted URL. |
|
251 |
*/ |
|
252 |
$args = apply_filters( 'rest_url_details_http_request_args', $args, $url ); |
|
253 |
||
254 |
$response = wp_safe_remote_get( $url, $args ); |
|
255 |
||
256 |
if ( WP_Http::OK !== wp_remote_retrieve_response_code( $response ) ) { |
|
257 |
// Not saving the error response to cache since the error might be temporary. |
|
258 |
return new WP_Error( |
|
259 |
'no_response', |
|
260 |
__( 'URL not found. Response returned a non-200 status code for this URL.' ), |
|
261 |
array( 'status' => WP_Http::NOT_FOUND ) |
|
262 |
); |
|
263 |
} |
|
264 |
||
265 |
$remote_body = wp_remote_retrieve_body( $response ); |
|
266 |
||
267 |
if ( empty( $remote_body ) ) { |
|
268 |
return new WP_Error( |
|
269 |
'no_content', |
|
270 |
__( 'Unable to retrieve body from response at this URL.' ), |
|
271 |
array( 'status' => WP_Http::NOT_FOUND ) |
|
272 |
); |
|
273 |
} |
|
274 |
||
275 |
return $remote_body; |
|
276 |
} |
|
277 |
||
278 |
/** |
|
279 |
* Parses the title tag contents from the provided HTML. |
|
280 |
* |
|
281 |
* @since 5.9.0 |
|
282 |
* |
|
283 |
* @param string $html The HTML from the remote website at URL. |
|
284 |
* @return string The title tag contents on success. Empty string if not found. |
|
285 |
*/ |
|
286 |
private function get_title( $html ) { |
|
287 |
$pattern = '#<title[^>]*>(.*?)<\s*/\s*title>#is'; |
|
288 |
preg_match( $pattern, $html, $match_title ); |
|
289 |
||
290 |
if ( empty( $match_title[1] ) || ! is_string( $match_title[1] ) ) { |
|
291 |
return ''; |
|
292 |
} |
|
293 |
||
294 |
$title = trim( $match_title[1] ); |
|
295 |
||
296 |
return $this->prepare_metadata_for_output( $title ); |
|
297 |
} |
|
298 |
||
299 |
/** |
|
300 |
* Parses the site icon from the provided HTML. |
|
301 |
* |
|
302 |
* @since 5.9.0 |
|
303 |
* |
|
304 |
* @param string $html The HTML from the remote website at URL. |
|
305 |
* @param string $url The target website URL. |
|
306 |
* @return string The icon URI on success. Empty string if not found. |
|
307 |
*/ |
|
308 |
private function get_icon( $html, $url ) { |
|
309 |
// Grab the icon's link element. |
|
310 |
$pattern = '#<link\s[^>]*rel=(?:[\"\']??)\s*(?:icon|shortcut icon|icon shortcut)\s*(?:[\"\']??)[^>]*\/?>#isU'; |
|
311 |
preg_match( $pattern, $html, $element ); |
|
312 |
if ( empty( $element[0] ) || ! is_string( $element[0] ) ) { |
|
313 |
return ''; |
|
314 |
} |
|
315 |
$element = trim( $element[0] ); |
|
316 |
||
317 |
// Get the icon's href value. |
|
318 |
$pattern = '#href=([\"\']??)([^\" >]*?)\\1[^>]*#isU'; |
|
319 |
preg_match( $pattern, $element, $icon ); |
|
320 |
if ( empty( $icon[2] ) || ! is_string( $icon[2] ) ) { |
|
321 |
return ''; |
|
322 |
} |
|
323 |
$icon = trim( $icon[2] ); |
|
324 |
||
325 |
// If the icon is a data URL, return it. |
|
326 |
$parsed_icon = parse_url( $icon ); |
|
327 |
if ( isset( $parsed_icon['scheme'] ) && 'data' === $parsed_icon['scheme'] ) { |
|
328 |
return $icon; |
|
329 |
} |
|
330 |
||
331 |
// Attempt to convert relative URLs to absolute. |
|
332 |
if ( ! is_string( $url ) || '' === $url ) { |
|
333 |
return $icon; |
|
334 |
} |
|
335 |
$parsed_url = parse_url( $url ); |
|
336 |
if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) { |
|
337 |
$root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; |
|
338 |
$icon = WP_Http::make_absolute_url( $icon, $root_url ); |
|
339 |
} |
|
340 |
||
341 |
return $icon; |
|
342 |
} |
|
343 |
||
344 |
/** |
|
345 |
* Parses the meta description from the provided HTML. |
|
346 |
* |
|
347 |
* @since 5.9.0 |
|
348 |
* |
|
349 |
* @param array $meta_elements { |
|
22
8c2e4d02f4ef
Update WordPress to latest version (6.7)
ymh <ymh.work@gmail.com>
parents:
21
diff
changeset
|
350 |
* A multidimensional indexed array on success, else empty array. |
19 | 351 |
* |
352 |
* @type string[] $0 Meta elements with a content attribute. |
|
353 |
* @type string[] $1 Content attribute's opening quotation mark. |
|
354 |
* @type string[] $2 Content attribute's value for each meta element. |
|
355 |
* } |
|
356 |
* @return string The meta description contents on success. Empty string if not found. |
|
357 |
*/ |
|
358 |
private function get_description( $meta_elements ) { |
|
359 |
// Bail out if there are no meta elements. |
|
360 |
if ( empty( $meta_elements[0] ) ) { |
|
361 |
return ''; |
|
362 |
} |
|
363 |
||
364 |
$description = $this->get_metadata_from_meta_element( |
|
365 |
$meta_elements, |
|
366 |
'name', |
|
367 |
'(?:description|og:description)' |
|
368 |
); |
|
369 |
||
370 |
// Bail out if description not found. |
|
371 |
if ( '' === $description ) { |
|
372 |
return ''; |
|
373 |
} |
|
374 |
||
375 |
return $this->prepare_metadata_for_output( $description ); |
|
376 |
} |
|
377 |
||
378 |
/** |
|
379 |
* Parses the Open Graph (OG) Image from the provided HTML. |
|
380 |
* |
|
381 |
* See: https://ogp.me/. |
|
382 |
* |
|
383 |
* @since 5.9.0 |
|
384 |
* |
|
385 |
* @param array $meta_elements { |
|
22
8c2e4d02f4ef
Update WordPress to latest version (6.7)
ymh <ymh.work@gmail.com>
parents:
21
diff
changeset
|
386 |
* A multidimensional indexed array on success, else empty array. |
19 | 387 |
* |
388 |
* @type string[] $0 Meta elements with a content attribute. |
|
389 |
* @type string[] $1 Content attribute's opening quotation mark. |
|
390 |
* @type string[] $2 Content attribute's value for each meta element. |
|
391 |
* } |
|
392 |
* @param string $url The target website URL. |
|
393 |
* @return string The OG image on success. Empty string if not found. |
|
394 |
*/ |
|
395 |
private function get_image( $meta_elements, $url ) { |
|
396 |
$image = $this->get_metadata_from_meta_element( |
|
397 |
$meta_elements, |
|
398 |
'property', |
|
399 |
'(?:og:image|og:image:url)' |
|
400 |
); |
|
401 |
||
402 |
// Bail out if image not found. |
|
403 |
if ( '' === $image ) { |
|
404 |
return ''; |
|
405 |
} |
|
406 |
||
407 |
// Attempt to convert relative URLs to absolute. |
|
408 |
$parsed_url = parse_url( $url ); |
|
409 |
if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) { |
|
410 |
$root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; |
|
411 |
$image = WP_Http::make_absolute_url( $image, $root_url ); |
|
412 |
} |
|
413 |
||
414 |
return $image; |
|
415 |
} |
|
416 |
||
417 |
/** |
|
418 |
* Prepares the metadata by: |
|
419 |
* - stripping all HTML tags and tag entities. |
|
420 |
* - converting non-tag entities into characters. |
|
421 |
* |
|
422 |
* @since 5.9.0 |
|
423 |
* |
|
424 |
* @param string $metadata The metadata content to prepare. |
|
425 |
* @return string The prepared metadata. |
|
426 |
*/ |
|
427 |
private function prepare_metadata_for_output( $metadata ) { |
|
428 |
$metadata = html_entity_decode( $metadata, ENT_QUOTES, get_bloginfo( 'charset' ) ); |
|
429 |
$metadata = wp_strip_all_tags( $metadata ); |
|
430 |
return $metadata; |
|
431 |
} |
|
432 |
||
433 |
/** |
|
434 |
* Utility function to build cache key for a given URL. |
|
435 |
* |
|
436 |
* @since 5.9.0 |
|
437 |
* |
|
438 |
* @param string $url The URL for which to build a cache key. |
|
439 |
* @return string The cache key. |
|
440 |
*/ |
|
441 |
private function build_cache_key_for_url( $url ) { |
|
442 |
return 'g_url_details_response_' . md5( $url ); |
|
443 |
} |
|
444 |
||
445 |
/** |
|
446 |
* Utility function to retrieve a value from the cache at a given key. |
|
447 |
* |
|
448 |
* @since 5.9.0 |
|
449 |
* |
|
450 |
* @param string $key The cache key. |
|
451 |
* @return mixed The value from the cache. |
|
452 |
*/ |
|
453 |
private function get_cache( $key ) { |
|
454 |
return get_site_transient( $key ); |
|
455 |
} |
|
456 |
||
457 |
/** |
|
458 |
* Utility function to cache a given data set at a given cache key. |
|
459 |
* |
|
460 |
* @since 5.9.0 |
|
461 |
* |
|
462 |
* @param string $key The cache key under which to store the value. |
|
463 |
* @param string $data The data to be stored at the given cache key. |
|
464 |
* @return bool True when transient set. False if not set. |
|
465 |
*/ |
|
466 |
private function set_cache( $key, $data = '' ) { |
|
467 |
$ttl = HOUR_IN_SECONDS; |
|
468 |
||
469 |
/** |
|
470 |
* Filters the cache expiration. |
|
471 |
* |
|
472 |
* Can be used to adjust the time until expiration in seconds for the cache |
|
473 |
* of the data retrieved for the given URL. |
|
474 |
* |
|
475 |
* @since 5.9.0 |
|
476 |
* |
|
477 |
* @param int $ttl The time until cache expiration in seconds. |
|
478 |
*/ |
|
479 |
$cache_expiration = apply_filters( 'rest_url_details_cache_expiration', $ttl ); |
|
480 |
||
481 |
return set_site_transient( $key, $data, $cache_expiration ); |
|
482 |
} |
|
483 |
||
484 |
/** |
|
485 |
* Retrieves the head element section. |
|
486 |
* |
|
487 |
* @since 5.9.0 |
|
488 |
* |
|
489 |
* @param string $html The string of HTML to parse. |
|
490 |
* @return string The `<head>..</head>` section on success. Given `$html` if not found. |
|
491 |
*/ |
|
492 |
private function get_document_head( $html ) { |
|
493 |
$head_html = $html; |
|
494 |
||
495 |
// Find the opening `<head>` tag. |
|
496 |
$head_start = strpos( $html, '<head' ); |
|
497 |
if ( false === $head_start ) { |
|
498 |
// Didn't find it. Return the original HTML. |
|
499 |
return $html; |
|
500 |
} |
|
501 |
||
502 |
// Find the closing `</head>` tag. |
|
503 |
$head_end = strpos( $head_html, '</head>' ); |
|
504 |
if ( false === $head_end ) { |
|
505 |
// Didn't find it. Find the opening `<body>` tag. |
|
506 |
$head_end = strpos( $head_html, '<body' ); |
|
507 |
||
508 |
// Didn't find it. Return the original HTML. |
|
509 |
if ( false === $head_end ) { |
|
510 |
return $html; |
|
511 |
} |
|
512 |
} |
|
513 |
||
514 |
// Extract the HTML from opening tag to the closing tag. Then add the closing tag. |
|
515 |
$head_html = substr( $head_html, $head_start, $head_end ); |
|
516 |
$head_html .= '</head>'; |
|
517 |
||
518 |
return $head_html; |
|
519 |
} |
|
520 |
||
521 |
/** |
|
522 |
* Gets all the meta tag elements that have a 'content' attribute. |
|
523 |
* |
|
524 |
* @since 5.9.0 |
|
525 |
* |
|
526 |
* @param string $html The string of HTML to be parsed. |
|
527 |
* @return array { |
|
22
8c2e4d02f4ef
Update WordPress to latest version (6.7)
ymh <ymh.work@gmail.com>
parents:
21
diff
changeset
|
528 |
* A multidimensional indexed array on success, else empty array. |
19 | 529 |
* |
530 |
* @type string[] $0 Meta elements with a content attribute. |
|
531 |
* @type string[] $1 Content attribute's opening quotation mark. |
|
532 |
* @type string[] $2 Content attribute's value for each meta element. |
|
533 |
* } |
|
534 |
*/ |
|
535 |
private function get_meta_with_content_elements( $html ) { |
|
536 |
/* |
|
537 |
* Parse all meta elements with a content attribute. |
|
538 |
* |
|
539 |
* Why first search for the content attribute rather than directly searching for name=description element? |
|
540 |
* tl;dr The content attribute's value will be truncated when it contains a > symbol. |
|
541 |
* |
|
542 |
* The content attribute's value (i.e. the description to get) can have HTML in it and be well-formed as |
|
543 |
* it's a string to the browser. Imagine what happens when attempting to match for the name=description |
|
544 |
* first. Hmm, if a > or /> symbol is in the content attribute's value, then it terminates the match |
|
545 |
* as the element's closing symbol. But wait, it's in the content attribute and is not the end of the |
|
546 |
* element. This is a limitation of using regex. It can't determine "wait a minute this is inside of quotation". |
|
547 |
* If this happens, what gets matched is not the entire element or all of the content. |
|
548 |
* |
|
549 |
* Why not search for the name=description and then content="(.*)"? |
|
550 |
* The attribute order could be opposite. Plus, additional attributes may exist including being between |
|
551 |
* the name and content attributes. |
|
552 |
* |
|
553 |
* Why not lookahead? |
|
554 |
* Lookahead is not constrained to stay within the element. The first <meta it finds may not include |
|
555 |
* the name or content, but rather could be from a different element downstream. |
|
556 |
*/ |
|
557 |
$pattern = '#<meta\s' . |
|
558 |
||
559 |
/* |
|
560 |
* Allows for additional attributes before the content attribute. |
|
561 |
* Searches for anything other than > symbol. |
|
562 |
*/ |
|
563 |
'[^>]*' . |
|
564 |
||
565 |
/* |
|
566 |
* Find the content attribute. When found, capture its value (.*). |
|
567 |
* |
|
568 |
* Allows for (a) single or double quotes and (b) whitespace in the value. |
|
569 |
* |
|
570 |
* Why capture the opening quotation mark, i.e. (["\']), and then backreference, |
|
571 |
* i.e \1, for the closing quotation mark? |
|
572 |
* To ensure the closing quotation mark matches the opening one. Why? Attribute values |
|
573 |
* can contain quotation marks, such as an apostrophe in the content. |
|
574 |
*/ |
|
575 |
'content=(["\']??)(.*)\1' . |
|
576 |
||
577 |
/* |
|
578 |
* Allows for additional attributes after the content attribute. |
|
579 |
* Searches for anything other than > symbol. |
|
580 |
*/ |
|
581 |
'[^>]*' . |
|
582 |
||
583 |
/* |
|
584 |
* \/?> searches for the closing > symbol, which can be in either /> or > format. |
|
585 |
* # ends the pattern. |
|
586 |
*/ |
|
587 |
'\/?>#' . |
|
588 |
||
589 |
/* |
|
590 |
* These are the options: |
|
22
8c2e4d02f4ef
Update WordPress to latest version (6.7)
ymh <ymh.work@gmail.com>
parents:
21
diff
changeset
|
591 |
* - i : case-insensitive |
19 | 592 |
* - s : allows newline characters for the . match (needed for multiline elements) |
593 |
* - U means non-greedy matching |
|
594 |
*/ |
|
595 |
'isU'; |
|
596 |
||
597 |
preg_match_all( $pattern, $html, $elements ); |
|
598 |
||
599 |
return $elements; |
|
600 |
} |
|
601 |
||
602 |
/** |
|
603 |
* Gets the metadata from a target meta element. |
|
604 |
* |
|
605 |
* @since 5.9.0 |
|
606 |
* |
|
607 |
* @param array $meta_elements { |
|
608 |
* A multi-dimensional indexed array on success, else empty array. |
|
609 |
* |
|
610 |
* @type string[] $0 Meta elements with a content attribute. |
|
611 |
* @type string[] $1 Content attribute's opening quotation mark. |
|
612 |
* @type string[] $2 Content attribute's value for each meta element. |
|
613 |
* } |
|
614 |
* @param string $attr Attribute that identifies the element with the target metadata. |
|
615 |
* @param string $attr_value The attribute's value that identifies the element with the target metadata. |
|
616 |
* @return string The metadata on success. Empty string if not found. |
|
617 |
*/ |
|
618 |
private function get_metadata_from_meta_element( $meta_elements, $attr, $attr_value ) { |
|
619 |
// Bail out if there are no meta elements. |
|
620 |
if ( empty( $meta_elements[0] ) ) { |
|
621 |
return ''; |
|
622 |
} |
|
623 |
||
624 |
$metadata = ''; |
|
625 |
$pattern = '#' . |
|
626 |
/* |
|
627 |
* Target this attribute and value to find the metadata element. |
|
628 |
* |
|
629 |
* Allows for (a) no, single, double quotes and (b) whitespace in the value. |
|
630 |
* |
|
631 |
* Why capture the opening quotation mark, i.e. (["\']), and then backreference, |
|
632 |
* i.e \1, for the closing quotation mark? |
|
633 |
* To ensure the closing quotation mark matches the opening one. Why? Attribute values |
|
634 |
* can contain quotation marks, such as an apostrophe in the content. |
|
635 |
*/ |
|
636 |
$attr . '=([\"\']??)\s*' . $attr_value . '\s*\1' . |
|
637 |
||
638 |
/* |
|
639 |
* These are the options: |
|
22
8c2e4d02f4ef
Update WordPress to latest version (6.7)
ymh <ymh.work@gmail.com>
parents:
21
diff
changeset
|
640 |
* - i : case-insensitive |
19 | 641 |
* - s : allows newline characters for the . match (needed for multiline elements) |
642 |
* - U means non-greedy matching |
|
643 |
*/ |
|
644 |
'#isU'; |
|
645 |
||
646 |
// Find the metadata element. |
|
647 |
foreach ( $meta_elements[0] as $index => $element ) { |
|
648 |
preg_match( $pattern, $element, $match ); |
|
649 |
||
650 |
// This is not the metadata element. Skip it. |
|
651 |
if ( empty( $match ) ) { |
|
652 |
continue; |
|
653 |
} |
|
654 |
||
655 |
/* |
|
656 |
* Found the metadata element. |
|
657 |
* Get the metadata from its matching content array. |
|
658 |
*/ |
|
659 |
if ( isset( $meta_elements[2][ $index ] ) && is_string( $meta_elements[2][ $index ] ) ) { |
|
660 |
$metadata = trim( $meta_elements[2][ $index ] ); |
|
661 |
} |
|
662 |
||
663 |
break; |
|
664 |
} |
|
665 |
||
666 |
return $metadata; |
|
667 |
} |
|
668 |
} |