From cf0274e1ca419f467ed46d60287a6dd9908f3bc0 Mon Sep 17 00:00:00 2001 From: hellofromTonya Date: Tue, 2 Nov 2021 12:48:00 +0000 Subject: [PATCH] REST API: Add URL Details endpoint. Adds a new REST API endpoint (`/wp-block-editor/v1/url-details`) for retrieving information from an external URL. Information retrieved: * Title: content of the `` element * Icon: favicon image link * Description: content of the `description` or `og:description` meta element * Image: OG image link This endpoint is used by the block editor for link previews. Props get_dave, aduth, andraganescu, beaulebens, hellofromTonya, kevin940726, mamaduka, marekhrabe, mnelson4, noisysocks, obenland, ocean90, retrofox, shaunandrews, spacedmonkey, swissspidy, timothyblynjacobs, xknown, youknowriad. Fixes #54358. Built from https://develop.svn.wordpress.org/trunk@51973 git-svn-id: http://core.svn.wordpress.org/trunk@51562 1a063a9b-81f0-0310-95a4-ce76da25c4cd --- wp-includes/rest-api.php | 4 + .../class-wp-rest-url-details-controller.php | 630 ++++++++++++++++++ wp-includes/version.php | 2 +- wp-settings.php | 1 + 4 files changed, 636 insertions(+), 1 deletion(-) create mode 100644 wp-includes/rest-api/endpoints/class-wp-rest-url-details-controller.php diff --git a/wp-includes/rest-api.php b/wp-includes/rest-api.php index 03860a915c..28187154e7 100644 --- a/wp-includes/rest-api.php +++ b/wp-includes/rest-api.php @@ -337,6 +337,10 @@ function create_initial_rest_routes() { $site_health = WP_Site_Health::get_instance(); $controller = new WP_REST_Site_Health_Controller( $site_health ); $controller->register_routes(); + + // URL Details. + $controller = new WP_REST_URL_Details_Controller(); + $controller->register_routes(); } /** diff --git a/wp-includes/rest-api/endpoints/class-wp-rest-url-details-controller.php b/wp-includes/rest-api/endpoints/class-wp-rest-url-details-controller.php new file mode 100644 index 0000000000..d89a420e8e --- /dev/null +++ b/wp-includes/rest-api/endpoints/class-wp-rest-url-details-controller.php @@ -0,0 +1,630 @@ +<?php +/** + * REST API: WP_REST_URL_Details_Controller class + * + * @package WordPress + * @subpackage REST_API + * @since 5.9.0 + */ + +/** + * Controller which provides REST endpoint for retrieving information + * from a remote site's HTML response. + * + * @since 5.9.0 + * + * @see WP_REST_Controller + */ +class WP_REST_URL_Details_Controller extends WP_REST_Controller { + + /** + * Constructs the controller. + * + * @since 5.9.0 + */ + public function __construct() { + $this->namespace = 'wp-block-editor/v1'; + $this->rest_base = 'url-details'; + } + + /** + * Registers the necessary REST API routes. + * + * @since 5.9.0 + */ + public function register_routes() { + register_rest_route( + $this->namespace, + '/' . $this->rest_base, + array( + array( + 'methods' => WP_REST_Server::READABLE, + 'callback' => array( $this, 'parse_url_details' ), + 'args' => array( + 'url' => array( + 'required' => true, + 'description' => __( 'The URL to process.' ), + 'validate_callback' => 'wp_http_validate_url', + 'sanitize_callback' => 'esc_url_raw', + 'type' => 'string', + 'format' => 'uri', + ), + ), + 'permission_callback' => array( $this, 'permissions_check' ), + 'schema' => array( $this, 'get_public_item_schema' ), + ), + ) + ); + } + + /** + * Retrieves the item's schema, conforming to JSON Schema. + * + * @since 5.9.0 + * + * @return array Item schema data. + */ + public function get_item_schema() { + if ( $this->schema ) { + return $this->add_additional_fields_schema( $this->schema ); + } + + $schema = array( + '$schema' => 'http://json-schema.org/draft-04/schema#', + 'title' => 'url-details', + 'type' => 'object', + 'properties' => array( + 'title' => array( + 'description' => __( 'The contents of the <title> element from the URL.' ), + 'type' => 'string', + 'context' => array( 'view', 'edit', 'embed' ), + 'readonly' => true, + ), + 'icon' => array( + 'description' => __( 'The favicon image link of the <link rel="icon"> element from the URL.' ), + 'type' => 'string', + 'format' => 'uri', + 'context' => array( 'view', 'edit', 'embed' ), + 'readonly' => true, + ), + 'description' => array( + 'description' => __( 'The content of the <meta name="description"> element from the URL.' ), + 'type' => 'string', + 'context' => array( 'view', 'edit', 'embed' ), + 'readonly' => true, + ), + 'image' => array( + 'description' => __( 'The OG image link of the <meta property="og:image"> or <meta property="og:image:url"> element from the URL.' ), + 'type' => 'string', + 'format' => 'uri', + 'context' => array( 'view', 'edit', 'embed' ), + 'readonly' => true, + ), + ), + ); + + $this->schema = $schema; + + return $this->add_additional_fields_schema( $this->schema ); + } + + /** + * Retrieves the contents of the <title> tag from the HTML response. + * + * @since 5.9.0 + * + * @param WP_REST_REQUEST $request Full details about the request. + * @return WP_REST_Response|WP_Error The parsed details as a response object, or an error. + */ + public function parse_url_details( $request ) { + $url = untrailingslashit( $request['url'] ); + + if ( empty( $url ) ) { + return new WP_Error( 'rest_invalid_url', __( 'Invalid URL' ), array( 'status' => 404 ) ); + } + + // Transient per URL. + $cache_key = $this->build_cache_key_for_url( $url ); + + // Attempt to retrieve cached response. + $cached_response = $this->get_cache( $cache_key ); + + if ( ! empty( $cached_response ) ) { + $remote_url_response = $cached_response; + } else { + $remote_url_response = $this->get_remote_url( $url ); + + // Exit if we don't have a valid body or it's empty. + if ( is_wp_error( $remote_url_response ) || empty( $remote_url_response ) ) { + return $remote_url_response; + } + + // Cache the valid response. + $this->set_cache( $cache_key, $remote_url_response ); + } + + $html_head = $this->get_document_head( $remote_url_response ); + $meta_elements = $this->get_meta_with_content_elements( $html_head ); + + $data = $this->add_additional_fields_to_object( + array( + 'title' => $this->get_title( $html_head ), + 'icon' => $this->get_icon( $html_head, $url ), + 'description' => $this->get_description( $meta_elements ), + 'image' => $this->get_image( $meta_elements, $url ), + ), + $request + ); + + // Wrap the data in a response object. + $response = rest_ensure_response( $data ); + + /** + * Filters the URL data for the response. + * + * @param WP_REST_Response $response The response object. + * @param string $url The requested URL. + * @param WP_REST_Request $request Request object. + * @param array $remote_url_response HTTP response body from the remote URL. + */ + return apply_filters( 'rest_prepare_url_details', $response, $url, $request, $remote_url_response ); + } + + /** + * Checks whether a given request has permission to read remote urls. + * + * @since 5.9.0 + * + * @return WP_Error|bool True if the request has access, or WP_Error object. + */ + public function permissions_check() { + if ( current_user_can( 'edit_posts' ) ) { + return true; + } + + foreach ( get_post_types( array( 'show_in_rest' => true ), 'objects' ) as $post_type ) { + if ( current_user_can( $post_type->cap->edit_posts ) ) { + return true; + } + } + + return new WP_Error( + 'rest_cannot_view_url_details', + __( 'Sorry, you are not allowed to process remote urls.' ), + array( 'status' => rest_authorization_required_code() ) + ); + } + + /** + * Retrieves the document title from a remote URL. + * + * @since 5.9.0 + * + * @param string $url The website url whose HTML we want to access. + * @return string|WP_Error The HTTP response from the remote URL, or an error. + */ + private function get_remote_url( $url ) { + + /* + * Provide a modified UA string to workaround web properties which block WordPress "Pingbacks". + * Why? The UA string used for pingback requests contains `WordPress/` which is very similar + * to that used as the default UA string by the WP HTTP API. Therefore requests from this + * REST endpoint are being unintentionally blocked as they are misidentified as pingback requests. + * By slightly modifying the UA string, but still retaining the "WordPress" identification (via "WP") + * we are able to work around this issue. + * Example UA string: `WP-URLDetails/5.9-alpha-51389 (+http://localhost:8888)`. + */ + $modified_user_agent = 'WP-URLDetails/' . get_bloginfo( 'version' ) . ' (+' . get_bloginfo( 'url' ) . ')'; + + $args = array( + 'limit_response_size' => 150 * KB_IN_BYTES, + 'user-agent' => $modified_user_agent, + ); + + /** + * Filters the HTTP request args for URL data retrieval. + * + * Can be used to adjust response size limit and other WP_Http::request args. + * + * @param array $args Arguments used for the HTTP request + * @param string $url The attempted URL. + */ + $args = apply_filters( 'rest_url_details_http_request_args', $args, $url ); + + $response = wp_safe_remote_get( $url, $args ); + + if ( WP_Http::OK !== wp_remote_retrieve_response_code( $response ) ) { + // Not saving the error response to cache since the error might be temporary. + return new WP_Error( 'no_response', __( 'URL not found. Response returned a non-200 status code for this URL.' ), array( 'status' => WP_Http::NOT_FOUND ) ); + } + + $remote_body = wp_remote_retrieve_body( $response ); + + if ( empty( $remote_body ) ) { + return new WP_Error( 'no_content', __( 'Unable to retrieve body from response at this URL.' ), array( 'status' => WP_Http::NOT_FOUND ) ); + } + + return $remote_body; + } + + /** + * Parses the `<title>` contents from the provided HTML. + * + * @since 5.9.0 + * + * @param string $html The HTML from the remote website at URL. + * @return string The title tag contents on success, or an empty string. + */ + private function get_title( $html ) { + $pattern = '#<title[^>]*>(.*?)<\s*/\s*title>#is'; + preg_match( $pattern, $html, $match_title ); + + $title = ! empty( $match_title[1] ) && is_string( $match_title[1] ) ? trim( $match_title[1] ) : ''; + + if ( empty( $title ) ) { + return ''; + } + + return $this->prepare_metadata_for_output( $title ); + } + + /** + * Parses the site icon from the provided HTML. + * + * @since 5.9.0 + * + * @param string $html The HTML from the remote website at URL. + * @param string $url The target website URL. + * @return string The icon URI on success, or an empty string. + */ + private function get_icon( $html, $url ) { + // Grab the icon's link element. + $pattern = '#<link\s[^>]*rel=(?:[\"\']??)\s*(?:icon|shortcut icon|icon shortcut)\s*(?:[\"\']??)[^>]*\/?>#isU'; + preg_match( $pattern, $html, $element ); + $element = ! empty( $element[0] ) && is_string( $element[0] ) ? trim( $element[0] ) : ''; + if ( empty( $element ) ) { + return ''; + } + + // Get the icon's href value. + $pattern = '#href=([\"\']??)([^\" >]*?)\\1[^>]*#isU'; + preg_match( $pattern, $element, $icon ); + $icon = ! empty( $icon[2] ) && is_string( $icon[2] ) ? trim( $icon[2] ) : ''; + if ( empty( $icon ) ) { + return ''; + } + + // If the icon is a data URL, return it. + $parsed_icon = parse_url( $icon ); + if ( isset( $parsed_icon['scheme'] ) && 'data' === $parsed_icon['scheme'] ) { + return $icon; + } + + // Attempt to convert relative URLs to absolute. + if ( ! is_string( $url ) || '' === $url ) { + return $icon; + } + $parsed_url = parse_url( $url ); + if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) { + $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; + $icon = WP_Http::make_absolute_url( $icon, $root_url ); + } + + return $icon; + } + + /** + * Parses the meta description from the provided HTML. + * + * @since 5.9.0 + * + * @param array $meta_elements { + * A multi-dimensional indexed array on success, or empty array. + * + * @type string[] 0 Meta elements with a content attribute. + * @type string[] 1 Content attribute's opening quotation mark. + * @type string[] 2 Content attribute's value for each meta element. + * } + * @return string The meta description contents on success, or an empty string. + */ + private function get_description( $meta_elements ) { + // Bail out if there are no meta elements. + if ( empty( $meta_elements[0] ) ) { + return ''; + } + + $description = $this->get_metadata_from_meta_element( $meta_elements, 'name', '(?:description|og:description)' ); + + // Bail out if description not found. + if ( '' === $description ) { + return ''; + } + + return $this->prepare_metadata_for_output( $description ); + } + + /** + * Parses the Open Graph Image from the provided HTML. + * + * See: https://ogp.me/. + * + * @since 5.9.0 + * + * @param array $meta_elements { + * A multi-dimensional indexed array on success, or empty array. + * + * @type string[] 0 Meta elements with a content attribute. + * @type string[] 1 Content attribute's opening quotation mark. + * @type string[] 2 Content attribute's value for each meta element. + * } + * @param string $url The target website URL. + * @return string The OG image on success, or empty string. + */ + private function get_image( $meta_elements, $url ) { + $image = $this->get_metadata_from_meta_element( $meta_elements, 'property', '(?:og:image|og:image:url)' ); + + // Bail out if image not found. + if ( '' === $image ) { + return ''; + } + + // Attempt to convert relative URLs to absolute. + $parsed_url = parse_url( $url ); + if ( isset( $parsed_url['scheme'] ) && isset( $parsed_url['host'] ) ) { + $root_url = $parsed_url['scheme'] . '://' . $parsed_url['host'] . '/'; + $image = WP_Http::make_absolute_url( $image, $root_url ); + } + + return $image; + } + + /** + * Prepares the metadata by: + * - stripping all HTML tags and tag entities. + * - converting non-tag entities into characters. + * + * @since 5.9.0 + * + * @param string $metadata The metadata content to prepare. + * @return string The prepared metadata. + */ + private function prepare_metadata_for_output( $metadata ) { + $metadata = html_entity_decode( $metadata, ENT_QUOTES, get_bloginfo( 'charset' ) ); + $metadata = wp_strip_all_tags( $metadata ); + return $metadata; + } + + /** + * Utility function to build cache key for a given URL. + * + * @since 5.9.0 + * + * @param string $url The URL for which to build a cache key. + * @return string The cache key. + */ + private function build_cache_key_for_url( $url ) { + return 'g_url_details_response_' . md5( $url ); + } + + /** + * Utility function to retrieve a value from the cache at a given key. + * + * @since 5.9.0 + * + * @param string $key The cache key. + * @return mixed The value from the cache. + */ + private function get_cache( $key ) { + return get_transient( $key ); + } + + /** + * Utility function to cache a given data set at a given cache key. + * + * @since 5.9.0 + * + * @param string $key The cache key under which to store the value. + * @param string $data The data to be stored at the given cache key. + * @return bool True when transient set. False if fails. + */ + private function set_cache( $key, $data = '' ) { + $ttl = HOUR_IN_SECONDS; + + /** + * Filters the cache expiration. + * + * Can be used to adjust the time until expiration in seconds for the cache + * of the data retrieved for the given URL. + * + * @param int $ttl the time until cache expiration in seconds. + */ + $cache_expiration = apply_filters( 'rest_url_details_cache_expiration', $ttl ); + + return set_transient( $key, $data, $cache_expiration ); + } + + /** + * Retrieves the `<head>` section. + * + * @since 5.9.0 + * + * @param string $html The string of HTML to parse. + * @return string The `<head>..</head>` section on success, or original HTML. + */ + private function get_document_head( $html ) { + $head_html = $html; + + // Find the opening `<head>` tag. + $head_start = strpos( $html, '<head' ); + if ( false === $head_start ) { + // Didn't find it. Return the original HTML. + return $html; + } + + // Find the closing `</head>` tag. + $head_end = strpos( $head_html, '</head>' ); + if ( false === $head_end ) { + // Didn't find it. Find the opening `<body>` tag. + $head_end = strpos( $head_html, '<body' ); + + // Didn't find it. Return the original HTML. + if ( false === $head_end ) { + return $html; + } + } + + // Extract the HTML from opening tag to the closing tag. Then add the closing tag. + $head_html = substr( $head_html, $head_start, $head_end ); + $head_html .= '</head>'; + + return $head_html; + } + + /** + * Gets all the `<meta>` elements that have a `content` attribute. + * + * @since 5.9.0 + * + * @param string $html The string of HTML to be parsed. + * @return array { + * A multi-dimensional indexed array on success, or empty array. + * + * @type string[] 0 Meta elements with a content attribute. + * @type string[] 1 Content attribute's opening quotation mark. + * @type string[] 2 Content attribute's value for each meta element. + * } + */ + private function get_meta_with_content_elements( $html ) { + /* + * Parse all meta elements with a content attribute. + * + * Why first search for the content attribute rather than directly searching for name=description element? + * tl;dr The content attribute's value will be truncated when it contains a > symbol. + * + * The content attribute's value (i.e. the description to get) can have HTML in it and be well-formed as + * it's a string to the browser. Imagine what happens when attempting to match for the name=description + * first. Hmm, if a > or /> symbol is in the content attribute's value, then it terminates the match + * as the element's closing symbol. But wait, it's in the content attribute and is not the end of the + * element. This is a limitation of using regex. It can't determine "wait a minute this is inside of quotation". + * If this happens, what gets matched is not the entire element or all of the content. + * + * Why not search for the name=description and then content="(.*)"? + * The attribute order could be opposite. Plus, additional attributes may exist including being between + * the name and content attributes. + * + * Why not lookahead? + * Lookahead is not constrained to stay within the element. The first <meta it finds may not include + * the name or content, but rather could be from a different element downstream. + */ + $pattern = '#<meta\s' . + + /* + * Alows for additional attributes before the content attribute. + * Searches for anything other than > symbol. + */ + '[^>]*' . + + /* + * Find the content attribute. When found, capture its value (.*). + * + * Allows for (a) single or double quotes and (b) whitespace in the value. + * + * Why capture the opening quotation mark, i.e. (["\']), and then backreference, + * i.e \1, for the closing quotation mark? + * To ensure the closing quotation mark matches the opening one. Why? Attribute values + * can contain quotation marks, such as an apostrophe in the content. + */ + 'content=(["\']??)(.*)\1' . + + /* + * Allows for additional attributes after the content attribute. + * Searches for anything other than > symbol. + */ + '[^>]*' . + + /* + * \/?> searches for the closing > symbol, which can be in either /> or > format. + * # ends the pattern. + */ + '\/?>#' . + + /* + * These are the options: + * - i : case insensitive + * - s : allows newline characters for the . match (needed for multiline elements) + * - U means non-greedy matching + */ + 'isU'; + + preg_match_all( $pattern, $html, $elements ); + + return $elements; + } + + /** + * Gets the metadata from a target meta element. + * + * @since 5.9.0 + * + * @param array $meta_elements { + * A multi-dimensional indexed array on success, or empty array. + * + * @type string[] 0 Meta elements with a content attribute. + * @type string[] 1 Content attribute's opening quotation mark. + * @type string[] 2 Content attribute's value for each meta element. + * } + * @param string $attr Attribute that identifies the element with the target metadata. + * @param string $attr_value The attribute's value that identifies the element with the target metadata. + * @return string The metadata on success, or an empty string. + */ + private function get_metadata_from_meta_element( $meta_elements, $attr, $attr_value ) { + // Bail out if there are no meta elements. + if ( empty( $meta_elements[0] ) ) { + return ''; + } + + $metadata = ''; + $pattern = '#' . + /* + * Target this attribute and value to find the metadata element. + * + * Allows for (a) no, single, double quotes and (b) whitespace in the value. + * + * Why capture the opening quotation mark, i.e. (["\']), and then backreference, + * i.e \1, for the closing quotation mark? + * To ensure the closing quotation mark matches the opening one. Why? Attribute values + * can contain quotation marks, such as an apostrophe in the content. + */ + $attr . '=([\"\']??)\s*' . $attr_value . '\s*\1' . + + /* + * These are the options: + * - i : case insensitive + * - s : allows newline characters for the . match (needed for multiline elements) + * - U means non-greedy matching + */ + '#isU'; + + // Find the metdata element. + foreach ( $meta_elements[0] as $index => $element ) { + preg_match( $pattern, $element, $match ); + + // This is not the metadata element. Skip it. + if ( empty( $match ) ) { + continue; + } + + /* + * Found the metadata element. + * Get the metadata from its matching content array. + */ + if ( isset( $meta_elements[2][ $index ] ) && is_string( $meta_elements[2][ $index ] ) ) { + $metadata = trim( $meta_elements[2][ $index ] ); + } + + break; + } + + return $metadata; + } +} diff --git a/wp-includes/version.php b/wp-includes/version.php index 5971abcc5d..46dc06ac09 100644 --- a/wp-includes/version.php +++ b/wp-includes/version.php @@ -16,7 +16,7 @@ * * @global string $wp_version */ -$wp_version = '5.9-alpha-51972'; +$wp_version = '5.9-alpha-51973'; /** * Holds the WordPress DB revision, increments when changes are made to the WordPress DB schema. diff --git a/wp-settings.php b/wp-settings.php index 99e77f3868..c9b82dc39e 100644 --- a/wp-settings.php +++ b/wp-settings.php @@ -276,6 +276,7 @@ require ABSPATH . WPINC . '/rest-api/endpoints/class-wp-rest-sidebars-controller require ABSPATH . WPINC . '/rest-api/endpoints/class-wp-rest-widget-types-controller.php'; require ABSPATH . WPINC . '/rest-api/endpoints/class-wp-rest-widgets-controller.php'; require ABSPATH . WPINC . '/rest-api/endpoints/class-wp-rest-templates-controller.php'; +require ABSPATH . WPINC . '/rest-api/endpoints/class-wp-rest-url-details-controller.php'; require ABSPATH . WPINC . '/rest-api/fields/class-wp-rest-meta-fields.php'; require ABSPATH . WPINC . '/rest-api/fields/class-wp-rest-comment-meta-fields.php'; require ABSPATH . WPINC . '/rest-api/fields/class-wp-rest-post-meta-fields.php';