From dff479af649005c819e16ed8e75278ab50c955b3 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Fri, 21 May 2021 15:39:41 +0300 Subject: [PATCH] feeditem_atom: support xml:base for enclosures and entry content UrlHelper::rewrite_relative: use base URL path if relative url path is not absolute (experimental) --- classes/feeditem/atom.php | 51 ++++++++++++++++++++++++++++++++++----- classes/urlhelper.php | 21 ++++++++++------ include/functions.php | 4 +-- 3 files changed, 60 insertions(+), 16 deletions(-) diff --git a/classes/feeditem/atom.php b/classes/feeditem/atom.php index a03080981..51358f36c 100755 --- a/classes/feeditem/atom.php +++ b/classes/feeditem/atom.php @@ -60,43 +60,76 @@ class FeedItem_Atom extends FeedItem_Common { } } + /** $base is optional (returns $content if $base is null), $content is an HTML string */ + private function rewrite_content_to_base($base, $content) { + + if (!empty($base) && !empty($content)) { + + $tmpdoc = new DOMDocument(); + if (@$tmpdoc->loadHTML('' . $content)) { + $tmpxpath = new DOMXPath($tmpdoc); + + $elems = $tmpxpath->query("(//*[@href]|//*[@src])"); + + foreach ($elems as $elem) { + if ($elem->hasAttribute("href")) { + $elem->setAttribute("href", + UrlHelper::rewrite_relative($base, $elem->getAttribute("href"))); + } else if ($elem->hasAttribute("src")) { + $elem->setAttribute("src", + UrlHelper::rewrite_relative($base, $elem->getAttribute("src"))); + } + } + + return $tmpdoc->saveXML(); + } + } + + return $content; + } + function get_content() { $content = $this->elem->getElementsByTagName("content")->item(0); if ($content) { + $base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $content); + if ($content->hasAttribute('type')) { if ($content->getAttribute('type') == 'xhtml') { for ($i = 0; $i < $content->childNodes->length; $i++) { $child = $content->childNodes->item($i); if ($child->hasChildNodes()) { - return $this->doc->saveHTML($child); + return $this->rewrite_content_to_base($base, $this->doc->saveHTML($child)); } } } } - return $this->subtree_or_text($content); + return $this->rewrite_content_to_base($base, $this->subtree_or_text($content)); } } + // TODO: duplicate code should be merged with get_content() function get_description() { $content = $this->elem->getElementsByTagName("summary")->item(0); if ($content) { + $base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $content); + if ($content->hasAttribute('type')) { if ($content->getAttribute('type') == 'xhtml') { for ($i = 0; $i < $content->childNodes->length; $i++) { $child = $content->childNodes->item($i); if ($child->hasChildNodes()) { - return $this->doc->saveHTML($child); + return $this->rewrite_content_to_base($base, $this->doc->saveHTML($child)); } } } } - return $this->subtree_or_text($content); + return $this->rewrite_content_to_base($base, $this->subtree_or_text($content)); } } @@ -122,16 +155,22 @@ class FeedItem_Atom extends FeedItem_Common { function get_enclosures() { $links = $this->elem->getElementsByTagName("link"); - $encs = array(); + $encs = []; foreach ($links as $link) { if ($link && $link->hasAttribute("href") && $link->hasAttribute("rel")) { + $base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $link); + if ($link->getAttribute("rel") == "enclosure") { $enc = new FeedEnclosure(); $enc->type = clean($link->getAttribute("type")); - $enc->link = clean($link->getAttribute("href")); $enc->length = clean($link->getAttribute("length")); + $enc->link = clean($link->getAttribute("href")); + + if (!empty($base)) { + $enc->link = UrlHelper::rewrite_relative($base, $enc->link); + } array_push($encs, $enc); } diff --git a/classes/urlhelper.php b/classes/urlhelper.php index edfb2ad73..136046701 100644 --- a/classes/urlhelper.php +++ b/classes/urlhelper.php @@ -20,14 +20,14 @@ class UrlHelper { } /** - * Converts a (possibly) relative URL to a absolute one. + * Converts a (possibly) relative URL to a absolute one, using provided base URL. * - * @param string $url Base URL (i.e. from where the document is) + * @param string $base_url Base URL (i.e. from where the document is) * @param string $rel_url Possibly relative URL in the document * * @return string Absolute URL */ - public static function rewrite_relative($url, $rel_url) { + public static function rewrite_relative($base_url, $rel_url) { $rel_parts = parse_url($rel_url); @@ -40,14 +40,19 @@ class UrlHelper { # allow magnet links return $rel_url; } else { - $parts = parse_url($url); + $base_parts = parse_url($base_url); - $rel_parts['host'] = $parts['host']; - $rel_parts['scheme'] = $parts['scheme']; + $rel_parts['host'] = $base_parts['host']; + $rel_parts['scheme'] = $base_parts['scheme']; if (isset($rel_parts['path'])) { - if (strpos($rel_parts['path'], '/') !== 0) - $rel_parts['path'] = '/' . $rel_parts['path']; + + // experimental: if relative url path is not absolute (i.e. starting with /) concatenate it using base url path + // (i'm not sure if it's a good idea) + + if (strpos($rel_parts['path'], '/') !== 0) { + $rel_parts['path'] = with_trailing_slash($base_parts['path']) . $rel_parts['path']; + } $rel_parts['path'] = str_replace("/./", "/", $rel_parts['path']); $rel_parts['path'] = str_replace("//", "/", $rel_parts['path']); diff --git a/include/functions.php b/include/functions.php index e8f41d56a..694b8f398 100644 --- a/include/functions.php +++ b/include/functions.php @@ -193,8 +193,8 @@ } /** function is @deprecated */ - function rewrite_relative_url($url, $rel_url) { - return UrlHelper::rewrite_relative($url, $rel_url); + function rewrite_relative_url($base_url, $rel_url) { + return UrlHelper::rewrite_relative($base_url, $rel_url); } /** function is @deprecated */