feeditem_atom: support xml:base for enclosures and entry content

UrlHelper::rewrite_relative: use base URL path if relative url path is not absolute (experimental)
This commit is contained in:
Andrew Dolgov 2021-05-21 15:39:41 +03:00
parent d09a64d6f9
commit dff479af64
3 changed files with 60 additions and 16 deletions

View File

@ -60,43 +60,76 @@ class FeedItem_Atom extends FeedItem_Common {
} }
} }
/** $base is optional (returns $content if $base is null), $content is an HTML string */
private function rewrite_content_to_base($base, $content) {
if (!empty($base) && !empty($content)) {
$tmpdoc = new DOMDocument();
if (@$tmpdoc->loadHTML('<?xml encoding="UTF-8">' . $content)) {
$tmpxpath = new DOMXPath($tmpdoc);
$elems = $tmpxpath->query("(//*[@href]|//*[@src])");
foreach ($elems as $elem) {
if ($elem->hasAttribute("href")) {
$elem->setAttribute("href",
UrlHelper::rewrite_relative($base, $elem->getAttribute("href")));
} else if ($elem->hasAttribute("src")) {
$elem->setAttribute("src",
UrlHelper::rewrite_relative($base, $elem->getAttribute("src")));
}
}
return $tmpdoc->saveXML();
}
}
return $content;
}
function get_content() { function get_content() {
$content = $this->elem->getElementsByTagName("content")->item(0); $content = $this->elem->getElementsByTagName("content")->item(0);
if ($content) { if ($content) {
$base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $content);
if ($content->hasAttribute('type')) { if ($content->hasAttribute('type')) {
if ($content->getAttribute('type') == 'xhtml') { if ($content->getAttribute('type') == 'xhtml') {
for ($i = 0; $i < $content->childNodes->length; $i++) { for ($i = 0; $i < $content->childNodes->length; $i++) {
$child = $content->childNodes->item($i); $child = $content->childNodes->item($i);
if ($child->hasChildNodes()) { if ($child->hasChildNodes()) {
return $this->doc->saveHTML($child); return $this->rewrite_content_to_base($base, $this->doc->saveHTML($child));
} }
} }
} }
} }
return $this->subtree_or_text($content); return $this->rewrite_content_to_base($base, $this->subtree_or_text($content));
} }
} }
// TODO: duplicate code should be merged with get_content()
function get_description() { function get_description() {
$content = $this->elem->getElementsByTagName("summary")->item(0); $content = $this->elem->getElementsByTagName("summary")->item(0);
if ($content) { if ($content) {
$base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $content);
if ($content->hasAttribute('type')) { if ($content->hasAttribute('type')) {
if ($content->getAttribute('type') == 'xhtml') { if ($content->getAttribute('type') == 'xhtml') {
for ($i = 0; $i < $content->childNodes->length; $i++) { for ($i = 0; $i < $content->childNodes->length; $i++) {
$child = $content->childNodes->item($i); $child = $content->childNodes->item($i);
if ($child->hasChildNodes()) { if ($child->hasChildNodes()) {
return $this->doc->saveHTML($child); return $this->rewrite_content_to_base($base, $this->doc->saveHTML($child));
} }
} }
} }
} }
return $this->subtree_or_text($content); return $this->rewrite_content_to_base($base, $this->subtree_or_text($content));
} }
} }
@ -122,16 +155,22 @@ class FeedItem_Atom extends FeedItem_Common {
function get_enclosures() { function get_enclosures() {
$links = $this->elem->getElementsByTagName("link"); $links = $this->elem->getElementsByTagName("link");
$encs = array(); $encs = [];
foreach ($links as $link) { foreach ($links as $link) {
if ($link && $link->hasAttribute("href") && $link->hasAttribute("rel")) { if ($link && $link->hasAttribute("href") && $link->hasAttribute("rel")) {
$base = $this->xpath->evaluate("string(ancestor-or-self::*[@xml:base][1]/@xml:base)", $link);
if ($link->getAttribute("rel") == "enclosure") { if ($link->getAttribute("rel") == "enclosure") {
$enc = new FeedEnclosure(); $enc = new FeedEnclosure();
$enc->type = clean($link->getAttribute("type")); $enc->type = clean($link->getAttribute("type"));
$enc->link = clean($link->getAttribute("href"));
$enc->length = clean($link->getAttribute("length")); $enc->length = clean($link->getAttribute("length"));
$enc->link = clean($link->getAttribute("href"));
if (!empty($base)) {
$enc->link = UrlHelper::rewrite_relative($base, $enc->link);
}
array_push($encs, $enc); array_push($encs, $enc);
} }

View File

@ -20,14 +20,14 @@ class UrlHelper {
} }
/** /**
* Converts a (possibly) relative URL to a absolute one. * Converts a (possibly) relative URL to a absolute one, using provided base URL.
* *
* @param string $url Base URL (i.e. from where the document is) * @param string $base_url Base URL (i.e. from where the document is)
* @param string $rel_url Possibly relative URL in the document * @param string $rel_url Possibly relative URL in the document
* *
* @return string Absolute URL * @return string Absolute URL
*/ */
public static function rewrite_relative($url, $rel_url) { public static function rewrite_relative($base_url, $rel_url) {
$rel_parts = parse_url($rel_url); $rel_parts = parse_url($rel_url);
@ -40,14 +40,19 @@ class UrlHelper {
# allow magnet links # allow magnet links
return $rel_url; return $rel_url;
} else { } else {
$parts = parse_url($url); $base_parts = parse_url($base_url);
$rel_parts['host'] = $parts['host']; $rel_parts['host'] = $base_parts['host'];
$rel_parts['scheme'] = $parts['scheme']; $rel_parts['scheme'] = $base_parts['scheme'];
if (isset($rel_parts['path'])) { if (isset($rel_parts['path'])) {
if (strpos($rel_parts['path'], '/') !== 0)
$rel_parts['path'] = '/' . $rel_parts['path']; // experimental: if relative url path is not absolute (i.e. starting with /) concatenate it using base url path
// (i'm not sure if it's a good idea)
if (strpos($rel_parts['path'], '/') !== 0) {
$rel_parts['path'] = with_trailing_slash($base_parts['path']) . $rel_parts['path'];
}
$rel_parts['path'] = str_replace("/./", "/", $rel_parts['path']); $rel_parts['path'] = str_replace("/./", "/", $rel_parts['path']);
$rel_parts['path'] = str_replace("//", "/", $rel_parts['path']); $rel_parts['path'] = str_replace("//", "/", $rel_parts['path']);

View File

@ -193,8 +193,8 @@
} }
/** function is @deprecated */ /** function is @deprecated */
function rewrite_relative_url($url, $rel_url) { function rewrite_relative_url($base_url, $rel_url) {
return UrlHelper::rewrite_relative($url, $rel_url); return UrlHelper::rewrite_relative($base_url, $rel_url);
} }
/** function is @deprecated */ /** function is @deprecated */