get_feeds_from_html: remove XML preamble hack

move several related helper functions to Feeds class
This commit is contained in:
Andrew Dolgov 2019-06-20 07:51:48 +03:00
parent 270b39a337
commit 6d746453c7
4 changed files with 88 additions and 115 deletions

View File

@ -1135,9 +1135,9 @@ class Feeds extends Handler_Protected {
$pdo = Db::pdo();
$url = fix_url($url);
$url = Feeds::fix_url($url);
if (!$url || !validate_feed_url($url)) return array("code" => 2);
if (!$url || !Feeds::validate_feed_url($url)) return array("code" => 2);
$contents = @fetch_file_contents($url, false, $auth_login, $auth_pass);
@ -1153,8 +1153,8 @@ class Feeds extends Handler_Protected {
return array("code" => 5, "message" => $fetch_last_error);
}
if (mb_strpos($fetch_last_content_type, "html") !== FALSE && is_html($contents)) {
$feedUrls = get_feeds_from_html($url, $contents);
if (mb_strpos($fetch_last_content_type, "html") !== FALSE && Feeds::is_html($contents)) {
$feedUrls = Feeds::get_feeds_from_html($url, $contents);
if (count($feedUrls) == 0) {
return array("code" => 3);
@ -1923,5 +1923,88 @@ class Feeds extends Handler_Protected {
return $colormap[$sum];
}
static function get_feeds_from_html($url, $content) {
$url = Feeds::fix_url($url);
$baseUrl = substr($url, 0, strrpos($url, '/') + 1);
libxml_use_internal_errors(true);
$feedUrls = [];
$doc = new DOMDocument();
if ($doc->loadHTML($content)) {
$xpath = new DOMXPath($doc);
$entries = $xpath->query('/html/head/link[@rel="alternate" and '.
'(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]');
foreach ($entries as $entry) {
if ($entry->hasAttribute('href')) {
$title = $entry->getAttribute('title');
if ($title == '') {
$title = $entry->getAttribute('type');
}
$feedUrl = rewrite_relative_url(
$baseUrl, $entry->getAttribute('href')
);
$feedUrls[$feedUrl] = $title;
}
}
}
return $feedUrls;
}
static function is_html($content) {
return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 8192)) !== 0;
}
static function validate_feed_url($url) {
$parts = parse_url($url);
return ($parts['scheme'] == 'http' || $parts['scheme'] == 'feed' || $parts['scheme'] == 'https');
}
/**
* Fixes incomplete URLs by prepending "http://".
* Also replaces feed:// with http://, and
* prepends a trailing slash if the url is a domain name only.
*
* @param string $url Possibly incomplete URL
*
* @return string Fixed URL.
*/
static function fix_url($url) {
// support schema-less urls
if (strpos($url, '//') === 0) {
$url = 'https:' . $url;
}
if (strpos($url, '://') === false) {
$url = 'http://' . $url;
} else if (substr($url, 0, 5) == 'feed:') {
$url = 'http:' . substr($url, 5);
}
//prepend slash if the URL has no slash in it
// "http://www.example" -> "http://www.example/"
if (strpos($url, '/', strpos($url, ':') + 3) === false) {
$url .= '/';
}
//convert IDNA hostname to punycode if possible
if (function_exists("idn_to_ascii")) {
$parts = parse_url($url);
if (mb_detect_encoding($parts['host']) != 'ASCII')
{
$parts['host'] = idn_to_ascii($parts['host']);
$url = build_url($parts);
}
}
if ($url != "http:///")
return $url;
else
return '';
}
}

View File

@ -728,18 +728,6 @@ class Handler_Public extends Handler {
}
}
/* function subtest() {
header("Content-type: text/plain; charset=utf-8");
$url = clean($_REQUEST["url"]);
print "$url\n\n";
print_r(get_feeds_from_html($url, fetch_file_contents($url)));
} */
function subscribe() {
if (SINGLE_USER_MODE) {
login_sequence();

View File

@ -1708,7 +1708,7 @@ class Pref_Feeds extends Handler_Protected {
foreach ($feeds as $feed) {
$feed = trim($feed);
if (validate_feed_url($feed)) {
if (Feeds::validate_feed_url($feed)) {
$this->pdo->beginTransaction();

View File

@ -2017,68 +2017,6 @@
return false;
}
/**
* Fixes incomplete URLs by prepending "http://".
* Also replaces feed:// with http://, and
* prepends a trailing slash if the url is a domain name only.
*
* @param string $url Possibly incomplete URL
*
* @return string Fixed URL.
*/
function fix_url($url) {
// support schema-less urls
if (strpos($url, '//') === 0) {
$url = 'https:' . $url;
}
if (strpos($url, '://') === false) {
$url = 'http://' . $url;
} else if (substr($url, 0, 5) == 'feed:') {
$url = 'http:' . substr($url, 5);
}
//prepend slash if the URL has no slash in it
// "http://www.example" -> "http://www.example/"
if (strpos($url, '/', strpos($url, ':') + 3) === false) {
$url .= '/';
}
//convert IDNA hostname to punycode if possible
if (function_exists("idn_to_ascii")) {
$parts = parse_url($url);
if (mb_detect_encoding($parts['host']) != 'ASCII')
{
$parts['host'] = idn_to_ascii($parts['host']);
$url = build_url($parts);
}
}
if ($url != "http:///")
return $url;
else
return '';
}
function validate_feed_url($url) {
$parts = parse_url($url);
return ($parts['scheme'] == 'http' || $parts['scheme'] == 'feed' || $parts['scheme'] == 'https');
}
/* function save_email_address($email) {
// FIXME: implement persistent storage of emails
if (!$_SESSION['stored_emails'])
$_SESSION['stored_emails'] = array();
if (!in_array($email, $_SESSION['stored_emails']))
array_push($_SESSION['stored_emails'], $email);
} */
function get_feed_access_key($feed_id, $is_cat, $owner_uid = false) {
if (!$owner_uid) $owner_uid = $_SESSION["uid"];
@ -2107,42 +2045,6 @@
}
}
function get_feeds_from_html($url, $content)
{
$url = fix_url($url);
$baseUrl = substr($url, 0, strrpos($url, '/') + 1);
libxml_use_internal_errors(true);
$doc = new DOMDocument();
$doc->loadHTML('<?xml encoding="UTF-8">' . $content);
$xpath = new DOMXPath($doc);
$entries = $xpath->query('/html/head/link[@rel="alternate" and '.
'(contains(@type,"rss") or contains(@type,"atom"))]|/html/head/link[@rel="feed"]');
$feedUrls = array();
foreach ($entries as $entry) {
if ($entry->hasAttribute('href')) {
$title = $entry->getAttribute('title');
if ($title == '') {
$title = $entry->getAttribute('type');
}
$feedUrl = rewrite_relative_url(
$baseUrl, $entry->getAttribute('href')
);
$feedUrls[$feedUrl] = $title;
}
}
return $feedUrls;
}
function is_html($content) {
return preg_match("/<html|DOCTYPE html/i", substr($content, 0, 8192)) !== 0;
}
function url_is_html($url, $login = false, $pass = false) {
return is_html(fetch_file_contents($url, false, $login, $pass));
}
function build_url($parts) {
return $parts['scheme'] . "://" . $parts['host'] . $parts['path'];
}