af_readability: add missing file

This commit is contained in:
Andrew Dolgov 2019-08-16 15:29:24 +03:00
parent 865c54abcb
commit 3e4701116d
11 changed files with 202 additions and 66 deletions

View File

@ -88,7 +88,7 @@ class Backend extends Handler {
} }
function help() { function help() {
$topic = basename(clean($_REQUEST["topic"])); // only one for now $topic = clean_filename($_REQUEST["topic"]); // only one for now
if ($topic == "main") { if ($topic == "main") {
$info = get_hotkeys_info(); $info = get_hotkeys_info();

View File

@ -1203,30 +1203,30 @@ class Handler_Public extends Handler {
public function pluginhandler() { public function pluginhandler() {
$host = new PluginHost(); $host = new PluginHost();
$plugin = basename(clean($_REQUEST["plugin"])); $plugin_name = clean_filename($_REQUEST["plugin"]);
$method = clean($_REQUEST["pmethod"]); $method = clean($_REQUEST["pmethod"]);
$host->load($plugin, PluginHost::KIND_USER, 0); $host->load($plugin_name, PluginHost::KIND_USER, 0);
$host->load_data(); $host->load_data();
$pclass = $host->get_plugin($plugin); $plugin = $host->get_plugin($plugin_name);
if ($pclass) { if ($plugin) {
if (method_exists($pclass, $method)) { if (method_exists($plugin, $method)) {
if ($pclass->is_public_method($method)) { if ($plugin->is_public_method($method)) {
$pclass->$method(); $plugin->$method();
} else { } else {
user_error("pluginhandler: Requested private method '$method' of plugin '$plugin'."); user_error("PluginHandler[PUBLIC]: Requested private method '$method' of plugin '$plugin_name'.", E_USER_WARNING);
header("Content-Type: text/json"); header("Content-Type: text/json");
print error_json(6); print error_json(6);
} }
} else { } else {
user_error("pluginhandler: Requested unknown method '$method' of plugin '$plugin'."); user_error("PluginHandler[PUBLIC]: Requested unknown method '$method' of plugin '$plugin_name'.", E_USER_WARNING);
header("Content-Type: text/json"); header("Content-Type: text/json");
print error_json(13); print error_json(13);
} }
} else { } else {
user_error("pluginhandler: Requested method '$method' of unknown plugin '$plugin'."); user_error("PluginHandler[PUBLIC]: Requested method '$method' of unknown plugin '$plugin_name'.", E_USER_WARNING);
header("Content-Type: text/json"); header("Content-Type: text/json");
print error_json(14); print error_json(14);
} }

View File

@ -5,15 +5,18 @@ class PluginHandler extends Handler_Protected {
} }
function catchall($method) { function catchall($method) {
$plugin = PluginHost::getInstance()->get_plugin(clean($_REQUEST["plugin"])); $plugin_name = clean($_REQUEST["plugin"]);
$plugin = PluginHost::getInstance()->get_plugin($plugin_name);
if ($plugin) { if ($plugin) {
if (method_exists($plugin, $method)) { if (method_exists($plugin, $method)) {
$plugin->$method(); $plugin->$method();
} else { } else {
user_error("PluginHandler: Requested unknown method '$method' of plugin '$plugin_name'.", E_USER_WARNING);
print error_json(13); print error_json(13);
} }
} else { } else {
user_error("PluginHandler: Requested method '$method' of unknown plugin '$plugin_name'.", E_USER_WARNING);
print error_json(14); print error_json(14);
} }
} }

View File

@ -186,7 +186,7 @@ class PluginHost {
foreach ($plugins as $class) { foreach ($plugins as $class) {
$class = trim($class); $class = trim($class);
$class_file = strtolower(basename($class)); $class_file = strtolower(clean_filename($class));
if (!is_dir(__DIR__."/../plugins/$class_file") && if (!is_dir(__DIR__."/../plugins/$class_file") &&
!is_dir(__DIR__."/../plugins.local/$class_file")) continue; !is_dir(__DIR__."/../plugins.local/$class_file")) continue;

View File

@ -572,7 +572,7 @@ class RPC extends Handler_Protected {
function log() { function log() {
$msg = clean($_REQUEST['msg']); $msg = clean($_REQUEST['msg']);
$file = basename(clean($_REQUEST['file'])); $file = clean_filename($_REQUEST['file']);
$line = (int) clean($_REQUEST['line']); $line = (int) clean($_REQUEST['line']);
$context = clean($_REQUEST['context']); $context = clean($_REQUEST['context']);

View File

@ -593,7 +593,7 @@
} }
function clean_filename($filename) { function clean_filename($filename) {
return basename(preg_replace("/\.\.|[\/\\\]/", "", $filename)); return basename(preg_replace("/\.\.|[\/\\\]/", "", clean($filename)));
} }
function make_password($length = 12) { function make_password($length = 12) {

View File

@ -166,32 +166,6 @@ class Configuration
return $this; return $this;
} }
/**
* @deprecated Use getCharThreshold. Will be removed in version 2.0
*
* @return int
*/
public function getWordThreshold()
{
@trigger_error('getWordThreshold was replaced with getCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
return $this->charThreshold;
}
/**
* @param int $charThreshold
*
* @return $this
*/
public function setWordThreshold($charThreshold)
{
@trigger_error('setWordThreshold was replaced with setCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
$this->charThreshold = $charThreshold;
return $this;
}
/** /**
* @return bool * @return bool
*/ */

View File

@ -0,0 +1,82 @@
<?php
namespace andreskrey\Readability\Nodes\DOM;
/**
* Class DOMNodeList.
*
* This is a fake DOMNodeList class that allows adding items to the list. The original class is static and the nodes
* are defined automagically when instantiating it. This fake version behaves exactly the same way but adds the function
* add() that allows to insert new DOMNodes into the DOMNodeList.
*
* It cannot extend the original DOMNodeList class because the functionality behind the property ->length is hidden
* from the user and cannot be extended, changed, or tweaked.
*/
class DOMNodeList implements \Countable, \IteratorAggregate
{
/**
* @var array
*/
protected $items = [];
/**
* @var int
*/
protected $length = 0;
/**
* To allow access to length in the same way that DOMNodeList allows.
*
* {@inheritdoc}
*/
public function __get($name)
{
switch ($name) {
case 'length':
return $this->length;
default:
trigger_error(sprintf('Undefined property: %s::%s', static::class, $name));
}
}
/**
* @param DOMNode|DOMElement|DOMComment $node
*
* @return DOMNodeList
*/
public function add($node)
{
$this->items[] = $node;
$this->length++;
return $this;
}
/**
* @param int $offset
*
* @return DOMNode|DOMElement|DOMComment
*/
public function item(int $offset)
{
return $this->items[$offset];
}
/**
* @return int|void
*/
public function count(): int
{
return $this->length;
}
/**
* To make it compatible with iterator_to_array() function.
*
* {@inheritdoc}
*/
public function getIterator(): \ArrayIterator
{
return new \ArrayIterator($this->items);
}
}

View File

@ -181,11 +181,11 @@ trait NodeTrait
/** /**
* Override for native hasAttribute. * Override for native hasAttribute.
* *
* @see getAttribute
*
* @param $attributeName * @param $attributeName
* *
* @return bool * @return bool
*
* @see getAttribute
*/ */
public function hasAttribute($attributeName) public function hasAttribute($attributeName)
{ {
@ -317,10 +317,14 @@ trait NodeTrait
* *
* @param bool $filterEmptyDOMText Filter empty DOMText nodes? * @param bool $filterEmptyDOMText Filter empty DOMText nodes?
* *
* @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0
*
* @return array * @return array
*/ */
public function getChildren($filterEmptyDOMText = false) public function getChildren($filterEmptyDOMText = false)
{ {
@trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED);
$ret = iterator_to_array($this->childNodes); $ret = iterator_to_array($this->childNodes);
if ($filterEmptyDOMText) { if ($filterEmptyDOMText) {
// Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number // Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number
@ -418,12 +422,12 @@ trait NodeTrait
public function hasSingleTagInsideElement($tag) public function hasSingleTagInsideElement($tag)
{ {
// There should be exactly 1 element child with given tag // There should be exactly 1 element child with given tag
if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) { if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) {
return false; return false;
} }
// And there should be no text nodes with real content // And there should be no text nodes with real content
return array_reduce($children, function ($carry, $child) { return array_reduce(iterator_to_array($children), function ($carry, $child) {
if (!$carry === false) { if (!$carry === false) {
return false; return false;
} }
@ -443,7 +447,7 @@ trait NodeTrait
{ {
$result = false; $result = false;
if ($this->hasChildNodes()) { if ($this->hasChildNodes()) {
foreach ($this->getChildren() as $child) { foreach ($this->childNodes as $child) {
if (in_array($child->nodeName, $this->divToPElements)) { if (in_array($child->nodeName, $this->divToPElements)) {
$result = true; $result = true;
} else { } else {
@ -500,18 +504,22 @@ trait NodeTrait
); );
} }
/**
* In the original JS project they check if the node has the style display=none, which unfortunately
* in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
*
* Might be a good idea to check for classes or other attributes like 'aria-hidden'
*
* @return bool
*/
public function isProbablyVisible() public function isProbablyVisible()
{ {
/*
* In the original JS project they check if the node has the style display=none, which unfortunately
* in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
*
* Might be a good idea to check for classes or other attributes like 'aria-hidden'
*/
return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden'); return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden');
} }
/**
* @return bool
*/
public function isWhitespace() public function isWhitespace()
{ {
return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) || return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
@ -557,4 +565,23 @@ trait NodeTrait
$count -= ($count - $nodes->length); $count -= ($count - $nodes->length);
} }
} }
/**
* Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this
* function to get the first one that is an DOMElement node.
*
* @return \DOMElement|null
*/
public function getFirstElementChild()
{
if ($this->childNodes instanceof \Traversable) {
foreach ($this->childNodes as $node) {
if ($node instanceof \DOMElement) {
return $node;
}
}
}
return null;
}
} }

View File

@ -5,6 +5,7 @@ namespace andreskrey\Readability\Nodes;
use andreskrey\Readability\Nodes\DOM\DOMDocument; use andreskrey\Readability\Nodes\DOM\DOMDocument;
use andreskrey\Readability\Nodes\DOM\DOMElement; use andreskrey\Readability\Nodes\DOM\DOMElement;
use andreskrey\Readability\Nodes\DOM\DOMNode; use andreskrey\Readability\Nodes\DOM\DOMNode;
use andreskrey\Readability\Nodes\DOM\DOMNodeList;
/** /**
* Class NodeUtility. * Class NodeUtility.
@ -157,4 +158,23 @@ class NodeUtility
return ($originalNode) ? $originalNode->nextSibling : $originalNode; return ($originalNode) ? $originalNode->nextSibling : $originalNode;
} }
/**
* Remove all empty DOMNodes from DOMNodeLists.
*
* @param \DOMNodeList $list
*
* @return DOMNodeList
*/
public static function filterTextNodes(\DOMNodeList $list)
{
$newList = new DOMNodeList();
foreach ($list as $node) {
if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) {
$newList->add($node);
}
}
return $newList;
}
} }

View File

@ -56,6 +56,13 @@ class Readability
*/ */
protected $author = null; protected $author = null;
/**
* Website name.
*
* @var string|null
*/
protected $siteName = null;
/** /**
* Direction of the text. * Direction of the text.
* *
@ -287,10 +294,10 @@ class Readability
$values = []; $values = [];
// property is a space-separated list of values // property is a space-separated list of values
$propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image)\s*/i'; $propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i';
// name is a single value // name is a single value
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image)\s*$/i'; $namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i';
// Find description tags. // Find description tags.
foreach ($this->dom->getElementsByTagName('meta') as $meta) { foreach ($this->dom->getElementsByTagName('meta') as $meta) {
@ -332,7 +339,6 @@ class Readability
* This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s. * This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s.
* Will probably replace it with ??s after dropping support of PHP5.6 * Will probably replace it with ??s after dropping support of PHP5.6
*/ */
$key = current(array_intersect([ $key = current(array_intersect([
'dc:title', 'dc:title',
'dcterm:title', 'dcterm:title',
@ -373,11 +379,18 @@ class Readability
// get main image // get main image
$key = current(array_intersect([ $key = current(array_intersect([
'image',
'og:image', 'og:image',
'twitter:image' 'twitter:image'
], array_keys($values))); ], array_keys($values)));
$this->setImage(isset($values[$key]) ? $values[$key] : null); $this->setImage(isset($values[$key]) ? $values[$key] : null);
$key = current(array_intersect([
'og:site_name'
], array_keys($values)));
$this->setSiteName(isset($values[$key]) ? $values[$key] : null);
} }
/** /**
@ -722,7 +735,7 @@ class Readability
*/ */
if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) { if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) {
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128))); $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
$pNode = $node->getChildren(true)[0]; $pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0);
$node->parentNode->replaceChild($pNode, $node); $node->parentNode->replaceChild($pNode, $node);
$node = $pNode; $node = $pNode;
$elementsToScore[] = $node; $elementsToScore[] = $node;
@ -1082,7 +1095,7 @@ class Readability
// If the top candidate is the only child, use parent instead. This will help sibling // If the top candidate is the only child, use parent instead. This will help sibling
// joining logic when adjacent content is actually located in parent's sibling node. // joining logic when adjacent content is actually located in parent's sibling node.
$parentOfTopCandidate = $topCandidate->parentNode; $parentOfTopCandidate = $topCandidate->parentNode;
while ($parentOfTopCandidate->nodeName !== 'body' && count($parentOfTopCandidate->getChildren(true)) === 1) { while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) {
$topCandidate = $parentOfTopCandidate; $topCandidate = $parentOfTopCandidate;
$parentOfTopCandidate = $topCandidate->parentNode; $parentOfTopCandidate = $topCandidate->parentNode;
} }
@ -1102,14 +1115,16 @@ class Readability
$siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2); $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2);
// Keep potential top candidate's parent node to try to get text direction of it later. // Keep potential top candidate's parent node to try to get text direction of it later.
$parentOfTopCandidate = $topCandidate->parentNode; $parentOfTopCandidate = $topCandidate->parentNode;
$siblings = $parentOfTopCandidate->getChildren(); $siblings = $parentOfTopCandidate->childNodes;
$hasContent = false; $hasContent = false;
$this->logger->info('[Rating] Adding top candidate siblings...'); $this->logger->info('[Rating] Adding top candidate siblings...');
/** @var DOMElement $sibling */ /* @var DOMElement $sibling */
foreach ($siblings as $sibling) { // Can't foreach here because down there we might change the tag name and that causes the foreach to skip items
for ($i = 0; $i < $siblings->length; $i++) {
$sibling = $siblings[$i];
$append = false; $append = false;
if ($sibling === $topCandidate) { if ($sibling === $topCandidate) {
@ -1147,7 +1162,6 @@ class Readability
* We have a node that isn't a common block level element, like a form or td tag. * We have a node that isn't a common block level element, like a form or td tag.
* Turn it into a div so it doesn't get filtered out later by accident. * Turn it into a div so it doesn't get filtered out later by accident.
*/ */
$sibling = NodeUtility::setNodeTag($sibling, 'div'); $sibling = NodeUtility::setNodeTag($sibling, 'div');
} }
@ -1266,11 +1280,11 @@ class Readability
// Remove single-cell tables // Remove single-cell tables
foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) { foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
/** @var DOMNode $table */ /** @var DOMNode $table */
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->childNodes[0] : $table; $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table;
if ($tbody->hasSingleTagInsideElement('tr')) { if ($tbody->hasSingleTagInsideElement('tr')) {
$row = $tbody->firstChild; $row = $tbody->getFirstElementChild();
if ($row->hasSingleTagInsideElement('td')) { if ($row->hasSingleTagInsideElement('td')) {
$cell = $row->firstChild; $cell = $row->getFirstElementChild();
$cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) { $cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) {
return $node->isPhrasingContent() && $carry; return $node->isPhrasingContent() && $carry;
}, true)) ? 'p' : 'div'); }, true)) ? 'p' : 'div');
@ -1597,7 +1611,7 @@ class Readability
$node->removeAttribute('class'); $node->removeAttribute('class');
} }
for ($node = $node->firstChild; $node !== null; $node = $node->nextSibling) { for ($node = $node->getFirstElementChild(); $node !== null; $node = $node->nextSibling) {
$this->_cleanClasses($node); $this->_cleanClasses($node);
} }
} }
@ -1756,6 +1770,22 @@ class Readability
$this->author = $author; $this->author = $author;
} }
/**
* @return string|null
*/
public function getSiteName()
{
return $this->siteName;
}
/**
* @param string $siteName
*/
protected function setSiteName($siteName)
{
$this->siteName = $siteName;
}
/** /**
* @return null|string * @return null|string
*/ */