af_readability: add missing file

This commit is contained in:
Andrew Dolgov 2019-08-16 15:29:24 +03:00
parent 865c54abcb
commit 3e4701116d
11 changed files with 202 additions and 66 deletions

View File

@ -88,7 +88,7 @@ class Backend extends Handler {
}
function help() {
$topic = basename(clean($_REQUEST["topic"])); // only one for now
$topic = clean_filename($_REQUEST["topic"]); // only one for now
if ($topic == "main") {
$info = get_hotkeys_info();

View File

@ -1203,30 +1203,30 @@ class Handler_Public extends Handler {
public function pluginhandler() {
$host = new PluginHost();
$plugin = basename(clean($_REQUEST["plugin"]));
$plugin_name = clean_filename($_REQUEST["plugin"]);
$method = clean($_REQUEST["pmethod"]);
$host->load($plugin, PluginHost::KIND_USER, 0);
$host->load($plugin_name, PluginHost::KIND_USER, 0);
$host->load_data();
$pclass = $host->get_plugin($plugin);
$plugin = $host->get_plugin($plugin_name);
if ($pclass) {
if (method_exists($pclass, $method)) {
if ($pclass->is_public_method($method)) {
$pclass->$method();
if ($plugin) {
if (method_exists($plugin, $method)) {
if ($plugin->is_public_method($method)) {
$plugin->$method();
} else {
user_error("pluginhandler: Requested private method '$method' of plugin '$plugin'.");
user_error("PluginHandler[PUBLIC]: Requested private method '$method' of plugin '$plugin_name'.", E_USER_WARNING);
header("Content-Type: text/json");
print error_json(6);
}
} else {
user_error("pluginhandler: Requested unknown method '$method' of plugin '$plugin'.");
user_error("PluginHandler[PUBLIC]: Requested unknown method '$method' of plugin '$plugin_name'.", E_USER_WARNING);
header("Content-Type: text/json");
print error_json(13);
}
} else {
user_error("pluginhandler: Requested method '$method' of unknown plugin '$plugin'.");
user_error("PluginHandler[PUBLIC]: Requested method '$method' of unknown plugin '$plugin_name'.", E_USER_WARNING);
header("Content-Type: text/json");
print error_json(14);
}

View File

@ -5,15 +5,18 @@ class PluginHandler extends Handler_Protected {
}
function catchall($method) {
$plugin = PluginHost::getInstance()->get_plugin(clean($_REQUEST["plugin"]));
$plugin_name = clean($_REQUEST["plugin"]);
$plugin = PluginHost::getInstance()->get_plugin($plugin_name);
if ($plugin) {
if (method_exists($plugin, $method)) {
$plugin->$method();
} else {
user_error("PluginHandler: Requested unknown method '$method' of plugin '$plugin_name'.", E_USER_WARNING);
print error_json(13);
}
} else {
user_error("PluginHandler: Requested method '$method' of unknown plugin '$plugin_name'.", E_USER_WARNING);
print error_json(14);
}
}

View File

@ -186,7 +186,7 @@ class PluginHost {
foreach ($plugins as $class) {
$class = trim($class);
$class_file = strtolower(basename($class));
$class_file = strtolower(clean_filename($class));
if (!is_dir(__DIR__."/../plugins/$class_file") &&
!is_dir(__DIR__."/../plugins.local/$class_file")) continue;

View File

@ -572,7 +572,7 @@ class RPC extends Handler_Protected {
function log() {
$msg = clean($_REQUEST['msg']);
$file = basename(clean($_REQUEST['file']));
$file = clean_filename($_REQUEST['file']);
$line = (int) clean($_REQUEST['line']);
$context = clean($_REQUEST['context']);

View File

@ -593,7 +593,7 @@
}
function clean_filename($filename) {
return basename(preg_replace("/\.\.|[\/\\\]/", "", $filename));
return basename(preg_replace("/\.\.|[\/\\\]/", "", clean($filename)));
}
function make_password($length = 12) {

View File

@ -166,32 +166,6 @@ class Configuration
return $this;
}
/**
* @deprecated Use getCharThreshold. Will be removed in version 2.0
*
* @return int
*/
public function getWordThreshold()
{
@trigger_error('getWordThreshold was replaced with getCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
return $this->charThreshold;
}
/**
* @param int $charThreshold
*
* @return $this
*/
public function setWordThreshold($charThreshold)
{
@trigger_error('setWordThreshold was replaced with setCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
$this->charThreshold = $charThreshold;
return $this;
}
/**
* @return bool
*/

View File

@ -0,0 +1,82 @@
<?php
namespace andreskrey\Readability\Nodes\DOM;
/**
* Class DOMNodeList.
*
* This is a fake DOMNodeList class that allows adding items to the list. The original class is static and the nodes
* are defined automagically when instantiating it. This fake version behaves exactly the same way but adds the function
* add() that allows to insert new DOMNodes into the DOMNodeList.
*
* It cannot extend the original DOMNodeList class because the functionality behind the property ->length is hidden
* from the user and cannot be extended, changed, or tweaked.
*/
class DOMNodeList implements \Countable, \IteratorAggregate
{
/**
* @var array
*/
protected $items = [];
/**
* @var int
*/
protected $length = 0;
/**
* To allow access to length in the same way that DOMNodeList allows.
*
* {@inheritdoc}
*/
public function __get($name)
{
switch ($name) {
case 'length':
return $this->length;
default:
trigger_error(sprintf('Undefined property: %s::%s', static::class, $name));
}
}
/**
* @param DOMNode|DOMElement|DOMComment $node
*
* @return DOMNodeList
*/
public function add($node)
{
$this->items[] = $node;
$this->length++;
return $this;
}
/**
* @param int $offset
*
* @return DOMNode|DOMElement|DOMComment
*/
public function item(int $offset)
{
return $this->items[$offset];
}
/**
* @return int|void
*/
public function count(): int
{
return $this->length;
}
/**
* To make it compatible with iterator_to_array() function.
*
* {@inheritdoc}
*/
public function getIterator(): \ArrayIterator
{
return new \ArrayIterator($this->items);
}
}

View File

@ -181,11 +181,11 @@ trait NodeTrait
/**
* Override for native hasAttribute.
*
* @see getAttribute
*
* @param $attributeName
*
* @return bool
*
* @see getAttribute
*/
public function hasAttribute($attributeName)
{
@ -317,10 +317,14 @@ trait NodeTrait
*
* @param bool $filterEmptyDOMText Filter empty DOMText nodes?
*
* @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0
*
* @return array
*/
public function getChildren($filterEmptyDOMText = false)
{
@trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED);
$ret = iterator_to_array($this->childNodes);
if ($filterEmptyDOMText) {
// Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number
@ -418,12 +422,12 @@ trait NodeTrait
public function hasSingleTagInsideElement($tag)
{
// There should be exactly 1 element child with given tag
if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) {
if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) {
return false;
}
// And there should be no text nodes with real content
return array_reduce($children, function ($carry, $child) {
return array_reduce(iterator_to_array($children), function ($carry, $child) {
if (!$carry === false) {
return false;
}
@ -443,7 +447,7 @@ trait NodeTrait
{
$result = false;
if ($this->hasChildNodes()) {
foreach ($this->getChildren() as $child) {
foreach ($this->childNodes as $child) {
if (in_array($child->nodeName, $this->divToPElements)) {
$result = true;
} else {
@ -500,18 +504,22 @@ trait NodeTrait
);
}
public function isProbablyVisible()
{
/*
/**
* In the original JS project they check if the node has the style display=none, which unfortunately
* in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
*
* Might be a good idea to check for classes or other attributes like 'aria-hidden'
*
* @return bool
*/
public function isProbablyVisible()
{
return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden');
}
/**
* @return bool
*/
public function isWhitespace()
{
return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
@ -557,4 +565,23 @@ trait NodeTrait
$count -= ($count - $nodes->length);
}
}
/**
* Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this
* function to get the first one that is an DOMElement node.
*
* @return \DOMElement|null
*/
public function getFirstElementChild()
{
if ($this->childNodes instanceof \Traversable) {
foreach ($this->childNodes as $node) {
if ($node instanceof \DOMElement) {
return $node;
}
}
}
return null;
}
}

View File

@ -5,6 +5,7 @@ namespace andreskrey\Readability\Nodes;
use andreskrey\Readability\Nodes\DOM\DOMDocument;
use andreskrey\Readability\Nodes\DOM\DOMElement;
use andreskrey\Readability\Nodes\DOM\DOMNode;
use andreskrey\Readability\Nodes\DOM\DOMNodeList;
/**
* Class NodeUtility.
@ -157,4 +158,23 @@ class NodeUtility
return ($originalNode) ? $originalNode->nextSibling : $originalNode;
}
/**
* Remove all empty DOMNodes from DOMNodeLists.
*
* @param \DOMNodeList $list
*
* @return DOMNodeList
*/
public static function filterTextNodes(\DOMNodeList $list)
{
$newList = new DOMNodeList();
foreach ($list as $node) {
if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) {
$newList->add($node);
}
}
return $newList;
}
}

View File

@ -56,6 +56,13 @@ class Readability
*/
protected $author = null;
/**
* Website name.
*
* @var string|null
*/
protected $siteName = null;
/**
* Direction of the text.
*
@ -287,10 +294,10 @@ class Readability
$values = [];
// property is a space-separated list of values
$propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image)\s*/i';
$propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i';
// name is a single value
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image)\s*$/i';
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i';
// Find description tags.
foreach ($this->dom->getElementsByTagName('meta') as $meta) {
@ -332,7 +339,6 @@ class Readability
* This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s.
* Will probably replace it with ??s after dropping support of PHP5.6
*/
$key = current(array_intersect([
'dc:title',
'dcterm:title',
@ -373,11 +379,18 @@ class Readability
// get main image
$key = current(array_intersect([
'image',
'og:image',
'twitter:image'
], array_keys($values)));
$this->setImage(isset($values[$key]) ? $values[$key] : null);
$key = current(array_intersect([
'og:site_name'
], array_keys($values)));
$this->setSiteName(isset($values[$key]) ? $values[$key] : null);
}
/**
@ -722,7 +735,7 @@ class Readability
*/
if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) {
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
$pNode = $node->getChildren(true)[0];
$pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0);
$node->parentNode->replaceChild($pNode, $node);
$node = $pNode;
$elementsToScore[] = $node;
@ -1082,7 +1095,7 @@ class Readability
// If the top candidate is the only child, use parent instead. This will help sibling
// joining logic when adjacent content is actually located in parent's sibling node.
$parentOfTopCandidate = $topCandidate->parentNode;
while ($parentOfTopCandidate->nodeName !== 'body' && count($parentOfTopCandidate->getChildren(true)) === 1) {
while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) {
$topCandidate = $parentOfTopCandidate;
$parentOfTopCandidate = $topCandidate->parentNode;
}
@ -1102,14 +1115,16 @@ class Readability
$siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2);
// Keep potential top candidate's parent node to try to get text direction of it later.
$parentOfTopCandidate = $topCandidate->parentNode;
$siblings = $parentOfTopCandidate->getChildren();
$siblings = $parentOfTopCandidate->childNodes;
$hasContent = false;
$this->logger->info('[Rating] Adding top candidate siblings...');
/** @var DOMElement $sibling */
foreach ($siblings as $sibling) {
/* @var DOMElement $sibling */
// Can't foreach here because down there we might change the tag name and that causes the foreach to skip items
for ($i = 0; $i < $siblings->length; $i++) {
$sibling = $siblings[$i];
$append = false;
if ($sibling === $topCandidate) {
@ -1147,7 +1162,6 @@ class Readability
* We have a node that isn't a common block level element, like a form or td tag.
* Turn it into a div so it doesn't get filtered out later by accident.
*/
$sibling = NodeUtility::setNodeTag($sibling, 'div');
}
@ -1266,11 +1280,11 @@ class Readability
// Remove single-cell tables
foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
/** @var DOMNode $table */
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->childNodes[0] : $table;
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table;
if ($tbody->hasSingleTagInsideElement('tr')) {
$row = $tbody->firstChild;
$row = $tbody->getFirstElementChild();
if ($row->hasSingleTagInsideElement('td')) {
$cell = $row->firstChild;
$cell = $row->getFirstElementChild();
$cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) {
return $node->isPhrasingContent() && $carry;
}, true)) ? 'p' : 'div');
@ -1597,7 +1611,7 @@ class Readability
$node->removeAttribute('class');
}
for ($node = $node->firstChild; $node !== null; $node = $node->nextSibling) {
for ($node = $node->getFirstElementChild(); $node !== null; $node = $node->nextSibling) {
$this->_cleanClasses($node);
}
}
@ -1756,6 +1770,22 @@ class Readability
$this->author = $author;
}
/**
* @return string|null
*/
public function getSiteName()
{
return $this->siteName;
}
/**
* @param string $siteName
*/
protected function setSiteName($siteName)
{
$this->siteName = $siteName;
}
/**
* @return null|string
*/