update readability library

This commit is contained in:
Andrew Dolgov 2019-08-16 12:53:25 +03:00
parent 0e3b71c535
commit 24f55d5b91
4 changed files with 102 additions and 51 deletions

View File

@ -166,32 +166,6 @@ class Configuration
return $this;
}
/**
* @deprecated Use getCharThreshold. Will be removed in version 2.0
*
* @return int
*/
public function getWordThreshold()
{
@trigger_error('getWordThreshold was replaced with getCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
return $this->charThreshold;
}
/**
* @param int $charThreshold
*
* @return $this
*/
public function setWordThreshold($charThreshold)
{
@trigger_error('setWordThreshold was replaced with setCharThreshold and will be removed in version 3.0', E_USER_DEPRECATED);
$this->charThreshold = $charThreshold;
return $this;
}
/**
* @return bool
*/

View File

@ -181,11 +181,11 @@ trait NodeTrait
/**
* Override for native hasAttribute.
*
* @see getAttribute
*
* @param $attributeName
*
* @return bool
*
* @see getAttribute
*/
public function hasAttribute($attributeName)
{
@ -317,10 +317,14 @@ trait NodeTrait
*
* @param bool $filterEmptyDOMText Filter empty DOMText nodes?
*
* @deprecated Use NodeUtility::filterTextNodes, function will be removed in version 3.0
*
* @return array
*/
public function getChildren($filterEmptyDOMText = false)
{
@trigger_error('getChildren was replaced with NodeUtility::filterTextNodes and will be removed in version 3.0', E_USER_DEPRECATED);
$ret = iterator_to_array($this->childNodes);
if ($filterEmptyDOMText) {
// Array values is used to discard the key order. Needs to be 0 to whatever without skipping any number
@ -418,12 +422,12 @@ trait NodeTrait
public function hasSingleTagInsideElement($tag)
{
// There should be exactly 1 element child with given tag
if (count($children = $this->getChildren(true)) !== 1 || $children[0]->nodeName !== $tag) {
if (count($children = NodeUtility::filterTextNodes($this->childNodes)) !== 1 || $children->item(0)->nodeName !== $tag) {
return false;
}
// And there should be no text nodes with real content
return array_reduce($children, function ($carry, $child) {
return array_reduce(iterator_to_array($children), function ($carry, $child) {
if (!$carry === false) {
return false;
}
@ -443,7 +447,7 @@ trait NodeTrait
{
$result = false;
if ($this->hasChildNodes()) {
foreach ($this->getChildren() as $child) {
foreach ($this->childNodes as $child) {
if (in_array($child->nodeName, $this->divToPElements)) {
$result = true;
} else {
@ -500,18 +504,22 @@ trait NodeTrait
);
}
/**
* In the original JS project they check if the node has the style display=none, which unfortunately
* in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
*
* Might be a good idea to check for classes or other attributes like 'aria-hidden'
*
* @return bool
*/
public function isProbablyVisible()
{
/*
* In the original JS project they check if the node has the style display=none, which unfortunately
* in our case we have no way of knowing that. So we just check for the attribute hidden or "display: none".
*
* Might be a good idea to check for classes or other attributes like 'aria-hidden'
*/
return !preg_match('/display:( )?none/', $this->getAttribute('style')) && !$this->hasAttribute('hidden');
}
/**
* @return bool
*/
public function isWhitespace()
{
return ($this->nodeType === XML_TEXT_NODE && mb_strlen(trim($this->textContent)) === 0) ||
@ -557,4 +565,23 @@ trait NodeTrait
$count -= ($count - $nodes->length);
}
}
/**
* Mimics JS's firstElementChild property. PHP only has firstChild which could be any type of DOMNode. Use this
* function to get the first one that is an DOMElement node.
*
* @return \DOMElement|null
*/
public function getFirstElementChild()
{
if ($this->childNodes instanceof \Traversable) {
foreach ($this->childNodes as $node) {
if ($node instanceof \DOMElement) {
return $node;
}
}
}
return null;
}
}

View File

@ -5,6 +5,7 @@ namespace andreskrey\Readability\Nodes;
use andreskrey\Readability\Nodes\DOM\DOMDocument;
use andreskrey\Readability\Nodes\DOM\DOMElement;
use andreskrey\Readability\Nodes\DOM\DOMNode;
use andreskrey\Readability\Nodes\DOM\DOMNodeList;
/**
* Class NodeUtility.
@ -157,4 +158,23 @@ class NodeUtility
return ($originalNode) ? $originalNode->nextSibling : $originalNode;
}
/**
* Remove all empty DOMNodes from DOMNodeLists.
*
* @param \DOMNodeList $list
*
* @return DOMNodeList
*/
public static function filterTextNodes(\DOMNodeList $list)
{
$newList = new DOMNodeList();
foreach ($list as $node) {
if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) {
$newList->add($node);
}
}
return $newList;
}
}

View File

@ -56,6 +56,13 @@ class Readability
*/
protected $author = null;
/**
* Website name.
*
* @var string|null
*/
protected $siteName = null;
/**
* Direction of the text.
*
@ -287,10 +294,10 @@ class Readability
$values = [];
// property is a space-separated list of values
$propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image)\s*/i';
$propertyPattern = '/\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|image|site_name)(?!:)\s*/i';
// name is a single value
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image)\s*$/i';
$namePattern = '/^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|image|site_name)(?!:)\s*$/i';
// Find description tags.
foreach ($this->dom->getElementsByTagName('meta') as $meta) {
@ -332,7 +339,6 @@ class Readability
* This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s.
* Will probably replace it with ??s after dropping support of PHP5.6
*/
$key = current(array_intersect([
'dc:title',
'dcterm:title',
@ -373,11 +379,18 @@ class Readability
// get main image
$key = current(array_intersect([
'image',
'og:image',
'twitter:image'
], array_keys($values)));
$this->setImage(isset($values[$key]) ? $values[$key] : null);
$key = current(array_intersect([
'og:site_name'
], array_keys($values)));
$this->setSiteName(isset($values[$key]) ? $values[$key] : null);
}
/**
@ -722,7 +735,7 @@ class Readability
*/
if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) {
$this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));
$pNode = $node->getChildren(true)[0];
$pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0);
$node->parentNode->replaceChild($pNode, $node);
$node = $pNode;
$elementsToScore[] = $node;
@ -1082,7 +1095,7 @@ class Readability
// If the top candidate is the only child, use parent instead. This will help sibling
// joining logic when adjacent content is actually located in parent's sibling node.
$parentOfTopCandidate = $topCandidate->parentNode;
while ($parentOfTopCandidate->nodeName !== 'body' && count($parentOfTopCandidate->getChildren(true)) === 1) {
while ($parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) {
$topCandidate = $parentOfTopCandidate;
$parentOfTopCandidate = $topCandidate->parentNode;
}
@ -1102,14 +1115,16 @@ class Readability
$siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2);
// Keep potential top candidate's parent node to try to get text direction of it later.
$parentOfTopCandidate = $topCandidate->parentNode;
$siblings = $parentOfTopCandidate->getChildren();
$siblings = $parentOfTopCandidate->childNodes;
$hasContent = false;
$this->logger->info('[Rating] Adding top candidate siblings...');
/** @var DOMElement $sibling */
foreach ($siblings as $sibling) {
/* @var DOMElement $sibling */
// Can't foreach here because down there we might change the tag name and that causes the foreach to skip items
for ($i = 0; $i < $siblings->length; $i++) {
$sibling = $siblings[$i];
$append = false;
if ($sibling === $topCandidate) {
@ -1147,7 +1162,6 @@ class Readability
* We have a node that isn't a common block level element, like a form or td tag.
* Turn it into a div so it doesn't get filtered out later by accident.
*/
$sibling = NodeUtility::setNodeTag($sibling, 'div');
}
@ -1266,11 +1280,11 @@ class Readability
// Remove single-cell tables
foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {
/** @var DOMNode $table */
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->childNodes[0] : $table;
$tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table;
if ($tbody->hasSingleTagInsideElement('tr')) {
$row = $tbody->firstChild;
$row = $tbody->getFirstElementChild();
if ($row->hasSingleTagInsideElement('td')) {
$cell = $row->firstChild;
$cell = $row->getFirstElementChild();
$cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) {
return $node->isPhrasingContent() && $carry;
}, true)) ? 'p' : 'div');
@ -1597,7 +1611,7 @@ class Readability
$node->removeAttribute('class');
}
for ($node = $node->firstChild; $node !== null; $node = $node->nextSibling) {
for ($node = $node->getFirstElementChild(); $node !== null; $node = $node->nextSibling) {
$this->_cleanClasses($node);
}
}
@ -1756,6 +1770,22 @@ class Readability
$this->author = $author;
}
/**
* @return string|null
*/
public function getSiteName()
{
return $this->siteName;
}
/**
* @param string $siteName
*/
protected function setSiteName($siteName)
{
$this->siteName = $siteName;
}
/**
* @return null|string
*/