'/-ad-|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i', 'okMaybeItsACandidate' => '/and|article|body|column|main|shadow/i', 'extraneous' => '/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i', 'byline' => '/byline|author|dateline|writtenby|p-author/i', 'replaceFonts' => '/<(\/?)font[^>]*>/gi', 'normalize' => '/\s{2,}/', 'videos' => '/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i', 'nextLink' => '/(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i', 'prevLink' => '/(prev|earl|old|new|<|«)/i', 'whitespace' => '/^\s*$/', 'hasContent' => '/\S$/', 'positive' => '/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i', 'negative' => '/hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i', // \x{00A0} is the unicode version of   'onlyWhitespace' => '/\x{00A0}|\s+/u' ]; /** * Imported from the Element class on league\html-to-markdown. * * @param $node * * @return DOMElement */ public static function nextElement($node) { $next = $node; while ($next && $next->nodeType !== XML_ELEMENT_NODE && $next->isWhitespace()) { $next = $next->nextSibling; } return $next; } /** * Changes the node tag name. Since tagName on DOMElement is a read only value, this must be done creating a new * element with the new tag name and importing it to the main DOMDocument. * * @param DOMNode $node * @param string $value * @param bool $importAttributes * * @return DOMNode */ public static function setNodeTag($node, $value, $importAttributes = true) { $new = new DOMDocument('1.0', 'utf-8'); $new->appendChild($new->createElement($value)); $children = $node->childNodes; /** @var $children \DOMNodeList $i */ for ($i = 0; $i < $children->length; $i++) { $import = $new->importNode($children->item($i), true); $new->firstChild->appendChild($import); } if ($importAttributes) { // Import attributes from the original node. foreach ($node->attributes as $attribute) { $new->firstChild->setAttribute($attribute->nodeName, $attribute->nodeValue); } } // The import must be done on the firstChild of $new, since $new is a DOMDocument and not a DOMElement. $import = $node->ownerDocument->importNode($new->firstChild, true); $node->parentNode->replaceChild($import, $node); return $import; } /** * Removes the current node and returns the next node to be parsed (child, sibling or parent). * * @param DOMNode $node * * @return DOMNode */ public static function removeAndGetNext($node) { $nextNode = self::getNextNode($node, true); $node->parentNode->removeChild($node); return $nextNode; } /** * Remove the selected node. * * @param $node DOMElement * * @return void **/ public static function removeNode($node) { $parent = $node->parentNode; if ($parent) { $parent->removeChild($node); } } /** * Returns the next node. First checks for children (if the flag allows it), then for siblings, and finally * for parents. * * @param DOMNode $originalNode * @param bool $ignoreSelfAndKids * * @return DOMNode */ public static function getNextNode($originalNode, $ignoreSelfAndKids = false) { /* * Traverse the DOM from node to node, starting at the node passed in. * Pass true for the second parameter to indicate this node itself * (and its kids) are going away, and we want the next node over. * * Calling this in a loop will traverse the DOM depth-first. */ // First check for kids if those aren't being ignored if (!$ignoreSelfAndKids && $originalNode->firstChild) { return $originalNode->firstChild; } // Then for siblings... if ($originalNode->nextSibling) { return $originalNode->nextSibling; } // And finally, move up the parent chain *and* find a sibling // (because this is depth-first traversal, we will have already // seen the parent nodes themselves). do { $originalNode = $originalNode->parentNode; } while ($originalNode && !$originalNode->nextSibling); return ($originalNode) ? $originalNode->nextSibling : $originalNode; } /** * Remove all empty DOMNodes from DOMNodeLists. * * @param \DOMNodeList $list * * @return DOMNodeList */ public static function filterTextNodes(\DOMNodeList $list) { $newList = new DOMNodeList(); foreach ($list as $node) { if ($node->nodeType !== XML_TEXT_NODE || mb_strlen(trim($node->nodeValue))) { $newList->add($node); } } return $newList; } }