From 00f622e9b7325973e07805b6f3c140e227d18588 Mon Sep 17 00:00:00 2001 From: Jeremy Benoist Date: Tue, 1 Mar 2016 14:30:27 +0100 Subject: [PATCH] Revert BC changes - avoid method signature update - revert moving logic out of the constructor --- src/Readability.php | 62 +++++++++++++++++++++++++-------------- tests/ReadabilityTest.php | 29 +++++++++++------- 2 files changed, 59 insertions(+), 32 deletions(-) diff --git a/src/Readability.php b/src/Readability.php index 2969c4b..6e09e66 100644 --- a/src/Readability.php +++ b/src/Readability.php @@ -174,14 +174,15 @@ class Readability implements LoggerAwareInterface * @param string (optional) Which parser to use for turning raw HTML into a DOMDocument * @param bool (optional) Use tidy */ - public function __construct($html, $url = null, $parser = 'libxml', $useTidy = true) + public function __construct($html, $url = null, $parser = 'libxml', $use_tidy = true) { $this->url = $url; $this->html = $html; $this->parser = $parser; - $this->useTidy = $useTidy && function_exists('tidy_parse_string'); + $this->useTidy = $use_tidy && function_exists('tidy_parse_string'); $this->logger = new NullLogger(); + $this->loadHtml(); } public function setLogger(LoggerInterface $logger) @@ -235,6 +236,8 @@ public function addPostFilter($filter, $replacer = '') * Load HTML in a DOMDocument. * Apply Pre filters * Cleanup HTML using Tidy (or not). + * + * @todo This should be called in init() instead of from __construct */ private function loadHtml() { @@ -266,7 +269,6 @@ private function loadHtml() * Use tidy (if it exists). * This fixes problems with some sites which would otherwise trouble DOMDocument's HTML parsing. * Although sometimes it makes matters worse, which is why there is an option to disable it. - * */ if ($this->useTidy) { $this->logger->debug('Tidying document'); @@ -314,8 +316,6 @@ private function loadHtml() */ public function init() { - $this->loadHtml(); - if (!isset($this->dom->documentElement)) { return false; } @@ -372,12 +372,31 @@ public function init() return $this->success; } + /** + * Debug. + * + * @deprecated use $this->logger->debug() instead + */ + protected function dbg($msg) + { + $this->logger->debug($msg); + } + + /** + * Dump debug info. + * + * @deprecated since Monolog gather log, we don't need it + */ + protected function dump_dbg() + { + } + /** * Run any post-process modifications to article content as necessary. * * @param \DOMElement $articleContent */ - public function postProcessContent(\DOMElement $articleContent) + public function postProcessContent($articleContent) { if ($this->convertLinksToFootnotes && !preg_match('/\bwiki/', $this->url)) { $this->addFootnotes($articleContent); @@ -462,7 +481,7 @@ protected function prepDocument() * * @param \DOMElement $articleContent */ - public function addFootnotes(\DOMElement $articleContent) + public function addFootnotes($articleContent) { $footnotesWrapper = $this->dom->createElement('footer'); $footnotesWrapper->setAttribute('class', 'readability-footnotes'); @@ -526,7 +545,7 @@ public function addFootnotes(\DOMElement $articleContent) * * @param \DOMElement $articleContent */ - public function prepArticle(\DOMElement $articleContent) + public function prepArticle($articleContent) { $this->logger->debug($this->lightClean ? 'Light clean enabled.' : 'Standard clean enabled.'); @@ -623,7 +642,7 @@ public function prepArticle(\DOMElement $articleContent) * * @param \DOMElement $node */ - protected function initializeNode(\DOMElement $node) + protected function initializeNode($node) { if (!isset($node->tagName)) { return; @@ -694,7 +713,7 @@ protected function initializeNode(\DOMElement $node) * * @return \DOMElement|bool */ - protected function grabArticle(\DOMElement $page = null) + protected function grabArticle($page = null) { if (!$page) { $page = $this->dom; @@ -743,8 +762,7 @@ protected function grabArticle(\DOMElement $page = null) continue; } - // XML_TEXT_NODE - if ($childNode->nodeType == 3) { + if ($childNode->nodeType === XML_TEXT_NODE) { $p = $this->dom->createElement('p'); $p->innerHTML = $childNode->nodeValue; $p->setAttribute('data-readability-styled', 'true'); @@ -770,7 +788,7 @@ protected function grabArticle(\DOMElement $page = null) continue; } - $grandParentNode = ($parentNode->parentNode instanceof \DOMElement) ? $parentNode->parentNode : null; + $grandParentNode = $parentNode->parentNode instanceof \DOMElement ? $parentNode->parentNode : null; $innerText = $this->getInnerText($nodesToScore[$pt]); // If this paragraph is less than MIN_PARAGRAPH_LENGTH (default:20) characters, don't even count it. @@ -1051,7 +1069,7 @@ protected function grabArticle(\DOMElement $page = null) * * @return string */ - public function getInnerText(\DOMElement $e = null, $normalizeSpaces = true, $flattenLines = false) + public function getInnerText($e, $normalizeSpaces = true, $flattenLines = false) { if (null === $e || !isset($e->textContent) || $e->textContent === '') { return ''; @@ -1073,7 +1091,7 @@ public function getInnerText(\DOMElement $e = null, $normalizeSpaces = true, $fl * * @param \DOMElement $e */ - public function cleanStyles(\DOMElement $e) + public function cleanStyles($e) { if (!is_object($e)) { return; @@ -1121,7 +1139,7 @@ public function getWordCount($text) * * @return int */ - public function getLinkDensity(\DOMElement $e, $excludeExternal = false) + public function getLinkDensity($e, $excludeExternal = false) { $links = $e->getElementsByTagName('a'); $textLength = mb_strlen($this->getInnerText($e, true, true)); @@ -1150,7 +1168,7 @@ public function getLinkDensity(\DOMElement $e, $excludeExternal = false) * * @return int */ - protected function weightAttribute(\DOMElement $element, $attribute) + protected function weightAttribute($element, $attribute) { if (!$element->hasAttribute($attribute)) { return 0; @@ -1185,7 +1203,7 @@ protected function weightAttribute(\DOMElement $element, $attribute) * * @return int */ - public function getWeight(\DOMElement $e) + public function getWeight($e) { if (!$this->flagIsActive(self::FLAG_WEIGHT_ATTRIBUTES)) { return 0; @@ -1205,7 +1223,7 @@ public function getWeight(\DOMElement $e) * * @param \DOMElement $node */ - public function killBreaks(\DOMElement $node) + public function killBreaks($node) { $html = $node->innerHTML; $html = preg_replace($this->regexps['killBreaks'], '
', $html); @@ -1221,7 +1239,7 @@ public function killBreaks(\DOMElement $node) * @param \DOMElement $e * @param string $tag */ - public function clean(\DOMElement $e, $tag) + public function clean($e, $tag) { $currentItem = null; $targetList = $e->getElementsByTagName($tag); @@ -1257,7 +1275,7 @@ public function clean(\DOMElement $e, $tag) * @param \DOMElement $e * @param string $tag */ - public function cleanConditionally(\DOMElement $e, $tag) + public function cleanConditionally($e, $tag) { if (!$this->flagIsActive(self::FLAG_CLEAN_CONDITIONALLY)) { return; @@ -1370,7 +1388,7 @@ public function cleanConditionally(\DOMElement $e, $tag) * * @param \DOMElement $e */ - public function cleanHeaders(\DOMElement $e) + public function cleanHeaders($e) { for ($headerIndex = 1; $headerIndex < 3; ++$headerIndex) { $headers = $e->getElementsByTagName('h'.$headerIndex); diff --git a/tests/ReadabilityTest.php b/tests/ReadabilityTest.php index d10f7e1..90932e7 100644 --- a/tests/ReadabilityTest.php +++ b/tests/ReadabilityTest.php @@ -22,40 +22,47 @@ private function getReadability($html, $url = null, $parser = 'libxml', $useTidy return $readability; } + /** + * @requires extension tidy + */ + public function testConstructDefault() + { + $readability = $this->getReadability(''); + + $this->assertNull($readability->url); + $this->assertInstanceOf('DomDocument', $readability->dom); + } + + /** + * @requires extension tidy + */ public function testConstructSimple() { $readability = $this->getReadability('', 'http://0.0.0.0'); - $readability->init(); $this->assertEquals('http://0.0.0.0', $readability->url); + $this->assertInstanceOf('DomDocument', $readability->dom); $this->assertEquals('', $readability->original_html); $this->assertTrue($readability->tidied); - - $this->assertTrue($this->logHandler->hasDebugThatContains('Parsing URL: http://0.0.0.0')); - $this->assertTrue($this->logHandler->hasDebugThatContains('Tidying document')); - $this->assertTrue($this->logHandler->hasDebugThatContains('Light clean enabled.')); } public function testConstructDefaultWithoutTidy() { $readability = $this->getReadability('', null, 'libxml', false); - $readability->init(); $this->assertNull($readability->url); $this->assertEquals('', $readability->original_html); $this->assertFalse($readability->tidied); - $this->assertTrue($this->logHandler->hasDebugThatContains('Parsing URL: ')); - $this->assertFalse($this->logHandler->hasDebugThatContains('Tidying document')); - $this->assertTrue($this->logHandler->hasDebugThatContains('Light clean enabled.')); + $this->assertInstanceOf('DomDocument', $readability->dom); } public function testConstructSimpleWithoutTidy() { $readability = $this->getReadability('', 'http://0.0.0.0', 'libxml', false); - $readability->init(); $this->assertEquals('http://0.0.0.0', $readability->url); + $this->assertInstanceOf('DomDocument', $readability->dom); $this->assertEquals('', $readability->original_html); $this->assertFalse($readability->tidied); } @@ -447,6 +454,8 @@ public function testPostFilters() public function testPreFilters() { + $this->markTestSkipped('Won\'t work until loadHtml() is moved in init() instead of __construct()'); + $readability = $this->getReadability('
'.str_repeat('

This is the awesome and WONDERFUL content :)

', 7).'
', 'http://0.0.0.0'); $readability->addPreFilter('!]*>(.*?)!is', '');