Home

Welcome to the metadata wiki!

Some old PHP code for HTML text extraction

HtmlContentExtractor

class HtmlContentExtractor
{
    private $dom;
    protected $xpath;

    /**
     * Constructor
     *
     * @param \DOMDocument $dom DOMDocument
     *
     * @throws \Exception
     */
    public function __construct(\DOMDocument $dom)
    {
        set_error_handler('Yaraku\Html\ErrorHandlerFunction');
        libxml_use_internal_errors(true);

        try {
            $this->dom = $dom;
            if (!$this->dom) {
                throw new \Exception("DOMDocument is invalid.");
            }
            $this->dom->encoding = 'UTF-8';
            $this->dom->formatOutput = false;
            $this->xpath = new \DOMXPath($this->dom);
        } catch (\Exception $e) {
            restore_error_handler();
            throw $e;
        }

        restore_error_handler();
    }

    /**
     * Get the map of node path and text
     *
     * @return array
     * @throws \Exception
     */
    public function getNodePathAndTextMap()
    {
        $textArray = array();

        $blocks = array();
        $elements = $this->xpath->query(
            "//*[name() != 'script' and name() != 'style'"
            ." and name() != 'code'"
            ." and not(@translate='no')]/text()"
        );
        $elementArray = GetDepthSortedDomNodeArrayFromDomNodeList($elements);
        /** @var \DOMNode $e */
        foreach ($elementArray as $e) {
            //$temp = self::whiteSpaceNormalization($e->C14N());
            $temp = self::whiteSpaceNormalization($this->dom->saveHTML($e));
            $temp = preg_replace("/<[^>]+>/u", "", $temp);
            $temp = preg_replace("/[\s\d]+/u", "", $temp);
            if ($temp !== '') {
                $nodePath = $e->getNodePath();
                $ancestorQuery
                    = "$nodePath/ancestor::p[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::a[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::div[not(descendant::table)"
                     ." and not(descendant::div)"
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::font[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::span[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::li[not(descendant::table)"
                     ." and not(descendant::li)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::dt[not(descendant::table)]"
                     ." | $nodePath/ancestor::dd[not(descendant::table)]"
                     ." | $nodePath/ancestor::td[not(descendant::table)"
                     ." and not(descendant::div) "
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::th[not(descendant::table)]"
                     ." | $nodePath/ancestor::b[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::i[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::u[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::tt[not(descendant::table)]"
                     ." | $nodePath/ancestor::blockquote[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strike[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::em[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strong[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::iframe[not(descendant::table)"
                     ." and not(descendant::div) and normalize-space(text())]"
                ;
                $ancestors = $this->xpath->query($ancestorQuery);
                if (!$ancestors) {
                    throw new \Exception(
                        "$ancestorQuery is an incorrect XPath query."
                    );
                } elseif (0 === $ancestors->length) {
                    //$blocks[$nodePath] = $e->C14N();
                    $blocks[$nodePath] = $this->dom->saveHTML($e);
                } else {
                    $isExtractedBlock = false;
                    for ($i = $ancestors->length - 1; $i >= 0; $i--) {
                        $blockNode = $ancestors->item($i);
                        //$blockOuterC14N = $blockNode->C14N();
                        $blockOuterC14N = $this->dom->saveHTML($blockNode);
                        $blockNodePath = $blockNode->getNodePath();
                        $headAndTheRest = preg_split(
                            "/^(<$blockNode->nodeName[^>]*>)/iu",
                            $blockOuterC14N,
                            2,
                            PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                        );
                        $head = $headAndTheRest[0];
                        $tail = "</$blockNode->nodeName>";

                        $key = "$blockNodePath|$head|$tail";
                        if (array_key_exists($key, $blocks)) {
                            $isExtractedBlock = true;
                            break;
                        }
                    }
                    if ($isExtractedBlock) {
                        continue;
                    }

                    $blockNode = $ancestors->item($ancestors->length - 1);
                    //$blockOuterC14N = $blockNode->C14N();
                    $blockOuterC14N = $this->dom->saveHTML($blockNode);
                    $blockNodePath = $blockNode->getNodePath();
                    $headAndTheRest = preg_split(
                        "/^(<$blockNode->nodeName[^>]*>)/iu",
                        $blockOuterC14N,
                        2,
                        PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                    );
                    $head = $headAndTheRest[0];
                    $tail = "</$blockNode->nodeName>";
                    $end = strripos($headAndTheRest[1], $tail);
                    $blockInnerC14N = substr($headAndTheRest[1], 0, $end);
                    $blocks["$blockNodePath|$head|$tail"] = $blockInnerC14N;
                }
            }
        }
        $blocks = array_reverse($blocks);
        foreach ($blocks as $path => $html) {
            $textArray[$path] = str_replace('&#xD;', '', $html);
        }

        $metae = $this->xpath->query(
            "/html/head/meta"
            ."[string(@content)"
            ." and ("
            ."@name='Description' or @name='description'"
            ." or @name='Keywords' or @name='keywords'"
            .")]"
        );
        /** @var \DOMElement $meta */
        foreach ($metae as $meta) {
            if (1 != preg_match("/^[\s\d]+$/u", $meta->getAttribute("content"))) {
                $textArray[$meta->getNodePath()]
                    = $meta->getAttribute("content");
            }
        }

        $inputs = $this->xpath->query(
            "//input"
            ."[string(@value)"
            ." and ("
            ."@type='button' or @type='Button'"
            ." or @type='reset' or @type='Reset'"
            ." or @type='search' or @type='Search'"
            ." or @type='submit' or @type='Submit'"
            ." or @type='text' or @type='Text'"
            .")]"
        );
        /** @var \DOMElement $input */
        foreach ($inputs as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("value"))) {
                $textArray[$input->getNodePath()]
                    = $input->getAttribute("value");
            }
        }
        $inputsWithPlaceholder
            = $this->xpath->query("//input[string(@placeholder)]");
        foreach ($inputsWithPlaceholder as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("placeholder"))
            ) {
                $textArray[$input->getNodePath() .'/@placeholder']
                    = $input->getAttribute("placeholder");
            }
        }
        //        $attributes =
        //            $this->m_xpath->query(
        //                "//*["
        //                    ."string(@abbr) or string(@alt) or string(@label)"
        //                    ." or string(@title) or string(@standby)"
        //                    ." or string(@summary)"
        //                ."]");
        //        foreach ($attributes as $a)
        //            $textArray[$a->getNodePath()] = $a->C14N();
        $attributeName = 'alt';
        $attributes = $this->xpath->query(
            "//*[string(@$attributeName)]/@$attributeName"
        );
        /** @var \DOMNode $a */
        foreach ($attributes as $a) {
            $textArray[$a->getNodePath()] = $a->nodeValue;
        }

        return $textArray;
    }

    /**
     * Get text array
     *
     * @return array
     */
    public function getTextArray()
    {
        return array_values($this->getNodePathAndTextMap());
    }

    /**
     * @return array
     * @throws \Exception
     */
    public function getTextArrayWithLineNumber()
    {
        $textWithLineNumberList = [];
        $nodePathAndTextMap = $this->getNodePathAndTextMap();
        foreach ($nodePathAndTextMap as $nodePathWithHeadTail => $text) {
            $nodePath = array_values(explode('|', $nodePathWithHeadTail))[0];
            /** @var \DOMNodeList $nodeList */
            $nodeList = $this->xpath->query($nodePath);
            $lineNumber = $nodeList->item(0)->getLineNo();
            $textWithLineNumberList[] = [$lineNumber, $text];
        }

        return $textWithLineNumberList;
    }

    /**
     * Get the map of node path and image
     *
     * @return array
     */
    public function getNodePathAndImageMap()
    {
        $imageArray = array();

        $images = $this->xpath->query("//img[string(@src)]");
        /** @var \DOMNode $i */
        foreach ($images as $i) {
            $imageArray[$i->getNodePath()]
                = $i->attributes->getNamedItem('src')->nodeValue;
        }

        return $imageArray;
    }

    /**
     * Get image array
     *
     * @return array
     */
    public function getImageArray()
    {
        return array_values($this->getNodePathAndImageMap());
    }

    /**
     * Convert HTML to a one line string that can be used as Json variable
     *
     * @param string $html         The html
     * @param bool   $jsonFriendly Prepare to use as Json variable
     *
     * @return string
     */
    public static function htmlToOneLineString($html, $jsonFriendly=true)
    {
        $html = preg_replace('~>\s+<~', '><', $html);
        $html = preg_replace('/^\s+|\n|\r|\s+$/um', '', $html);

        if ($jsonFriendly) {
            $html = str_replace('"', '\"', $html);
        }

        return $html;
    }

    /**
     * Encode the string into HTML Encoding format
     *
     * @param String $str Text String
     *
     * @return string
     */
    public static function encode($str)
    {
        $str = mb_convert_encoding($str, 'UTF-32', 'UTF-8');

        $t = unpack("N*", $str);

        $t = array_map(
            function ($n) {
                return "&#$n;";
            }, $t
        );

        return implode("", $t);
    }

    /**
     * Normalize white space inside the text
     *
     * @param String $text raw text
     *
     * @return String $text
     * @throws \Exception
     */
    public static function whiteSpaceNormalization($text)
    {
        // encode the text in decimal format
        $text = self::encode($text);

        // replace uncommon white space with ordinary white space
        $text = preg_replace(
            '/(\&\#5760\;|\&\#6158\;|'
            .'\&\#8192\;|\&\#8193\;|'
            .'\&\#8194\;|\&\#8195\;|\&\#8196\;|\&\#8197\;|\&\#8198\;|\&\#8199\;|'
            .'\&\#8200\;|\&\#8201\;|\&\#8204\;|\&\#8205\;|\&\#8206\;|\&\#8207\;|'
            .'\&\#8202\;|\&\#8239\;|\&\#8287\;|\&\#12288\;|\&\#10\;|'
            .'\&\#11\;|\&\#12\;|\&\#13\;|\&\#133\;|\&\#8232\;|\&\#8233\;'
            .'\&\#32\;|\&\#09\;|\&\#11\;|'
            .'\&\#160\;|\&\#9\;)+/u',
            " ",
            $text
        );

        // if $text is null, there is something wrong with the preg_replace function
        if (is_null($text)) {
            $errorCode = preg_last_error();
            throw new \Exception("preg_replace error code $errorCode.");
        }

        // decode the text again into the normal string
        $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');

        // replace all common white space in named HTML entities with ordinary white
        // space
        // references:
        // - http://www.w3schools.com/tags/ref_symbols.asp
        // - http://www.w3schools.com/tags/ref_entities.asp
        $text = trim(
            preg_replace(
                '/(\s|\&nbsp\;|\&\#xA0\;|\&uml\;|\&\#xA8\;|\&shy\;|\&\#xAD\;|'
                .'\&macr\;|\&\#xAF\;|\&acute\;|\&\#xB4\;|\&cedil\;|\&\#xB8\;|'
                .'\&ensp\;|\&\#x2002\;|\&emsp\;|\&\#x2003\;|\&thinsp\;|\&\#x2009\;|'
                .'\&zwnj\;|\&\#x200C\;|\&zwj\;|\&\#x200D\;|\&lrm\;|\&\#x200E\;|'
                .'\&rlm\;|\&\#x200F\;|\&\#xA\;|\&#xD\;|\x{FEFF})+/u',
                " ",
                $text
            )
        );

        return $text;
    }
}

HtmlContentExtractorTest

class HtmlContentExtractorTest extends \PHPUnit_Framework_TestCase
{
    protected $dataFolderPath;
    protected $oInnPage;
    protected $solarePage;

    /** @var \DOMDocument $phpDom */
    protected $phpDom;

    /** @var \DOMDocument $html5Dom */
    protected $html5Dom;

    /** @var HtmlContentExtractor $extractor */
    protected $extractor;

    public function setUp()
    {
        parent::setUp();
        $this->dataFolderPath
            = __DIR__ . DIRECTORY_SEPARATOR .'_data'. DIRECTORY_SEPARATOR;
        $this->oInnPage = $this->dataFolderPath .'www.o-inn.co.jp_index.html';
        $this->solarePage = $this->dataFolderPath .'www.solarehotels.com.html';
    }

    public function testGetTextArrayWithLineNumber()
    {
        $this->extractor = new HtmlContentExtractor(
            PhpDom::make($this->solarePage)
        );
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);

        $this->markTestSkipped('Masterminds\HTML5 has no support of it.');
        $this->extractor = new HtmlContentExtractor(
            Html5Dom::make($this->solarePage)
        );
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);
    }

    /**
     * Test get node path and text map
     *
     * @return void
     *
     * @ticket #108
     * @ticket #109
     * @ticket #136
     */
    public function testGetNodePathAndTextMap()
    {
        $expectedMetaKeywordsNodePath = "/html/head/meta[5]";
        $expectedMetaKeywordsText
            = "Best Price Guarantee,Bottom Price,Lowest Price,Hotel,Stay,"
                ."Reservation,Booking,SOLARE HOTELS & RESORTS";
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey(
                $expectedMetaKeywordsNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedMetaKeywordsText,
                $nodePathAndTextMap[$expectedMetaKeywordsNodePath]
            );
        }

        $expectedMetaKeywordsNodePath = "/html/head/meta[3]";
        $expectedMetaKeywordsText
            = "お茶の水イン,御茶ノ水,お茶の水,後楽園,"
                ."ビジネスホテル,文京区,東京ドーム,出張,宿泊予約";
        $expectedInterpolatedCaseNodePath
            = "/html/body/div[4]/div/div[2]/div[6]/p|<p>|</p>";
        $expectedInterpolatedCaseText
            = '掲載されている'
                .'<a href="http://www.tripadvisor.jp/'
                    .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
                    .'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
                .' target="_blank">'
                . PHP_EOL //< Masterminds\HTML5 seems not using source's EOL.
                .'            ホテルお茶の水イン'
                .'</a>'
                .'のクチコミはTripAdvisorより提供を受けています'
            ;
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey(
                $expectedMetaKeywordsNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedMetaKeywordsText,
                $nodePathAndTextMap[$expectedMetaKeywordsNodePath]
            );
            $this->assertArrayHasKey(
                $expectedInterpolatedCaseNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedInterpolatedCaseText,
                $nodePathAndTextMap[$expectedInterpolatedCaseNodePath]
            );
        }
    }

    /**
     * Test get text array
     *
     * @return void
     *
     * @ticket #108
     * @ticket #109
     * @ticket #136
     */
    public function testGetTextArray()
    {
        $expectedCommonCase
            = '　　* 1...Only applicable to rates compared on the same date'
            .' as the date of reservation made via the SORALRE HOTELS &amp;'
            .' RESORTS official website.<br>'."\n"
            .'　　* 2...Limited to claims submitted via email within 24 hours'
            .' of booking.';
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);
        }

        $expectedCommonCase
            = '<strong>お茶の水イン</strong><br>〒113-0034<br>'
                .'東京都文京区湯島1-3-7<br>TEL：03-3813-8211<br>'
                .'FAX：03-3813-9730<br>'
                .'<a href="/transportation/">お茶の水インまでの地図</a>'
            ;
        $expectedInterpolatedCase
            = '掲載されている'
                .'<a href="http://www.tripadvisor.jp/'
                .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
                .'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
                .' target="_blank">'
                ."\n            ホテルお茶の水イン"
                .'</a>'
                .'のクチコミはTripAdvisorより提供を受けています'
            ;
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);
            $this->assertContains($expectedInterpolatedCase, $textArray);
        }
    }

    /**
     * @ticket #109
     */
    /**
     * Test get text array on ill formed html
     *
     * @return void
     */
    public function testGetTextArrayOnIllFormedHtml()
    {
        $unpreparedHtml = file_get_contents($this->oInnPage);
        $illformedCase
            = '掲載されている'
            .'<a href="http://www.tripadvisor.jp/'
            .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
            .'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
            ."\n            ホテルお茶の水イン</a>"
            .'のクチコミはTripAdvisorより提供を受けています'
            .'</a>'
        ;
        $this->assertNotEquals(false, strpos($unpreparedHtml, $illformedCase));

        $expectedCase
            = '掲載されている'
                . '<a href="http://www.tripadvisor.jp/'
                . 'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
                . 'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
                . "\n            ホテルお茶の水イン</a>"
                . 'のクチコミはTripAdvisorより提供を受けています';
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCase, $textArray);
        }
    }

    /**
     * Testing for placeholder attribute extraction from input tag
     *
     * @return void
     */
    public function testGetNodePathAndTextMapOnPlaceholderAttributeOfInputTag()
    {
        $expectedPath = '/html/body/input/@placeholder';
        $expectedText = 'お名前';
        $file = $this->dataFolderPath .'inputPlaceholderTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
        }
    }

    /**
     * Testing for alt attribute extraction from image tag
     *
     * @return void
     *
     * @ticket LOC-2162
     */
    public function testGetNodePathAndTextMapOnAltAttributeOfImageTag()
    {
        $expectedPath = '/html/body/img/@alt';
        $expectedText = '画像です';
        $file = $this->dataFolderPath .'imageAltTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
        }
    }

    /**
     * Test BOM removal
     *
     * @ticket ZEN-2579
     *
     * @return void
     */
    public function testGetNodePathAndTextMapOnBOM()
    {
        $unexpectedPath = '/html/body/div[1]/text()[3]';
        $file = $this->dataFolderPath
            .'www.yokohamabay-sheraton.co.jp__other_facilities.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayNotHasKey($unexpectedPath, $nodePathAndTextMap);
        }
    }

//    /**
//     * Test sorting the node-path-to-text map by the line numbers of the HTML
//     *
//     * @return void
//     */
//    public function testSortNodePathAndTextMapByLineNum()
//    {
//        $file = $this->dataFolderPath .'replaceNodeXpath.html';
//        $doms = [PhpDom::make($file), Html5Dom::make($file)];
//        foreach ($doms as $dom) {
//            $extractor = new HtmlContentExtractor($dom);
//            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
//            $this->assertEquals(
//                'HTML Test', array_values($nodePathAndTextMap)[0]
//            );
//            $this->assertEquals(
//                '選べるリージョンとゾーン',
//                array_values($nodePathAndTextMap)[4]
//            );
//        }
//    }

}

PartialHtmlWrapper

class PartialHtmlWrapper
{
    const HEADER = <<<HTML_HEADER
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"></head><body>
HTML_HEADER;

    const HEADER_C14N = <<<HTML_HEADER_C14N
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type">'</meta></head><body>
HTML_HEADER_C14N;

    const FOOTER = <<<HTML_FOOTER
</body></html>

HTML_FOOTER;

    /**
     * Wrap the partial HTML
     *
     * @param string $partialHtml Partial HTML
     *
     * @return string
     */
    public static function wrap($partialHtml)
    {
        return self::HEADER . $partialHtml . self::FOOTER;
    }

    /**
     * Unwrap wrapped partial HTML
     *
     * @param string $wrappedPartialHtml Wrapped partial HTML
     *
     * @return string
     */
    public static function unwrap($wrappedPartialHtml)
    {
        return
            str_replace(
                [self::HEADER_C14N, self::HEADER, self::FOOTER],
                '',
                $wrappedPartialHtml
            );
    }
}

English HTML sentence segmenter

```php class EnglishSentenceSegmenter extends AbstractSentenceSegmenter { private $_replacements;

protected $titles
    = array(
        "Mr.", "Ms.", "Mrs.", "Dr.", "Prof.",
        "M.", "Ph.D.", "D.Phil.", "M.D.", "D.O.",
        "Capt.", "Cpl.", "Sgt.", "Maj.", "Gen.", "Messrs."
    );
protected $ellipsis = array("...", ". . .");

/**
 * Constructor
 */
public function __construct($preserveSpaces=false)
{
    parent::__construct($preserveSpaces);
    mb_internal_encoding("UTF-8");
}

/**
 * Pre-process
 *
 * @param string $rawHtml Raw HTML
 *
 * @return string
 */
public function preprocess($rawHtml)
{
    $cookedText=$rawHtml;

    if(!$this->_preserveSpaces) {
        $cookedText = preg_replace("/[\r\n\s]+/u", " ", $cookedText);
        $cookedText = trim($cookedText);
    } else {
        //Preserve white spaces at the beginning of the sentence
        $cookedText = rtrim($cookedText);
    }



    unset($this->_replacements);
    $this->_replacements = array();
    
    $this->addHtmlTagAsQuote(
        "span", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "font", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "a", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "li", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "h1", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "h2", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "h3", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "h4", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "h5", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "h6", $cookedText, $this->quoteKeys, $this->quotePairs
    );
    $this->addHtmlTagAsQuote(
        "p", $cookedText, $this->quoteKeys, $this->quotePairs
    );

    $begin2OpenQuoteAndEndMap = array();
    for ($i = 0; $i < count($this->quoteKeys); $i++) {
        $openQuote = $this->quoteKeys[$i];
        $offset = 0;
        $begin = mb_strpos($cookedText, $openQuote, $offset);
        while (false !== $begin) {
            switch ($openQuote) {
            case "\"":
            case "'":
                if ($begin > 0 && " " != $cookedText[$begin - 1]) {
                    $offset++;
                    break;
                }
            default:
                $closeQuote = $this->quotePairs[$openQuote];
                $offset = $begin + mb_strlen($openQuote);
                $end = mb_strpos($cookedText, $closeQuote, $offset);
                if (false !== $end) {
                    $offset = $end + mb_strlen($closeQuote);
                    $begin2OpenQuoteAndEndMap[$begin] = array($openQuote, $end);
                }
            }
            $begin = mb_strpos($cookedText, $openQuote, $offset);
        }
    }

    ksort($begin2OpenQuoteAndEndMap);
    $previousEnds = array();
    $begin2OpenQuoteMap = array();
    foreach ($begin2OpenQuoteAndEndMap as $begin => $openQuoteAndEnd) {
        $openQuote = $openQuoteAndEnd[0];
        $end = $openQuoteAndEnd[1];
        $isOverlapped = false;
        foreach ($previousEnds as $previousEnd) {
            if ($end < $previousEnd || $begin < $previousEnd) {
                $isOverlapped = true;
                break;
            }
        }
        
        if (!$isOverlapped) {
            $previousEnds[] = $end;
            $begin2OpenQuoteMap[$begin] = $openQuote;
        }
    }

    $offset = key($begin2OpenQuoteMap);
    $count = 0;
    foreach ($begin2OpenQuoteMap as $openQuote) {
        $openQuoteBegin = mb_strpos($cookedText, $openQuote, $offset);
        $openQuoteEnd = $openQuoteBegin + mb_strlen($openQuote);
        $closeQuote = $this->quotePairs[$openQuote];
        $closeQuoteBegin = mb_strpos($cookedText, $closeQuote, $openQuoteEnd);
        $closeQuoteEnd = $closeQuoteBegin + mb_strlen($closeQuote);
        $key = "<q". $count++ .">";
        $this->_replacements[$key]
            = mb_substr(
                $cookedText,
                $openQuoteBegin,
                $closeQuoteEnd - $openQuoteBegin
            );
        $cookedText
            = mb_substr($cookedText, 0, $openQuoteBegin)
                .$key
                .mb_substr($cookedText, $closeQuoteEnd);
        $offset = $openQuoteBegin + strlen($key);
    }

    for ($i = 0; $i < count($this->titles); $i++) {
        $cookedText = str_replace($this->titles[$i], "<t$i>", $cookedText);
        $this->_replacements["<t$i>"] = $this->titles[$i];
    }

    for ($i = 0; $i < count($this->ellipsis); $i++) {
        $cookedText = str_replace($this->ellipsis[$i], "<e$i>", $cookedText);
        $this->_replacements["<e$i>"] = $this->ellipsis[$i];
    }

    return $cookedText;
}

/**
 * Post-process
 *
 * @param string $cookedText Cooked text
 *
 * @return string
 */
public function postprocess($cookedText)
{
    foreach ($this->_replacements as $key => $value) {
        $pos = strpos($cookedText, $key);
        if (false !== $pos) {
            $cookedText
                = substr_replace($cookedText, $value, $pos, strlen($key));
        }

    }

    return $cookedText;
}

/**
 * Get sentences
 *
 * @param string $rawHtml Raw HTML
 *
 * @return array
 */
public function getSentences($rawHtml)
{
    $sentences = array();

    $rawHtmlWithoutCrLf = $this->preprocess($rawHtml);
    
    $begin = 0;
    $offset = 0;
    $matches = array();
    $isMatched = 1;

    while ($isMatched) {
        $isMatched
            = preg_match(
                $this->fullStop,
                $rawHtmlWithoutCrLf,
                $matches,
                PREG_OFFSET_CAPTURE,
                $offset
            );
        if ($isMatched) {
            $matchedFullStop = $matches[0][0];
            $matchedPos = $matches[0][1];
            $next = $matchedPos + strlen($matchedFullStop);

            $isValid = false;
            if ("." != $matchedFullStop) {
                $isValid = true;
            } else if ($matchedPos == strlen($rawHtmlWithoutCrLf)) {
                $isValid = true;
            } else if ($matchedPos + 2 < strlen($rawHtmlWithoutCrLf)
                && " " == $rawHtmlWithoutCrLf[$matchedPos + 1]
            ) {
                $isValid = true;
            }

            if ($isValid) {
                $extractedSentence
                    =substr(
                        $rawHtmlWithoutCrLf,
                        $begin,
                        $next - $begin
                    );

                if($this->_preserveSpaces){
                    //Preserve white spaces at the beginning of the sentence
                    $extractedSentence=rtrim($extractedSentence);
                } else {
                    $extractedSentence=trim($extractedSentence);
                }
                $sentences[]=$extractedSentence;

                $begin = $next;
            }
            $offset = $next;
        }
    }
    
    if ($begin < strlen($rawHtmlWithoutCrLf)) {
        if($this->_preserveSpaces){
            $sentences[] = rtrim(substr($rawHtmlWithoutCrLf, $begin));
        } else {
            $sentences[] = trim(substr($rawHtmlWithoutCrLf, $begin));
        }
    }

    foreach ($sentences as &$sentence) {
        $sentence = $this->postprocess($sentence);
    }
    unset($sentence);

    return $sentences;
}

}

</details>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Home

Clone this wiki locally