smart_trim-8.x-1.3/src/TruncateHTML.php
src/TruncateHTML.php
<?php
namespace Drupal\smart_trim;
use Drupal\Component\Utility\Html;
use Drupal\Component\Utility\Unicode;
/**
* @file
* Contains trim functionality.
*
* As noted on http://www.pjgalbraith.com/2011/11/truncating-text-html-with-php/
* with some modifications to adhere to the Drupal Coding Standards.
*/
/**
* This class basically truncates the HTML characters.
*/
class TruncateHTML {
/**
* Total characters.
*
* @var int
*/
protected int $charCount = 0;
/**
* Total words.
*
* @var int
*/
protected int $wordCount = 0;
/**
* Character / Word limit.
*
* @var int
*/
protected int $limit;
/**
* Element to start on.
*
* @var \DOMElement
*/
protected \DOMElement $startNode;
/**
* Ellipsis character.
*
* @var string
*/
protected string $ellipsis;
/**
* Did we find the breakpoint?
*
* @var bool
*/
protected bool $foundBreakpoint = FALSE;
/**
* Sets up object for use.
*
* @param string $html
* Text to be prepared.
* @param int $limit
* Amount of text to return.
* @param string $ellipsis
* Characters to use at the end of the text.
*
* @return \DOMNode
* Prepared DOMNode to work with.
*/
protected function init(string $html, int $limit, string $ellipsis): \DOMNode {
$dom = Html::load($html);
// The body tag node, our html fragment is automatically wrapped in
// a <html><body> etc.
$this->startNode = $dom->getElementsByTagName("body")->item(0);
$this->limit = $limit;
$this->ellipsis = $ellipsis;
$this->charCount = 0;
$this->wordCount = 0;
$this->foundBreakpoint = FALSE;
$this->removeHtmlComments($dom);
return $dom;
}
/**
* Truncates HTML text by characters.
*
* @param string $html
* Text to be updated.
* @param int $limit
* Amount of text to allow.
* @param string $ellipsis
* Characters to use at the end of the text.
*
* @return string
* Resulting text.
*/
public function truncateChars(string $html, int $limit, string $ellipsis = '...'): string {
if ($limit <= 0 || $limit >= mb_strlen(strip_tags($html))) {
return $html;
}
$dom = $this->init($html, $limit, $ellipsis);
// Pass the body node on to be processed.
$this->domNodeTruncateChars($this->startNode);
return Html::serialize($dom);
}
/**
* Truncates HTML text by words.
*
* @param string $html
* Text to be updated.
* @param int $limit
* Amount of text to allow.
* @param string $ellipsis
* Characters to use at the end of the text.
*
* @return string
* Resulting text.
*/
public function truncateWords(string $html, int $limit, string $ellipsis = '...'): string {
if ($limit <= 0 || $limit >= $this->countWords(strip_tags($html))) {
return $html;
}
$dom = $this->init($html, $limit, $ellipsis);
// Pass the body node on to be processed.
$this->domNodeTruncateWords($this->startNode);
return Html::serialize($dom);
}
/**
* Truncates a DOMNode by character count.
*
* @param \DOMNode $domNode
* Object to be truncated.
*/
protected function domNodeTruncateChars(\DOMNode $domNode): void {
foreach ($domNode->childNodes as $node) {
if ($this->foundBreakpoint) {
return;
}
if ($node->hasChildNodes()) {
$this->domNodeTruncateChars($node);
}
else {
$text = html_entity_decode($node->nodeValue, ENT_QUOTES, 'UTF-8');
$length = mb_strlen($text);
if (($this->charCount + $length) >= $this->limit) {
// We have found our end point.
$node->nodeValue = Unicode::truncate($text, $this->limit - $this->charCount, TRUE);
$this->removeTrailingPunctuation($node);
$this->removeProceedingNodes($node);
$this->insertEllipsis($node);
$this->foundBreakpoint = TRUE;
return;
}
else {
$this->charCount += $length;
}
}
}
}
/**
* Truncates a DOMNode by words.
*
* @param \DOMNode $domNode
* Object to be truncated.
*/
protected function domNodeTruncateWords(\DOMNode $domNode): void {
foreach ($domNode->childNodes as $node) {
if ($this->foundBreakpoint) {
return;
}
if ($node->hasChildNodes()) {
$this->domNodeTruncateWords($node);
}
else {
$cur_count = $this->countWords($node->nodeValue);
if (($this->wordCount + $cur_count) >= $this->limit) {
// We have found our end point.
if ($cur_count > 1 && ($this->limit - $this->wordCount) < $cur_count) {
// Note that PREG_SPLIT_OFFSET_CAPTURE and UTF-8 is interesting.
// preg_split() works on the string as an array of bytes therefore
// in order to use its results we need to use non unicode aware
// functions.
// @see https://bugs.php.net/bug.php?id=67487
$words = preg_split("/[\n\r\t ]+/", $node->nodeValue, ($this->limit - $this->wordCount) + 1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_OFFSET_CAPTURE);
end($words);
$last_word = prev($words);
$node->nodeValue = substr($node->nodeValue, 0, $last_word[1] + strlen($last_word[0]));
}
$this->removeTrailingPunctuation($node);
$this->removeProceedingNodes($node);
$this->insertEllipsis($node);
$this->foundBreakpoint = TRUE;
return;
}
else {
$this->wordCount += $cur_count;
}
}
}
}
/**
* Removes certain punctuation from the end of the node value.
*
* @param \DOMNode $domNode
* Node to be altered.
*/
protected function removeTrailingPunctuation(\DOMNode $domNode): void {
while (preg_match('/[\.,:;\?!…]$/u', $domNode->nodeValue)) {
$domNode->nodeValue = mb_substr($domNode->nodeValue, 0, -1);
}
}
/**
* Removes preceding sibling node.
*
* @param \DOMNode $domNode
* Node to be altered.
*/
protected function removeProceedingNodes(\DOMNode $domNode): void {
$nextNode = $domNode->nextSibling;
if ($nextNode !== NULL) {
// Run in a while loop to prevent hitting the maximum recursion limit
// when processing DOM elements with many children at the same level.
while ($nextNode->nextSibling !== NULL) {
$node = $nextNode;
$nextNode = $nextNode->nextSibling;
$node->parentNode->removeChild($node);
}
$this->removeProceedingNodes($nextNode);
$domNode->parentNode->removeChild($nextNode);
}
else {
// Scan upwards till we find a sibling.
$currentNode = $domNode->parentNode;
while ($currentNode !== $this->startNode) {
if ($currentNode->nextSibling !== NULL) {
$currentNode = $currentNode->nextSibling;
$this->removeProceedingNodes($currentNode);
$currentNode->parentNode->removeChild($currentNode);
break;
}
$currentNode = $currentNode->parentNode;
}
}
}
/**
* Inserts the ellipsis character to the node.
*
* @param \DOMNode $domNode
* Node to be altered.
*/
protected function insertEllipsis(\DOMNode $domNode): void {
// HTML tags to avoid appending the ellipsis to.
$avoid = ['a', 'strong', 'em', 'h1', 'h2', 'h3', 'h4', 'h5'];
if (in_array($domNode->parentNode->nodeName, $avoid) && ($domNode->parentNode->parentNode !== NULL || $domNode->parentNode->parentNode !== $this->startNode)) {
// Append as text node to parent instead.
$textNode = new \DOMText($this->ellipsis);
if ($domNode->parentNode->parentNode->nextSibling) {
$domNode->parentNode->parentNode->insertBefore($textNode, $domNode->parentNode->parentNode->nextSibling);
}
else {
$domNode->parentNode->parentNode->appendChild($textNode);
}
}
else {
// This allows unicode characters like \u2026 for ellipsis.
$this->ellipsis = Html::escape(json_decode('"' . $this->ellipsis . '"'));
// Append to current node.
$domNode->nodeValue = rtrim($domNode->nodeValue) . $this->ellipsis;
}
}
/**
* Gets number of words in text.
*
* @param string $text
* Text to be counted.
*
* @return int
* Results
*/
protected function countWords(string $text): int {
$words = preg_split("/[\n\r\t ]+/", $text, -1, PREG_SPLIT_NO_EMPTY);
return count($words);
}
/**
* Removes all comment elements.
*
* @param \DOMNode $domNode
* Node to be altered.
*/
protected function removeHtmlComments(&$domNode): void {
$nodes = $domNode->childNodes;
for ($i = 0; $i < $nodes->length; $i++) {
$node = $nodes->item($i);
if ($node->nodeName == '#comment') {
$node->parentNode->removeChild($node);
// Since we just removed a child, decrement the counter.
$i--;
}
if ($node->hasChildNodes()) {
$this->removeHtmlComments($node);
}
}
}
}
