Spaces:
No application file
No application file
namespace Mautic\EmailBundle\Helper; | |
class PlainTextHelper | |
{ | |
public const ENCODING = 'UTF-8'; | |
/** | |
* Contains the HTML content to convert. | |
*/ | |
protected string $html = ''; | |
/** | |
* Contains the converted, formatted text. | |
* | |
* @var string | |
*/ | |
protected $text; | |
/** | |
* Maximum width of the formatted text, in columns. | |
* | |
* Set this value to 0 (or less) to ignore word wrapping | |
* and not constrain text to a fixed-width column. | |
* | |
* @var int | |
*/ | |
protected $width = 70; | |
/** | |
* List of preg* regular expression patterns to search for, | |
* used in conjunction with $replace. | |
* | |
* @var array | |
* | |
* @see $replace | |
*/ | |
protected $search = [ | |
"/\r/", // Non-legal carriage return | |
"/[\n\t]+/", // Newlines and tabs | |
'/<head[^>]*>.*?<\/head>/i', // <head> | |
'/<script[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with | |
'/<style[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with | |
'/<p[^>]*>/i', // <P> | |
'/<br[^>]*>/i', // <br> | |
'/<i[^>]*>(.*?)<\/i>/i', // <i> | |
'/<em[^>]*>(.*?)<\/em>/i', // <em> | |
'/(<ul[^>]*>|<\/ul>)/i', // <ul> and </ul> | |
'/(<ol[^>]*>|<\/ol>)/i', // <ol> and </ol> | |
'/(<dl[^>]*>|<\/dl>)/i', // <dl> and </dl> | |
'/<li[^>]*>(.*?)<\/li>/i', // <li> and </li> | |
'/<dd[^>]*>(.*?)<\/dd>/i', // <dd> and </dd> | |
'/<dt[^>]*>(.*?)<\/dt>/i', // <dt> and </dt> | |
'/<li[^>]*>/i', // <li> | |
'/<hr[^>]*>/i', // <hr> | |
'/<div[^>]*>/i', // <div> | |
'/(<table[^>]*>|<\/table>)/i', // <table> and </table> | |
'/(<tr[^>]*>|<\/tr>)/i', // <tr> and </tr> | |
'/<td[^>]*>(.*?)<\/td>/i', // <td> and </td> | |
'/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span> | |
]; | |
/** | |
* List of pattern replacements corresponding to patterns searched. | |
* | |
* @var array | |
* | |
* @see $search | |
*/ | |
protected $replace = [ | |
'', // Non-legal carriage return | |
' ', // Newlines and tabs | |
'', // <head> | |
'', // <script>s -- which strip_tags supposedly has problems with | |
'', // <style>s -- which strip_tags supposedly has problems with | |
"\n\n", // <P> | |
"\n", // <br> | |
'_\\1_', // <i> | |
'_\\1_', // <em> | |
"\n\n", // <ul> and </ul> | |
"\n\n", // <ol> and </ol> | |
"\n\n", // <dl> and </dl> | |
"\t* \\1\n", // <li> and </li> | |
" \\1\n", // <dd> and </dd> | |
"\t* \\1", // <dt> and </dt> | |
"\n\t* ", // <li> | |
"\n-------------------------\n", // <hr> | |
"<div>\n", // <div> | |
"\n\n", // <table> and </table> | |
"\n", // <tr> and </tr> | |
"\t\t\\1\n", // <td> and </td> | |
'', // <span class="_html2text_ignore">...</span> | |
]; | |
/** | |
* List of preg* regular expression patterns to search for, | |
* used in conjunction with $entReplace. | |
* | |
* @var array | |
* | |
* @see $entReplace | |
*/ | |
protected $entSearch = [ | |
'/™/i', // TM symbol in win-1252 | |
'/—/i', // m-dash in win-1252 | |
'/&(amp|#38);/i', // Ampersand: see converter() | |
'/[ ]{2,}/', // Runs of spaces, post-handling | |
]; | |
/** | |
* List of pattern replacements corresponding to patterns searched. | |
* | |
* @var array | |
* | |
* @see $entSearch | |
*/ | |
protected $entReplace = [ | |
'™', // TM symbol | |
'—', // m-dash | |
'|+|amp|+|', // Ampersand: see converter() | |
' ', // Runs of spaces, post-handling | |
]; | |
/** | |
* List of preg* regular expression patterns to search for | |
* and replace using callback function. | |
* | |
* @var array | |
*/ | |
protected $callbackSearch = [ | |
'/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 | |
'/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> | |
'/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> | |
'/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> | |
'/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i', // <a href=""> | |
]; | |
/** | |
* List of preg* regular expression patterns to search for in PRE body, | |
* used in conjunction with $preReplace. | |
* | |
* @var array | |
* | |
* @see $preReplace | |
*/ | |
protected $preSearch = [ | |
"/\n/", | |
"/\t/", | |
'/ /', | |
'/<pre[^>]*>/', | |
'/<\/pre>/', | |
]; | |
/** | |
* List of pattern replacements corresponding to patterns searched for PRE body. | |
* | |
* @var array | |
* | |
* @see $preSearch | |
*/ | |
protected $preReplace = [ | |
'<br>', | |
' ', | |
' ', | |
'', | |
'', | |
]; | |
/** | |
* Temporary workspace used during PRE processing. | |
* | |
* @var string | |
*/ | |
protected $preContent = ''; | |
/** | |
* Indicates whether content in the $html variable has been converted yet. | |
* | |
* @var bool | |
* | |
* @see $html, $text | |
*/ | |
protected $converted = false; | |
/** | |
* Contains URL addresses from links to be rendered in plain text. | |
* | |
* @var array | |
* | |
* @see buildlinkList() | |
*/ | |
protected $linkList = []; | |
/** | |
* Various configuration options (able to be set in the constructor). | |
* | |
* @var array<string, mixed> | |
*/ | |
protected array $options = [ | |
'do_links' => 'inline', // 'none' | |
// 'inline' (show links inline) | |
// 'nextline' (show links on the next line) | |
// 'table' (if a table of link URLs should be listed after the text. | |
'width' => 70, // Maximum width of the formatted text, in columns. | |
// Set this value to 0 (or less) to ignore word wrapping | |
// and not constrain text to a fixed-width column. | |
'base_url' => '', | |
'preview_length' => 119, // Maximum length of the preview text | |
]; | |
/** | |
* @param array<string, mixed> $options Set configuration options | |
*/ | |
public function __construct(array $options = []) | |
{ | |
$this->options = array_merge($this->options, $options); | |
} | |
/** | |
* Set the source HTML. | |
* | |
* @param string $html HTML source content | |
* | |
* @return PlainTextHelper | |
*/ | |
public function setHtml($html) | |
{ | |
$this->html = $html; | |
$this->converted = false; | |
return $this; | |
} | |
/** | |
* Returns the text, converted from HTML. | |
*/ | |
public function getText(): string | |
{ | |
if (!$this->converted) { | |
$this->convert(); | |
} | |
return trim($this->text); | |
} | |
public function getPreview(): string | |
{ | |
$textContent = $this->getText(); | |
$preview = trim(substr($textContent, 0, $this->options['preview_length'])); | |
// If the text is longer than the preview length, append an ellipsis | |
if (strlen($textContent) > $this->options['preview_length']) { | |
$preview .= '...'; | |
} | |
return $preview; | |
} | |
protected function convert() | |
{ | |
$this->linkList = []; | |
$text = trim(stripslashes($this->html)); | |
$this->converter($text); | |
if ($this->linkList) { | |
$text .= "\n\nLinks:\n------\n"; | |
foreach ($this->linkList as $i => $url) { | |
$text .= '['.($i + 1).'] '.$url."\n"; | |
} | |
} | |
$this->text = $text; | |
$this->converted = true; | |
} | |
protected function converter(&$text) | |
{ | |
$this->convertBlockquotes($text); | |
$this->convertPre($text); | |
$text = preg_replace($this->search, $this->replace, $text); | |
$text = preg_replace_callback($this->callbackSearch, [$this, 'pregCallback'], $text); | |
$text = strip_tags($text); | |
$text = preg_replace($this->entSearch, $this->entReplace, $text); | |
$text = html_entity_decode($text, ENT_QUOTES, self::ENCODING); | |
// Remove unknown/unhandled entities (this cannot be done in search-and-replace block) | |
$text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); | |
// Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities | |
// This properly handles situation of "&quot;" in input string | |
$text = str_replace('|+|amp|+|', '&', $text); | |
// Normalise empty lines | |
$text = preg_replace("/\n\s+\n/", "\n\n", $text); | |
$text = preg_replace("/[\n]{3,}/", "\n\n", $text); | |
// remove leading empty lines (can be produced by eg. P tag on the beginning) | |
$text = ltrim($text, "\n"); | |
if ($this->options['width'] > 0) { | |
$text = $this->linewrap($text, $this->options['width']); | |
} | |
} | |
/** | |
* Helper function called by preg_replace() on link replacement. | |
* | |
* Maintains an internal list of links to be displayed at the end of the | |
* text, with numeric indices to the original point in the text they | |
* appeared. Also makes an effort at identifying and handling absolute | |
* and relative links. | |
* | |
* @param string $link URL of the link | |
* @param string $display Part of the text to associate number with | |
* | |
* @return string | |
*/ | |
protected function buildlinkList($link, $display, ?string $linkOverride = null) | |
{ | |
$linkMethod = $linkOverride ?: $this->options['do_links']; | |
if ('none' == $linkMethod) { | |
return $display; | |
} | |
// Ignored link types | |
if (preg_match('!^(javascript:|mailto:|#)!i', $link)) { | |
return $display; | |
} | |
if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link) || preg_match('!({|%7B)(.*?)(}|%7D)!', $link)) { | |
$url = $link; | |
} else { | |
$url = $this->options['base_url']; | |
if (!str_starts_with($link, '/')) { | |
$url .= '/'; | |
} | |
$url .= $link; | |
} | |
if ('table' == $linkMethod) { | |
if (false === ($index = array_search($url, $this->linkList))) { | |
$index = count($this->linkList); | |
$this->linkList[] = $url; | |
} | |
return $display.' ['.($index + 1).']'; | |
} elseif ('nextline' == $linkMethod) { | |
return $display."\n[".$url.']'; | |
} else { // link_method defaults to inline | |
return $display.' ['.$url.']'; | |
} | |
} | |
protected function convertPre(&$text) | |
{ | |
// get the content of PRE element | |
while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { | |
$this->preContent = $matches[1]; | |
// Run our defined tags search-and-replace with callback | |
$this->preContent = preg_replace_callback( | |
$this->callbackSearch, | |
[$this, 'pregCallback'], | |
$this->preContent | |
); | |
// convert the content | |
$this->preContent = sprintf( | |
'<div><br>%s<br></div>', | |
preg_replace($this->preSearch, $this->preReplace, $this->preContent) | |
); | |
// replace the content (use callback because content can contain $0 variable) | |
$text = preg_replace_callback( | |
'/<pre[^>]*>.*<\/pre>/ismU', | |
[$this, 'pregPreCallback'], | |
$text, | |
1 | |
); | |
// free memory | |
$this->preContent = ''; | |
} | |
} | |
/** | |
* Helper function for BLOCKQUOTE body conversion. | |
* | |
* @param string $text HTML content | |
*/ | |
protected function convertBlockquotes(&$text) | |
{ | |
if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { | |
$start = 0; | |
$taglen = 0; | |
$level = 0; | |
$diff = 0; | |
foreach ($matches[0] as $m) { | |
if ('<' == $m[0][0] && '/' == $m[0][1]) { | |
--$level; | |
if ($level < 0) { | |
$level = 0; // malformed HTML: go to next blockquote | |
} elseif ($level > 0) { | |
// skip inner blockquote | |
} else { | |
$end = $m[1]; | |
$len = $end - $taglen - $start; | |
// Get blockquote content | |
$body = substr($text, $start + $taglen - $diff, $len); | |
// Set text width | |
$pWidth = $this->options['width']; | |
if ($this->options['width'] > 0) { | |
$this->options['width'] -= 2; | |
} | |
// Convert blockquote content | |
$body = trim($body); | |
$this->converter($body); | |
// Add citation markers and create PRE block | |
$body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); | |
$body = '<pre>'.htmlspecialchars($body).'</pre>'; | |
// Re-set text width | |
$this->options['width'] = $pWidth; | |
// Replace content | |
$text = substr($text, 0, $start - $diff) | |
.$body.substr($text, $end + strlen($m[0]) - $diff); | |
$diff = $len + $taglen + strlen($m[0]) - strlen($body); | |
unset($body); | |
} | |
} else { | |
if (0 == $level) { | |
$start = $m[1]; | |
$taglen = strlen($m[0]); | |
} | |
++$level; | |
} | |
} | |
} | |
} | |
/** | |
* Callback function for preg_replace_callback use. | |
* | |
* @param array $matches PREG matches | |
* | |
* @return string | |
*/ | |
protected function pregCallback($matches) | |
{ | |
switch (strtolower($matches[1])) { | |
case 'b': | |
case 'strong': | |
return $matches[3]; | |
case 'th': | |
return $this->toupper("\t\t".$matches[3]."\n"); | |
case 'h': | |
return $this->toupper("\n\n".$matches[3]."\n\n"); | |
case 'a': | |
// override the link method | |
$linkOverride = null; | |
if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) { | |
$linkOverride = $linkOverrideMatch[1]; | |
} | |
// Remove spaces in URL (#1487805) | |
$url = str_replace(' ', '', $matches[3]); | |
return $this->buildlinkList($url, $matches[5], $linkOverride); | |
} | |
return ''; | |
} | |
/** | |
* Callback function for preg_replace_callback use in PRE content handler. | |
* | |
* @param array $matches PREG matches | |
* | |
* @return string | |
*/ | |
protected function pregPreCallback(/* @noinspection PhpUnusedParameterInspection */ $matches) | |
{ | |
return $this->preContent; | |
} | |
/** | |
* Strtoupper function with HTML tags and entities handling. | |
* | |
* @param string $str Text to convert | |
* | |
* @return string Converted text | |
*/ | |
private function toupper($str): string | |
{ | |
// string can contain HTML tags | |
$chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); | |
// convert toupper only the text between HTML tags | |
foreach ($chunks as $i => $chunk) { | |
if ('<' != $chunk[0]) { | |
$chunks[$i] = $this->strtoupper($chunk); | |
} | |
} | |
return implode('', $chunks); | |
} | |
/** | |
* Strtoupper multibyte wrapper function with HTML entities handling. | |
* | |
* @param string $str Text to convert | |
* | |
* @return string Converted text | |
*/ | |
private function strtoupper($str): string | |
{ | |
$str = html_entity_decode($str, ENT_COMPAT, self::ENCODING); | |
if (function_exists('mb_strtoupper')) { | |
$str = mb_strtoupper($str, self::ENCODING); | |
} else { | |
$str = strtoupper($str); | |
} | |
return htmlspecialchars($str, ENT_COMPAT, self::ENCODING); | |
} | |
/** | |
* @param string $breakline | |
* @param bool|false $cut | |
*/ | |
private function linewrap($text, $width, $breakline = "\n", $cut = false): string | |
{ | |
$lines = explode("\n", $text); | |
$text = ''; | |
foreach ($lines as $line) { | |
$text .= trim(wordwrap(trim($line), $width, $breakline, $cut)); | |
$text .= "\n"; | |
} | |
return $text; | |
} | |
} | |