cloudwords-8.x-1.x-dev/lib/cloudwords_html_converter.inc
lib/cloudwords_html_converter.inc
<?php
/**
* @file
* Converts HTML tags into their corresponding XLIFF tags.
*/
class CloudwordsConverter {
protected $elementMap = [
'b' => 'bold',
'br' => 'lb',
'caption' => 'caption',
'fieldset' => 'groupbox',
'footer' => 'footer',
'form' => 'dialog',
'frame' => 'frame',
'head' => 'header',
'i' => 'italic',
'img' => 'image',
'li' => 'listitem',
'menu' => 'menu',
'table' => 'table',
'td' => 'cell',
'tfoot' => 'footer',
'tr' => 'row',
'u' => 'underlined',
];
protected $inlineTags = [
'a' => TRUE,
'abbr' => TRUE,
'acronym' => TRUE,
'address' => TRUE,
'applet' => TRUE,
'area' => TRUE,
'audio' => TRUE,
'b' => TRUE,
'bdo' => TRUE,
'big' => TRUE,
'blink' => TRUE,
'br' => TRUE,
'button' => TRUE,
'cite' => TRUE,
'code' => TRUE,
'command' => TRUE,
'datalist' => TRUE,
'del' => TRUE,
'details' => TRUE,
'dfn' => TRUE,
'em' => TRUE,
'embed' => TRUE,
'face' => TRUE,
// 'font' => TRUE,
'i' => TRUE,
'iframe' => TRUE,
'img' => TRUE,
'input' => TRUE,
'ins' => TRUE,
'kbd' => TRUE,
'label' => TRUE,
'legend' => TRUE,
'link' => TRUE,
'map' => TRUE,
'mark' => TRUE,
'meter' => TRUE,
'nav' => TRUE,
'nobr' => TRUE,
'object' => TRUE,
'optgroup' => TRUE,
'option' => TRUE,
'param' => TRUE,
'q' => TRUE,
'rb' => TRUE,
'rbc' => TRUE,
'rp' => TRUE,
'rt' => TRUE,
'rtc' => TRUE,
'ruby' => TRUE,
's' => TRUE,
'samp' => TRUE,
'select' => TRUE,
'small' => TRUE,
'source' => TRUE,
'span' => TRUE,
'spacer' => TRUE,
'strike' => TRUE,
'strong' => TRUE,
'sub' => TRUE,
'summary' => TRUE,
'sup' => TRUE,
'symbol' => TRUE,
'textarea' => TRUE,
'time' => TRUE,
'tt' => TRUE,
'u' => TRUE,
'var' => TRUE,
'wbr' => TRUE,
];
protected $selfClosingTags = [
'area' => TRUE,
'base' => TRUE,
'basefont' => TRUE,
'br' => TRUE,
'col' => TRUE,
'frame' => TRUE,
'hr' => TRUE,
'img' => TRUE,
'input' => TRUE,
'link' => TRUE,
'meta' => TRUE,
'param' => TRUE,
];
protected $inTransUnit = FALSE;
public function __construct($html, $langcode) {
$this->doc = new DOMDocument();
$this->doc->strictErrorChecking = FALSE;
$this->langcode = $langcode;
$error = $this->errorStart();
// Setting meta below is a hack to get our DomDocument into utf-8. All other
// methods tried didn't work.
$success = $this->doc->loadHTML('<meta http-equiv="content-type" content="text/html; charset=utf-8"><div id="cloudwords-dont-ever-use-this-id">' . $html . '</div>');
$this->errorStop($error);
if (!$success) {
throw new Exception('Invalid HTML');
}
}
/**
* Converts HTML to the corresponding XLIFF representation.
*
* @return string
* The source HTML converted to XLIFF.
*/
public function toXLIFF($pretty_print = FALSE) {
$this->doc->formatOutput = $pretty_print;
// Do not use getElementById to comply with older versions of libxml.
// getElementById doesn't work properly on libxml 2.7.6 (CentOS)
$xpath = new DOMXPath($this->doc);
$wrapper_div = $xpath->query("//*[@id='cloudwords-dont-ever-use-this-id']")->item(0);
$out = $this->doc->createDocumentFragment();
$domNodeList = [];
for ($i = 0; $i < $wrapper_div->childNodes->length; ++$i) {
$domNodeList[] = $wrapper_div->childNodes->item($i);
}
$this->sanitizeMixedDomNodeList($this->doc, $domNodeList);
foreach($domNodeList as $domNode){
if ($output = $this->convert($domNode)) {
$out->appendChild($output);
}
}
return $this->doc->saveXML($out);
}
/**
* Test for mixed dom node types (DOMText and DOMElement)
* Wraps sibling DOMText and DOMElement inline tags into single DOMElement with a text attribute
*
* @return string
* The source HTML converted to XLIFF.
*/
protected function sanitizeMixedDomNodeList($doc, &$list){
$NewElement = $doc->createElement('text');
$test = [XML_ELEMENT_NODE=> FALSE, XML_TEXT_NODE=>FALSE];
foreach($list as $node){
$test[$node->nodeType] = TRUE;
}
//text only so exit
if($test[XML_TEXT_NODE] == TRUE && $test[XML_ELEMENT_NODE] == FALSE){
return;
}
//mixed group logic
$newList = [];
foreach($list as $k => $node){
if (array_key_exists($node->nodeName, $this->inlineTags)||
array_key_exists($node->nodeName, ['#text' => true])){
unset($list[$k]);
$NewElement->appendChild($node->cloneNode(true));
if (!isset($node->nextSibling->nodeName)){
$newList[$k] = $NewElement;
}else if(array_key_exists($node->nextSibling->nodeName, $this->inlineTags)||
array_key_exists($node->nextSibling->nodeName, ['#text' => true])){
//do nothing
}else{
$newList[$k] = $NewElement;
}
}else{
$newList[$k] = $node;
$NewElement = $doc->createElement('text');
}
}
$list = $newList;
}
protected function convert(DOMNode $node) {
switch ($node->nodeType) {
case XML_ELEMENT_NODE:
return $this->convertElement($node);
case XML_TEXT_NODE:
//if (!trim($node->nodeValue)) {
// break;
//}
return $this->addText($node);
// case XML_CDATA_SECTION_NODE:
// $out->appendChild($this->addText($node));
// return $out;
}
return FALSE;
}
protected function convertElement(DOMElement $element) {
$translated_element = $this->convertElementTag($element);
foreach ($this->xliffAttrs($element) as $attr) {
$translated_element->setAttributeNode($attr);
}
//Correction for inline tags created as trans-units
if ((array_key_exists($element->nodeName, $this->inlineTags)) && $translated_element->tagName == 'trans-unit'){
$tmpEl = $element;
$element = $this->doc->createElement('text');
$translated_element = $this->convertElementTag($element);
foreach ($this->xliffAttrs($element) as $attr) {
$translated_element->setAttributeNode($attr);
}
$element->appendchild(clone $tmpEl);
}
if ($translated_element->tagName == 'trans-unit') {
$out = $this->createSource();
$translated_element->appendChild($out);
$target = $this->createTarget();
$translated_element->appendChild($target);
$this->inTransUnit = TRUE;
}
else {
$out = $translated_element;
}
if ($element->hasChildNodes()) {
foreach ($element->childNodes as $child) {
if ($converted = $this->convert(clone $child)) {
$out->appendChild($converted);
if ($out->tagName == 'source') {
$target->appendChild(clone $converted);
}
}
}
}
if ($translated_element->tagName == 'trans-unit') {
$this->inTransUnit = FALSE;
}
return $translated_element;
}
protected function addText(DOMText $text) {
// $text->nodeValue = htmlentities($text->nodeValue);
if (!$this->inTransUnit) {
$trans = $this->doc->createElement('trans-unit');
$trans->setAttribute('id', uniqid('text-'));
$source = $this->createSource('en');
$target = $this->createTarget('fr');
$trans->appendChild($source);
$trans->appendChild($target);
$source->appendChild($text);
$target->appendChild(clone $text);
return $trans;
}
return $text;
}
protected function convertElementTag(DOMElement $element) {
if ($this->isBlockElement($element) && $this->hasBlockChild($element)) {
return $this->doc->createElement('group');
}
if (isset($this->selfClosingTags[$element->tagName]) && $this->inTransUnit) {
$out = $this->doc->createElement('x');
}
elseif (isset($this->inlineTags[$element->tagName]) && $this->inTransUnit) {
$out = $this->doc->createElement('g');
}
else {
$out = $this->doc->createElement('trans-unit');
}
$out->setAttribute('id', uniqid($element->tagName . '-'));
return $out;
}
protected function isBlockElement(DOMElement $element) {
return !isset($this->inlineTags[$element->tagName]);
}
protected function hasBlockChild(DOMElement $element) {
if ($element->hasChildNodes()) {
$filter = new CloudwordsDOMElementFilter(new CloudwordsRecursiveDOMIterator($element));
$recursive = new RecursiveIteratorIterator($filter, RecursiveIteratorIterator::SELF_FIRST);
foreach ($recursive as $element) {
if ($this->isBlockElement($element)) {
return TRUE;
}
}
}
return FALSE;
}
protected function xliffAttrs(DOMElement $element) {
$attrs = [];
if (isset($this->inlineTags[$element->tagName])) {
$attrs[] = new DOMAttr('ctype', $this->mapHTMLTagToXLIFF($element));
}
else if($element->tagName != 'text'){
$attrs[] = new DOMAttr('restype', $this->mapHTMLTagToXLIFF($element));
}
foreach ($element->attributes as $attr) {
switch ($attr->name) {
case 'style':
$name = 'css-style';
break;
default:
$name = 'html:' . $attr->name;
break;
}
$attrs[] = new DOMAttr($name, _cloudwords_filter_xml_control_characters($attr->value));
}
return $attrs;
}
protected function createSource() {
$element = $this->doc->createElement('source');
$element->setAttribute('xml:lang', 'en');
return $element;
}
protected function createTarget() {
$element = $this->doc->createElement('target');
$element->setAttribute('xml:lang', $this->langcode);
return $element;
}
protected function mapHTMLTagToXLIFF(DOMElement $element) {
if (isset($this->elementMap[$element->tagName])) {
return $this->elementMap[$element->tagName];
}
return 'x-html-' . $element->tagName;
}
/**
* Start custom error handling.
*
* @return bool
* The previous value of use_errors.
*/
protected function errorStart() {
return libxml_use_internal_errors(TRUE);
}
/**
* Stop custom error handling.
*
* @param bool $use
* The previous value of use_errors.
* @param bool $print
* (Optional) Whether to print errors to the screen. Defaults to TRUE.
*/
protected function errorStop($use, $print = FALSE) {
if ($print) {
foreach (libxml_get_errors() as $error) {
// Invalid tag. Skip this as DOMDocument does not supprt HTML5.
if ($error->code == 801) {
continue;
}
switch ($error->level) {
case LIBXML_ERR_WARNING:
case LIBXML_ERR_ERROR:
$type = 'warning';
break;
case LIBXML_ERR_FATAL:
throw new Exception('Fatal error');
break;
}
$message = sprintf('%s on line %d. Error code: %d', trim($error->message), $error->line, $error->code);
// drupal_set_message($message, $type, FALSE);
print $message . "\n";
}
}
libxml_clear_errors();
libxml_use_internal_errors($use);
}
}
class CloudwordsConverterToHTML extends CloudwordsConverter {
public function __construct($xml) {
$this->doc = new DOMDocument('1.0', 'UTF-8');
$this->doc->strictErrorChecking = TRUE;
$error = $this->errorStart();
// Setting meta below is a hack to get our DomDocument into utf-8. All other
// methods tried didn't work.
$success = $this->doc->loadXML('<xliff version="1.2" xmlns:html="http://www.w3.org/1999/xhtml" xmlns="urn:oasis:names:tc:xliff:document:1.2" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-strict.xsd">' . $xml . '</xliff>');
$this->errorStop($error);
$this->elementMap = array_flip($this->elementMap);
if (!$success) {
throw new Exception('Invalid XML');
}
$this->xpath = new DOMXPath($this->doc);
$this->xpath->registerNamespace('html', 'http://www.w3.org/1999/xhtml');
$this->xpath->registerNamespace('xliff', 'urn:oasis:names:tc:xliff:document:1.2');
}
/**
* Converts XML to the corresponding HTML representation.
*
* @return string
* The source XML converted to HTML.
*/
public function toHTML($pretty_print = TRUE) {
$this->out = new DOMDocument('1.0', 'UTF-8');
$this->out->formatOutput = $pretty_print;
$field = $this->doc->getElementsByTagName('group')->item(0);
foreach ($field->childNodes as $child) {
if ($output = $this->convert($child)) {
if($output->childNodes->length > 0){
$this->out->appendChild($output);
}
}
}
return html_entity_decode($this->out->saveHTML(), ENT_QUOTES, 'UTF-8');
}
protected function convert(DOMNode $node) {
if ($node->nodeType == XML_ELEMENT_NODE) {
switch ($node->tagName) {
case 'group':
return $this->convertGroup($node);
case 'trans-unit':
return $this->convertTransUnit($node);
case 'g':
case 'x':
return $this->convertXG($node);
}
}
elseif ($node->nodeType == XML_TEXT_NODE) {
if (trim($node->nodeValue)) {
return $node->nodeValue;
}
}
return FALSE;
}
protected function addChildren(DOMNode $node, DOMNode $elem) {
foreach ($node->childNodes as $child) {
if ($new_child = $this->convert($child)) {
$elem->appendChild($new_child);
}
}
return $elem;
}
protected function htmlTag(DOMElement $element) {
switch ($element->tagName) {
case 'group':
case 'trans-unit':
$attr = $element->getAttribute('restype');
break;
case 'g':
case 'x':
$attr = $element->getAttribute('ctype');
break;
default:
// var_export( $element->nodeValue);
}
if (!$attr) {
return $this->out->createDocumentFragment();
}
if (isset($this->elementMap[$attr])) {
$html_element = $this->elementMap[$attr];
}
else {
$html_element = substr($attr, 7);
}
$out = $this->out->createElement($html_element);
$this->addAttrs($out, $element);
return $out;
}
protected function convertGroup(DOMElement $node) {
$elem = $this->htmlTag($node);
return $this->addChildren($node, $elem);
}
protected function convertTransUnit(DOMElement $node) {
$elem = $this->htmlTag($node);
$target = $node->getElementsByTagName('target')->item(0);
foreach ($target->childNodes as $child) {
$elem->appendChild($this->convertTarget($child));
}
return $elem;
}
protected function convertTarget(DOMNode $node) {
switch ($node->nodeType) {
case XML_ELEMENT_NODE:
$tag = $this->htmlTag($node);
foreach ($node->childNodes as $child) {
$tag->appendChild($this->convertTarget($child));
}
return $tag;
case XML_TEXT_NODE:
return $this->out->createTextNode($node->nodeValue);
}
}
protected function convertXG(DOMElement $elem) {
$html_element = substr($elem->getAttribute('ctype')->nodeValue, 7);
$out = $this->out->createElement($html_element);
$this->addAttrs($out, $elem);
return $this->addChildren($elem, $out);
}
protected function addAttrs($out, $elem) {
foreach ($elem->attributes as $key => $attr) {
if ($attr->prefix == 'html') {
$out->setAttribute($key, $attr->nodeValue);
}
elseif ($key == 'css-style') {
$out->setAttribute('style', $attr->nodeValue);
}
}
}
}
/**
* Iterates recursively over a DOMNodeList.
*/
class CloudwordsRecursiveDOMIterator implements RecursiveIterator {
/**
* Current Position in DOMNodeList.
*
* @var integer
*/
protected $position = 0;
/**
* The DOMNodeList with all children to iterate over.
*
* @var DOMNodeList
*/
protected $nodeList;
/**
* Constructor.
*
* @param DOMNode $node
* A DOMNode to iterate over.
*/
public function __construct(DOMNode $node) {
$this->nodeList = $node->childNodes;
}
/**
* Returns the current DOMNode.
*
* @return DOMNode
*/
public function current() {
return $this->nodeList->item($this->position);
}
/**
* Returns an iterator for the current iterator entry.
*
* @return CloudwordsRecursiveDOMIterator
*/
public function getChildren() {
// Poor mans late static binding.
$class = get_class($this);
return new $class($this->current());
}
/**
* Returns if an iterator can be created for the current entry.
*
* @return bool
*/
public function hasChildren() {
return $this->current()->hasChildNodes();
}
/**
* Returns the current position.
*
* @return int
*/
public function key() {
return $this->position;
}
/**
* Moves the current position to the next element.
*/
public function next() {
$this->position++;
}
/**
* Rewinds the Iterator to the first element.
*/
public function rewind() {
$this->position = 0;
}
/**
* Checks if current position is valid.
*
* @return bool
*/
public function valid() {
return $this->position < $this->nodeList->length;
}
}
class CloudwordsDOMElementFilter extends RecursiveFilterIterator {
public function accept() {
return $this->current()->nodeType === XML_ELEMENT_NODE;
}
}
