lionbridge_translation_provider-8.x-2.4/tmgmt_contentapi/src/Plugin/tmgmt_contentapi/Format/Xliff.php
tmgmt_contentapi/src/Plugin/tmgmt_contentapi/Format/Xliff.php
<?php
namespace Drupal\tmgmt_contentapi\Plugin\tmgmt_contentapi\Format;
use Drupal\tmgmt\Entity\Job;
use Drupal\tmgmt\Entity\JobItem;
use Drupal\tmgmt\JobInterface;
use Drupal\tmgmt\JobItemInterface;
use Drupal\tmgmt_contentapi\Format\FormatInterface;
use Drupal\tmgmt_contentapi\RecursiveDOMIterator;
/**
* Export to XLIFF format.
*
* The XLIFF processor follows this specification:
* @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html.
*
* The purpose of this class is to mask or process HTML elements in the source
* and target elements so that translation tools are able to understand which
* content needs to be translated and ignored.
*
* On the other hand we need to properly unmask the XLIFF markup back to HTML on
* the translation import. So the process is bidirectional and prior to running
* the unmasking process we try to validate the integrity in the
* validateJobTranslationUponImport() method. Currently the integrity check
* involves only a counter of XLIFF elements that have been created during
* source processing and has to mach number of XLIFF elements being imported
* with the translation.
*
* To process the content DOMDocument object is used due to its ability to
* read broken HTML. This also implies that if broken HTML is in the source
* content the translation content will be fixed into the extend of DOMDocument
* abilities.
*
* Following is implemented:
* - All pair tags get escaped using <bpt><ept> markup.
* - <br> tags are marked with <x ctype="lb">.
* - <img> tags are marked with <ph ctype="image"> tags. The title and alt
* attributes should have been extracted into <sub> elements, however are not
* as Trados studio triggers a fatal error in case there are two <sub>
* elements at the same level.
*
* Not implemented:
* - Attributes of <img> element are written only as attributes of <ph> element
* instead of using x-html: prefix. This results in conflict with own <ph>
* element's attributes such as "id". The reason why x-html prefix has not
* been used is that Trados studio triggered fatal error on xml validation.
* - Translatable attributes like title and alt.
* @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#elem_img
* - Forms - this is big part
* @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#HTMLForms
* - <pre> elements
* @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#Elem_preformatted
*
* @FormatPlugin(
* id = "contentapi_xlf",
* label = @Translation("XLIFF")
* )
*/
class Xliff extends \XMLWriter implements FormatInterface {
/**
* Contains a reference to the currently being exported job.
*
* @var \Drupal\tmgmt\Entity\Job
*/
protected $job;
/**
* Contains the simple XMLElement object.
*
* @var string
*/
protected $importedXML;
/**
* Contains the imported translation units.
*
* @var string
*/
protected $importedTransUnits;
/**
* Adds a job item to the xml export.
*
* @param \Drupal\tmgmt\JobItemInterface $item
* The job item entity.
*/
protected function addItem(JobItemInterface $item) {
$this->startElement('group');
$this->writeAttribute('id', $item->id());
// Add a note for the source label.
$this->writeElement('note', $item->getSourceLabel());
// @todo: Write in nested groups instead of flattening it.
$trdata = $this->getTranslation($item);
$data = \Drupal::service('tmgmt.data')->filterTranslatable($item->getData());
$data = $this->merTranslation($data, $trdata);
foreach ($data as $key => $element) {
\Drupal::moduleHandler()->alter('tmgmt_contentapi_transunit_element', $element, $key, $item);
$this->addTransUnit($item->id() . '][' . $key, $element, $this->job);
}
$this->endElement();
}
/**
* Merge translation.
*/
private function merTranslation($sourcedata, $targetdata) {
$mergeddata = NULL;
foreach ($sourcedata as $key => $element) {
if (array_key_exists($key, $targetdata)) {
$element['#translation']['#text'] = $targetdata[$key]['#text'];
$sourcedata[$key] = $element;
}
}
$mergeddata = $sourcedata;
return $mergeddata;
}
/**
* Add item path and item id.
*/
protected function addNote(JobItemInterface $job_item) {
$storage = \Drupal::entityTypeManager()->getStorage($job_item->getItemType());
$entity = $storage->load($job_item->getItemId());
if ($job_item->getItemType() != 'node') {
$this->writeElement('Item-Path', $job_item->getItemType());
}
else {
try {
$item_path = $entity->toUrl()->getInternalPath();
$this->writeElement('Item-Path', $item_path);
}
catch (\Exception $ex) {
$this->writeElement('Item-Path', $job_item->getItemType());
}
}
}
/**
* Adds a job item to the xml export.
*
* @param \Drupal\tmgmt\JobItemInterface $item
* The job item entity.
*/
private function getTranslation(JobItemInterface $item) {
try {
$storage = \Drupal::entityTypeManager()->getStorage($item->getItemType());
$entity = $storage->load($item->getItemId());
$source_plugin = $item->getSourcePlugin();
$translated_data = NULL;
if ($entity != NULL && $entity->hasTranslation($item->getJob()->getTargetLangcode())) {
$translated_entity = $entity->getTranslation($item->getJob()
->getTargetLangcode());
$translated_data = $source_plugin->extractTranslatableData($translated_entity);
$trfiltered = \Drupal::service('tmgmt.data')
->filterTranslatable($translated_data);
return $trfiltered;
}
}
catch (\Throwable $ex) {
\Drupal::logger('TMGMT_CONTENTAPI')->error(t('Error while exporting translation for job %job and item %item: %message'), [
'%job' => $item->getJobId(),
'%item' => $item->getItemId(),
'%message' => $ex->getMessage(),
]);
return [];
}
return [];
}
/**
* Adds a single translation unit for a data element.
*
* @param $key
* The unique identifier for this data element.
* @param $element
* Array with the properties #text and optionally #label.
* @param \Drupal\tmgmt\JobInterface $job
* Translation job.
*/
protected function addTransUnit($key, $element, JobInterface $job) {
$key_array = \Drupal::service('tmgmt.data')->ensureArrayKey($key);
$this->startElement('trans-unit');
$this->writeAttribute('id', $key);
$this->writeAttribute('resname', $key);
$this->startElement('source');
$this->writeAttribute('xml:lang', $this->job->getRemoteSourceLanguage());
if ($job->getSetting('xliff_cdata')) {
$this->writeCdata(trim($element['#text']));
}
elseif ($job->getSetting('xliff_processing')) {
$this->writeRaw($this->processForExport($element['#text'], $key_array));
}
else {
$this->text($element['#text']);
}
$this->endElement();
$this->startElement('target');
$this->writeAttribute('xml:lang', $this->job->getRemoteTargetLanguage());
if (!empty($element['#translation']['#text'])) {
if ($job->getSetting('xliff_processing')) {
$this->writeRaw($this->processForExport($element['#translation']['#text'], $key_array));
}
else {
$this->text($element['#translation']['#text']);
}
}
$this->endElement();
$this->endElement();
}
/**
* {@inheritdoc}
*/
#[\ReturnTypeWillChange]
public function export(JobInterface $job, $conditions = []) {
$this->job = $job;
$item_id = (Job::load((int) $job->id()));
$this->openMemory();
$this->setIndent(TRUE);
$this->setIndentString(' ');
$this->startDocument('1.0', 'UTF-8');
// Root element with schema definition.
$this->startElement('xliff');
$this->writeAttribute('version', '1.2');
$this->writeAttribute('xmlns', 'urn:oasis:names:tc:xliff:document:1.2');
$this->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance');
$this->writeAttribute('xsi:schemaLocation', 'urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-strict.xsd');
// File element.
$this->startElement('file');
$this->writeAttribute('original', 'xliff-core-1.2-strict.xsd');
$this->writeAttribute('source-language', $job->getRemoteSourceLanguage());
$this->writeAttribute('target-language', $job->getRemoteTargetLanguage());
$this->writeAttribute('datatype', 'plaintext');
// Date needs to be in ISO-8601 UTC.
$this->writeAttribute('date', date('Y-m-d\Th:m:i\Z'));
$this->startElement('header');
$this->startElement('note');
$this->writeElement('Item-ID', $item_id->uuid());
foreach ($job->getItems($conditions) as $item) {
$this->addNote($item);
}
$this->endElement();
$this->startElement('phase-group');
$this->startElement('phase');
$this->writeAttribute('tool-id', 'tmgmt');
$this->writeAttribute('phase-name', 'extraction');
$this->writeAttribute('process-name', 'extraction');
$this->writeAttribute('job-id', $job->id());
$this->endElement();
$this->endElement();
$this->startElement('tool');
$this->writeAttribute('tool-id', 'tmgmt');
$this->writeAttribute('tool-name', 'Drupal Translation Management Tools');
$this->endElement();
$this->endElement();
$this->startElement('body');
foreach ($job->getItems($conditions) as $item) {
$this->addItem($item);
}
// End the body, file and xliff tags.
$this->endElement();
$this->endElement();
$this->endElement();
$this->endDocument();
return $this->outputMemory();
}
/**
* {@inheritdoc}
*/
#[\ReturnTypeWillChange]
public function exportItem(JobItemInterface $jobItem, $conditions = []) {
$this->job = $jobItem->getJob();
$job = $jobItem->getJob();
$this->openMemory();
$this->setIndent(TRUE);
$this->setIndentString(' ');
$this->startDocument('1.0', 'UTF-8');
// Root element with schema definition.
$this->startElement('xliff');
$this->writeAttribute('version', '1.2');
$this->writeAttribute('xmlns', 'urn:oasis:names:tc:xliff:document:1.2');
$this->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance');
$this->writeAttribute('xsi:schemaLocation', 'urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-strict.xsd');
// File element.
$this->startElement('file');
$this->writeAttribute('original', 'xliff-core-1.2-strict.xsd');
$this->writeAttribute('source-language', $job->getRemoteSourceLanguage());
$this->writeAttribute('target-language', $job->getRemoteTargetLanguage());
$this->writeAttribute('datatype', 'plaintext');
// Date needs to be in ISO-8601 UTC.
$this->writeAttribute('date', date('Y-m-d\Th:m:i\Z'));
$this->startElement('header');
$this->startElement('phase-group');
$this->startElement('phase');
$this->writeAttribute('tool-id', 'tmgmt');
$this->writeAttribute('phase-name', 'extraction');
$this->writeAttribute('process-name', 'extraction');
$this->writeAttribute('job-id', $job->id());
$this->endElement();
$this->endElement();
$this->startElement('tool');
$this->writeAttribute('tool-id', 'tmgmt');
$this->writeAttribute('tool-name', 'Drupal Translation Management Tools');
$this->endElement();
$this->endElement();
$this->startElement('body');
$this->addItem($jobItem);
// End the body, file and xliff tags.
$this->endElement();
$this->endElement();
$this->endElement();
$this->endDocument();
return $this->outputMemory();
}
/**
* {@inheritdoc}
*/
#[\ReturnTypeWillChange]
public function import($imported_file, $is_file = TRUE) {
if ($this->getImportedXML($imported_file, $is_file) === FALSE) {
return FALSE;
}
$phase = $this->importedXML->xpath("//xliff:phase[@phase-name='extraction']");
$phase = reset($phase);
$job = Job::load((string) $phase['job-id']);
return \Drupal::service('tmgmt.data')->unflatten($this->getImportedTargets($job));
}
/**
* {@inheritdoc}
*/
#[\ReturnTypeWillChange]
public function validateImport($imported_file, $is_file = TRUE) {
// Validates imported XLIFF file.
// Checks:
// - Job ID
// - Target ans source languages
// - Content integrity.
$xml = $this->getImportedXML($imported_file, $is_file);
if ($xml === FALSE) {
\Drupal::messenger()->addMessage(t('The imported file is not a valid XML.'), 'error');
return FALSE;
}
// Check if our phase information is there.
$phase = $xml->xpath("//xliff:phase[@phase-name='extraction']");
if ($phase) {
$phase = reset($phase);
}
else {
\Drupal::messenger()->addMessage(t('The imported file is missing required XLIFF phase information.'), 'error');
return FALSE;
}
// Check if the job has a valid job reference.
if (!isset($phase['job-id'])) {
\Drupal::messenger()->addMessage(t('The imported file does not contain a job reference.'), 'error');
return FALSE;
}
// Attempt to load the job if none passed.
$job = (Job::load((int) $phase['job-id']));
if (empty($job)) {
\Drupal::messenger()->addMessage(t('The imported file job id @file_tjid is not available.', [
'@file_tjid' => $phase['job-id'],
]), 'error');
return FALSE;
}
// Compare source language.
if (!isset($xml->file['source-language']) || $job->getRemoteSourceLanguage() != $xml->file['source-language']) {
$job->addMessage('The imported file source language @file_language does not match the job source language @job_language.', [
'@file_language' => empty($xml->file['source-language']) ? t('none') : $xml->file['source-language'],
'@job_language' => $job->source_language,
], 'error');
return FALSE;
}
// Compare target language.
if (!isset($xml->file['target-language']) || $job->getRemoteTargetLanguage() != $xml->file['target-language']) {
$job->addMessage('The imported file target language @file_language does not match the job target language @job_language.', [
'@file_language' => empty($xml->file['target-language']) ? t('none') : $xml->file['target-language'],
'@job_language' => $job->target_language,
], 'error');
return FALSE;
}
$targets = $this->getImportedTargets($job);
if (empty($targets)) {
$job->addMessage('The imported file seems to be missing translation.', 'error');
return FALSE;
}
// In case we do not do xliff processing we cannot do the elements
// count validation.
if (!$job->getSetting('xliff_processing')) {
return $job;
}
$reader = new \XMLReader();
$xliff_validation = $job->getSetting('xliff_validation');
foreach ($targets as $id => $target) {
$array_key = \Drupal::service('tmgmt.data')->ensureArrayKey($id);
$job_item = JobItem::load(array_shift($array_key));
$count = 0;
$reader->XML('<translation>' . $target['#text'] . '</translation>');
while ($reader->read()) {
if (in_array($reader->name, ['translation', '#text'])) {
continue;
}
$count++;
}
if (!isset($xliff_validation[$id]) || $xliff_validation[$id] != $count) {
}
}
// Validation successful.
return $job;
}
/**
* Returns the simple XMLElement object.
*
* @param string $imported_file
* Path to a file or an XML string to import.
* @param bool $is_file
* (optional) Whether $imported_file is the path to a file or not.
*
* @return bool|\SimpleXMLElement
* The parsed SimpleXMLElement object. FALSE in case of failed parsing.
*/
protected function getImportedXML($imported_file, $is_file = TRUE) {
if (empty($this->importedXML)) {
// It is not possible to load the file directly with simplexml as it gets
// url encoded due to the temporary://. This is a PHP bug, see
// https://bugs.php.net/bug.php?id=61469
if ($is_file) {
$imported_file = file_get_contents($imported_file);
}
$this->importedXML = simplexml_load_string($imported_file);
if ($this->importedXML === FALSE) {
\Drupal::messenger()->addMessage(t('The imported file is not a valid XML.'), 'error');
return FALSE;
}
// Register the XLIFF namespace, required for xpath.
$this->importedXML->registerXPathNamespace('xliff', 'urn:oasis:names:tc:xliff:document:1.2');
}
return $this->importedXML;
}
/**
*
*/
protected function getImportedTargets(JobInterface $job) {
if (empty($this->importedXML)) {
return FALSE;
}
if (empty($this->importedTransUnits)) {
$reader = new \XMLReader();
foreach ($this->importedXML->xpath('//xliff:trans-unit') as $unit) {
if (!$job->getSetting('xliff_processing')) {
$this->importedTransUnits[(string) $unit['id']]['#text'] = (string) $unit->target;
continue;
}
$reader->XML($unit->target->asXML());
$reader->read();
$this->importedTransUnits[(string) $unit['id']]['#text'] =
$this->processForImport($reader->readInnerXML(), $job);
}
}
return $this->importedTransUnits;
}
/**
* Processes trans-unit/target to rebuild back the HTML.
*
* @param string $translation
* Job data array.
* @param \Drupal\tmgmt\JobInterface $job
* Translation job.
*
* @return string
*/
protected function processForImport($translation, JobInterface $job) {
// In case we do not want to do xliff processing return the translation as
// is.
if (!$job->getSetting('xliff_processing')) {
return $translation;
}
$reader = new \XMLReader();
$reader->XML('<translation>' . $translation . '</translation>');
$text = '';
while ($reader->read()) {
// If the current element is text append it to the result text.
if ($reader->name == '#text' || $reader->name == '#cdata-section') {
$text .= $reader->value;
}
elseif ($reader->name == 'x') {
if ($reader->getAttribute('ctype') == 'lb') {
$text .= '<br />';
}
}
elseif ($reader->name == 'ph') {
if ($reader->getAttribute('ctype') == 'image') {
$text .= '<img';
while ($reader->moveToNextAttribute()) {
// @todo - we have to use x-html: prefixes for attributes.
if ($reader->name != 'ctype' && $reader->name != 'id') {
$text .= " {$reader->name}=\"{$reader->value}\"";
}
}
$text .= ' />';
}
}
}
return $text;
}
/**
* Helper function to process the source text.
*
* @param string $source
* Job data array.
* @param array $key_array
* The source item data key.
*
* @return string
*/
protected function processForExport($source, array $key_array) {
$tjiid = $key_array[0];
$key_string = \Drupal::service('tmgmt.data')->ensureStringKey($key_array);
// The reason why we use DOMDocument object here and not just XMLReader
// is the DOMDocument's ability to deal with broken HTML.
$dom = new \DOMDocument();
// We need to append the head with encoding so that special characters
// are read correctly.
$dom->loadHTML("<html><head><meta http-equiv='Content-type' content='text/html; charset=UTF-8' /></head><body>" . $source . '</body></html>');
$iterator = new \RecursiveIteratorIterator(
new RecursiveDOMIterator($dom),
\RecursiveIteratorIterator::SELF_FIRST);
$writer = new \XMLWriter();
$writer->openMemory();
$writer->startDocument('1.0', 'UTF-8');
$writer->startElement('wrapper');
$tray = [];
$non_pair_tags = ['br', 'img'];
$xliff_validation = $this->job->getSetting('xliff_validation');
/** @var \DOMElement $node */
foreach ($iterator as $node) {
if (in_array($node->nodeName, ['html', 'body', 'head', 'meta'])) {
continue;
}
if ($node->nodeType === XML_ELEMENT_NODE) {
// Increment the elements count and compose element id.
if (!isset($xliff_validation[$key_string])) {
$xliff_validation[$key_string] = 0;
}
$xliff_validation[$key_string]++;
$id = 'tjiid' . $tjiid . '-' . $xliff_validation[$key_string];
$is_pair_tag = !in_array($node->nodeName, $non_pair_tags);
if ($is_pair_tag) {
$this->writeBPT($writer, $node, $id);
}
elseif ($node->nodeName == 'img') {
$this->writeIMG($writer, $node, $id);
}
elseif ($node->nodeName == 'br') {
$this->writeBR($writer, $node, $id);
}
// Add to tray new element info.
$tray[$id] = [
'name' => $node->nodeName,
'id' => $id,
'value' => $node->nodeValue,
'built_text' => '',
'is_pair_tag' => $is_pair_tag,
];
}
// The current node is a text.
elseif ($node->nodeName == '#text') {
// Add the node value to the text output.
$writer->writeCdata($this->toEntities($node->nodeValue));
foreach ($tray as &$info) {
$info['built_text'] .= $node->nodeValue;
}
}
// Reverse so that pair tags are closed in the expected order.
$reversed_tray = array_reverse($tray);
foreach ($reversed_tray as $_info) {
// If the build_text equals to the node value and it is not a pair tag
// add the end pair tag markup.
if ($_info['value'] == $_info['built_text'] && $_info['is_pair_tag']) {
// Count also for the closing elements.
$xliff_validation[$key_string]++;
$this->writeEPT($writer, $_info['name'], $_info['id']);
// When the end pair tag has been written unset the element info
// from the tray.
unset($tray[$_info['id']]);
}
}
}
// Set the xliff_validation data and save the job.
$this->job->settings->xliff_validation = $xliff_validation;
$this->job->save();
$writer->endElement();
// Load the output with XMLReader so that we can easily get the inner xml.
$reader = new \XMLReader();
$reader->XML($writer->outputMemory());
$reader->read();
return $reader->readInnerXML();
}
/**
* Writes br tag.
*
* @param XMLWriter $writer
* Writer that writes the output.
* @param DOMElement $node
* Current node.
* @param $id
* Current node id.
*/
protected function writeBR(\XMLWriter $writer, \DOMElement $node, $id) {
$writer->startElement('x');
$writer->writeAttribute('id', $id);
$writer->writeAttribute('ctype', 'lb');
$writer->endElement();
}
/**
* Writes beginning pair tag.
*
* @param XMLWriter $writer
* Writer that writes the output.
* @param DOMElement $node
* Current node.
* @param $id
* Current node id.
*/
protected function writeBPT(\XMLWriter $writer, \DOMElement $node, $id) {
$beginning_tag = '<' . $node->nodeName;
if ($node->hasAttributes()) {
$attributes = [];
/** @var DOMAttr $attribute */
foreach ($node->attributes as $attribute) {
$attributes[] = $attribute->name . '="' . $attribute->value . '"';
}
$beginning_tag .= ' ' . implode(' ', $attributes);
}
$beginning_tag .= '>';
$writer->startElement('bpt');
$writer->writeAttribute('id', $id);
$writer->text($beginning_tag);
$writer->endElement();
}
/**
* Writes ending pair tag.
*
* @param XMLWriter $writer
* Writer that writes the output.
* @param string $name
* Ending tag name.
* @param $id
* Current node id.
*/
protected function writeEPT(\XMLWriter $writer, $name, $id) {
$writer->startElement('ept');
$writer->writeAttribute('id', $id);
$writer->text('</' . $name . '>');
$writer->endElement();
}
/**
* Writes img tag.
*
* Note that alt and title attributes are not written as sub elements as
* Trados studio is not able to deal with two sub elements at one level.
*
* @param XMLWriter $writer
* Writer that writes the output.
* @param DOMElement $node
* Current node.
* @param $id
* Current node id.
*/
protected function writeIMG(\XMLWriter $writer, \DOMElement $node, $id) {
$writer->startElement('ph');
$writer->writeAttribute('id', $id);
$writer->writeAttribute('ctype', 'image');
foreach ($node->attributes as $attribute) {
// @todo - uncomment when issue with Trados/sub elements fixed.
/*
if (in_array($attribute->name, array('title', 'alt'))) {
continue;
}
*/
$writer->writeAttribute($attribute->name, $attribute->value);
}
$writer->endElement();
}
/**
* Convert critical characters to HTML entities.
*
* DOMDocument will convert HTML entities to its actual characters. This can
* lead into situation when not allowed characters will appear in the content.
*
* @param string $string
* String to escape.
*
* @return string
* Escaped string.
*/
protected function toEntities($string) {
return str_replace(['&', '>', '<'], ['&', '>', '<'], $string);
}
}
