lionbridge_content_api_test-8.x-4.0/tmgmt_contentapi/src/Plugin/tmgmt_contentapi/Format/Xliff.php

tmgmt_contentapi/src/Plugin/tmgmt_contentapi/Format/Xliff.php
<?php

namespace Drupal\tmgmt_contentapi\Plugin\tmgmt_contentapi\Format;

use Drupal\tmgmt\Entity\Job;
use Drupal\tmgmt\Entity\JobItem;
use Drupal\tmgmt\JobInterface;
use Drupal\tmgmt\JobItemInterface;
use Drupal\tmgmt_contentapi\Format\FormatInterface;
use Drupal\tmgmt_contentapi\RecursiveDOMIterator;
use Exception;

/**
 * Export to XLIFF format.
 *
 * The XLIFF processor follows this specification:
 * @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html
 *
 * The purpose of this class is to mask or process HTML elements in the source
 * and target elements so that translation tools are able to understand which
 * content needs to be translated and ignored.
 *
 * On the other hand we need to properly unmask the XLIFF markup back to HTML on
 * the translation import. So the process is bidirectional and prior to running
 * the unmasking process we try to validate the integrity in the
 * validateJobTranslationUponImport() method. Currently the integrity check
 * involves only a counter of XLIFF elements that have been created during
 * source processing and has to mach number of XLIFF elements being imported
 * with the translation.
 *
 * To process the content DOMDocument object is used due to its ability to
 * read broken HTML. This also implies that if broken HTML is in the source
 * content the translation content will be fixed into the extend of DOMDocument
 * abilities.
 *
 * Following is implemented:
 * - All pair tags get escaped using <bpt><ept> markup.
 * - <br> tags are marked with <x ctype="lb">.
 * - <img> tags are marked with <ph ctype="image"> tags. The title and alt
 *   attributes should have been extracted into <sub> elements, however are not
 *   as Trados studio triggers a fatal error in case there are two <sub>
 *   elements at the same level.
 *
 * Not implemented:
 * - Attributes of <img> element are written only as attributes of <ph> element
 *   instead of using x-html: prefix. This results in conflict with own <ph>
 *   element's attributes such as "id". The reason why x-html prefix has not
 *   been used is that Trados studio triggered fatal error on xml validation.
 * - Translatable attributes like title and alt.
 *   @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#elem_img
 * - Forms - this is big part
 *   @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#HTMLForms
 * - <pre> elements
 *   @link http://docs.oasis-open.org/xliff/v1.2/xliff-profile-html/xliff-profile-html-1.2-cd02.html#Elem_preformatted
 *
 * @FormatPlugin(
 *   id = "contentapi_xlf",
 *   label = @Translation("XLIFF")
 * )
 */
class Xliff extends \XMLWriter implements FormatInterface {

  /**
   * Contains a reference to the currently being exported job.
   *
   * @var Job
   */
  protected $job;

  protected $importedXML;
  protected $importedTransUnits;

  /**
   * Adds a job item to the xml export.
   *
   * @param $item
   *   The job item entity.
   */
  protected function addItem(JobItemInterface $item) {
    $this->startElement('group');
    $this->writeAttribute('id', $item->id());
    // Add a note for the source label.
    $this->writeElement('note', $item->getSourceLabel());
    // @todo: Write in nested groups instead of flattening it.
    $trdata = $this -> getTranslation($item);
    $data = \Drupal::service('tmgmt.data')->filterTranslatable($item->getData());
    $data = $this->merTranslation($data,$trdata);
    foreach ($data as $key => $element) {
      $this->addTransUnit($item->id() . '][' . $key, $element, $this->job);
    }
    $this->endElement();
  }

  private function merTranslation($sourcedata, $targetdata){
    $mergeddata = NULL;
    foreach ($sourcedata as $key => $element) {
      if(array_key_exists($key,$targetdata)){
        $element['#translation']['#text'] = $targetdata[$key]['#text'];
        $sourcedata[$key] = $element;
      }
    }
    $mergeddata = $sourcedata;
    return $mergeddata;
  }

  /**
   * Adds a job item to the xml export.
   *
   * @param $item
   *   The job item entity.
   */
  private function getTranslation(JobItemInterface $item){
    try {
      $storage = \Drupal::entityTypeManager()->getStorage($item->getItemType());
      $entity = $storage->load($item->getItemId());
      $source_plugin = $item->getSourcePlugin();
      $translated_data = NULL;
      if ($entity != NULL && $entity->hasTranslation($item->getJob()->getTargetLangcode())) {
        $translated_entity = $entity->getTranslation($item->getJob()
          ->getTargetLangcode());
        $translated_data = $source_plugin->extractTranslatableData($translated_entity);
        $trfiltered = \Drupal::service('tmgmt.data')
          ->filterTranslatable($translated_data);
        return $trfiltered;
      }
    }
    catch (\Throwable $ex){
      \Drupal::logger('TMGMT_CONTENTAPI')->error(t('Error while exporting translation for job %job and item %item: %message'), [
        '%job' => $item->getJobId(),
        '%item' => $item->getItemId(),
        '%message' => $ex->getMessage(),
      ]);
      return [];
    }
    return [];
  }

  /**
   * Adds a single translation unit for a data element.
   *
   * @param $key
   *   The unique identifier for this data element.
   * @param $element
   *   Array with the properties #text and optionally #label.
   * @param \Drupal\tmgmt\JobInterface $job
   *   Translation job.
   */
  protected function addTransUnit($key, $element, JobInterface $job) {

    $key_array = \Drupal::service('tmgmt.data')->ensureArrayKey($key);

    $this->startElement('trans-unit');
    $this->writeAttribute('id', $key);
    $this->writeAttribute('resname', $key);

    $this->startElement('source');
    $this->writeAttribute('xml:lang', $this->job->getRemoteSourceLanguage());

    if ($job->getSetting('xliff_cdata')) {
      $this->writeCdata(trim($element['#text']));
    }
    elseif ($job->getSetting('xliff_processing')) {
      $this->writeRaw($this->processForExport($element['#text'], $key_array));
    }
    else {
      $this->text($element['#text']);
    }

    $this->endElement();
    $this->startElement('target');
    $this->writeAttribute('xml:lang', $this->job->getRemoteTargetLanguage());

    if (!empty($element['#translation']['#text'])) {
      if ($job->getSetting('xliff_processing')) {
        $this->writeRaw($this->processForExport($element['#translation']['#text'], $key_array));
      }
      else {
        $this->text($element['#translation']['#text']);
      }
    }

    $this->endElement();
    if (isset($element['#label'])) {
      $this->writeElement('note', $element['#label']);
    }
    $this->endElement();
  }

  /**
   * {@inheritdoc}
   */
  public function export(JobInterface $job, $conditions = array()) {

    $this->job = $job;

    $this->openMemory();
    $this->setIndent(TRUE);
    $this->setIndentString(' ');
    $this->startDocument('1.0', 'UTF-8');

    // Root element with schema definition.
    $this->startElement('xliff');
    $this->writeAttribute('version', '1.2');
    $this->writeAttribute('xmlns', 'urn:oasis:names:tc:xliff:document:1.2');
    $this->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance');
    $this->writeAttribute('xsi:schemaLocation', 'urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-strict.xsd');

    // File element.
    $this->startElement('file');
    $this->writeAttribute('original', 'xliff-core-1.2-strict.xsd');
    $this->writeAttribute('source-language', $job->getRemoteSourceLanguage());
    $this->writeAttribute('target-language', $job->getRemoteTargetLanguage());
    $this->writeAttribute('datatype', 'plaintext');
    // Date needs to be in ISO-8601 UTC
    $this->writeAttribute('date', date('Y-m-d\Th:m:i\Z'));

    $this->startElement('header');
    $this->startElement('phase-group');
    $this->startElement('phase');
    $this->writeAttribute('tool-id', 'tmgmt');
    $this->writeAttribute('phase-name', 'extraction');
    $this->writeAttribute('process-name', 'extraction');
    $this->writeAttribute('job-id', $job->id());

    $this->endElement();
    $this->endElement();
    $this->startElement('tool');
    $this->writeAttribute('tool-id', 'tmgmt');
    $this->writeAttribute('tool-name', 'Drupal Translation Management Tools');
    $this->endElement();
    $this->endElement();

    $this->startElement('body');

    foreach ($job->getItems($conditions) as $item) {
      $this->addItem($item);
    }

    // End the body, file and xliff tags.
    $this->endElement();
    $this->endElement();
    $this->endElement();
    $this->endDocument();
    return $this->outputMemory();
  }

    public function exportItem(JobItemInterface $jobItem, $conditions = array()) {

        $this->job = $jobItem->getJob();
        $job = $jobItem->getJob();

        $this->openMemory();
        $this->setIndent(TRUE);
        $this->setIndentString(' ');
        $this->startDocument('1.0', 'UTF-8');

        // Root element with schema definition.
        $this->startElement('xliff');
        $this->writeAttribute('version', '1.2');
        $this->writeAttribute('xmlns', 'urn:oasis:names:tc:xliff:document:1.2');
        $this->writeAttribute('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance');
        $this->writeAttribute('xsi:schemaLocation', 'urn:oasis:names:tc:xliff:document:1.2 xliff-core-1.2-strict.xsd');

        // File element.
        $this->startElement('file');
        $this->writeAttribute('original', 'xliff-core-1.2-strict.xsd');
        $this->writeAttribute('source-language', $job->getRemoteSourceLanguage());
        $this->writeAttribute('target-language', $job->getRemoteTargetLanguage());
        $this->writeAttribute('datatype', 'plaintext');
        // Date needs to be in ISO-8601 UTC
        $this->writeAttribute('date', date('Y-m-d\Th:m:i\Z'));

        $this->startElement('header');
        $this->startElement('phase-group');
        $this->startElement('phase');
        $this->writeAttribute('tool-id', 'tmgmt');
        $this->writeAttribute('phase-name', 'extraction');
        $this->writeAttribute('process-name', 'extraction');
        $this->writeAttribute('job-id', $job->id());

        $this->endElement();
        $this->endElement();
        $this->startElement('tool');
        $this->writeAttribute('tool-id', 'tmgmt');
        $this->writeAttribute('tool-name', 'Drupal Translation Management Tools');
        $this->endElement();
        $this->endElement();

        $this->startElement('body');


            $this->addItem($jobItem);


        // End the body, file and xliff tags.
        $this->endElement();
        $this->endElement();
        $this->endElement();
        $this->endDocument();
        return $this->outputMemory();
    }

  /**
   * {@inheritdoc}
   */
  public function import($imported_file, $is_file = TRUE) {
    if ($this->getImportedXML($imported_file, $is_file) === FALSE) {
      return FALSE;
    }
    $phase = $this->importedXML->xpath("//xliff:phase[@phase-name='extraction']");
    $phase = reset($phase);
    $job = Job::load((string) $phase['job-id']);
    return \Drupal::service('tmgmt.data')->unflatten($this->getImportedTargets($job));
  }

  /**
   * {@inheritdoc}
   */
  public function validateImport($imported_file, $is_file = TRUE) {
    // Validates imported XLIFF file.
    // Checks:
    // - Job ID
    // - Target ans source languages
    // - Content integrity.

    $xml = $this->getImportedXML($imported_file, $is_file);
    if ($xml === FALSE) {
      drupal_set_message(t('The imported file is not a valid XML.'), 'error');
      return FALSE;
    }
    // Check if our phase information is there.
    $phase = $xml->xpath("//xliff:phase[@phase-name='extraction']");
    if ($phase) {
      $phase = reset($phase);
    }
    else {
      drupal_set_message(t('The imported file is missing required XLIFF phase information.'), 'error');
      return FALSE;
    }

    // Check if the job has a valid job reference.
    if (!isset($phase['job-id'])) {
      drupal_set_message(t('The imported file does not contain a job reference.'), 'error');
      return FALSE;
    }

    // Attempt to load the job if none passed.
    $job = (Job::load((int) $phase['job-id']));
    if (empty($job)) {
      drupal_set_message(t('The imported file job id @file_tjid is not available.', array(
        '@file_tjid' => $phase['job-id'],
      )), 'error');
      return FALSE;
    }

    // Compare source language.
    if (!isset($xml->file['source-language']) || $job->getRemoteSourceLanguage() != $xml->file['source-language']) {
      $job->addMessage('The imported file source language @file_language does not match the job source language @job_language.', array(
        '@file_language' => empty($xml->file['source-language']) ? t('none') : $xml->file['source-language'],
        '@job_language' => $job->source_language,
      ), 'error');
      return FALSE;
    }

    // Compare target language.
    if (!isset($xml->file['target-language']) || $job->getRemoteTargetLanguage() != $xml->file['target-language']) {
      $job->addMessage('The imported file target language @file_language does not match the job target language @job_language.', array(
        '@file_language' => empty($xml->file['target-language']) ? t('none') : $xml->file['target-language'],
        '@job_language' => $job->target_language,
      ), 'error');
      return FALSE;
    }

    $targets = $this->getImportedTargets($job);

    if (empty($targets)) {
      $job->addMessage('The imported file seems to be missing translation.', 'error');
      return FALSE;
    }

    // In case we do not do xliff processing we cannot do the elements
    // count validation.
    if (!$job->getSetting('xliff_processing')) {
      return $job;
    }

    $reader = new \XMLReader();
    $xliff_validation = $job->getSetting('xliff_validation');

    foreach ($targets as $id => $target) {
      $array_key = \Drupal::service('tmgmt.data')->ensureArrayKey($id);
      $job_item = JobItem::load(array_shift($array_key));
      $count = 0;
      $reader->XML('<translation>' . $target['#text'] . '</translation>');
      while ($reader->read()) {
        if (in_array($reader->name, array('translation', '#text'))) {
          continue;
        }
        $count++;
      }

      if (!isset($xliff_validation[$id]) || $xliff_validation[$id] != $count) {
      }
    }

    // Validation successful.
    return $job;
  }

  /**
   * Returns the simple XMLElement object.
   *
   * @param string $imported_file
   *   Path to a file or an XML string to import.
   * @param bool $is_file
   *   (optional) Whether $imported_file is the path to a file or not.
   *
   * @return bool|\SimpleXMLElement
   *   The parsed SimpleXMLElement object. FALSE in case of failed parsing.
   */
  protected function getImportedXML($imported_file, $is_file = TRUE) {
    if (empty($this->importedXML)) {
      // It is not possible to load the file directly with simplexml as it gets
      // url encoded due to the temporary://. This is a PHP bug, see
      // https://bugs.php.net/bug.php?id=61469
      if ($is_file) {
        $imported_file = file_get_contents($imported_file);
      }

      $this->importedXML = simplexml_load_string($imported_file);
      if ($this->importedXML === FALSE) {
        drupal_set_message(t('The imported file is not a valid XML.'), 'error');
        return FALSE;
      }
      // Register the XLIFF namespace, required for xpath.
      $this->importedXML->registerXPathNamespace('xliff', 'urn:oasis:names:tc:xliff:document:1.2');
    }

    return $this->importedXML;
  }

  protected function getImportedTargets(JobInterface $job) {
    if (empty($this->importedXML)) {
      return FALSE;
    }

    if (empty($this->importedTransUnits)) {
      $reader = new \XMLReader();
      foreach ($this->importedXML->xpath('//xliff:trans-unit') as $unit) {
        if (!$job->getSetting('xliff_processing')) {
          $this->importedTransUnits[(string) $unit['id']]['#text'] = (string) $unit->target;
          continue;
        }

        $reader->XML($unit->target->asXML());
        $reader->read();
        $this->importedTransUnits[(string) $unit['id']]['#text'] =
          $this->processForImport($reader->readInnerXML(), $job);
      }
    }

    return $this->importedTransUnits;
  }

  /**
   * Processes trans-unit/target to rebuild back the HTML.
   *
   * @param string $translation
   *   Job data array.
   * @param \Drupal\tmgmt\JobInterface $job
   *   Translation job.
   *
   * @return string
   */
  protected function processForImport($translation, JobInterface $job) {
    // In case we do not want to do xliff processing return the translation as
    // is.
    if (!$job->getSetting('xliff_processing')) {
      return $translation;
    }

    $reader = new \XMLReader();
    $reader->XML('<translation>' . $translation . '</translation>');
    $text = '';

    while ($reader->read()) {
      // If the current element is text append it to the result text.
      if ($reader->name == '#text' || $reader->name == '#cdata-section') {
        $text .= $reader->value;
      }
      elseif ($reader->name == 'x') {
        if ($reader->getAttribute('ctype') == 'lb') {
          $text .= '<br />';
        }
      }
      elseif ($reader->name == 'ph') {
        if ($reader->getAttribute('ctype') == 'image') {
          $text .= '<img';
          while ($reader->moveToNextAttribute()) {
            // @todo - we have to use x-html: prefixes for attributes.
            if ($reader->name != 'ctype' && $reader->name != 'id') {
              $text .= " {$reader->name}=\"{$reader->value}\"";
            }
          }
          $text .= ' />';
        }
      }
    }
    return $text;
  }

  /**
   * Helper function to process the source text.
   *
   * @param string $source
   *   Job data array.
   * @param array $key_array
   *   The source item data key.
   *
   * @return string
   */
  protected function processForExport($source, array $key_array) {
    $tjiid = $key_array[0];
    $key_string = \Drupal::service('tmgmt.data')->ensureStringKey($key_array);
    // The reason why we use DOMDocument object here and not just XMLReader
    // is the DOMDocument's ability to deal with broken HTML.
    $dom = new \DOMDocument();
    // We need to append the head with encoding so that special characters
    // are read correctly.
    $dom->loadHTML("<html><head><meta http-equiv='Content-type' content='text/html; charset=UTF-8' /></head><body>" . $source . '</body></html>');

    $iterator = new \RecursiveIteratorIterator(
      new RecursiveDOMIterator($dom),
      \RecursiveIteratorIterator::SELF_FIRST);

    $writer = new \XMLWriter();
    $writer->openMemory();
    $writer->startDocument('1.0', 'UTF-8');
    $writer->startElement('wrapper');

    $tray = array();
    $non_pair_tags = array('br', 'img');

    $xliff_validation = $this->job->getSetting('xliff_validation');

    /** @var \DOMElement $node */
    foreach ($iterator as $node) {

      if (in_array($node->nodeName, array('html', 'body', 'head', 'meta'))) {
        continue;
      }

      if ($node->nodeType === XML_ELEMENT_NODE) {
        // Increment the elements count and compose element id.
        if (!isset($xliff_validation[$key_string])) {
          $xliff_validation[$key_string] = 0;
        }
        $xliff_validation[$key_string]++;
        $id = 'tjiid' . $tjiid . '-' . $xliff_validation[$key_string];

        $is_pair_tag = !in_array($node->nodeName, $non_pair_tags);

        if ($is_pair_tag) {
          $this->writeBPT($writer, $node, $id);
        }
        elseif ($node->nodeName == 'img') {
          $this->writeIMG($writer, $node, $id);
        }
        elseif ($node->nodeName == 'br') {
          $this->writeBR($writer, $node, $id);
        }

        // Add to tray new element info.
        $tray[$id] = array(
          'name' => $node->nodeName,
          'id' => $id,
          'value' => $node->nodeValue,
          'built_text' => '',
          'is_pair_tag' => $is_pair_tag,
        );

      }
      // The current node is a text.
      elseif ($node->nodeName == '#text') {
        // Add the node value to the text output.
        $writer->writeCdata($this->toEntities($node->nodeValue));
        foreach ($tray as &$info) {
          $info['built_text'] .= $node->nodeValue;
        }
      }

      // Reverse so that pair tags are closed in the expected order.
      $reversed_tray = array_reverse($tray);
      foreach ($reversed_tray as $_info) {
        // If the build_text equals to the node value and it is not a pair tag
        // add the end pair tag markup.
        if ($_info['value'] == $_info['built_text'] && $_info['is_pair_tag']) {
          // Count also for the closing elements.
          $xliff_validation[$key_string]++;
          $this->writeEPT($writer, $_info['name'], $_info['id']);
          // When the end pair tag has been written unset the element info
          // from the tray.
          unset($tray[$_info['id']]);
        }
      }
    }

    // Set the xliff_validation data and save the job.
    $this->job->settings->xliff_validation = $xliff_validation;
    $this->job->save();

    $writer->endElement();
    // Load the output with XMLReader so that we can easily get the inner xml.
    $reader = new \XMLReader();
    $reader->XML($writer->outputMemory());
    $reader->read();
    return $reader->readInnerXML();
  }

  /**
   * Writes br tag.
   *
   * @param XMLWriter $writer
   *   Writer that writes the output.
   * @param DOMElement $node
   *   Current node.
   * @param $id
   *   Current node id.
   */
  protected function writeBR(\XMLWriter $writer, \DOMElement $node, $id) {
    $writer->startElement('x');
    $writer->writeAttribute('id', $id);
    $writer->writeAttribute('ctype', 'lb');
    $writer->endElement();
  }

  /**
   * Writes beginning pair tag.
   *
   * @param XMLWriter $writer
   *   Writer that writes the output.
   * @param DOMElement $node
   *   Current node.
   * @param $id
   *   Current node id.
   */
  protected function writeBPT(\XMLWriter $writer, \DOMElement $node, $id) {
    $beginning_tag = '<' . $node->nodeName;
    if ($node->hasAttributes()) {
      $attributes = array();
      /** @var DOMAttr $attribute */
      foreach ($node->attributes as $attribute) {
        $attributes[] = $attribute->name . '="' . $attribute->value . '"';
      }

      $beginning_tag .= ' '. implode(' ', $attributes);
    }
    $beginning_tag .= '>';
    $writer->startElement('bpt');
    $writer->writeAttribute('id', $id);
    $writer->text($beginning_tag);
    $writer->endElement();
  }

  /**
   * Writes ending pair tag.
   *
   * @param XMLWriter $writer
   *   Writer that writes the output.
   * @param string $name
   *   Ending tag name.
   * @param $id
   *   Current node id.
   */
  protected function writeEPT(\XMLWriter $writer, $name, $id) {
    $writer->startElement('ept');
    $writer->writeAttribute('id', $id);
    $writer->text('</' . $name . '>');
    $writer->endElement();
  }

  /**
   * Writes img tag.
   *
   * Note that alt and title attributes are not written as sub elements as
   * Trados studio is not able to deal with two sub elements at one level.
   *
   * @param XMLWriter $writer
   *   Writer that writes the output.
   * @param DOMElement $node
   *   Current node.
   * @param $id
   *   Current node id.
   */
  protected function writeIMG(\XMLWriter $writer, \DOMElement $node, $id) {
    $writer->startElement('ph');
    $writer->writeAttribute('id', $id);
    $writer->writeAttribute('ctype', 'image');
    foreach ($node->attributes as $attribute) {
      // @todo - uncomment when issue with Trados/sub elements fixed.
      /*
      if (in_array($attribute->name, array('title', 'alt'))) {
        continue;
      }
      */
      $writer->writeAttribute($attribute->name, $attribute->value);
    }
    /*
    if ($alt_attribute = $node->getAttribute('alt')) {
      $writer->startElement('sub');
      $writer->writeAttribute('id', $id . '-img-alt');
      $writer->writeAttribute('ctype', 'x-img-alt');
      $writer->text($alt_attribute);
      $writer->endElement();
      $this->elementsCount++;
    }
    if ($title_attribute = $node->getAttribute('title')) {
      $writer->startElement('sub');
      $writer->writeAttribute('id', $id . '-img-title');
      $writer->writeAttribute('ctype', 'x-img-title');
      $writer->text($title_attribute);
      $writer->endElement();
      $this->elementsCount++;
    }
    */
    $writer->endElement();
  }

  /**
   * Convert critical characters to HTML entities.
   *
   * DOMDocument will convert HTML entities to its actual characters. This can
   * lead into situation when not allowed characters will appear in the content.
   *
   * @param string $string
   *   String to escape.
   *
   * @return string
   *   Escaped string.
   */
  protected function toEntities($string) {
    return str_replace(array('&', '>', '<'), array('&amp;', '&gt;', '&lt;'), $string);
  }

}

Главная | Обратная связь

drupal hosting | друпал хостинг | it patrol .inc