g2-8.x-1.x-dev/src/Matcher.php

src/Matcher.php
<?php

declare(strict_types=1);

namespace Drupal\g2;

use AhoCorasick\MultiStringMatcher;
use Drupal\Component\Utility\Html;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\KeyValueStore\KeyValueFactoryInterface;
use Drupal\node\NodeInterface;

/**
 * The matcher class implements the g2.matcher service.
 *
 * The mission of this service is to keep an up-to-date Aho-Corasick MSM of all
 * the deduplicated G2 entry titles and of the associated G2 entries,
 * and to provide access to both.
 *
 * This is especially useful to ensure linear performance of the G2 Automatic
 * filter plugin.
 *
 * Because the stop list may change from one format to the next, we keep the
 * complete mapping and drop entries in the stop list during content handling.
 *
 * We keep the titles and MSM separately because an MSM does not account for
 * duplicates, while in G2 multiple entries may share an identical title.
 *
 * See https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm
 */
class Matcher {

  /**
   * The name of the KeyValue collection used to store the FSM.
   */
  const COLLECTION = G2::NAME;

  /**
   * The maximum number of nodes to load at once when rebuilding.
   *
   * This prevents extra-long cache inserts which may fail, especially on
   * MariaDB or MySQL development instances not configured for heavy loads.
   */
  const LOAD_CHUNK = 100;

  /**
   * The keyvalue service.
   *
   * The Matcher service uses KV instead of Cache - although the data can be
   * regenerated when lost - to avoid data size issues like those with the
   * Memcache modules handling of data larger than 1MB.
   *
   * @var \Drupal\Core\KeyValueStore\KeyValueFactoryInterface
   */
  protected KeyValueFactoryInterface $kv;

  /**
   * A map of G2 entry IDs keys by title.
   *
   * @var array<string,array<int,int>>
   */
  protected array $map;

  /**
   * The MSM instance holding the titles.
   *
   * @var \AhoCorasick\MultiStringMatcher
   */
  protected MultiStringMatcher $msm;

  /**
   * The core entity_type.manager service.
   *
   * @var \Drupal\Core\Entity\EntityTypeManagerInterface
   */
  protected EntityTypeManagerInterface $etm;

  /**
   * Constructor.
   *
   * @param \Drupal\Core\KeyValueStore\KeyValueStoreInterface $kv
   *   The keyvalue service.
   */

  /**
   * Service constructor.
   *
   * @param \Drupal\Core\Entity\EntityTypeManagerInterface $etm
   *   The core entity_type.manager service.
   * @param \Drupal\Core\KeyValueStore\KeyValueFactoryInterface $kv
   *   The core keyvalue service.
   */
  public function __construct(
    EntityTypeManagerInterface $etm,
    KeyValueFactoryInterface $kv,
  ) {
    $this->etm = $etm;
    $this->kv = $kv;
  }

  /**
   * Transform a single text node to insert DFN elements as needed.
   *
   * Also applies HTML normalization, e.g. closes elements that expect it.
   *
   * @param \DOMDocument $dom
   *   The document built from the text.
   * @param \DOMNode $node
   *   The text node being modified.
   * @param \AhoCorasick\MultiStringMatcher $msm
   *   The configured matcher.
   * @param string[] $stopList
   *   The titles in the stop list.
   *
   * @throws \DOMException
   *
   * @internal This method is only public to support unit tests.
   */
  public static function handleTextNode(
    \DOMDocument $dom,
    \DOMNode $node,
    MultiStringMatcher $msm,
    array $stopList,
  ): void {
    $input = $node->nodeValue ?: '';
    if (empty($input)) {
      return;
    }
    $parent = $node->parentNode;
    // Since this is a text node, it has to have a parent.
    assert($parent instanceof \DOMNode);

    $matches = $msm->searchIn($input);
    if (empty($matches)) {
      return;
    }
    $allPatterns = array_unique(array_map(fn($match) => $match[1], $matches));
    $patterns = array_diff($allPatterns, $stopList);
    unset($allPatterns, $matches, $msm, $stopList);
    if (empty($patterns)) {
      return;
    }

    $groups = [];
    foreach ($patterns as $pattern) {
      $groups[] = preg_quote($pattern, "/");
    }
    $rx = '/\b(' . implode('|', $groups) . ')\b/u';
    // Do not add PREG_SPLIT_NO_EMPTY to avoid messing with the even/odd logic.
    $res = preg_split($rx, $input, -1, PREG_SPLIT_DELIM_CAPTURE);
    // $rx is generated, assumed never to be an invalid regexp string.
    assert(is_array($res));

    for ($i = 0; $i < count($res); $i++) {
      $fragment = $res[$i];
      // Even: pattern, odd: text.
      $child = ($i % 2)
        ? $dom->createElement('dfn', $fragment)
        : $dom->createTextNode($fragment);
      $parent->insertBefore($child, $node);
    }
    $parent->removeChild($node);
  }

  /**
   * Perform the actual processing after checks.
   *
   * @param string $input
   *   The input text, assumed to be already checked for UTF-8 validity.
   * @param \AhoCorasick\MultiStringMatcher $msm
   *   The MSM.
   * @param string[] $stopList
   *   The titles of the node not to automatically wrap in a DFN.
   *
   * @return string
   *   The processed source.
   *
   * @throws \DOMException
   *
   * @internal This method is only public to support unit tests.
   */
  public static function handleSource(
    string $input,
    MultiStringMatcher $msm,
    array $stopList,
  ): string {
    if (empty($input)) {
      return "";
    }
    $dom = Html::load($input);
    $xp = new \DOMXPath($dom);
    // We do not want to parse "script" or "style" content, and we also want to
    // avoid existing "dfn" and "a" content because in the end the "dfn"
    // elements will be converted to "a href" and those cannot be nested.
    $texts = $xp->query('//text()[not(ancestor::script) and not(ancestor::style) and not(ancestor::dfn) and not(ancestor::a)]');
    // Our XPath is constant, so we should never get anything but a node list.
    assert($texts instanceof \DOMNodeList);
    /** @var \DOMNode $text */
    foreach ($texts as $text) {
      self::handleTextNode($dom, $text, $msm, $stopList);
    }
    $bodyTags = $dom->getElementsByTagName('body');
    assert(count($bodyTags) === 1);
    $body = $bodyTags->item(0);
    assert($body instanceof \DOMNode);
    // There may be multiple elements at the top if there is no top wrapper.
    assert($body->childNodes->count() >= 1);
    $output = '';
    foreach ($body->childNodes as $node) {
      $output .= $dom->saveHTML($node);
    }
    return $output;
  }

  /**
   * Rebuild the map and MSM from the available published G2 entries.
   *
   * Store them both in KV.
   *
   * This is a somehow costly operation, do not perform it frequently.
   * For example on a M1 Max macBook Pro, a 6000 definitions rebuild takes
   * about four seconds on a flushed cache and 0.6 sec on a primed cache.
   *
   * In most cases, you should be retrieving the map/MSM using their respective
   * get method.
   */
  public function rebuild(): void {
    $storage = $this->etm
      ->getStorage(G2::TYPE);
    $nids = $storage->getQuery()
      ->condition('type', G2::BUNDLE)
      ->condition('status', NodeInterface::PUBLISHED)
      ->accessCheck()
      ->execute();
    $chunks = array_chunk($nids, 100);
    $map = [];
    foreach ($chunks as $chunk) {
      $nodes = $storage->loadMultiple($chunk);
      foreach ($nodes as $node) {
        $label = (string) $node->label();
        $map[$label][] = (int) $node->id();
      }
    }

    // Refresh map.
    $coll = $this->kv
      ->get(Matcher::COLLECTION);
    $coll->set(G2::KV_TITLES, $map);
    $this->map = $map;

    // Refresh MSM.
    $msm = new MultiStringMatcher(array_keys($map));
    $coll->set(G2::KV_MSM, $msm);
    $this->msm = $msm;
  }

  /**
   * Get the MSM.
   *
   * @return \AhoCorasick\MultiStringMatcher
   *   A ready to use matcher, found on instance or regenerated as needed.
   */
  public function getMultiStringMatcher(): MultiStringMatcher {
    // Fast path: already ready.
    if (!empty($this->msm)) {
      return $this->msm;
    }

    // Medium path: get from KV.
    $msm = $this->kv
      ->get(self::COLLECTION)
      ->get(G2::KV_MSM);
    assert(empty($msm) || $msm instanceof MultiStringMatcher);
    if (!empty($msm)) {
      $this->msm = $msm;
      return $msm;
    }

    // Slow path: get from entity storage and refresh KV.
    $this->rebuild();
    return $this->msm;
  }

  /**
   * Get the map of G2 entry IDs by title.
   *
   * @return array<string,array<int,int>>
   *   An up-to-date map, found on instance or regenerated as needed.
   */
  public function getTitleMap(): array {
    // Fast path: already ready.
    if (!empty($this->map)) {
      return $this->map;
    }

    // Medium path: get from KV.
    $map = $this->kv
      ->get(self::COLLECTION)
      ->get(G2::KV_TITLES);
    assert(is_array($map));
    if (!empty($map)) {
      $this->map = $map;
      return $map;
    }

    // Slow path: get from entity storage and refresh KV.
    $this->rebuild();
    return $this->map;
  }

}

Главная | Обратная связь

drupal hosting | друпал хостинг | it patrol .inc