deepseek-1.x-dev/src/EmbeddingHandler.php

src/EmbeddingHandler.php
<?php

declare(strict_types=1);

namespace Drupal\deepseek;

use Drupal\Core\Config\ConfigFactoryInterface;
use Drupal\Core\Database\Connection;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\Extension\ModuleHandlerInterface;
use Drupal\Core\Logger\LoggerChannelFactoryInterface;
use League\HTMLToMarkdown\Converter\TableConverter;
use League\HTMLToMarkdown\HtmlConverter;

/**
 * Embedding handler for RAG.
 */
class EmbeddingHandler implements EmbeddingHandlerInterface {

  /**
   * Client for plugin manager provider .
   *
   * @var mixed
   */
  private object|null $client = NULL;

  /**
   * Constructs an EmbeddingHandler object.
   *
   * @param \Drupal\Core\Entity\EntityTypeManagerInterface $entityTypeManager
   *   Entity typ manager service.
   * @param \Drupal\Core\Config\ConfigFactoryInterface $configFactory
   *   Configuration factory service.
   * @param \Drupal\Core\Database\Connection $connection
   *   Database service.
   * @param \Drupal\Core\Logger\LoggerChannelFactoryInterface $logger
   *   Log service.
   * @param \Drupal\deepseek\AiProvidersPluginManager $providers
   *   Provider AI service.
   * @param \Drupal\deepseek\AiDbVectorsPluginManager $vectors
   *   Vector database service.
   * @param \Drupal\Core\Extension\ModuleHandlerInterface $moduleHandler
   *   The module handler service.
   */
  public function __construct(
    protected EntityTypeManagerInterface $entityTypeManager,
    protected ConfigFactoryInterface $configFactory,
    protected Connection $connection,
    protected LoggerChannelFactoryInterface $logger,
    protected AiProvidersPluginManager $providers,
    protected AiDbVectorsPluginManager $vectors,
    protected ModuleHandlerInterface $moduleHandler,
  ) {}

  /**
   * Get embedding vector from text using AI API.
   *
   * @param string $text
   *   The text to embed.
   *
   * @return array
   *   An array of embedding values.
   */
  public function getEmbedding(string $text): array {
    try {
      // Use API embedding model from DeepSeek or OpenAI.
      if (empty($this->client)) {
        $provider = $this->configFactory->get('deepseek.settings')->get('provider');
        $this->client = $this->providers->createInstance($provider);
      }
      $embedding = $this->client->embeddings($text);
      return $embedding;
    }
    catch (\Exception $e) {
      $this->logger->get('embedding')->error('Error getting embedding: @error', ['@error' => $e->getMessage()]);
      return [];
    }
  }

  /**
   * Save embedding into MySQL vector database.
   *
   * @param string $entityType
   *   The entity type.
   * @param int $entityId
   *   The entity ID.
   * @param string $content
   *   The content to embed.
   * @param bool $skip
   *   Skip embedding if it exists.
   *
   * @return bool
   *   TRUE if successful, FALSE otherwise.
   */
  public function saveEmbedding(string $entityType, int $entityId, string $content, bool $skip = FALSE): bool {
    $config = $this->configFactory->get('deepseek.settings');
    $db_vector = $config->get('db_vector');
    $definitions = $this->vectors->getDefinitions();
    if (empty($db_vector) || empty($definitions[$db_vector])) {
      return FALSE;
    }
    $vector = $this->vectors->createInstance($db_vector);
    if ($skip) {
      // Search if it exists entity id then skip.
      $isExist = $vector?->exist($entityType, $entityId) ?? FALSE;
      if ($isExist) {
        return FALSE;
      }
    }
    // Step 1: Convert HTML to Markdown (if needed).
    $converter = new HtmlConverter([
      'strip_tags' => TRUE,
      'hard_break' => TRUE,
    ]);
    $converter->getEnvironment()->addConverter(new TableConverter());

    // Get data in file.
    $contentFile = $this->getFile($content);
    if (!empty($contentFile)) {
      $content .= implode(' ', array: $contentFile);
    }

    $content = $this->cleanTagFromHtml($content);

    $markdown = $converter->convert($content);
    if (empty($markdown)) {
      return FALSE;
    }

    $vector->delete($entityType, $entityId);
    // Step 2: Split content into chunks.
    $chunks = $this->chunk($markdown, $vector?->limit ?? 1000);

    foreach ($chunks as $delta => $chunk) {
      // Step 3: Generate embedding for each chunk.
      $embedding = $this->getEmbedding($chunk);
      if (empty($embedding)) {
        continue;
      }

      // Step 4: Ensure embedding is a proper JSON array (no keys).
      $vector->insert($entityType, $entityId, $chunk, $embedding, $delta);
    }
    return TRUE;
  }

  /**
   * Search content vector similarity.
   *
   * @param array $embedding_query
   *   The query embed from api.
   * @param string $message
   *   The message from user.
   *
   * @return array
   *   An array of similar content.
   */
  public function searchSimilar(array $embedding_query, string $message): array {
    $config = $this->configFactory->get('deepseek.settings');
    $db_vector = $config->get('db_vector');
    $definitions = $this->vectors->getDefinitions();
    if (empty($db_vector) || empty($definitions[$db_vector])) {
      return [];
    }
    $vector = $this->vectors->createInstance($db_vector);
    // Limit 2 nodes and 20000 max token.
    return $vector->search($embedding_query, $message, 2);
  }

  /**
   * Clean tag from html.
   *
   * @param string $content
   *   The input text.
   *
   * @return string
   *   Cleaned String.
   */
  private function cleanTagFromHtml($content) {
    $doc = new \DOMDocument();
    $tagsToRemove = ['img', 'video', 'iframe', 'figure', 'figcaption', 'svg', 'canvas', 'script', 'style'];

    libxml_use_internal_errors(TRUE);
    $doc->loadHTML('<?xml encoding="utf-8" ?>' . $content);
    libxml_clear_errors();

    foreach ($tagsToRemove as $tag) {
      $elements = $doc->getElementsByTagName($tag);
      for ($i = $elements->length - 1; $i >= 0; $i--) {
        $element = $elements->item($i);
        $element->parentNode->removeChild($element);
      }
    }

    $body = $doc->getElementsByTagName('body')->item(0);
    $html = '';
    foreach ($body->childNodes as $child) {
      $html .= $doc->saveHTML($child);
    }

    return trim($html);
  }

  /**
   * Split text into optimal chunks.
   *
   * @param string $text
   *   Text input.
   * @param int $chunkSize
   *   Max size chunk.
   * @param int $avgCharsPerToken
   *   Average characters per token (5 for Vietnamese, 4 for English).
   *
   * @return array
   *   Matrix of chunk.
   */
  public function chunk($text, int $chunkSize = 768, int $avgCharsPerToken = 5) {
    $text = $this->normalizeText($text);

    $paragraphs = array_filter(
      preg_split('/\n\s*\n/', $text),
      fn($p) => trim($p) !== ''
    );

    // Estimated maximum number of characters per chunk.
    $maxChars = $chunkSize * $avgCharsPerToken;

    $chunks = [];
    $current = '';
    $currentLength = 0;

    foreach ($paragraphs as $para) {
      $para = trim($para);
      $paraLength = mb_strlen($para, 'UTF-8');

      if ($paraLength > $maxChars) {
        // If the paragraph is too long → split it into sentences.
        $sentences = $this->splitIntoSentences($para, $chunkSize);

        $current = '';
        $currentLength = 0;

        foreach ($sentences as $sentence) {
          $sentence = trim($sentence);

          $sentenceLength = mb_strlen($sentence, 'UTF-8');

          if ($currentLength + $sentenceLength + 1 <= $maxChars) {
            $current .= ($current ? ' ' : '') . $sentence;
            $currentLength += $sentenceLength + 1;
          }
          else {
            if ($current) {
              $chunks[] = $current;
            }
            if ($sentenceLength > $maxChars) {
              // If the paragraph is too long → split it by characters.
              $parts = $this->splitLongByWords($sentence, $maxChars);
              $chunks = array_merge($chunks, $parts);
              $lastPart = end($parts);
              $lengthLastPart = $lastPart ? mb_strlen($lastPart, 'UTF-8') : 0;
              if ($lengthLastPart < $maxChars) {
                $current = $lastPart;
                $currentLength = $lengthLastPart;
              }
              else {
                $current = '';
                $currentLength = 0;
              }
            }
            else {
              $current = $sentence;
              $currentLength = $sentenceLength;
            }
          }
        }
      }
      else {
        // Merge paragraph into current chunk.
        if ($currentLength + $paraLength + 1 <= $maxChars) {
          $current .= ($current ? ' ' : '') . $para;
          $currentLength += $paraLength + 1;
        }
        else {
          if ($current) {
            $chunks[] = $current;
          }
          $current = $para;
          $currentLength = $paraLength;
        }
      }
    }

    if ($current) {
      $chunks[] = $current;
    }
    return $chunks;
  }

  /**
   * Split a long text into chunks by max characters.
   *
   * @param string $text
   *   Chunk text.
   * @param int $maxChars
   *   Max characters.
   *
   * @return array
   *   An array of text chunks, each not exceeding the specified max length.
   */
  private function splitLongByWords(string $text, int $maxChars): array {
    $words = preg_split('/\s+/', trim($text));
    $parts = [];
    $current = '';

    foreach ($words as $word) {
      if (mb_strlen($current . ' ' . $word, 'UTF-8') > $maxChars) {
        $parts[] = trim($current);
        $current = $word;
      }
      else {
        $current .= ($current ? ' ' : '') . $word;
      }
    }

    if (!empty($current)) {
      $parts[] = trim($current);
    }

    return $parts;
  }

  /**
   * Normalize text: remove extra spaces, special characters.
   *
   * @param string $text
   *   Text for embedding.
   *
   * @return string
   *   Proceed text.
   */
  protected function normalizeText($text) {
    $text = preg_replace('/[ ]{2,}/', ' ', $text);
    $text = preg_replace('/(\n){3,}/', "\n\n", $text);
    return mb_strtolower(trim($text));
  }

  /**
   * Splits text into sentences based on punctuation.
   *
   * @param string $text
   *   The input text.
   * @param int $chunkSize
   *   Max size chunk.
   *
   * @return array
   *   Array of sentences.
   */
  private function splitIntoSentences($text, $chunkSize) {
    // Split by period, question mark, exclamation mark, new line.
    $sentences = preg_split('/(?<=[.!?])\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);

    $chunks = [];
    $current = '';

    foreach ($sentences as $sentence) {
      $test = $current . ($current ? ' ' : '') . $sentence;

      if (mb_strlen($test, 'UTF-8') <= $chunkSize) {
        $current = $test;
      }
      else {
        if ($current) {
          $chunks[] = $current;
        }
        $current = $sentence;
      }
    }

    if ($current) {
      $chunks[] = $current;
    }
    return $chunks;
  }

  /**
   * Get content from file.
   */
  private function getFile($content) {
    $arr_text = [];
    if ($this->moduleHandler->moduleExists('ocr_image')) {
      // phpcs:disable
      // @phpstan-ignore-next-line
      $docParser = \Drupal::service('ocr_image.DocParser');
      // @phpstan-ignore-next-line
      $ocrImage = \Drupal::service('ocr_image.OcrImage');
      // phpcs:enable

      $language = $this->getLanguages();

      // Get all path image.
      preg_match_all('/<img[^>]+src=["\']([^"\']+\.(?:jpg|jpeg|png|webp))["\']/i', $content, $image_matches);
      // Get all path file.
      preg_match_all('/<a[^>]+href=["\']([^"\']+\.(?:txt|pdf|docx|xlsx|xls|pptx))["\']/i', $content, $file_matches);

      foreach ($image_matches[1] as $image_path) {
        $text_img = $ocrImage->getText(DRUPAL_ROOT . $image_path, $language);
        if (!empty($text_img)) {
          $arr_text[] = str_replace(PHP_EOL, ' ', $text_img['full_text']);
        }
      }

      foreach ($file_matches[1] as $file_path) {
        $text_file = $docParser->getText(DRUPAL_ROOT . $file_path, $language);
        if (!empty($text_file)) {
          $arr_text[] = implode(' ', $text_file);
        }
      }
    }
    return $arr_text;
  }

  /**
   * Get languages.
   */
  private function getLanguages() {
    $config = $this->configFactory->get('deepseek.settings');
    $lang = $config->get('voice');
    $listLang = [
      'en-US' => 'eng',
      'fr-FR' => 'fra',
      'it-IT' => 'ita',
      'es-ES' => 'epo',
      'ca-ES' => 'cat',
      'gl-ES' => 'glg',
      'pt-PT' => 'por',
      'pt-BR' => 'por',
      'vi-VN' => 'vie',
      'af-ZA' => 'afr',
      'bs-BA' => 'bos',
      'id-ID' => 'ind',
      'jv-ID' => 'jav',
      'cy-GB' => 'cym',
      'da-DK' => 'dan',
      'de-DE' => 'deu',
      'et-EE' => 'est',
      'eu-ES' => 'eus',
      'fa-IR' => 'fas',
      'fil-PH' => 'fil',
      'ga-IE' => 'gle',
      'hr-HR' => 'hrv',
      'sw-KE' => 'swa',
      'kk-KZ' => 'kaz',
      'lt-LT' => 'lit',
      'lv-LV' => 'lav',
      'mt-MT' => 'mlt',
      'hu-HU' => 'hun',
      'nl-NL' => 'nld',
      'uz-UZ' => 'uzb',
      'pl-PL' => 'pol',
      'sq-AL' => 'sqi',
      'sv-SE' => 'swe',
      'fi-FI' => 'fin',
      'cs-CZ' => 'ces',
      'is-IS' => 'isl',
      'ro-RO' => 'ron',
      'tr-TR' => 'tur',
      'sk-SK' => 'slk',
      'el-GR' => 'grek',
      'az-AZ' => 'aze',
      'bg-BG' => 'bul',
      'sr-RS' => 'srp',
      'mk-MK' => 'mkd',
      'mn-MN' => 'mon',
      'ru-RU' => 'rus',
      'uk-UA' => 'ukr',
      'ja-JP' => 'jpan',
      'ko-KR' => 'kor',
      'th-TH' => 'tha',
      'km-KH' => 'khmr',
      'lo-LA' => 'laoo',
      'ar-SA' => 'ara',
      'ms-MY' => 'msa',
      'ps-AF' => 'pus',
      'am-ET' => 'amh',
      'bn-IN' => 'beng',
      'he-IL' => 'hebr',
      'hy-AM' => 'armn',
      'ka-GE' => 'geor',
      'my-MM' => 'mya',
      'si-LK' => 'sin',
      'gu-IN' => 'gujr',
      'hi-IN' => 'hin',
      'kn-IN' => 'kan',
      'ml-IN' => 'mal',
      'mr-IN' => 'mar',
      'or-IN' => 'orya',
      'pa-IN' => 'pan',
      'ta-IN' => 'tam',
      'te-IN' => 'tel',
      'ur-IN' => 'urd',
      'ne-NP' => 'nep',
      'zh-CN' => 'chi_sim',
      'zh-HK' => '中文 (香港)',
      'zh-TW' => 'chi_tra',
    ];

    return $listLang[$lang] ?? 'eng';
  }

}

Главная | Обратная связь

drupal hosting | друпал хостинг | it patrol .inc