deepseek-1.x-dev/src/EmbeddingHandler.php
src/EmbeddingHandler.php
<?php
declare(strict_types=1);
namespace Drupal\deepseek;
use Drupal\Core\Config\ConfigFactoryInterface;
use Drupal\Core\Database\Connection;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\Extension\ModuleHandlerInterface;
use Drupal\Core\Logger\LoggerChannelFactoryInterface;
use League\HTMLToMarkdown\Converter\TableConverter;
use League\HTMLToMarkdown\HtmlConverter;
/**
* Embedding handler for RAG.
*/
class EmbeddingHandler implements EmbeddingHandlerInterface {
/**
* Client for plugin manager provider .
*
* @var mixed
*/
private object|null $client = NULL;
/**
* Constructs an EmbeddingHandler object.
*
* @param \Drupal\Core\Entity\EntityTypeManagerInterface $entityTypeManager
* Entity typ manager service.
* @param \Drupal\Core\Config\ConfigFactoryInterface $configFactory
* Configuration factory service.
* @param \Drupal\Core\Database\Connection $connection
* Database service.
* @param \Drupal\Core\Logger\LoggerChannelFactoryInterface $logger
* Log service.
* @param \Drupal\deepseek\AiProvidersPluginManager $providers
* Provider AI service.
* @param \Drupal\deepseek\AiDbVectorsPluginManager $vectors
* Vector database service.
* @param \Drupal\Core\Extension\ModuleHandlerInterface $moduleHandler
* The module handler service.
*/
public function __construct(
protected EntityTypeManagerInterface $entityTypeManager,
protected ConfigFactoryInterface $configFactory,
protected Connection $connection,
protected LoggerChannelFactoryInterface $logger,
protected AiProvidersPluginManager $providers,
protected AiDbVectorsPluginManager $vectors,
protected ModuleHandlerInterface $moduleHandler,
) {}
/**
* Get embedding vector from text using AI API.
*
* @param string $text
* The text to embed.
*
* @return array
* An array of embedding values.
*/
public function getEmbedding(string $text): array {
try {
// Use API embedding model from DeepSeek or OpenAI.
if (empty($this->client)) {
$provider = $this->configFactory->get('deepseek.settings')->get('provider');
$this->client = $this->providers->createInstance($provider);
}
$embedding = $this->client->embeddings($text);
return $embedding;
}
catch (\Exception $e) {
$this->logger->get('embedding')->error('Error getting embedding: @error', ['@error' => $e->getMessage()]);
return [];
}
}
/**
* Save embedding into MySQL vector database.
*
* @param string $entityType
* The entity type.
* @param int $entityId
* The entity ID.
* @param string $content
* The content to embed.
* @param bool $skip
* Skip embedding if it exists.
*
* @return bool
* TRUE if successful, FALSE otherwise.
*/
public function saveEmbedding(string $entityType, int $entityId, string $content, bool $skip = FALSE): bool {
$config = $this->configFactory->get('deepseek.settings');
$db_vector = $config->get('db_vector');
$definitions = $this->vectors->getDefinitions();
if (empty($db_vector) || empty($definitions[$db_vector])) {
return FALSE;
}
$vector = $this->vectors->createInstance($db_vector);
if ($skip) {
// Search if it exists entity id then skip.
$isExist = $vector?->exist($entityType, $entityId) ?? FALSE;
if ($isExist) {
return FALSE;
}
}
// Step 1: Convert HTML to Markdown (if needed).
$converter = new HtmlConverter([
'strip_tags' => TRUE,
'hard_break' => TRUE,
]);
$converter->getEnvironment()->addConverter(new TableConverter());
// Get data in file.
$contentFile = $this->getFile($content);
if (!empty($contentFile)) {
$content .= implode(' ', array: $contentFile);
}
$content = $this->cleanTagFromHtml($content);
$markdown = $converter->convert($content);
if (empty($markdown)) {
return FALSE;
}
$vector->delete($entityType, $entityId);
// Step 2: Split content into chunks.
$chunks = $this->chunk($markdown, $vector?->limit ?? 1000);
foreach ($chunks as $delta => $chunk) {
// Step 3: Generate embedding for each chunk.
$embedding = $this->getEmbedding($chunk);
if (empty($embedding)) {
continue;
}
// Step 4: Ensure embedding is a proper JSON array (no keys).
$vector->insert($entityType, $entityId, $chunk, $embedding, $delta);
}
return TRUE;
}
/**
* Search content vector similarity.
*
* @param array $embedding_query
* The query embed from api.
* @param string $message
* The message from user.
*
* @return array
* An array of similar content.
*/
public function searchSimilar(array $embedding_query, string $message): array {
$config = $this->configFactory->get('deepseek.settings');
$db_vector = $config->get('db_vector');
$definitions = $this->vectors->getDefinitions();
if (empty($db_vector) || empty($definitions[$db_vector])) {
return [];
}
$vector = $this->vectors->createInstance($db_vector);
// Limit 2 nodes and 20000 max token.
return $vector->search($embedding_query, $message, 2);
}
/**
* Clean tag from html.
*
* @param string $content
* The input text.
*
* @return string
* Cleaned String.
*/
private function cleanTagFromHtml($content) {
$doc = new \DOMDocument();
$tagsToRemove = ['img', 'video', 'iframe', 'figure', 'figcaption', 'svg', 'canvas', 'script', 'style'];
libxml_use_internal_errors(TRUE);
$doc->loadHTML('<?xml encoding="utf-8" ?>' . $content);
libxml_clear_errors();
foreach ($tagsToRemove as $tag) {
$elements = $doc->getElementsByTagName($tag);
for ($i = $elements->length - 1; $i >= 0; $i--) {
$element = $elements->item($i);
$element->parentNode->removeChild($element);
}
}
$body = $doc->getElementsByTagName('body')->item(0);
$html = '';
foreach ($body->childNodes as $child) {
$html .= $doc->saveHTML($child);
}
return trim($html);
}
/**
* Split text into optimal chunks.
*
* @param string $text
* Text input.
* @param int $chunkSize
* Max size chunk.
* @param int $avgCharsPerToken
* Average characters per token (5 for Vietnamese, 4 for English).
*
* @return array
* Matrix of chunk.
*/
public function chunk($text, int $chunkSize = 768, int $avgCharsPerToken = 5) {
$text = $this->normalizeText($text);
$paragraphs = array_filter(
preg_split('/\n\s*\n/', $text),
fn($p) => trim($p) !== ''
);
// Estimated maximum number of characters per chunk.
$maxChars = $chunkSize * $avgCharsPerToken;
$chunks = [];
$current = '';
$currentLength = 0;
foreach ($paragraphs as $para) {
$para = trim($para);
$paraLength = mb_strlen($para, 'UTF-8');
if ($paraLength > $maxChars) {
// If the paragraph is too long → split it into sentences.
$sentences = $this->splitIntoSentences($para, $chunkSize);
$current = '';
$currentLength = 0;
foreach ($sentences as $sentence) {
$sentence = trim($sentence);
$sentenceLength = mb_strlen($sentence, 'UTF-8');
if ($currentLength + $sentenceLength + 1 <= $maxChars) {
$current .= ($current ? ' ' : '') . $sentence;
$currentLength += $sentenceLength + 1;
}
else {
if ($current) {
$chunks[] = $current;
}
if ($sentenceLength > $maxChars) {
// If the paragraph is too long → split it by characters.
$parts = $this->splitLongByWords($sentence, $maxChars);
$chunks = array_merge($chunks, $parts);
$lastPart = end($parts);
$lengthLastPart = $lastPart ? mb_strlen($lastPart, 'UTF-8') : 0;
if ($lengthLastPart < $maxChars) {
$current = $lastPart;
$currentLength = $lengthLastPart;
}
else {
$current = '';
$currentLength = 0;
}
}
else {
$current = $sentence;
$currentLength = $sentenceLength;
}
}
}
}
else {
// Merge paragraph into current chunk.
if ($currentLength + $paraLength + 1 <= $maxChars) {
$current .= ($current ? ' ' : '') . $para;
$currentLength += $paraLength + 1;
}
else {
if ($current) {
$chunks[] = $current;
}
$current = $para;
$currentLength = $paraLength;
}
}
}
if ($current) {
$chunks[] = $current;
}
return $chunks;
}
/**
* Split a long text into chunks by max characters.
*
* @param string $text
* Chunk text.
* @param int $maxChars
* Max characters.
*
* @return array
* An array of text chunks, each not exceeding the specified max length.
*/
private function splitLongByWords(string $text, int $maxChars): array {
$words = preg_split('/\s+/', trim($text));
$parts = [];
$current = '';
foreach ($words as $word) {
if (mb_strlen($current . ' ' . $word, 'UTF-8') > $maxChars) {
$parts[] = trim($current);
$current = $word;
}
else {
$current .= ($current ? ' ' : '') . $word;
}
}
if (!empty($current)) {
$parts[] = trim($current);
}
return $parts;
}
/**
* Normalize text: remove extra spaces, special characters.
*
* @param string $text
* Text for embedding.
*
* @return string
* Proceed text.
*/
protected function normalizeText($text) {
$text = preg_replace('/[ ]{2,}/', ' ', $text);
$text = preg_replace('/(\n){3,}/', "\n\n", $text);
return mb_strtolower(trim($text));
}
/**
* Splits text into sentences based on punctuation.
*
* @param string $text
* The input text.
* @param int $chunkSize
* Max size chunk.
*
* @return array
* Array of sentences.
*/
private function splitIntoSentences($text, $chunkSize) {
// Split by period, question mark, exclamation mark, new line.
$sentences = preg_split('/(?<=[.!?])\s+/', $text, -1, PREG_SPLIT_NO_EMPTY);
$chunks = [];
$current = '';
foreach ($sentences as $sentence) {
$test = $current . ($current ? ' ' : '') . $sentence;
if (mb_strlen($test, 'UTF-8') <= $chunkSize) {
$current = $test;
}
else {
if ($current) {
$chunks[] = $current;
}
$current = $sentence;
}
}
if ($current) {
$chunks[] = $current;
}
return $chunks;
}
/**
* Get content from file.
*/
private function getFile($content) {
$arr_text = [];
if ($this->moduleHandler->moduleExists('ocr_image')) {
// phpcs:disable
// @phpstan-ignore-next-line
$docParser = \Drupal::service('ocr_image.DocParser');
// @phpstan-ignore-next-line
$ocrImage = \Drupal::service('ocr_image.OcrImage');
// phpcs:enable
$language = $this->getLanguages();
// Get all path image.
preg_match_all('/<img[^>]+src=["\']([^"\']+\.(?:jpg|jpeg|png|webp))["\']/i', $content, $image_matches);
// Get all path file.
preg_match_all('/<a[^>]+href=["\']([^"\']+\.(?:txt|pdf|docx|xlsx|xls|pptx))["\']/i', $content, $file_matches);
foreach ($image_matches[1] as $image_path) {
$text_img = $ocrImage->getText(DRUPAL_ROOT . $image_path, $language);
if (!empty($text_img)) {
$arr_text[] = str_replace(PHP_EOL, ' ', $text_img['full_text']);
}
}
foreach ($file_matches[1] as $file_path) {
$text_file = $docParser->getText(DRUPAL_ROOT . $file_path, $language);
if (!empty($text_file)) {
$arr_text[] = implode(' ', $text_file);
}
}
}
return $arr_text;
}
/**
* Get languages.
*/
private function getLanguages() {
$config = $this->configFactory->get('deepseek.settings');
$lang = $config->get('voice');
$listLang = [
'en-US' => 'eng',
'fr-FR' => 'fra',
'it-IT' => 'ita',
'es-ES' => 'epo',
'ca-ES' => 'cat',
'gl-ES' => 'glg',
'pt-PT' => 'por',
'pt-BR' => 'por',
'vi-VN' => 'vie',
'af-ZA' => 'afr',
'bs-BA' => 'bos',
'id-ID' => 'ind',
'jv-ID' => 'jav',
'cy-GB' => 'cym',
'da-DK' => 'dan',
'de-DE' => 'deu',
'et-EE' => 'est',
'eu-ES' => 'eus',
'fa-IR' => 'fas',
'fil-PH' => 'fil',
'ga-IE' => 'gle',
'hr-HR' => 'hrv',
'sw-KE' => 'swa',
'kk-KZ' => 'kaz',
'lt-LT' => 'lit',
'lv-LV' => 'lav',
'mt-MT' => 'mlt',
'hu-HU' => 'hun',
'nl-NL' => 'nld',
'uz-UZ' => 'uzb',
'pl-PL' => 'pol',
'sq-AL' => 'sqi',
'sv-SE' => 'swe',
'fi-FI' => 'fin',
'cs-CZ' => 'ces',
'is-IS' => 'isl',
'ro-RO' => 'ron',
'tr-TR' => 'tur',
'sk-SK' => 'slk',
'el-GR' => 'grek',
'az-AZ' => 'aze',
'bg-BG' => 'bul',
'sr-RS' => 'srp',
'mk-MK' => 'mkd',
'mn-MN' => 'mon',
'ru-RU' => 'rus',
'uk-UA' => 'ukr',
'ja-JP' => 'jpan',
'ko-KR' => 'kor',
'th-TH' => 'tha',
'km-KH' => 'khmr',
'lo-LA' => 'laoo',
'ar-SA' => 'ara',
'ms-MY' => 'msa',
'ps-AF' => 'pus',
'am-ET' => 'amh',
'bn-IN' => 'beng',
'he-IL' => 'hebr',
'hy-AM' => 'armn',
'ka-GE' => 'geor',
'my-MM' => 'mya',
'si-LK' => 'sin',
'gu-IN' => 'gujr',
'hi-IN' => 'hin',
'kn-IN' => 'kan',
'ml-IN' => 'mal',
'mr-IN' => 'mar',
'or-IN' => 'orya',
'pa-IN' => 'pan',
'ta-IN' => 'tam',
'te-IN' => 'tel',
'ur-IN' => 'urd',
'ne-NP' => 'nep',
'zh-CN' => 'chi_sim',
'zh-HK' => '中文 (香港)',
'zh-TW' => 'chi_tra',
];
return $listLang[$lang] ?? 'eng';
}
}
