entity_to_text-1.0.x-dev/modules/entity_to_text_tika/src/Extractor/FileToText.php
modules/entity_to_text_tika/src/Extractor/FileToText.php
<?php namespace Drupal\entity_to_text_tika\Extractor; use Drupal\Core\File\FileSystemInterface; use Drupal\Core\Logger\LoggerChannelFactoryInterface; use Drupal\Core\Site\Settings; use Drupal\entity_to_text_tika\Event\EntityToTextTikaEvents; use Drupal\entity_to_text_tika\Event\PreProcessFileEvent; use Drupal\file\Entity\File; use Symfony\Contracts\EventDispatcher\EventDispatcherInterface; use Vaites\ApacheTika\Client; /** * Provide Capabilities to transform a File content to plain-text via Tika. */ class FileToText { /** * The site settings. * * @var \Drupal\Core\Site\Settings */ protected $settings; /** * The file system service. * * @var \Drupal\Core\File\FileSystemInterface */ protected $fileSystem; /** * The logger service. * * @var \Drupal\Core\Logger\LoggerChannelInterface */ protected $logger; /** * The Apache Tika client. * * @var \Vaites\ApacheTika\Client|null */ protected $client; /** * The event dispatcher. * * @var \Symfony\Contracts\EventDispatcher\EventDispatcherInterface */ private $eventDispatcher; /** * Construct a new FileToText object. */ public function __construct(Settings $settings, FileSystemInterface $file_system, LoggerChannelFactoryInterface $logger_factory, EventDispatcherInterface $event_dispatcher) { $this->settings = $settings; $this->fileSystem = $file_system; $this->logger = $logger_factory->get('entity_to_text'); $this->client = NULL; $this->eventDispatcher = $event_dispatcher; } /** * Transform a File into plain text value. * * @param \Drupal\file\Entity\File $file * The document. * @param string $langcode * The OCR langcode to be used. * * @return string * The transformed file into a plain text value by Apache Tika. */ public function fromFileToText(File $file, string $langcode = 'eng'): string { $content = ''; $settings_tika_connection = $this->settings->get('entity_to_text_tika.connection'); // Don't attempts to query Tika when not configured. if (!isset($settings_tika_connection['host'], $settings_tika_connection['port'])) { return ''; } /** @var \Vaites\ApacheTika\Clients\WebClient $web_client */ $web_client = $this->getClient($settings_tika_connection['host'], $settings_tika_connection['port']); $web_client->setOCRLanguage($langcode); $event = new PreProcessFileEvent($web_client, $file); /** @var \Drupal\entity_to_text_tika\Event\PreProcessFileEvent $event */ $event = $this->eventDispatcher->dispatch($event, EntityToTextTikaEvents::PRE_PROCESS_FILE); // Use the Event altered Client and File. $web_client = $event->getClient(); $file = $event->getFile(); $absolute_path = $this->fileSystem->realpath($file->getFileUri()); try { $content = (string) $web_client->getText($absolute_path); } catch (\Exception $e) { $this->logger->notice("Document '@fid' on '@path' can't be processed by Tika. Got: @message.", [ '@fid' => $file->id(), '@path' => $absolute_path, '@message' => $e->getMessage(), ]); } return $content; } /** * Get a class instance throwing an exception if check fails. * * @param string|null $param1 * Path or host. * @param string|int|null $param2 * Java binary path or port for web client. * @param array $options * Options for cURL request. * @param bool $check * Check JAR file or server connection. * * @return \Vaites\ApacheTika\Clients\CLIClient|\Vaites\ApacheTika\Clients\WebClient * The Apache Tika Client. * * @throws \Exception */ public function getClient(string $param1 = NULL, $param2 = NULL, array $options = [], bool $check = TRUE): Client { if (!$this->client) { $this->client = Client::make($param1, $param2, $options, $check); $this->client->setTimeout(60); } return $this->client; } /** * Set the Apache Tika client to be used. * * @param \Vaites\ApacheTika\Client $client * The Apache client to be injected and used for later calls to Tika. */ public function setClient(Client $client): void { $this->client = $client; } }