media_duplicate_check-1.0.0/src/Service/MediaDuplicateChecker.php
src/Service/MediaDuplicateChecker.php
<?php
declare(strict_types=1);
namespace Drupal\media_duplicate_check\Service;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\File\FileSystemInterface;
use Drupal\Core\Database\Connection;
use Drupal\media\MediaInterface;
use Drupal\Core\StringTranslation\ByteSizeMarkup;
/**
* Service for checking duplicate media files.
*/
class MediaDuplicateChecker {
/**
* The entity type manager.
*
* @var \Drupal\Core\Entity\EntityTypeManagerInterface
*/
protected $entityTypeManager;
/**
* The file system service.
*
* @var \Drupal\Core\File\FileSystemInterface
*/
protected $fileSystem;
/**
* The database connection.
*
* @var \Drupal\Core\Database\Connection
*/
protected $database;
/**
* Constructs a MediaDuplicateChecker object.
*
* @param \Drupal\Core\Entity\EntityTypeManagerInterface $entity_type_manager
* The entity type manager.
* @param \Drupal\Core\File\FileSystemInterface $file_system
* The file system service.
* @param \Drupal\Core\Database\Connection $database
* The database connection.
*/
public function __construct(
EntityTypeManagerInterface $entity_type_manager,
FileSystemInterface $file_system,
Connection $database
) {
$this->entityTypeManager = $entity_type_manager;
$this->fileSystem = $file_system;
$this->database = $database;
}
/**
* Check for duplicates using the original filename before upload.
*
* @param string $original_filename
* The original filename before any Drupal processing.
* @param string $media_type
* The media type to check within.
*
* @return array
* Array of media entities with similar filenames.
*/
public function findDuplicatesByOriginalFilename(string $original_filename, ?string $media_type = NULL): array {
$base_filename = $this->getBaseFilename($original_filename);
$extension = pathinfo($original_filename, PATHINFO_EXTENSION);
// Search for files that have the same base filename
$query = $this->database->select('file_managed', 'f')
->fields('f', ['fid', 'filename', 'uri', 'created']);
// Look for files that start with the base filename
$pattern = $base_filename . '%';
$query->condition('f.filename', $pattern, 'LIKE');
// Ensure the extension matches
if ($extension) {
$query->condition('f.filename', '%.' . $extension, 'LIKE');
}
$file_results = $query->execute()->fetchAll();
if (empty($file_results)) {
return [];
}
// Get file IDs and find associated media
$file_ids = array_column($file_results, 'fid');
$media_storage = $this->entityTypeManager->getStorage('media');
$query = $media_storage->getQuery()->accessCheck(TRUE);
if ($media_type) {
$query->condition('bundle', $media_type);
}
// Check media file fields
$field_conditions = $query->orConditionGroup();
$media_file_fields = [
'field_media_image',
'field_media_document',
'field_media_svg_image',
'field_media_oembed_video',
];
$field_definitions = \Drupal::service('entity_field.manager')->getFieldStorageDefinitions('media');
$has_conditions = FALSE;
foreach ($media_file_fields as $field_name) {
if (isset($field_definitions[$field_name])) {
$field_conditions->condition($field_name, $file_ids, 'IN');
$has_conditions = TRUE;
}
}
if ($has_conditions) {
$query->condition($field_conditions);
$media_ids = $query->execute();
return $media_storage->loadMultiple($media_ids);
}
return [];
}
/**
* Find existing media entities with the same filename or file hash.
*
* @param string $filename
* The filename to check.
* @param string $media_type
* The media type to check within.
* @param string $file_uri
* Optional file URI to check by hash.
*
* @return array
* Array of media entities with the same filename or hash.
*/
public function findDuplicatesByFilename(string $filename, ?string $media_type = NULL, ?string $file_uri = NULL): array {
$duplicates = [];
// Get configuration.
$config = \Drupal::config('media_duplicate_check.settings');
$case_sensitive = $config->get('case_sensitive') ?? FALSE;
$check_by_hash = $config->get('check_by_hash') ?? TRUE;
// Extract the base filename (remove Drupal's automatic suffixes like _0, _1, _13, etc.)
$base_filename = $this->getBaseFilename($filename);
$extension = pathinfo($filename, PATHINFO_EXTENSION);
// If we have a file URI and hash checking is enabled, check by hash
if ($file_uri && $check_by_hash && file_exists($file_uri)) {
$file_hash = md5_file($file_uri);
// Query for files with similar names (to limit the hash checking)
$query = $this->database->select('file_managed', 'f')
->fields('f', ['fid', 'filename', 'uri', 'created']);
// Look for files that start with the base filename
$query->condition('f.filename', $base_filename . '%', 'LIKE');
if ($extension) {
$query->condition('f.filename', '%.' . $extension, 'LIKE');
}
$similar_files = $query->execute()->fetchAll();
$file_results = [];
// Check hashes only for similar files
foreach ($similar_files as $file_record) {
if (file_exists($file_record->uri)) {
$existing_hash = md5_file($file_record->uri);
if ($existing_hash === $file_hash) {
$file_results[] = $file_record;
}
}
}
} else {
// Search for files with the same base filename pattern
$query = $this->database->select('file_managed', 'f')
->fields('f', ['fid', 'filename', 'uri', 'created']);
// Look for files that match the base filename pattern
if ($case_sensitive) {
$query->condition('f.filename', $base_filename . '%', 'LIKE');
} else {
// Case-insensitive comparison with base filename
$query->where('LOWER(f.filename) LIKE LOWER(:pattern)', [':pattern' => $base_filename . '%']);
}
// Also ensure the extension matches
if ($extension) {
$query->condition('f.filename', '%.' . $extension, 'LIKE');
}
$file_results = $query->execute()->fetchAll();
}
if (empty($file_results)) {
return $duplicates;
}
// Get file IDs.
$file_ids = array_column($file_results, 'fid');
// Now find media entities that reference these files.
$media_storage = $this->entityTypeManager->getStorage('media');
// Build query for media entities.
$query = $media_storage->getQuery()
->accessCheck(TRUE);
if ($media_type) {
$query->condition('bundle', $media_type);
}
// Check different media field types for file references.
$field_conditions = $query->orConditionGroup();
// Check common media file fields - but only if they exist.
$media_file_fields = [
'field_media_image',
'field_media_document',
'field_media_svg_image',
'field_media_oembed_video',
];
// Get field definitions to check which fields actually exist.
$field_definitions = \Drupal::service('entity_field.manager')->getFieldStorageDefinitions('media');
$has_conditions = FALSE;
foreach ($media_file_fields as $field_name) {
if (isset($field_definitions[$field_name])) {
$field_conditions->condition($field_name, $file_ids, 'IN');
$has_conditions = TRUE;
}
}
// Only add conditions if we found at least one valid field.
if ($has_conditions) {
$query->condition($field_conditions);
$media_ids = $query->execute();
}
else {
$media_ids = [];
}
if (!empty($media_ids)) {
$duplicates = $media_storage->loadMultiple($media_ids);
}
return $duplicates;
}
/**
* Extract the base filename without Drupal's automatic numbering.
*
* Examples:
* - profile-image-3_0_13.png -> profile-image-3_0
* - document_5.pdf -> document
* - image.jpg -> image
*
* @param string $filename
* The filename to process.
*
* @return string
* The base filename without automatic suffixes.
*/
public function getBaseFilename(string $filename): string {
$extension = pathinfo($filename, PATHINFO_EXTENSION);
$name_without_ext = pathinfo($filename, PATHINFO_FILENAME);
// Remove Drupal's automatic numbering pattern (_0, _1, _2, etc.)
// This regex matches _[number] at the end of the filename
$base_name = preg_replace('/_\d+$/', '', $name_without_ext);
return $base_name;
}
/**
* Get media preview data for AJAX response.
*
* @param \Drupal\media\MediaInterface $media
* The media entity.
*
* @return array
* Array containing media preview data.
*/
public function getMediaPreviewData(MediaInterface $media): array {
$data = [
'id' => $media->id(),
'name' => $media->getName(),
'created' => $media->getCreatedTime(),
'created_formatted' => \Drupal::service('date.formatter')->format($media->getCreatedTime(), 'medium'),
'type' => $media->bundle(),
'thumbnail' => NULL,
'url' => $media->toUrl()->toString(),
];
// Try to get thumbnail.
if ($media->hasField('thumbnail') && !$media->get('thumbnail')->isEmpty()) {
$thumbnail = $media->get('thumbnail')->entity;
if ($thumbnail) {
$data['thumbnail'] = \Drupal::service('file_url_generator')->generateString($thumbnail->getFileUri());
}
}
// Get the actual file URL.
$file_field_name = $this->getFileFieldName($media);
if ($file_field_name && $media->hasField($file_field_name) && !$media->get($file_field_name)->isEmpty()) {
$file = $media->get($file_field_name)->entity;
if ($file) {
$data['file_url'] = \Drupal::service('file_url_generator')->generateString($file->getFileUri());
$data['file_size'] = ByteSizeMarkup::create($file->getSize());
}
}
return $data;
}
/**
* Get the file field name for a media entity.
*
* @param \Drupal\media\MediaInterface $media
* The media entity.
*
* @return string|null
* The field name or NULL if not found.
*/
protected function getFileFieldName(MediaInterface $media): ?string {
$bundle = $media->bundle();
// Map media types to their file fields.
$field_map = [
'image' => 'field_media_image',
'document' => 'field_media_document',
'svg_image' => 'field_media_svg_image',
'remote_video' => 'field_media_oembed_video',
];
return $field_map[$bundle] ?? NULL;
}
}
