image_to_media_swapper-2.x-dev/src/ContentVerificationService.php
src/ContentVerificationService.php
<?php
declare(strict_types=1);
namespace Drupal\image_to_media_swapper;
use Drupal\Core\File\FileSystemInterface;
use Drupal\Core\Logger\LoggerChannelFactoryInterface;
use Drupal\Core\Logger\LoggerChannelInterface;
use Symfony\Component\Mime\MimeTypeGuesserInterface;
/**
* Service for verifying file content matches its declared type.
*
* This service provides enhanced file validation by examining the actual
* content of files to verify they match their declared MIME types.
*/
final class ContentVerificationService {
/**
* The logger service.
*/
protected LoggerChannelInterface $logger;
/**
* Constructs a ContentVerificationService object.
*
* @param \Symfony\Component\Mime\MimeTypeGuesserInterface $mimeTypeGuesser
* The MIME type guesser service.
* @param \Drupal\Core\File\FileSystemInterface $fileSystem
* The file system service.
* @param \Drupal\Core\Logger\LoggerChannelFactoryInterface $loggerFactory
* The logger factory service.
* @param \Drupal\image_to_media_swapper\SecurityValidationService $securityValidationService
* The security validation service.
*/
public function __construct(
protected readonly MimeTypeGuesserInterface $mimeTypeGuesser,
protected readonly FileSystemInterface $fileSystem,
LoggerChannelFactoryInterface $loggerFactory,
protected readonly SecurityValidationService $securityValidationService,
) {
$this->logger = $loggerFactory->get('image_to_media_swapper');
}
/**
* Performs deep content verification on a file to validate its type.
*
* This method examines the actual content of a file to verify it matches
* its declared MIME type, using multiple verification techniques.
*
* @param string $filePath
* The path to the file to verify.
* @param string $declaredMimeType
* The declared MIME type of the file, if known.
*
* @return array
* An array with verification results, containing:
* - verified: TRUE if the file content matches the declared type.
* - actual_type: The detected MIME type from content analysis.
* - errors: Array of error messages if verification fails.
*/
public function verifyFileContent(string $filePath, string $declaredMimeType = ''): array {
if (!file_exists($filePath)) {
return [
'verified' => FALSE,
'actual_type' => '',
'errors' => ['File does not exist'],
];
}
// Get the declared MIME type if not provided.
if (empty($declaredMimeType)) {
$declaredMimeType = $this->mimeTypeGuesser->guessMimeType($filePath);
}
// Initialize the result.
$result = [
'errors' => [],
];
// Perform multiple verification checks.
$fileInfoCheck = $this->verifyWithFileinfo($filePath);
$headerCheck = $this->verifyWithFileHeaders($filePath, $declaredMimeType);
$contentPatternCheck = $this->verifyWithContentPatterns($filePath, $declaredMimeType);
// Combine the results of all checks.
$result['actual_type'] = $fileInfoCheck['mime_type'];
// A file is verified if it passes at least two verification methods.
$passedChecks = 0;
if ($fileInfoCheck['verified']) {
$passedChecks++;
}
else {
$result['errors'] = array_merge($result['errors'], $fileInfoCheck['errors']);
}
if ($headerCheck['verified']) {
$passedChecks++;
}
else {
$result['errors'] = array_merge($result['errors'], $headerCheck['errors']);
}
if ($contentPatternCheck['verified']) {
$passedChecks++;
}
else {
$result['errors'] = array_merge($result['errors'], $contentPatternCheck['errors']);
}
// If at least two verification methods agree, consider the file verified.
$result['verified'] = ($passedChecks >= 2);
// Check content against known malicious patterns.
$securityCheck = $this->checkForMaliciousContent($filePath, $declaredMimeType);
if (!$securityCheck['safe']) {
$result['verified'] = FALSE;
$result['errors'] = array_merge($result['errors'], $securityCheck['errors']);
}
return $result;
}
/**
* Verifies a file using PHP's fileinfo extension.
*
* This method uses the fileinfo extension to examine the file's content
* and determine its actual MIME type.
*
* @param string $filePath
* The path to the file to verify.
*
* @return array
* An array containing verification results:
* - verified: TRUE if the fileinfo check passed.
* - mime_type: The detected MIME type.
* - errors: Array of error messages if verification fails.
*/
private function verifyWithFileinfo(string $filePath): array {
$result = [
'verified' => FALSE,
'mime_type' => '',
'errors' => [],
];
// Use finfo to determine the file's MIME type from its content.
if (function_exists('finfo_open')) {
$finfo = finfo_open(FILEINFO_MIME_TYPE);
if ($finfo) {
$detectedMimeType = finfo_file($finfo, $filePath);
finfo_close($finfo);
if ($detectedMimeType) {
$result['mime_type'] = $detectedMimeType;
$result['verified'] = TRUE;
}
else {
$result['errors'][] = 'Could not determine file type using fileinfo';
}
}
else {
$result['errors'][] = 'Failed to initialize fileinfo';
}
}
else {
// Fall back to Drupal's MIME type guesser if fileinfo is not available.
$result['mime_type'] = $this->mimeTypeGuesser->guessMimeType($filePath);
$result['verified'] = TRUE;
$result['errors'][] = 'PHP fileinfo extension not available, using fallback method';
}
return $result;
}
/**
* Verifies a file by examining its binary headers.
*
* This method reads the first few bytes of a file to check for
* file format signatures that match the declared MIME type.
*
* @param string $filePath
* The path to the file to verify.
* @param string $declaredMimeType
* The declared MIME type of the file.
*
* @return array
* An array containing verification results:
* - verified: TRUE if the header check passed.
* - errors: Array of error messages if verification fails.
*/
private function verifyWithFileHeaders(string $filePath, string $declaredMimeType): array {
$result = [
'verified' => FALSE,
'errors' => [],
];
// Get file headers (first 12 bytes is usually enough for most formats).
$handle = fopen($filePath, 'rb');
if (!$handle) {
$result['errors'][] = 'Could not open file for header analysis';
return $result;
}
$header = fread($handle, 12);
fclose($handle);
if ($header === FALSE) {
$result['errors'][] = 'Could not read file header';
return $result;
}
// Convert header to hex for comparison.
$hexHeader = bin2hex($header);
// Check the header against known file signatures.
// This is a simplified version - a full implementation would have many
// more signatures.
$fileSignatures = [
'image/jpeg' => ['ffd8ff'],
'image/png' => ['89504e47'],
'image/gif' => ['474946'],
'image/webp' => ['52494646', 'WEBP'],
'application/pdf' => ['25504446'],
'image/svg+xml' => ['3c737667'],
'image/tiff' => ['49492a00', '4d4d002a'],
'application/zip' => ['504b0304'],
'application/x-compressed' => ['1f8b08'],
'audio/mpeg' => ['494433', 'fffb'],
'video/mp4' => ['00000018', '00000020', '6674797069736f6d'],
'video/quicktime' => ['6674797071742020'],
'application/msword' => ['d0cf11e0'],
'application/vnd.openxmlformats-officedocument' => ['504b0304'],
];
// For each file type, check if the header matches any of its signatures.
foreach ($fileSignatures as $mimeType => $signatures) {
foreach ($signatures as $signature) {
if (stripos($hexHeader, strtolower($signature)) === 0 ||
stripos($header, $signature) === 0) {
// If a signature matches, check consistency with declared type.
if (str_starts_with($declaredMimeType, explode('/', $mimeType)[0]) ||
str_starts_with($mimeType, explode('/', $declaredMimeType)[0])) {
$result['verified'] = TRUE;
}
else {
$result['errors'][] = 'File header indicates ' . $mimeType .
' but declared as ' . $declaredMimeType;
}
return $result;
}
}
}
// If we got here, no known signatures matched.
$result['errors'][] = 'File header does not match any known format signatures';
return $result;
}
/**
* Verifies a file by examining its content for type-specific patterns.
*
* This method analyzes the file's content for patterns that are
* characteristic of specific file types, beyond just the header.
*
* @param string $filePath
* The path to the file to verify.
* @param string $declaredMimeType
* The declared MIME type of the file.
*
* @return array
* An array containing verification results:
* - verified: TRUE if the content pattern check passed.
* - errors: Array of error messages if verification fails.
*/
private function verifyWithContentPatterns(string $filePath, string $declaredMimeType): array {
$result = [
'verified' => FALSE,
'errors' => [],
];
// Get the first part of the MIME type (e.g., "image" from "image/jpeg").
$typeCategory = explode('/', $declaredMimeType)[0] ?? '';
// Different verification strategies based on file type.
switch ($typeCategory) {
case 'image':
$result = $this->verifyImageContent($filePath, $declaredMimeType);
break;
case 'application':
if (str_contains($declaredMimeType, 'pdf')) {
$result = $this->verifyPdfContent($filePath);
}
elseif (str_contains($declaredMimeType, 'xml') ||
str_contains($declaredMimeType, 'html')) {
$result = $this->verifyXmlContent($filePath);
}
else {
// Default verification for application types.
$result['verified'] = TRUE;
$result['errors'][] = 'Limited verification for ' . $declaredMimeType;
}
break;
case 'text':
$result = $this->verifyTextContent($filePath, $declaredMimeType);
break;
case 'audio':
case 'video':
$result = $this->verifyMediaContent($filePath, $declaredMimeType);
break;
default:
// For unknown types, be lenient but note the limitation.
$result['verified'] = TRUE;
$result['errors'][] = 'No content pattern verification available for ' . $declaredMimeType;
}
return $result;
}
/**
* Verifies image file content.
*
* @param string $filePath
* The path to the image file.
* @param string $declaredMimeType
* The declared MIME type of the image.
*
* @return array
* Verification results array.
*/
private function verifyImageContent(string $filePath, string $declaredMimeType): array {
$result = [
'verified' => FALSE,
'errors' => [],
];
// Try to get image dimensions - if successful, it's likely a valid image.
$imageInfo = @getimagesize($filePath);
if ($imageInfo !== FALSE) {
// The @getimagesize returns the MIME type in index 'mime'.
$detectedType = $imageInfo['mime'] ?? '';
// Check if detected type matches declared type or at least the category.
if ($detectedType === $declaredMimeType ||
(str_starts_with($detectedType, 'image/') && str_starts_with($declaredMimeType, 'image/'))) {
$result['verified'] = TRUE;
}
else {
$result['errors'][] = 'Image verification failed: detected ' .
$detectedType . ' but declared as ' . $declaredMimeType;
}
}
else {
$result['errors'][] = 'Not a valid image or unsupported image format';
}
return $result;
}
/**
* Verifies PDF file content.
*
* @param string $filePath
* The path to the PDF file.
*
* @return array
* Verification results array.
*/
private function verifyPdfContent(string $filePath): array {
$result = [
'verified' => FALSE,
'errors' => [],
];
// Read the first 1024 bytes, which should contain the PDF header.
$handle = fopen($filePath, 'rb');
if (!$handle) {
$result['errors'][] = 'Could not open file for PDF verification';
return $result;
}
$content = fread($handle, 1024);
fclose($handle);
// Check for PDF header and other required elements.
if (stripos($content, '%PDF-') === 0 &&
(stripos($content, '%%EOF') !== FALSE || filesize($filePath) > 1024)) {
$result['verified'] = TRUE;
}
else {
$result['errors'][] = 'Not a valid PDF file or corrupted PDF structure';
}
return $result;
}
/**
* Verifies XML/HTML file content.
*
* @param string $filePath
* The path to the XML file.
*
* @return array
* Verification results array.
*/
private function verifyXmlContent(string $filePath): array {
$result = [
'verified' => FALSE,
'errors' => [],
];
// Use libxml to verify XML structure.
$previousValue = libxml_use_internal_errors(TRUE);
$xml = simplexml_load_file($filePath);
$errors = libxml_get_errors();
libxml_clear_errors();
libxml_use_internal_errors($previousValue);
if ($xml !== FALSE && empty($errors)) {
$result['verified'] = TRUE;
}
else {
$result['errors'][] = 'Not a valid XML/HTML file or has parsing errors';
}
return $result;
}
/**
* Verifies text file content.
*
* @param string $filePath
* The path to the text file.
* @param string $declaredMimeType
* The declared MIME type of the text file.
*
* @return array
* Verification results array.
*/
private function verifyTextContent(string $filePath, string $declaredMimeType): array {
$result = [
'verified' => FALSE,
'errors' => [],
];
// Read a portion of the file to check for binary content.
$handle = fopen($filePath, 'rb');
if (!$handle) {
$result['errors'][] = 'Could not open file for text verification';
return $result;
}
$sample = fread($handle, 512);
fclose($handle);
// Check if the content is mostly printable characters.
$binary = FALSE;
for ($i = 0; $i < strlen($sample); $i++) {
$char = ord($sample[$i]);
if ($char < 9 || ($char > 13 && $char < 32 && $char != 27)) {
$binary = TRUE;
break;
}
}
if (!$binary) {
$result['verified'] = TRUE;
}
else {
$result['errors'][] = 'File contains binary data, not a valid text file';
}
return $result;
}
/**
* Verifies audio/video file content.
*
* @param string $filePath
* The path to the media file.
* @param string $declaredMimeType
* The declared MIME type of the media file.
*
* @return array
* Verification results array.
*/
private function verifyMediaContent(string $filePath, string $declaredMimeType): array {
$result = [
'verified' => FALSE,
'errors' => [],
];
// For media files, we rely mostly on header verification and fileinfo.
// Full media parsing would require additional libraries.
// Check file size - media files are usually not tiny.
$fileSize = filesize($filePath);
if ($fileSize < 100) {
$result['errors'][] = 'File too small to be a valid media file';
return $result;
}
// Since we've already checked headers in another method,
// and deep media parsing is complex, we'll be lenient here.
$result['verified'] = TRUE;
$result['errors'][] = 'Limited verification for media files';
return $result;
}
/**
* Checks a file for known malicious content patterns.
*
* @param string $filePath
* The path to the file to check.
* @param string $declaredMimeType
* The declared MIME type of the file.
*
* @return array
* An array containing security check results:
* - safe: TRUE if no malicious content was detected.
* - errors: Array of security issues found.
*/
private function checkForMaliciousContent(string $filePath, string $declaredMimeType): array {
$result = [
'safe' => TRUE,
'errors' => [],
];
// Get file size - don't try to read huge files entirely.
$fileSize = filesize($filePath);
// Scan at most 1MB.
$maxScanSize = min($fileSize, 1048576);
$handle = fopen($filePath, 'rb');
if (!$handle) {
$result['errors'][] = 'Could not open file for security scanning';
$result['safe'] = FALSE;
return $result;
}
$content = fread($handle, $maxScanSize);
fclose($handle);
if ($content === FALSE) {
$result['errors'][] = 'Could not read file content for security scanning';
$result['safe'] = FALSE;
return $result;
}
// Check for PHP code in non-PHP files.
if (!str_contains($declaredMimeType, 'php') &&
(preg_match('/<\?php/i', $content) || preg_match('/eval\s*\(/i', $content))) {
$result['safe'] = FALSE;
$result['errors'][] = 'File contains PHP code but is not declared as a PHP file';
}
// Check for script tags in image files.
if (str_starts_with($declaredMimeType, 'image/') &&
(stripos($content, '<script') !== FALSE || stripos($content, 'javascript:') !== FALSE)) {
$result['safe'] = FALSE;
$result['errors'][] = 'Image file contains script tags or JavaScript code';
}
// Check for shell commands in text files.
if (str_starts_with($declaredMimeType, 'text/') &&
preg_match('/system\s*\(|exec\s*\(|passthru\s*\(|shell_exec\s*\(/i', $content)) {
$result['safe'] = FALSE;
$result['errors'][] = 'Text file contains potentially dangerous shell commands';
}
// Check for suspicious metadata in PDF files.
if (str_contains($declaredMimeType, 'pdf') &&
(stripos($content, '/JS') !== FALSE || stripos($content, '/JavaScript') !== FALSE)) {
$result['safe'] = FALSE;
$result['errors'][] = 'PDF file contains JavaScript code';
}
// Check for XML external entity injection in XML files.
if (str_contains($declaredMimeType, 'xml') &&
(stripos($content, '<!ENTITY') !== FALSE && stripos($content, 'SYSTEM') !== FALSE)) {
$result['safe'] = FALSE;
$result['errors'][] = 'XML file contains external entity declarations (XXE risk)';
}
// Check for HTML in SVG files that could lead to XSS.
if (str_contains($declaredMimeType, 'svg') &&
(stripos($content, '<script') !== FALSE ||
preg_match('/on\w+\s*=/i', $content) ||
stripos($content, 'javascript:') !== FALSE)) {
$result['safe'] = FALSE;
$result['errors'][] = 'SVG file contains potentially dangerous script elements or event handlers';
}
return $result;
}
}
