image_to_media_swapper-2.x-dev/src/ContentVerificationService.php

src/ContentVerificationService.php
<?php

declare(strict_types=1);

namespace Drupal\image_to_media_swapper;

use Drupal\Core\File\FileSystemInterface;
use Drupal\Core\Logger\LoggerChannelFactoryInterface;
use Drupal\Core\Logger\LoggerChannelInterface;
use Symfony\Component\Mime\MimeTypeGuesserInterface;

/**
 * Service for verifying file content matches its declared type.
 *
 * This service provides enhanced file validation by examining the actual
 * content of files to verify they match their declared MIME types.
 */
final class ContentVerificationService {

  /**
   * The logger service.
   */
  protected LoggerChannelInterface $logger;

  /**
   * Constructs a ContentVerificationService object.
   *
   * @param \Symfony\Component\Mime\MimeTypeGuesserInterface $mimeTypeGuesser
   *   The MIME type guesser service.
   * @param \Drupal\Core\File\FileSystemInterface $fileSystem
   *   The file system service.
   * @param \Drupal\Core\Logger\LoggerChannelFactoryInterface $loggerFactory
   *   The logger factory service.
   * @param \Drupal\image_to_media_swapper\SecurityValidationService $securityValidationService
   *   The security validation service.
   */
  public function __construct(
    protected readonly MimeTypeGuesserInterface $mimeTypeGuesser,
    protected readonly FileSystemInterface $fileSystem,
    LoggerChannelFactoryInterface $loggerFactory,
    protected readonly SecurityValidationService $securityValidationService,
  ) {
    $this->logger = $loggerFactory->get('image_to_media_swapper');
  }

  /**
   * Performs deep content verification on a file to validate its type.
   *
   * This method examines the actual content of a file to verify it matches
   * its declared MIME type, using multiple verification techniques.
   *
   * @param string $filePath
   *   The path to the file to verify.
   * @param string $declaredMimeType
   *   The declared MIME type of the file, if known.
   *
   * @return array
   *   An array with verification results, containing:
   *   - verified: TRUE if the file content matches the declared type.
   *   - actual_type: The detected MIME type from content analysis.
   *   - errors: Array of error messages if verification fails.
   */
  public function verifyFileContent(string $filePath, string $declaredMimeType = ''): array {
    if (!file_exists($filePath)) {
      return [
        'verified' => FALSE,
        'actual_type' => '',
        'errors' => ['File does not exist'],
      ];
    }

    // Get the declared MIME type if not provided.
    if (empty($declaredMimeType)) {
      $declaredMimeType = $this->mimeTypeGuesser->guessMimeType($filePath);
    }

    // Initialize the result.
    $result = [
      'errors' => [],
    ];

    // Perform multiple verification checks.
    $fileInfoCheck = $this->verifyWithFileinfo($filePath);
    $headerCheck = $this->verifyWithFileHeaders($filePath, $declaredMimeType);
    $contentPatternCheck = $this->verifyWithContentPatterns($filePath, $declaredMimeType);

    // Combine the results of all checks.
    $result['actual_type'] = $fileInfoCheck['mime_type'];

    // A file is verified if it passes at least two verification methods.
    $passedChecks = 0;
    if ($fileInfoCheck['verified']) {
      $passedChecks++;
    }
    else {
      $result['errors'] = array_merge($result['errors'], $fileInfoCheck['errors']);
    }

    if ($headerCheck['verified']) {
      $passedChecks++;
    }
    else {
      $result['errors'] = array_merge($result['errors'], $headerCheck['errors']);
    }

    if ($contentPatternCheck['verified']) {
      $passedChecks++;
    }
    else {
      $result['errors'] = array_merge($result['errors'], $contentPatternCheck['errors']);
    }

    // If at least two verification methods agree, consider the file verified.
    $result['verified'] = ($passedChecks >= 2);

    // Check content against known malicious patterns.
    $securityCheck = $this->checkForMaliciousContent($filePath, $declaredMimeType);
    if (!$securityCheck['safe']) {
      $result['verified'] = FALSE;
      $result['errors'] = array_merge($result['errors'], $securityCheck['errors']);
    }

    return $result;
  }

  /**
   * Verifies a file using PHP's fileinfo extension.
   *
   * This method uses the fileinfo extension to examine the file's content
   * and determine its actual MIME type.
   *
   * @param string $filePath
   *   The path to the file to verify.
   *
   * @return array
   *   An array containing verification results:
   *   - verified: TRUE if the fileinfo check passed.
   *   - mime_type: The detected MIME type.
   *   - errors: Array of error messages if verification fails.
   */
  private function verifyWithFileinfo(string $filePath): array {
    $result = [
      'verified' => FALSE,
      'mime_type' => '',
      'errors' => [],
    ];

    // Use finfo to determine the file's MIME type from its content.
    if (function_exists('finfo_open')) {
      $finfo = finfo_open(FILEINFO_MIME_TYPE);
      if ($finfo) {
        $detectedMimeType = finfo_file($finfo, $filePath);
        finfo_close($finfo);

        if ($detectedMimeType) {
          $result['mime_type'] = $detectedMimeType;
          $result['verified'] = TRUE;
        }
        else {
          $result['errors'][] = 'Could not determine file type using fileinfo';
        }
      }
      else {
        $result['errors'][] = 'Failed to initialize fileinfo';
      }
    }
    else {
      // Fall back to Drupal's MIME type guesser if fileinfo is not available.
      $result['mime_type'] = $this->mimeTypeGuesser->guessMimeType($filePath);
      $result['verified'] = TRUE;
      $result['errors'][] = 'PHP fileinfo extension not available, using fallback method';
    }

    return $result;
  }

  /**
   * Verifies a file by examining its binary headers.
   *
   * This method reads the first few bytes of a file to check for
   * file format signatures that match the declared MIME type.
   *
   * @param string $filePath
   *   The path to the file to verify.
   * @param string $declaredMimeType
   *   The declared MIME type of the file.
   *
   * @return array
   *   An array containing verification results:
   *   - verified: TRUE if the header check passed.
   *   - errors: Array of error messages if verification fails.
   */
  private function verifyWithFileHeaders(string $filePath, string $declaredMimeType): array {
    $result = [
      'verified' => FALSE,
      'errors' => [],
    ];

    // Get file headers (first 12 bytes is usually enough for most formats).
    $handle = fopen($filePath, 'rb');
    if (!$handle) {
      $result['errors'][] = 'Could not open file for header analysis';
      return $result;
    }

    $header = fread($handle, 12);
    fclose($handle);

    if ($header === FALSE) {
      $result['errors'][] = 'Could not read file header';
      return $result;
    }

    // Convert header to hex for comparison.
    $hexHeader = bin2hex($header);

    // Check the header against known file signatures.
    // This is a simplified version - a full implementation would have many
    // more signatures.
    $fileSignatures = [
      'image/jpeg' => ['ffd8ff'],
      'image/png' => ['89504e47'],
      'image/gif' => ['474946'],
      'image/webp' => ['52494646', 'WEBP'],
      'application/pdf' => ['25504446'],
      'image/svg+xml' => ['3c737667'],
      'image/tiff' => ['49492a00', '4d4d002a'],
      'application/zip' => ['504b0304'],
      'application/x-compressed' => ['1f8b08'],
      'audio/mpeg' => ['494433', 'fffb'],
      'video/mp4' => ['00000018', '00000020', '6674797069736f6d'],
      'video/quicktime' => ['6674797071742020'],
      'application/msword' => ['d0cf11e0'],
      'application/vnd.openxmlformats-officedocument' => ['504b0304'],
    ];

    // For each file type, check if the header matches any of its signatures.
    foreach ($fileSignatures as $mimeType => $signatures) {
      foreach ($signatures as $signature) {
        if (stripos($hexHeader, strtolower($signature)) === 0 ||
            stripos($header, $signature) === 0) {
          // If a signature matches, check consistency with declared type.
          if (str_starts_with($declaredMimeType, explode('/', $mimeType)[0]) ||
            str_starts_with($mimeType, explode('/', $declaredMimeType)[0])) {
            $result['verified'] = TRUE;
          }
          else {
            $result['errors'][] = 'File header indicates ' . $mimeType .
                                ' but declared as ' . $declaredMimeType;
          }
          return $result;
        }
      }
    }

    // If we got here, no known signatures matched.
    $result['errors'][] = 'File header does not match any known format signatures';
    return $result;
  }

  /**
   * Verifies a file by examining its content for type-specific patterns.
   *
   * This method analyzes the file's content for patterns that are
   * characteristic of specific file types, beyond just the header.
   *
   * @param string $filePath
   *   The path to the file to verify.
   * @param string $declaredMimeType
   *   The declared MIME type of the file.
   *
   * @return array
   *   An array containing verification results:
   *   - verified: TRUE if the content pattern check passed.
   *   - errors: Array of error messages if verification fails.
   */
  private function verifyWithContentPatterns(string $filePath, string $declaredMimeType): array {
    $result = [
      'verified' => FALSE,
      'errors' => [],
    ];

    // Get the first part of the MIME type (e.g., "image" from "image/jpeg").
    $typeCategory = explode('/', $declaredMimeType)[0] ?? '';

    // Different verification strategies based on file type.
    switch ($typeCategory) {
      case 'image':
        $result = $this->verifyImageContent($filePath, $declaredMimeType);
        break;

      case 'application':
        if (str_contains($declaredMimeType, 'pdf')) {
          $result = $this->verifyPdfContent($filePath);
        }
        elseif (str_contains($declaredMimeType, 'xml') ||
          str_contains($declaredMimeType, 'html')) {
          $result = $this->verifyXmlContent($filePath);
        }
        else {
          // Default verification for application types.
          $result['verified'] = TRUE;
          $result['errors'][] = 'Limited verification for ' . $declaredMimeType;
        }
        break;

      case 'text':
        $result = $this->verifyTextContent($filePath, $declaredMimeType);
        break;

      case 'audio':
      case 'video':
        $result = $this->verifyMediaContent($filePath, $declaredMimeType);
        break;

      default:
        // For unknown types, be lenient but note the limitation.
        $result['verified'] = TRUE;
        $result['errors'][] = 'No content pattern verification available for ' . $declaredMimeType;
    }

    return $result;
  }

  /**
   * Verifies image file content.
   *
   * @param string $filePath
   *   The path to the image file.
   * @param string $declaredMimeType
   *   The declared MIME type of the image.
   *
   * @return array
   *   Verification results array.
   */
  private function verifyImageContent(string $filePath, string $declaredMimeType): array {
    $result = [
      'verified' => FALSE,
      'errors' => [],
    ];

    // Try to get image dimensions - if successful, it's likely a valid image.
    $imageInfo = @getimagesize($filePath);
    if ($imageInfo !== FALSE) {
      // The @getimagesize returns the MIME type in index 'mime'.
      $detectedType = $imageInfo['mime'] ?? '';

      // Check if detected type matches declared type or at least the category.
      if ($detectedType === $declaredMimeType ||
          (str_starts_with($detectedType, 'image/') && str_starts_with($declaredMimeType, 'image/'))) {
        $result['verified'] = TRUE;
      }
      else {
        $result['errors'][] = 'Image verification failed: detected ' .
                            $detectedType . ' but declared as ' . $declaredMimeType;
      }
    }
    else {
      $result['errors'][] = 'Not a valid image or unsupported image format';
    }

    return $result;
  }

  /**
   * Verifies PDF file content.
   *
   * @param string $filePath
   *   The path to the PDF file.
   *
   * @return array
   *   Verification results array.
   */
  private function verifyPdfContent(string $filePath): array {
    $result = [
      'verified' => FALSE,
      'errors' => [],
    ];

    // Read the first 1024 bytes, which should contain the PDF header.
    $handle = fopen($filePath, 'rb');
    if (!$handle) {
      $result['errors'][] = 'Could not open file for PDF verification';
      return $result;
    }

    $content = fread($handle, 1024);
    fclose($handle);

    // Check for PDF header and other required elements.
    if (stripos($content, '%PDF-') === 0 &&
        (stripos($content, '%%EOF') !== FALSE || filesize($filePath) > 1024)) {
      $result['verified'] = TRUE;
    }
    else {
      $result['errors'][] = 'Not a valid PDF file or corrupted PDF structure';
    }

    return $result;
  }

  /**
   * Verifies XML/HTML file content.
   *
   * @param string $filePath
   *   The path to the XML file.
   *
   * @return array
   *   Verification results array.
   */
  private function verifyXmlContent(string $filePath): array {
    $result = [
      'verified' => FALSE,
      'errors' => [],
    ];

    // Use libxml to verify XML structure.
    $previousValue = libxml_use_internal_errors(TRUE);
    $xml = simplexml_load_file($filePath);
    $errors = libxml_get_errors();
    libxml_clear_errors();
    libxml_use_internal_errors($previousValue);

    if ($xml !== FALSE && empty($errors)) {
      $result['verified'] = TRUE;
    }
    else {
      $result['errors'][] = 'Not a valid XML/HTML file or has parsing errors';
    }

    return $result;
  }

  /**
   * Verifies text file content.
   *
   * @param string $filePath
   *   The path to the text file.
   * @param string $declaredMimeType
   *   The declared MIME type of the text file.
   *
   * @return array
   *   Verification results array.
   */
  private function verifyTextContent(string $filePath, string $declaredMimeType): array {
    $result = [
      'verified' => FALSE,
      'errors' => [],
    ];

    // Read a portion of the file to check for binary content.
    $handle = fopen($filePath, 'rb');
    if (!$handle) {
      $result['errors'][] = 'Could not open file for text verification';
      return $result;
    }

    $sample = fread($handle, 512);
    fclose($handle);

    // Check if the content is mostly printable characters.
    $binary = FALSE;
    for ($i = 0; $i < strlen($sample); $i++) {
      $char = ord($sample[$i]);
      if ($char < 9 || ($char > 13 && $char < 32 && $char != 27)) {
        $binary = TRUE;
        break;
      }
    }

    if (!$binary) {
      $result['verified'] = TRUE;
    }
    else {
      $result['errors'][] = 'File contains binary data, not a valid text file';
    }

    return $result;
  }

  /**
   * Verifies audio/video file content.
   *
   * @param string $filePath
   *   The path to the media file.
   * @param string $declaredMimeType
   *   The declared MIME type of the media file.
   *
   * @return array
   *   Verification results array.
   */
  private function verifyMediaContent(string $filePath, string $declaredMimeType): array {
    $result = [
      'verified' => FALSE,
      'errors' => [],
    ];

    // For media files, we rely mostly on header verification and fileinfo.
    // Full media parsing would require additional libraries.
    // Check file size - media files are usually not tiny.
    $fileSize = filesize($filePath);
    if ($fileSize < 100) {
      $result['errors'][] = 'File too small to be a valid media file';
      return $result;
    }

    // Since we've already checked headers in another method,
    // and deep media parsing is complex, we'll be lenient here.
    $result['verified'] = TRUE;
    $result['errors'][] = 'Limited verification for media files';

    return $result;
  }

  /**
   * Checks a file for known malicious content patterns.
   *
   * @param string $filePath
   *   The path to the file to check.
   * @param string $declaredMimeType
   *   The declared MIME type of the file.
   *
   * @return array
   *   An array containing security check results:
   *   - safe: TRUE if no malicious content was detected.
   *   - errors: Array of security issues found.
   */
  private function checkForMaliciousContent(string $filePath, string $declaredMimeType): array {
    $result = [
      'safe' => TRUE,
      'errors' => [],
    ];

    // Get file size - don't try to read huge files entirely.
    $fileSize = filesize($filePath);
    // Scan at most 1MB.
    $maxScanSize = min($fileSize, 1048576);

    $handle = fopen($filePath, 'rb');
    if (!$handle) {
      $result['errors'][] = 'Could not open file for security scanning';
      $result['safe'] = FALSE;
      return $result;
    }

    $content = fread($handle, $maxScanSize);
    fclose($handle);

    if ($content === FALSE) {
      $result['errors'][] = 'Could not read file content for security scanning';
      $result['safe'] = FALSE;
      return $result;
    }

    // Check for PHP code in non-PHP files.
    if (!str_contains($declaredMimeType, 'php') &&
        (preg_match('/<\?php/i', $content) || preg_match('/eval\s*\(/i', $content))) {
      $result['safe'] = FALSE;
      $result['errors'][] = 'File contains PHP code but is not declared as a PHP file';
    }

    // Check for script tags in image files.
    if (str_starts_with($declaredMimeType, 'image/') &&
        (stripos($content, '<script') !== FALSE || stripos($content, 'javascript:') !== FALSE)) {
      $result['safe'] = FALSE;
      $result['errors'][] = 'Image file contains script tags or JavaScript code';
    }

    // Check for shell commands in text files.
    if (str_starts_with($declaredMimeType, 'text/') &&
        preg_match('/system\s*\(|exec\s*\(|passthru\s*\(|shell_exec\s*\(/i', $content)) {
      $result['safe'] = FALSE;
      $result['errors'][] = 'Text file contains potentially dangerous shell commands';
    }

    // Check for suspicious metadata in PDF files.
    if (str_contains($declaredMimeType, 'pdf') &&
        (stripos($content, '/JS') !== FALSE || stripos($content, '/JavaScript') !== FALSE)) {
      $result['safe'] = FALSE;
      $result['errors'][] = 'PDF file contains JavaScript code';
    }

    // Check for XML external entity injection in XML files.
    if (str_contains($declaredMimeType, 'xml') &&
        (stripos($content, '<!ENTITY') !== FALSE && stripos($content, 'SYSTEM') !== FALSE)) {
      $result['safe'] = FALSE;
      $result['errors'][] = 'XML file contains external entity declarations (XXE risk)';
    }

    // Check for HTML in SVG files that could lead to XSS.
    if (str_contains($declaredMimeType, 'svg') &&
        (stripos($content, '<script') !== FALSE ||
         preg_match('/on\w+\s*=/i', $content) ||
         stripos($content, 'javascript:') !== FALSE)) {
      $result['safe'] = FALSE;
      $result['errors'][] = 'SVG file contains potentially dangerous script elements or event handlers';
    }

    return $result;
  }

}

Главная | Обратная связь

drupal hosting | друпал хостинг | it patrol .inc