linkchecker-8.x-1.x-dev/src/LinkCheckerService.php
src/LinkCheckerService.php
<?php
namespace Drupal\linkchecker;
use Drupal\Component\Datetime\TimeInterface;
use Drupal\Core\Config\ConfigFactory;
use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\Core\Link;
use Drupal\Core\Logger\RfcLogLevel;
use Drupal\Core\Queue\QueueFactory;
use Drupal\Core\StringTranslation\StringTranslationTrait;
use Drupal\Core\Url;
use Drupal\linkchecker\Event\BuildHeader;
use Drupal\linkchecker\Event\LinkcheckerEvents;
use Drupal\linkchecker\Plugin\LinkStatusHandlerManager;
use GuzzleHttp\Client;
use GuzzleHttp\Exception\RequestException;
use Psr\Http\Message\ResponseInterface;
use Symfony\Component\EventDispatcher\EventDispatcherInterface;
/**
* Created a Class for creating Services.
*/
class LinkCheckerService {
use StringTranslationTrait;
/**
* The entity type manager.
*
* @var \Drupal\Core\Entity\EntityTypeManagerInterface
*/
protected $entityTypeManager;
/**
* The Linkchecker settings.
*
* @var \Drupal\Core\Config\ImmutableConfig
*/
protected $linkcheckerSetting;
/**
* The http client.
*
* @var \GuzzleHttp\Client
*/
protected $httpClient;
/**
* The report link.
*
* @var \Drupal\Core\Link
*/
protected $reportLink;
/**
* The time.
*
* @var \Drupal\Component\Datetime\TimeInterface
*/
protected $time;
/**
* The queue.
*
* @var \Drupal\Core\Queue\QueueInterface
*/
protected $queue;
/**
* The status handler manager.
*
* @var \Drupal\linkchecker\Plugin\LinkStatusHandlerManager
*/
protected $statusHandlerManager;
/**
* The event dispatcher.
*
* @var \Symfony\Component\EventDispatcher\EventDispatcherInterface
*/
protected EventDispatcherInterface $eventDispatcher;
/**
* Constructs a new LinkCheckerService object.
*/
public function __construct(EntityTypeManagerInterface $entityTypeManager, ConfigFactory $config, Client $httpClient, TimeInterface $time, QueueFactory $queueFactory, LinkStatusHandlerManager $statusHandlerManager, EventDispatcherInterface $eventDispatcher) {
$this->entityTypeManager = $entityTypeManager;
$this->linkcheckerSetting = $config->get('linkchecker.settings');
$this->httpClient = $httpClient;
$this->time = $time;
$this->queue = $queueFactory->get('linkchecker_check');
$this->statusHandlerManager = $statusHandlerManager;
$this->eventDispatcher = $eventDispatcher;
}
/**
* Queue all published links for checking.
*
* @param bool $rebuild
* Defines whether to rebuild queue or not.
*
* @return int
* Number of queued items.
*/
public function queueLinks($rebuild = FALSE) {
if ($rebuild) {
$this->queue->deleteQueue();
}
if (!empty($this->queue->numberOfItems())) {
return $this->queue->numberOfItems();
}
$checkInterval = $this->linkcheckerSetting->get('check.interval');
$query = $this->entityTypeManager->getStorage('linkcheckerlink')
->getAggregateQuery()
->accessCheck()
->condition('status', 1);
$orGroup = $query->orConditionGroup()
->condition('last_check', $this->time->getRequestTime() - $checkInterval, '<=')
->condition('last_check', NULL, 'IS NULL');
$query->groupBy('urlhash')
->aggregate('lid', 'MIN')
->condition($orGroup);
$linkIds = $query->execute();
$this->queue->createQueue();
if (!empty($linkIds)) {
$linkIds = array_column($linkIds, 'lid_min');
$maxConnections = $this->linkcheckerSetting->get('check.connections_max');
// Split ids by max connection amount to make it possible to send
// concurrent requests.
$linkIds = array_chunk($linkIds, $maxConnections);
}
else {
$linkIds = [];
}
foreach ($linkIds as $ids) {
$this->queue->createItem($ids);
}
return $this->queue->numberOfItems();
}
/**
* Check the link.
*
* @param \Drupal\linkchecker\LinkCheckerLinkInterface $link
* The link to check.
*
* @return \GuzzleHttp\Promise\PromiseInterface
* Promise of link checking request.
*/
public function check(LinkCheckerLinkInterface $link) {
$userAgent = $this->linkcheckerSetting->get('check.useragent');
$headers = [];
$headers['User-Agent'] = $userAgent;
$uri = @parse_url($link->getUrl());
// URL contains a fragment.
if (in_array($link->getRequestMethod(), ['HEAD', 'GET']) && !empty($uri['fragment'])) {
// We need the full content and not only the HEAD.
$link->setRequestMethod('GET');
// Request text content only (like Firefox/Chrome).
$headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8';
}
elseif ($link->getRequestMethod() == 'GET') {
// Range: Only request the first 1024 bytes from remote server. This is
// required to prevent timeouts on URLs that are large downloads.
$headers['Range'] = 'bytes=0-1024';
}
// Allow other modules to alter the header.
$context = [
'method' => $link->getRequestMethod(),
'url' => $link->getUrl(),
];
$event = new BuildHeader($headers, $context);
$this->eventDispatcher->dispatch($event, LinkcheckerEvents::BUILD_HEADER);
$headers = $event->getHeaders();
// Add in the headers.
$options = [
'headers' => $headers,
'max_redirects' => 0,
'http_errors' => FALSE,
'allow_redirects' => FALSE,
'synchronous' => FALSE,
];
return $this->httpClient
->requestAsync($link->getRequestMethod(), $link->getUrl(), $options)
->then(function (ResponseInterface $response) use ($link, $uri) {
if (!empty($uri['fragment'])) {
$response = $response->withHeader('Fragment', $uri['fragment']);
}
$this->statusHandling($response, $link);
},
function (RequestException $e) use ($link) {
$this->exceptionHandling($e, $link);
}
);
}
/**
* Status code handling.
*
* @param \Psr\Http\Message\ResponseInterface $response
* An object containing the HTTP request headers, response code, headers,
* data and redirect status.
* @param \Drupal\linkchecker\LinkCheckerLinkInterface $link
* The link.
*/
protected function statusHandling(ResponseInterface $response, LinkCheckerLinkInterface $link) {
$ignoreResponseCodes = preg_split('/(\r\n?|\n)/', $this->linkcheckerSetting->get('error.ignore_response_codes'));
$error = $response->getReasonPhrase();
if (!isset($error)) {
$error = '';
}
// Destination anchors in HTML documents may be specified either by:
// - the A element (naming it with the name attribute)
// - or by any other element (naming with the id attribute)
// - and must not contain a key/value pair as these type of hash fragments
// are typically used by AJAX applications to prevent additionally HTTP
// requests e.g. https://www.example.com/ajax.html#key1=value1&key2=value2
// - and must not contain '/' or ',' as this are not normal anchors.
// - and '#top' is a reserved fragment that must not exist in a page.
// See https://www.w3.org/TR/html401/struct/links.html
$statusCode = $response->getStatusCode();
if ($statusCode == 200
&& !empty($response->getBody())
&& !empty($response->getHeader('Content-Type'))
&& $response->hasHeader('Fragment')
&& preg_match('/=|\/|,/', $response->getHeaderLine('Fragment')) == FALSE
&& $response->getHeader('Fragment') !== '#top'
&& in_array($response->getHeaderLine('Content-Type'), [
'text/html',
'application/xhtml+xml',
'application/xml',
])
&& !preg_match('/(\s[^>]*(name|id)(\s+)?=(\s+)?["\'])(' . preg_quote(urldecode($response->getHeaderLine('Fragment')), '/') . ')(["\'][^>]*>)/i', $response->getBody())
) {
// Override status code 200 with status code 404 so it can be handled with
// default status code 404 logic and custom error text.
$statusCode = 404;
$error = 'URL fragment identifier not found in content';
}
switch ($statusCode) {
case 301:
$link->setStatusCode($statusCode);
$link->setErrorMessage($error);
$link->setFailCount($link->getFailCount() + 1);
$link->setLastCheckTime($this->time->getCurrentTime());
$link->save();
linkchecker_watchdog_log('linkchecker', 'Link %link has changed and needs to be updated.', [
'%link' => $link->getUrl(),
], RfcLogLevel::NOTICE, $this->getReportLink());
break;
case 404:
$link->setStatusCode($statusCode);
$link->setErrorMessage($error);
$link->setFailCount($link->getFailCount() + 1);
$link->setLastCheckTime($this->time->getCurrentTime());
$link->save();
linkchecker_watchdog_log('linkchecker', 'Broken link %link has been found.', [
'%link' => $link->getUrl(),
], RfcLogLevel::NOTICE, $this->getReportLink());
break;
case 405:
// - 405: Special error handling if method is not allowed. Switch link
// checking to GET method and try again.
$link->setRequestMethod('GET');
$link->setStatusCode($statusCode);
$link->setErrorMessage($error);
$link->setFailCount($link->getFailCount() + 1);
$link->setLastCheckTime($this->time->getCurrentTime());
$link->save();
linkchecker_watchdog_log('linkchecker', 'Method HEAD is not allowed for link %link. Method has been changed to GET.', [
'%link' => $link->getUrl(),
], RfcLogLevel::NOTICE, $this->getReportLink());
break;
case 500:
// - 500: Like WGET, try with GET on "500 Internal server error".
// - If GET also fails with status code 500, than the link is broken.
if ($link->getRequestMethod() == 'GET') {
$link->setStatusCode($statusCode);
$link->setErrorMessage($error);
$link->setFailCount($link->getFailCount() + 1);
$link->setLastCheckTime($this->time->getCurrentTime());
$link->save();
linkchecker_watchdog_log('linkchecker', 'Broken link %link has been found.', [
'%link' => $link->getUrl(),
], RfcLogLevel::NOTICE, $this->getReportLink());
}
else {
$link->setRequestMethod('GET');
$link->setStatusCode($statusCode);
$link->setErrorMessage($error);
$link->setFailCount($link->getFailCount() + 1);
$link->setLastCheckTime($this->time->getCurrentTime());
$link->save();
linkchecker_watchdog_log('linkchecker', 'Internal server error for link %link. Method has been changed to GET.', [
'%link' => $link->getUrl(),
], RfcLogLevel::NOTICE, $this->getReportLink());
}
break;
default:
// Don't treat ignored response codes as errors.
if (in_array($statusCode, $ignoreResponseCodes)) {
$link->setStatusCode($statusCode);
$link->setErrorMessage($error);
$link->setFailCount(0);
$link->setLastCheckTime($this->time->getCurrentTime());
$link->save();
}
else {
$link->setStatusCode($statusCode);
$link->setErrorMessage($error);
$link->setFailCount($link->getFailCount() + 1);
$link->setLastCheckTime($this->time->getCurrentTime());
$link->save();
linkchecker_watchdog_log('linkchecker', 'Unhandled link error %link has been found.', [
'%link' => $link->getUrl(),
], RfcLogLevel::ERROR, $this->getReportLink());
}
}
$this->updateSameLinks($link);
foreach ($this->statusHandlerManager->getDefinitions() as $definition) {
if (in_array($statusCode, $definition['status_codes'])) {
/** @var \Drupal\linkchecker\Plugin\LinkStatusHandlerInterface $handler */
$handler = $this->statusHandlerManager->createInstance($definition['id']);
$handler->queueItems($link, $response);
}
}
}
/**
* Exception handling.
*
* @param \GuzzleHttp\Exception\RequestException $e
* An object containing the Exception.
* @param \Drupal\linkchecker\LinkCheckerLinkInterface $link
* The link.
*/
protected function exceptionHandling(RequestException $e, LinkCheckerLinkInterface $link) {
$link->setStatusCode('502');
$link->setErrorMessage($e->getMessage());
$link->setFailCount($link->getFailCount() + 1);
$link->setLastCheckTime($this->time->getCurrentTime());
$link->save();
linkchecker_watchdog_log('linkchecker', 'Unhandled link error %link has been found: : %message.', [
'%link' => $link->getUrl(),
'%message' => $e->getMessage(),
], RfcLogLevel::ERROR, $this->getReportLink());
$this->updateSameLinks($link);
}
/**
* Helper function to create report link.
*/
protected function getReportLink() {
if (!isset($this->reportLink)) {
$this->reportLink = Link::fromTextAndUrl($this->t('Broken links'), Url::fromUserInput('/admin/reports/linkchecker'));
}
return $this->reportLink;
}
/**
* Helper function to update same links that were found in other entities.
*/
protected function updateSameLinks(LinkCheckerLinkInterface $link) {
$hash = $link->getHash();
// If there is no hash, return early.
if (is_null($hash)) {
return;
}
$storage = $this->entityTypeManager->getStorage($link->getEntityTypeId());
$query = $storage->getQuery();
$query->accessCheck();
$query->condition('urlhash', $hash);
$query->condition('lid', $link->id(), '!=');
$ids = $query->execute();
foreach ($ids as $id) {
/** @var \Drupal\linkchecker\LinkCheckerLinkInterface $linkToUpdate */
$linkToUpdate = $storage->load($id);
$linkToUpdate->setRequestMethod($link->getRequestMethod());
$linkToUpdate->setStatusCode($link->getStatusCode());
$linkToUpdate->setErrorMessage($link->getErrorMessage());
$linkToUpdate->setFailCount($link->getFailCount());
$linkToUpdate->setLastCheckTime($link->getLastCheckTime());
$linkToUpdate->save();
}
}
}
