Skip to content

Commit b576fa8

Browse files
committed
Prevent loading relative reference URI
Trying to load a relative reference URI (no scheme and host/authority, only path) via the `HttpLoader` now immediately logs (or throws when `loadOrFail()` is used) an error instead of trying to actually load it.
1 parent 8bb18b6 commit b576fa8

File tree

6 files changed

+111
-15
lines changed

6 files changed

+111
-15
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
## [3.2.4] - 2025-02-25
10+
### Fixed
11+
* Trying to load a relative reference URI (no scheme and host/authority, only path) via the `HttpLoader` now immediately logs (or throws when `loadOrFail()` is used) an error instead of trying to actually load it.
12+
913
## [3.2.3] - 2025-01-28
1014
### Fixed
1115
* Fix deprecation warning triggered in the `DomQuery` class, when trying to get the value of an HTML/XML attribute that does not exist on the element.

src/Loader/Http/HttpLoader.php

+20-5
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,10 @@ public function load(mixed $subject): ?RespondedRequest
123123

124124
try {
125125
$request = $this->validateSubjectType($subject);
126-
} catch (InvalidArgumentException) {
127-
$this->logger->error('Invalid input URL: ' . var_export($subject, true));
126+
} catch (InvalidArgumentException|Exception $exception) {
127+
$url = $subject instanceof RequestInterface ? (string) $subject->getUri() : (string) $subject;
128+
129+
$this->logger->error('Invalid input URL: ' . $url . ' - ' . $exception->getMessage());
128130

129131
return null;
130132
}
@@ -164,7 +166,7 @@ public function load(mixed $subject): ?RespondedRequest
164166
}
165167

166168
/**
167-
* @throws LoadingException
169+
* @throws LoadingException|InvalidArgumentException|Exception
168170
*/
169171
public function loadOrFail(mixed $subject): RespondedRequest
170172
{
@@ -592,16 +594,29 @@ protected function shouldRequestBeServedFromCache(RequestInterface $request): bo
592594
}
593595

594596
/**
595-
* @throws InvalidArgumentException
597+
* @throws InvalidArgumentException|Exception
596598
*/
597599
protected function validateSubjectType(RequestInterface|string $requestOrUri): RequestInterface
598600
{
599601
if (is_string($requestOrUri)) {
600602
try {
601-
return new Request('GET', Url::parsePsr7($requestOrUri));
603+
$url = Url::parse($requestOrUri);
604+
605+
if ($url->isRelativeReference()) {
606+
throw new InvalidArgumentException(
607+
'The URI is a relative reference and therefore can\'t be loaded.',
608+
);
609+
}
610+
611+
return new Request('GET', $url->toPsr7());
602612
} catch (InvalidUrlException) {
603613
throw new InvalidArgumentException('Invalid URL.');
604614
}
615+
} elseif (
616+
empty(trim($requestOrUri->getUri()->getScheme())) &&
617+
Url::parse($requestOrUri->getUri())->isRelativeReference()
618+
) {
619+
throw new InvalidArgumentException('The URI is a relative reference and therefore can\'t be loaded.');
605620
}
606621

607622
return $requestOrUri;

tests/Loader/Http/HttpLoaderTest.php

+52-9
Original file line numberDiff line numberDiff line change
@@ -9,29 +9,26 @@
99
use Crwlr\Crawler\Loader\Http\Politeness\Throttler;
1010
use Crwlr\Crawler\Steps\Filters\Filter;
1111
use Crwlr\Crawler\UserAgents\BotUserAgent;
12-
use Crwlr\Crawler\UserAgents\UserAgent;
1312
use Exception;
1413
use GuzzleHttp\Client;
1514
use GuzzleHttp\Psr7\Request;
1615
use GuzzleHttp\Psr7\Response;
16+
use InvalidArgumentException;
1717
use Mockery;
1818
use PHPUnit\Framework\TestCase;
1919
use Psr\Http\Client\ClientInterface;
2020
use Psr\Http\Message\RequestInterface;
2121
use Psr\Http\Message\UriInterface;
2222
use Psr\SimpleCache\CacheInterface;
23+
use tests\_Stubs\DummyLogger;
2324
use tests\_Stubs\RespondedRequestChild;
2425
use Throwable;
2526

2627
use function tests\helper_cachedir;
2728
use function tests\helper_getFastLoader;
29+
use function tests\helper_nonBotUserAgent;
2830
use function tests\helper_resetCacheDir;
2931

30-
function helper_nonBotUserAgent(): UserAgent
31-
{
32-
return new UserAgent('Mozilla/5.0 (compatible; FooBot)');
33-
}
34-
3532
afterEach(function () {
3633
helper_resetCacheDir();
3734
});
@@ -50,6 +47,27 @@ function helper_nonBotUserAgent(): UserAgent
5047
$httpLoader->loadOrFail('https://www.crwlr.software');
5148
});
5249

50+
it('fails and logs an error when invoked with a relative reference URI', function () {
51+
$logger = new DummyLogger();
52+
53+
$httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);
54+
55+
$httpLoader->load('/foo');
56+
57+
expect($logger->messages)->not->toBeEmpty()
58+
->and($logger->messages[0]['message'])->toBe(
59+
'Invalid input URL: /foo - The URI is a relative reference and therefore can\'t be loaded.',
60+
);
61+
});
62+
63+
it('fails and throws an exception when loadOrFail() is called with a relative reference URI', function () {
64+
$logger = new DummyLogger();
65+
66+
$httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);
67+
68+
$httpLoader->loadOrFail('/foo');
69+
})->throws(InvalidArgumentException::class);
70+
5371
it('accepts RequestInterface as argument to load', function () {
5472
$httpClient = Mockery::mock(ClientInterface::class);
5573

@@ -62,6 +80,31 @@ function helper_nonBotUserAgent(): UserAgent
6280
$httpLoader->loadOrFail(new Request('GET', 'https://www.crwlr.software'));
6381
});
6482

83+
it('fails and logs an error when invoked with a RequestInterface object having a relative reference URI', function () {
84+
$logger = new DummyLogger();
85+
86+
$httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);
87+
88+
$httpLoader->load(new Request('GET', '/foo'));
89+
90+
expect($logger->messages)->not->toBeEmpty()
91+
->and($logger->messages[0]['message'])->toBe(
92+
'Invalid input URL: /foo - The URI is a relative reference and therefore can\'t be loaded.',
93+
);
94+
});
95+
96+
it(
97+
'fails and throws an exception when loadOrFail() is called with a RequestInterface object having a relative ' .
98+
'reference URI',
99+
function () {
100+
$logger = new DummyLogger();
101+
102+
$httpLoader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);
103+
104+
$httpLoader->loadOrFail(new Request('GET', '/foo'));
105+
},
106+
)->throws(InvalidArgumentException::class);
107+
65108
it(
66109
'calls the before and after load hooks regardless whether the response was successful or not',
67110
function ($responseStatusCode) {
@@ -217,15 +260,15 @@ function ($responseStatusCode) {
217260
$httpLoader = new class (new BotUserAgent('Foo'), $httpClient) extends HttpLoader {
218261
public function isAllowedToBeLoaded(UriInterface $uri, bool $throwsException = false): bool
219262
{
220-
return $uri->__toString() === '/foo';
263+
return $uri->__toString() === 'https://www.example.com/foo';
221264
}
222265
};
223266

224-
$response = $httpLoader->load('/foo');
267+
$response = $httpLoader->load('https://www.example.com/foo');
225268

226269
expect($response)->toBeInstanceOf(RespondedRequest::class);
227270

228-
$response = $httpLoader->load('/bar');
271+
$response = $httpLoader->load('https://www.example.com/bar');
229272

230273
expect($response)->toBeNull();
231274
});

tests/Loader/Http/Politeness/RobotsTxtHandlerTest.php

+1-1
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ function () {
124124

125125
$robotsTxt = new RobotsTxtHandler($loader, new CliLogger());
126126

127-
expect($robotsTxt->isAllowed('/anything'))->toBeTrue();
127+
expect($robotsTxt->isAllowed('https://www.example.com/anything'))->toBeTrue();
128128

129129
$logOutput = $this->getActualOutputForAssertion();
130130

tests/Pest.php

+5
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,11 @@ protected function loader(UserAgentInterface $userAgent, LoggerInterface $logger
291291
};
292292
}
293293

294+
function helper_nonBotUserAgent(): UserAgent
295+
{
296+
return new UserAgent('Mozilla/5.0 (compatible; FooBot)');
297+
}
298+
294299
function helper_getMinThrottler(): Throttler
295300
{
296301
return new Throttler(new MultipleOf(0.0001), new MultipleOf(0.0002), Microseconds::fromSeconds(0.0001));

tests/Steps/Loading/HttpTest.php

+29
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@
1010
use GuzzleHttp\Psr7\Request;
1111
use GuzzleHttp\Psr7\Response;
1212
use GuzzleHttp\Psr7\Utils;
13+
use InvalidArgumentException;
1314
use Mockery;
1415
use Psr\Http\Message\RequestInterface;
1516
use stdClass;
1617
use tests\_Stubs\DummyLogger;
1718

1819
use function tests\helper_getRespondedRequest;
1920
use function tests\helper_invokeStepWithInput;
21+
use function tests\helper_nonBotUserAgent;
2022
use function tests\helper_traverseIterable;
2123

2224
it('can be invoked with a string as input', function () {
@@ -55,6 +57,33 @@
5557
->and($logger->messages[0]['message'])->toEndWith('. The invalid input is of type object.');
5658
});
5759

60+
it('logs an error message when invoked with a relative reference URI', function () {
61+
$logger = new DummyLogger();
62+
63+
$loader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);
64+
65+
$step = (new Http('GET'))->setLoader($loader)->addLogger($logger);
66+
67+
helper_invokeStepWithInput($step, '/foo/bar');
68+
69+
expect($logger->messages)->not->toBeEmpty()
70+
->and($logger->messages[0]['message'])->toBe(
71+
'Invalid input URL: /foo/bar - The URI is a relative reference and therefore can\'t be loaded.',
72+
);
73+
});
74+
75+
it('throws an exception when invoked with a relative reference URI and stopOnErrorResponse() was called', function () {
76+
$logger = new DummyLogger();
77+
78+
$loader = new HttpLoader(helper_nonBotUserAgent(), logger: $logger);
79+
80+
$step = (new Http('GET'))->setLoader($loader)->addLogger($logger);
81+
82+
$step->stopOnErrorResponse();
83+
84+
helper_invokeStepWithInput($step, '/foo/bar');
85+
})->throws(InvalidArgumentException::class);
86+
5887
test('You can set the request method via constructor', function (string $httpMethod) {
5988
$loader = Mockery::mock(HttpLoader::class);
6089

0 commit comments

Comments
 (0)