diff --git a/README.md b/README.md index 10adab3b..1fce9132 100644 --- a/README.md +++ b/README.md @@ -288,12 +288,17 @@ default headers could be sent as well). Coroutine functions (`async def`) are su This will be called at least once for each Scrapy request, but it could be called additional times if Playwright generates more requests (e.g. to retrieve assets like images or scripts). -The function must return a `dict` object, and receives the following positional arguments: +The function must return a `Dict[str, str]` object, and receives the following three **keyword** arguments: ```python -- browser_type: str +- browser_type_name: str - playwright_request: playwright.async_api.Request -- scrapy_headers: scrapy.http.headers.Headers +- scrapy_request_data: dict + * method: str + * url: str + * headers: scrapy.http.headers.Headers + * body: Optional[bytes] + * encoding: str ``` The default function (`scrapy_playwright.headers.use_scrapy_headers`) tries to @@ -308,6 +313,38 @@ set by Playwright will be sent. Keep in mind that in this case, headers passed via the `Request.headers` attribute or set by Scrapy components are ignored (including cookies set via the `Request.cookies` attribute). +Example: +```python +async def custom_headers( + *, + browser_type_name: str, + playwright_request: playwright.async_api.Request, + scrapy_request_data: dict, +) -> Dict[str, str]: + headers = await playwright_request.all_headers() + scrapy_headers = scrapy_request_data["headers"].to_unicode_dict() + headers["Cookie"] = scrapy_headers.get("Cookie") + return headers + +PLAYWRIGHT_PROCESS_REQUEST_HEADERS = custom_headers +``` + +#### Deprecated argument handling + +In version 0.0.40 and earlier, arguments were passed to the function positionally, +and only the Scrapy headers were passed instead of a dictionary with data about the +Scrapy request. +This is deprecated since version 0.0.41, and support for this way of handling arguments +will eventually be removed in accordance with the [Deprecation policy](#deprecation-policy). + +Passed arguments: +```python +- browser_type: str +- playwright_request: playwright.async_api.Request +- scrapy_headers: scrapy.http.headers.Headers +``` + +Example: ```python def custom_headers( browser_type: str, diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py index 94350167..b475d615 100644 --- a/scrapy_playwright/handler.py +++ b/scrapy_playwright/handler.py @@ -1,6 +1,8 @@ import asyncio +import inspect import logging import platform +import warnings from contextlib import suppress from dataclasses import dataclass, field as dataclass_field from ipaddress import ip_address @@ -22,7 +24,7 @@ from scrapy import Spider, signals from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.crawler import Crawler -from scrapy.exceptions import NotSupported +from scrapy.exceptions import NotSupported, ScrapyDeprecationWarning from scrapy.http import Request, Response from scrapy.http.headers import Headers from scrapy.responsetypes import responsetypes @@ -698,10 +700,40 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest) if self.process_request_headers is None: final_headers = await playwright_request.all_headers() + elif (sig := inspect.signature(self.process_request_headers)) and ( + "browser_type_name" in sig.parameters + and "playwright_request" in sig.parameters + and "scrapy_request_data" in sig.parameters + ): + overrides["headers"] = final_headers = await _maybe_await( + self.process_request_headers( + browser_type_name=self.config.browser_type_name, + playwright_request=playwright_request, + scrapy_request_data={ + "method": method, + "url": url, + "headers": headers, + "body": body, + "encoding": encoding, + }, + ) + ) else: + warnings.warn( + "Accepting positional arguments in the function passed to the" + " PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function" + " should accept three (3) keyword arguments instead:" + " browser_type_name: str," + " playwright_request: playwright.async_api.Request," + " scrapy_request_data: dict", + category=ScrapyDeprecationWarning, + stacklevel=1, + ) overrides["headers"] = final_headers = await _maybe_await( self.process_request_headers( - self.config.browser_type_name, playwright_request, headers + self.config.browser_type_name, + playwright_request, + headers, ) ) diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py index 39cb42e7..34532868 100644 --- a/scrapy_playwright/headers.py +++ b/scrapy_playwright/headers.py @@ -3,21 +3,22 @@ Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information. """ +from typing import Dict from urllib.parse import urlparse from playwright.async_api import Request as PlaywrightRequest -from scrapy.http.headers import Headers async def use_scrapy_headers( - browser_type: str, + *, + browser_type_name: str, playwright_request: PlaywrightRequest, - scrapy_headers: Headers, -) -> dict: + scrapy_request_data: dict, +) -> Dict[str, str]: """Scrapy headers take precedence over Playwright headers for navigation requests. For non-navigation requests, only User-Agent is taken from the Scrapy headers.""" - scrapy_headers_str = scrapy_headers.to_unicode_dict() + scrapy_headers_str = scrapy_request_data["headers"].to_unicode_dict() playwright_headers = await playwright_request.all_headers() # Scrapy's user agent has priority over Playwright's @@ -29,7 +30,7 @@ async def use_scrapy_headers( scrapy_headers_str.setdefault("referer", referer) # otherwise it fails with playwright.helper.Error: NS_ERROR_NET_RESET - if browser_type == "firefox": + if browser_type_name == "firefox": scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc return scrapy_headers_str diff --git a/tests/tests_asyncio/test_headers.py b/tests/tests_asyncio/test_headers.py index 9956bd7f..c26518b1 100644 --- a/tests/tests_asyncio/test_headers.py +++ b/tests/tests_asyncio/test_headers.py @@ -1,5 +1,7 @@ import json +import logging import platform +import warnings from unittest import IsolatedAsyncioTestCase import pytest @@ -10,6 +12,11 @@ class MixinProcessHeadersTestCase: + @pytest.fixture(autouse=True) + def inject_fixtures(self, caplog): + caplog.set_level(logging.DEBUG) + self._caplog = caplog + @allow_windows async def test_user_agent(self): settings_dict = { @@ -66,10 +73,14 @@ async def test_playwright_headers(self): assert b"asdf" not in req.headers @allow_windows - async def test_use_custom_headers(self): + async def test_use_custom_headers_ok(self): """Custom header processing function""" - async def important_headers(*_args, **_kwargs) -> dict: + async def important_headers( + browser_type_name, # pylint: disable=unused-argument + playwright_request, # pylint: disable=unused-argument + scrapy_request_data, # pylint: disable=unused-argument + ) -> dict: return {"foo": "bar"} settings_dict = { @@ -84,12 +95,51 @@ async def important_headers(*_args, **_kwargs) -> dict: meta={"playwright": True}, headers={"User-Agent": "foobar", "Asdf": "qwerty"}, ) - resp = await handler._download_request(req, Spider("foo")) + with warnings.catch_warnings(record=True) as warning_list: + resp = await handler._download_request(req, Spider("foo")) + assert not warning_list + headers = json.loads(resp.css("pre::text").get()) + headers = {key.lower(): value for key, value in headers.items()} + assert headers["foo"] == "bar" + assert headers.get("user-agent") not in (self.browser_type, "foobar") + assert "asdf" not in headers + + @allow_windows + async def test_use_custom_headers_deprecated_arg_handling(self): + """Custom header processing function that receives deprecated args""" + + async def deprecated_args( + browser_name, pw_req, headers # pylint: disable=unused-argument + ) -> dict: + return {"foo": "bar"} + + settings_dict = { + "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, + "PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}}, + "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": deprecated_args, + } + async with make_handler(settings_dict) as handler: + with MockServer() as server: + req = Request( + url=server.urljoin("/headers"), + meta={"playwright": True}, + headers={"User-Agent": "foobar", "Asdf": "qwerty"}, + ) + with warnings.catch_warnings(record=True) as warning_list: + resp = await handler._download_request(req, Spider("foo")) headers = json.loads(resp.css("pre::text").get()) headers = {key.lower(): value for key, value in headers.items()} assert headers["foo"] == "bar" assert headers.get("user-agent") not in (self.browser_type, "foobar") assert "asdf" not in headers + assert str(warning_list[0].message) == ( + "Accepting positional arguments in the function passed to the" + " PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function" + " should accept three (3) keyword arguments instead:" + " browser_type_name: str," + " playwright_request: playwright.async_api.Request," + " scrapy_request_data: dict" + ) class TestProcessHeadersChromium(IsolatedAsyncioTestCase, MixinProcessHeadersTestCase):