Skip to content

Commit

Permalink
Keyword arguments for PLAYWRIGHT_PROCESS_REQUEST_HEADERS, pass additi…
Browse files Browse the repository at this point in the history
…onal Request fields (#303)

* Keyword arguments for PLAYWRIGHT_PROCESS_REQUEST_HEADERS

* Update docs for PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting

* Update docs for PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting

* Update tests for PLAYWRIGHT_PROCESS_REQUEST_HEADERS

* Add comma

* Update version in readme
  • Loading branch information
elacuesta authored Jul 18, 2024
1 parent 5b8cfd7 commit 84ba393
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 14 deletions.
43 changes: 40 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,17 @@ default headers could be sent as well). Coroutine functions (`async def`) are su
This will be called at least once for each Scrapy request, but it could be called additional times
if Playwright generates more requests (e.g. to retrieve assets like images or scripts).

The function must return a `dict` object, and receives the following positional arguments:
The function must return a `Dict[str, str]` object, and receives the following three **keyword** arguments:

```python
- browser_type: str
- browser_type_name: str
- playwright_request: playwright.async_api.Request
- scrapy_headers: scrapy.http.headers.Headers
- scrapy_request_data: dict
* method: str
* url: str
* headers: scrapy.http.headers.Headers
* body: Optional[bytes]
* encoding: str
```

The default function (`scrapy_playwright.headers.use_scrapy_headers`) tries to
Expand All @@ -308,6 +313,38 @@ set by Playwright will be sent. Keep in mind that in this case, headers passed
via the `Request.headers` attribute or set by Scrapy components are ignored
(including cookies set via the `Request.cookies` attribute).

Example:
```python
async def custom_headers(
*,
browser_type_name: str,
playwright_request: playwright.async_api.Request,
scrapy_request_data: dict,
) -> Dict[str, str]:
headers = await playwright_request.all_headers()
scrapy_headers = scrapy_request_data["headers"].to_unicode_dict()
headers["Cookie"] = scrapy_headers.get("Cookie")
return headers

PLAYWRIGHT_PROCESS_REQUEST_HEADERS = custom_headers
```

#### Deprecated argument handling

In version 0.0.40 and earlier, arguments were passed to the function positionally,
and only the Scrapy headers were passed instead of a dictionary with data about the
Scrapy request.
This is deprecated since version 0.0.41, and support for this way of handling arguments
will eventually be removed in accordance with the [Deprecation policy](#deprecation-policy).

Passed arguments:
```python
- browser_type: str
- playwright_request: playwright.async_api.Request
- scrapy_headers: scrapy.http.headers.Headers
```

Example:
```python
def custom_headers(
browser_type: str,
Expand Down
36 changes: 34 additions & 2 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import asyncio
import inspect
import logging
import platform
import warnings
from contextlib import suppress
from dataclasses import dataclass, field as dataclass_field
from ipaddress import ip_address
Expand All @@ -22,7 +24,7 @@
from scrapy import Spider, signals
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.crawler import Crawler
from scrapy.exceptions import NotSupported
from scrapy.exceptions import NotSupported, ScrapyDeprecationWarning
from scrapy.http import Request, Response
from scrapy.http.headers import Headers
from scrapy.responsetypes import responsetypes
Expand Down Expand Up @@ -698,10 +700,40 @@ async def _request_handler(route: Route, playwright_request: PlaywrightRequest)

if self.process_request_headers is None:
final_headers = await playwright_request.all_headers()
elif (sig := inspect.signature(self.process_request_headers)) and (
"browser_type_name" in sig.parameters
and "playwright_request" in sig.parameters
and "scrapy_request_data" in sig.parameters
):
overrides["headers"] = final_headers = await _maybe_await(
self.process_request_headers(
browser_type_name=self.config.browser_type_name,
playwright_request=playwright_request,
scrapy_request_data={
"method": method,
"url": url,
"headers": headers,
"body": body,
"encoding": encoding,
},
)
)
else:
warnings.warn(
"Accepting positional arguments in the function passed to the"
" PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function"
" should accept three (3) keyword arguments instead:"
" browser_type_name: str,"
" playwright_request: playwright.async_api.Request,"
" scrapy_request_data: dict",
category=ScrapyDeprecationWarning,
stacklevel=1,
)
overrides["headers"] = final_headers = await _maybe_await(
self.process_request_headers(
self.config.browser_type_name, playwright_request, headers
self.config.browser_type_name,
playwright_request,
headers,
)
)

Expand Down
13 changes: 7 additions & 6 deletions scrapy_playwright/headers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,22 @@
Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
"""

from typing import Dict
from urllib.parse import urlparse

from playwright.async_api import Request as PlaywrightRequest
from scrapy.http.headers import Headers


async def use_scrapy_headers(
browser_type: str,
*,
browser_type_name: str,
playwright_request: PlaywrightRequest,
scrapy_headers: Headers,
) -> dict:
scrapy_request_data: dict,
) -> Dict[str, str]:
"""Scrapy headers take precedence over Playwright headers for navigation requests.
For non-navigation requests, only User-Agent is taken from the Scrapy headers."""

scrapy_headers_str = scrapy_headers.to_unicode_dict()
scrapy_headers_str = scrapy_request_data["headers"].to_unicode_dict()
playwright_headers = await playwright_request.all_headers()

# Scrapy's user agent has priority over Playwright's
Expand All @@ -29,7 +30,7 @@ async def use_scrapy_headers(
scrapy_headers_str.setdefault("referer", referer)

# otherwise it fails with playwright.helper.Error: NS_ERROR_NET_RESET
if browser_type == "firefox":
if browser_type_name == "firefox":
scrapy_headers_str["host"] = urlparse(playwright_request.url).netloc

return scrapy_headers_str
Expand Down
56 changes: 53 additions & 3 deletions tests/tests_asyncio/test_headers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import logging
import platform
import warnings
from unittest import IsolatedAsyncioTestCase

import pytest
Expand All @@ -10,6 +12,11 @@


class MixinProcessHeadersTestCase:
@pytest.fixture(autouse=True)
def inject_fixtures(self, caplog):
caplog.set_level(logging.DEBUG)
self._caplog = caplog

@allow_windows
async def test_user_agent(self):
settings_dict = {
Expand Down Expand Up @@ -66,10 +73,14 @@ async def test_playwright_headers(self):
assert b"asdf" not in req.headers

@allow_windows
async def test_use_custom_headers(self):
async def test_use_custom_headers_ok(self):
"""Custom header processing function"""

async def important_headers(*_args, **_kwargs) -> dict:
async def important_headers(
browser_type_name, # pylint: disable=unused-argument
playwright_request, # pylint: disable=unused-argument
scrapy_request_data, # pylint: disable=unused-argument
) -> dict:
return {"foo": "bar"}

settings_dict = {
Expand All @@ -84,12 +95,51 @@ async def important_headers(*_args, **_kwargs) -> dict:
meta={"playwright": True},
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
)
resp = await handler._download_request(req, Spider("foo"))
with warnings.catch_warnings(record=True) as warning_list:
resp = await handler._download_request(req, Spider("foo"))
assert not warning_list
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["foo"] == "bar"
assert headers.get("user-agent") not in (self.browser_type, "foobar")
assert "asdf" not in headers

@allow_windows
async def test_use_custom_headers_deprecated_arg_handling(self):
"""Custom header processing function that receives deprecated args"""

async def deprecated_args(
browser_name, pw_req, headers # pylint: disable=unused-argument
) -> dict:
return {"foo": "bar"}

settings_dict = {
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CONTEXTS": {"default": {"user_agent": self.browser_type}},
"PLAYWRIGHT_PROCESS_REQUEST_HEADERS": deprecated_args,
}
async with make_handler(settings_dict) as handler:
with MockServer() as server:
req = Request(
url=server.urljoin("/headers"),
meta={"playwright": True},
headers={"User-Agent": "foobar", "Asdf": "qwerty"},
)
with warnings.catch_warnings(record=True) as warning_list:
resp = await handler._download_request(req, Spider("foo"))
headers = json.loads(resp.css("pre::text").get())
headers = {key.lower(): value for key, value in headers.items()}
assert headers["foo"] == "bar"
assert headers.get("user-agent") not in (self.browser_type, "foobar")
assert "asdf" not in headers
assert str(warning_list[0].message) == (
"Accepting positional arguments in the function passed to the"
" PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting is deprecated. The function"
" should accept three (3) keyword arguments instead:"
" browser_type_name: str,"
" playwright_request: playwright.async_api.Request,"
" scrapy_request_data: dict"
)


class TestProcessHeadersChromium(IsolatedAsyncioTestCase, MixinProcessHeadersTestCase):
Expand Down

0 comments on commit 84ba393

Please sign in to comment.