Merge remote-tracking branch 'origin/main' into windows

scrapy-plugins · Jun 14, 2024 · 6e178cc · 6e178cc
2 parents a6b9f6e + 62ddc0c
commit 6e178cc
Show file tree

Hide file tree

Showing 17 changed files with 276 additions and 67 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.33
+current_version = 0.0.35
 commit = True
 tag = True
 

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -8,8 +8,13 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        os: [ubuntu-latest]
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        include:
+          - os: macos-latest
+            python-version: "3.12"
+          - os: windows-latest
+            python-version: "3.12"
 
     steps:
     - uses: actions/checkout@v4
@@ -28,16 +33,20 @@ jobs:
     - name: Run twisted tests
       run: tox -e py-twisted
 
-    - name: Upload coverage report
-      if: runner.os != 'Windows'
+    - name: Upload coverage report (Linux)
+      if: runner.os == 'Linux'
+      run: |
+        curl -Os https://uploader.codecov.io/latest/linux/codecov
+        chmod +x codecov
+        ./codecov
+
+    - name: Upload coverage report (macOS)
+      if: runner.os == 'macOS'
       run: |
-        if [ "${{ runner.os }}" = "Linux" ]; then
-          curl -Os https://uploader.codecov.io/latest/linux/codecov
-        else
-          curl -Os https://uploader.codecov.io/latest/macos/codecov
-        fi
+        curl -Os https://uploader.codecov.io/latest/macos/codecov
         chmod +x codecov
         ./codecov
+
     - name: Upload coverage report
       if: runner.os == 'Windows'
       run: |

diff --git a/README.md b/README.md
@@ -826,6 +826,32 @@ for a list of the accepted events and the arguments passed to their handlers.
   images, scripts, stylesheets, etc are not seen by Scrapy.
 
 
+## Memory usage extension
+
+The default Scrapy memory usage extension
+(`scrapy.extensions.memusage.MemoryUsage`) does not include the memory used by
+Playwright because the browser is launched as a separate process. The
+scrapy-playwright package provides a replacement extension which also considers
+the memory used by Playwright. This extension needs the
+[`psutil`](https://pypi.org/project/psutil/) package to work.
+
+Update the [EXTENSIONS](https://docs.scrapy.org/en/latest/topics/settings.html#std-setting-EXTENSIONS)
+setting to disable the built-in Scrapy extension and replace it with the one
+from the scrapy-playwright package:
+
+```python
+# settings.py
+EXTENSIONS = {
+    "scrapy.extensions.memusage.MemoryUsage": None,
+    "scrapy_playwright.memusage.ScrapyPlaywrightMemoryUsageExtension": 0,
+}
+```
+
+Refer to the
+[upstream docs](https://docs.scrapy.org/en/latest/topics/extensions.html#module-scrapy.extensions.memusage)
+for more information about supported settings.
+
+
 ## Examples
 
 **Click on a link, save the resulting page as PDF**
@@ -956,6 +982,68 @@ async def main():
 asyncio.run(main())
 ```
 
+### Software versions
+
+Be sure to include which versions of Scrapy and scrapy-playwright you are using:
+
+```
+$ python -c "import scrapy_playwright; print(scrapy_playwright.__version__)"
+0.0.34
+```
+
+```
+$ scrapy version -v
+Scrapy       : 2.11.1
+lxml         : 5.1.0.0
+libxml2      : 2.12.3
+cssselect    : 1.2.0
+parsel       : 1.8.1
+w3lib        : 2.1.2
+Twisted      : 23.10.0
+Python       : 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
+pyOpenSSL    : 24.0.0 (OpenSSL 3.2.1 30 Jan 2024)
+cryptography : 42.0.5
+Platform     : Linux-6.5.0-35-generic-x86_64-with-glibc2.35
+```
+
+### Reproducible code example
+
+When opening an issue please include a
+[Minimal, Reproducible Example](https://stackoverflow.com/help/minimal-reproducible-example)
+that shows the reported behavior. In addition, please make the code as self-contained as possible
+so an active Scrapy project is not required and the spider can be executed directly from a file with
+[`scrapy runspider`](https://docs.scrapy.org/en/latest/topics/commands.html#std-command-runspider).
+This usually means including the relevant settings in the spider's
+[`custom_settings`](https://docs.scrapy.org/en/latest/topics/settings.html#settings-per-spider)
+attribute:
+
+```python
+import scrapy
+
+class ExampleSpider(scrapy.Spider):
+    name = "example"
+    custom_settings = {
+        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
+        "DOWNLOAD_HANDLERS": {
+            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+        },
+    }
+
+    def start_requests(self):
+        yield scrapy.Request(
+            url="https://example.org",
+            meta={"playwright": True},
+        )
+```
+
+### Logs and stats
+
+Logs for spider jobs displaying the issue in detail are extremely useful
+for understanding possible bugs. Include lines before and after the problem,
+not just isolated tracebacks. Job stats displayed at the end of the job
+are also important.
+
 
 ## Frequently Asked Questions
 

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,11 @@
 # scrapy-playwright changelog
 
+### [v0.0.34](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.34) (2024-01-01)
+
+* Update dev status classifier to 4 - beta
+* Official Python 3.12 support (#254)
+* Custom memusage extension (#257)
+
 
 ### [v0.0.33](https://github.com/scrapy-plugins/scrapy-playwright/releases/tag/v0.0.33) (2023-10-19)
 

diff --git a/scrapy_playwright/__init__.py b/scrapy_playwright/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.33"
+__version__ = "0.0.35"
diff --git a/scrapy_playwright/_utils.py b/scrapy_playwright/_utils.py
@@ -64,7 +64,7 @@ async def _get_page_content(
     try:
         return await page.content()
     except Error as err:
-        if err.message == _NAVIGATION_ERROR_MSG:
+        if _NAVIGATION_ERROR_MSG in err.message:
             logger.debug(
                 "Retrying to get content from page '%s', error: '%s'",
                 page.url,

diff --git a/scrapy_playwright/handler.py b/scrapy_playwright/handler.py
@@ -13,6 +13,7 @@
     Download,
     Error as PlaywrightError,
     Page,
+    Playwright as AsyncPlaywright,
     PlaywrightContextManager,
     Request as PlaywrightRequest,
     Response as PlaywrightResponse,
@@ -129,6 +130,9 @@ def from_settings(cls, settings: Settings) -> "Config":
 
 
 class ScrapyPlaywrightDownloadHandler(HTTPDownloadHandler):
+    playwright_context_manager: Optional[PlaywrightContextManager] = None
+    playwright: Optional[AsyncPlaywright] = None
+
     def __init__(self, crawler: Crawler) -> None:
         super().__init__(settings=crawler.settings, crawler=crawler)
         verify_installed_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
@@ -328,8 +332,10 @@ async def _close(self) -> None:
         if hasattr(self, "browser"):
             logger.info("Closing browser")
             await self.browser.close()
-        await self.playwright_context_manager.__aexit__()
-        await self.playwright.stop()
+        if self.playwright_context_manager:
+            await self.playwright_context_manager.__aexit__()
+        if self.playwright:
+            await self.playwright.stop()
 
     def download_request(self, request: Request, spider: Spider) -> Deferred:
         if request.meta.get("playwright"):

diff --git a/scrapy_playwright/headers.py b/scrapy_playwright/headers.py
@@ -2,6 +2,7 @@
 This module includes functions to process request headers.
 Refer to the PLAYWRIGHT_PROCESS_REQUEST_HEADERS setting for more information.
 """
+
 from urllib.parse import urlparse
 
 from playwright.async_api import Request as PlaywrightRequest

diff --git a/scrapy_playwright/memusage.py b/scrapy_playwright/memusage.py
@@ -0,0 +1,56 @@
+from contextlib import suppress
+from importlib import import_module
+from typing import List
+
+from scrapy.exceptions import NotConfigured
+from scrapy.extensions.memusage import MemoryUsage
+
+from scrapy_playwright.handler import ScrapyPlaywrightDownloadHandler, logger
+
+
+_MIB_FACTOR = 1024**2
+
+
+class ScrapyPlaywrightMemoryUsageExtension(MemoryUsage):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        try:
+            self.psutil = import_module("psutil")
+        except ImportError as exc:
+            raise NotConfigured("The psutil module is not available") from exc
+
+    def _get_main_process_ids(self) -> List[int]:
+        try:
+            return [
+                handler.playwright_context_manager._connection._transport._proc.pid
+                for handler in self.crawler.engine.downloader.handlers._handlers.values()
+                if isinstance(handler, ScrapyPlaywrightDownloadHandler)
+                and handler.playwright_context_manager
+            ]
+        except Exception:
+            return []
+
+    def _get_descendant_processes(self, process) -> list:
+        children = process.children()
+        result = children.copy()
+        for child in children:
+            result.extend(self._get_descendant_processes(child))
+        return result
+
+    def _get_total_playwright_process_memory(self) -> int:
+        process_list = [self.psutil.Process(pid) for pid in self._get_main_process_ids()]
+        for proc in process_list.copy():
+            process_list.extend(self._get_descendant_processes(proc))
+        total_process_size = 0
+        for proc in process_list:
+            with suppress(Exception):  # might fail if the process exited in the meantime
+                total_process_size += proc.memory_info().rss
+        logger.debug(
+            "Total Playwright process memory: %i Bytes (%i MiB)",
+            total_process_size,
+            total_process_size / _MIB_FACTOR,
+        )
+        return total_process_size
+
+    def get_virtual_size(self) -> int:
+        return super().get_virtual_size() + self._get_total_playwright_process_memory()
diff --git a/setup.py b/setup.py
@@ -19,13 +19,14 @@
     url="https://github.com/scrapy-plugins/scrapy-playwright",
     packages=["scrapy_playwright"],
     classifiers=[
-        "Development Status :: 3 - Alpha",
+        "Development Status :: 4 - Beta",
         "License :: OSI Approved :: BSD License",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
         "Framework :: Scrapy",
         "Intended Audience :: Developers",
         "Topic :: Internet :: WWW/HTTP",

diff --git a/tests/tests_asyncio/test_browser_contexts.py b/tests/tests_asyncio/test_browser_contexts.py
@@ -15,7 +15,6 @@
 
 
 class MixinTestCaseMultipleContexts:
-    @pytest.mark.asyncio
     async def test_context_kwargs(self):
         settings_dict = {
             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
@@ -38,7 +37,6 @@ async def test_context_kwargs(self):
                 with pytest.raises(PlaywrightTimeoutError):
                     await handler._download_request(req, Spider("foo"))
 
-    @pytest.mark.asyncio
     async def test_contexts_max_pages(self):
         settings = {
             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
@@ -73,7 +71,6 @@ async def test_contexts_max_pages(self):
 
             assert handler.stats.get_value("playwright/page_count/max_concurrent") == 4
 
-    @pytest.mark.asyncio
     async def test_max_contexts(self):
         def cb_close_context(task):
             response = task.result()
@@ -108,7 +105,6 @@ def cb_close_context(task):
 
             assert handler.stats.get_value("playwright/context_count/max_concurrent") == 4
 
-    @pytest.mark.asyncio
     async def test_contexts_startup(self):
         settings = {
             "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
@@ -147,7 +143,6 @@ async def test_contexts_startup(self):
             assert cookie["value"] == "bar"
             assert cookie["domain"] == "example.org"
 
-    @pytest.mark.asyncio
     async def test_persistent_context(self):
         temp_dir = f"{tempfile.gettempdir()}/{uuid4()}"
         settings = {
@@ -166,7 +161,6 @@ async def test_persistent_context(self):
             assert handler.context_wrappers["persistent"].persistent
             assert not hasattr(handler, "browser")
 
-    @pytest.mark.asyncio
     async def test_mixed_persistent_contexts(self):
         temp_dir = f"{tempfile.gettempdir()}/{uuid4()}"
         settings = {
@@ -189,7 +183,6 @@ async def test_mixed_persistent_contexts(self):
             assert not handler.context_wrappers["non-persistent"].persistent
             assert isinstance(handler.browser, Browser)
 
-    @pytest.mark.asyncio
     async def test_contexts_dynamic(self):
         async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler:
             assert len(handler.context_wrappers) == 0