From 3e12067077755f2d953bc37914fd1395b4c2acdd Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sat, 10 Apr 2021 15:06:47 -0300 Subject: [PATCH 01/14] Retry: settings and docs --- README.md | 11 +++++++++++ crawlera_fetch/middleware.py | 26 +++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3af17cc..8460106 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,17 @@ Crawlera middleware won't be able to handle them. Default values to be sent to the Crawlera Fetch API. For instance, set to `{"device": "mobile"}` to render all requests with a mobile profile. +* `CRAWLERA_FETCH_SHOULD_RETRY` (type `Optional[Callable, str]`, default `None`) + A boolean callable that determines whether a request should be retried by the middleware. + If the setting value is a `str`, an attribute by that name will be looked up on the spider + object doing the crawl. The callable should accept the following arguments: + `response: scrapy.http.response.Response, request: scrapy.http.request.Request, spider: scrapy.spiders.Spider`. + If the return value evaluates to `True`, the request will be retried by the middleware. + +* `CRAWLERA_FETCH_RETRY_TIMES` (type `Optional[int]`, default `None`) + The maximum number of times a request should be retried. + If `None`, the value is taken from the `RETRY_TIMES` setting. + ### Spider attributes * `crawlera_fetch_enabled` (type `bool`, default `False`). Whether or not the middleware will be enabled. diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index b2a2467..7b7219c 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -5,7 +5,7 @@ import os import time from enum import Enum -from typing import Optional, Type, TypeVar +from typing import Callable, Optional, Type, TypeVar import scrapy from scrapy.crawler import Crawler @@ -74,11 +74,31 @@ def _read_settings(self, spider: Spider) -> None: self.download_slot_policy = settings.get( "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain ) - self.raise_on_error = settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR", True) - self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {}) + self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY") + if isinstance(self.should_retry, str): + try: + self.should_retry = getattr(spider, self.should_retry) + except AttributeError: + logger.info( + "Could not find a callable named '%s' on the spider - retries are disabled", + self.should_retry, + ) + self.should_retry = None + elif not isinstance(self.should_retry, Callable): # type: ignore + logger.info( + "Invalid type for retry function: expected Callable" + " or str, got %s - retries are disabled", + type(self.should_retry), + ) + self.should_retry = None + if self.should_retry is not None: + self.retry_times = settings.getint("CRAWLERA_FETCH_RETRY_TIMES") + if not self.retry_times: + self.retry_times = settings.getint("RETRY_TIMES") + def spider_opened(self, spider): try: spider_attr = getattr(spider, "crawlera_fetch_enabled") From c3543a93801179f1a3f9958101bd4a2963a40656 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Sat, 10 Apr 2021 15:36:57 -0300 Subject: [PATCH 02/14] Import get_retry_request, update settings format in readme --- README.md | 32 ++++++++++++++++++++++--------- crawlera_fetch/middleware.py | 37 ++++++++++++++++++++++-------------- 2 files changed, 46 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 8460106..9815c3b 100644 --- a/README.md +++ b/README.md @@ -43,32 +43,43 @@ Crawlera middleware won't be able to handle them. ### Settings -* `CRAWLERA_FETCH_ENABLED` (type `bool`, default `False`). Whether or not the middleware will be enabled, - i.e. requests should be downloaded using the Crawlera Fetch API. The `crawlera_fetch_enabled` spider - attribute takes precedence over this setting. +* `CRAWLERA_FETCH_ENABLED` (type `bool`, default `False`) -* `CRAWLERA_FETCH_APIKEY` (type `str`). API key to be used to authenticate against the Crawlera endpoint - (mandatory if enabled) + Whether or not the middleware will be enabled, + i.e. requests should be downloaded using the Crawlera Fetch API. The `crawlera_fetch_enabled` + spider attribute takes precedence over this setting. + +* `CRAWLERA_FETCH_APIKEY` (type `str`) + + API key to be used to authenticate against the Crawlera endpoint (mandatory if enabled) + +* `CRAWLERA_FETCH_URL` (Type `str`, default `"http://fetch.crawlera.com:8010/fetch/v2/"`) -* `CRAWLERA_FETCH_URL` (Type `str`, default `"http://fetch.crawlera.com:8010/fetch/v2/"`). The endpoint of a specific Crawlera instance -* `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`). Whether or not the middleware will +* `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`) + + Whether or not the middleware will raise an exception if an error occurs while downloading or decoding a request. If `False`, a warning will be logged and the raw upstream response will be returned upon encountering an error. * `CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY` (type `enum.Enum` - `crawlera_fetch.DownloadSlotPolicy`, - default `DownloadSlotPolicy.Domain`). + default `DownloadSlotPolicy.Domain`) + Possible values are `DownloadSlotPolicy.Domain`, `DownloadSlotPolicy.Single`, `DownloadSlotPolicydefault` (Scrapy default). If set to `DownloadSlotPolicy.Domain`, please consider setting `SCHEDULER_PRIORITY_QUEUE="scrapy.pqueues.DownloaderAwarePriorityQueue"` to make better usage of concurrency options and avoid delays. * `CRAWLERA_FETCH_DEFAULT_ARGS` (type `dict`, default `{}`) + Default values to be sent to the Crawlera Fetch API. For instance, set to `{"device": "mobile"}` to render all requests with a mobile profile. * `CRAWLERA_FETCH_SHOULD_RETRY` (type `Optional[Callable, str]`, default `None`) + + **_Requires Scrapy>=2.5_** + A boolean callable that determines whether a request should be retried by the middleware. If the setting value is a `str`, an attribute by that name will be looked up on the spider object doing the crawl. The callable should accept the following arguments: @@ -76,12 +87,15 @@ Crawlera middleware won't be able to handle them. If the return value evaluates to `True`, the request will be retried by the middleware. * `CRAWLERA_FETCH_RETRY_TIMES` (type `Optional[int]`, default `None`) + The maximum number of times a request should be retried. If `None`, the value is taken from the `RETRY_TIMES` setting. ### Spider attributes -* `crawlera_fetch_enabled` (type `bool`, default `False`). Whether or not the middleware will be enabled. +* `crawlera_fetch_enabled` (type `bool`, default `False`) + + Whether or not the middleware will be enabled. Takes precedence over the `CRAWLERA_FETCH_ENABLED` setting. ### Log formatter diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index 7b7219c..58245d1 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -18,6 +18,11 @@ from scrapy.utils.reqser import request_from_dict, request_to_dict from w3lib.http import basic_auth_header +try: + from scrapy.downloadermiddlewares.retry import get_retry_request +except ImportError: + get_retry_request = None + logger = logging.getLogger("crawlera-fetch-middleware") @@ -78,22 +83,26 @@ def _read_settings(self, spider: Spider) -> None: self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {}) self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY") - if isinstance(self.should_retry, str): - try: - self.should_retry = getattr(spider, self.should_retry) - except AttributeError: - logger.info( - "Could not find a callable named '%s' on the spider - retries are disabled", - self.should_retry, + if self.should_retry is not None: + if get_retry_request is None: + logger.warning("Retry feature disabled, Scrapy>=2.5 required") + self.should_retry = None + elif isinstance(self.should_retry, str): + try: + self.should_retry = getattr(spider, self.should_retry) + except AttributeError: + logger.warning( + "Could not find a '%s' callable on the spider - retries are disabled", + self.should_retry, + ) + self.should_retry = None + elif not isinstance(self.should_retry, Callable): # type: ignore + logger.warning( + "Invalid type for retry function: expected Callable" + " or str, got %s - retries are disabled", + type(self.should_retry), ) self.should_retry = None - elif not isinstance(self.should_retry, Callable): # type: ignore - logger.info( - "Invalid type for retry function: expected Callable" - " or str, got %s - retries are disabled", - type(self.should_retry), - ) - self.should_retry = None if self.should_retry is not None: self.retry_times = settings.getint("CRAWLERA_FETCH_RETRY_TIMES") if not self.retry_times: From dba07897517062cd0b9fafa444d48fa6ad4eaad4 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 12 Apr 2021 03:01:22 -0300 Subject: [PATCH 03/14] Deprecate CRAWLERA_FETCH_RAISE_ON_ERROR, add CRAWLERA_FETCH_ON_ERROR --- README.md | 11 +++++ crawlera_fetch/middleware.py | 81 +++++++++++++++++++++++++++++++----- 2 files changed, 81 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 292f2dc..8bcccc1 100644 --- a/README.md +++ b/README.md @@ -57,8 +57,19 @@ Crawlera middleware won't be able to handle them. The endpoint of a specific Crawlera instance +* `CRAWLERA_FETCH_ON_ERROR` (type `enum.Enum` - `crawlera_fetch.OnError`, + default `OnError.Raise`) + + What to do if an error occurs while downloading or decoding a response. Possible values are: + * `OnError.Raise` (raise a `crawlera_fetch.CrawleraFetchException` exception) + * `OnError.Warn` (log a warning and return the raw upstream response) + * `OnError.Retry` (retry the failed request, up to `CRAWLERA_FETCH_RETRY_TIMES` times - + Requires Scrapy>=2.5) + * `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`) + **_Deprecated, please use `CRAWLERA_FETCH_ON_ERROR`_** + Whether or not the middleware will raise an exception if an error occurs while downloading or decoding a response. If `False`, a warning will be logged and the raw upstream response will be returned upon encountering an error. diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index 58245d1..37222c5 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -4,11 +4,13 @@ import logging import os import time +import warnings from enum import Enum -from typing import Callable, Optional, Type, TypeVar +from typing import Callable, Optional, Type, TypeVar, Union import scrapy from scrapy.crawler import Crawler +from scrapy.exceptions import ScrapyDeprecationWarning from scrapy.http.request import Request from scrapy.http.response import Response from scrapy.responsetypes import responsetypes @@ -39,6 +41,12 @@ class DownloadSlotPolicy(Enum): Default = "default" +class OnError(Enum): + Warn = "warn" + Raise = "raise" + Retry = "retry" + + class CrawleraFetchException(Exception): pass @@ -79,9 +87,30 @@ def _read_settings(self, spider: Spider) -> None: self.download_slot_policy = settings.get( "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain ) - self.raise_on_error = settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR", True) self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {}) + # what to do when an error hapepns? + self.on_error_action: Optional[OnError] = None + if "CRAWLERA_FETCH_RAISE_ON_ERROR" in settings: + warnings.warn( + "CRAWLERA_FETCH_RAISE_ON_ERROR is deprecated, " + "please use CRAWLERA_FETCH_ON_ERROR instead", + category=ScrapyDeprecationWarning, + stacklevel=2, + ) + if settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR"): + self.on_error_action = OnError.Raise + else: + self.on_error_action = OnError.Warn + if "CRAWLERA_FETCH_ON_ERROR" in settings: + self.on_error_action = settings.get("CRAWLERA_FETCH_ON_ERROR") + if self.on_error_action == OnError.Retry and get_retry_request is None: + logger.warning("Cannot retry on error, Scrapy>=2.5 required") + self.on_error_action = None + if self.on_error_action is None: + self.on_error_action = OnError.Raise + + # should retry? self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY") if self.should_retry is not None: if get_retry_request is None: @@ -192,6 +221,18 @@ def process_request(self, request: Request, spider: Spider) -> Optional[Request] request.meta[META_KEY] = crawlera_meta return request.replace(url=self.url, method="POST", body=body_json) + def _get_retry_request( + self, request: Request, reason: Union[Exception, str], stats_base_key: str, + ) -> Optional[Request]: + return get_retry_request( + request=request, + reason=reason, + stats_base_key=stats_base_key, + spider=self.crawler.spider, + max_retry_times=self.retry_times, + logger=logger, + ) + def process_response(self, request: Request, response: Response, spider: Spider) -> Response: if not self.enabled: return response @@ -222,11 +263,17 @@ def process_response(self, request: Request, response: Response, spider: Spider) response.status, message, ) - if self.raise_on_error: + if self.on_error_action == OnError.Raise: raise CrawleraFetchException(log_msg) - else: + elif self.on_error_action == OnError.Warn: logger.warning(log_msg) return response + elif self.on_error_action == OnError.Retry: + return self._get_retry_request( + request=request, + reason=message, + stats_base_key="crawlera_fetch/retry/error", + ) try: json_response = json.loads(response.text) @@ -242,14 +289,22 @@ def process_response(self, request: Request, response: Response, spider: Spider) exc.lineno, exc.colno, ) - if self.raise_on_error: + if self.on_error_action == OnError.Raise: raise CrawleraFetchException(log_msg) from exc - else: + elif self.on_error_action == OnError.Warn: logger.warning(log_msg) return response + elif self.on_error_action == OnError.Retry: + return self._get_retry_request( + request=request, + reason=exc, + stats_base_key="crawlera_fetch/retry/error", + ) - server_error = json_response.get("crawlera_error") or json_response.get("error_code") original_status = json_response.get("original_status") + self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status)) + + server_error = json_response.get("crawlera_error") or json_response.get("error_code") request_id = json_response.get("id") or json_response.get("uncork_id") if server_error: message = json_response.get("body") or json_response.get("message") @@ -266,13 +321,17 @@ def process_response(self, request: Request, response: Response, spider: Spider) message, request_id or "unknown", ) - if self.raise_on_error: + if self.on_error_action == OnError.Raise: raise CrawleraFetchException(log_msg) - else: + elif self.on_error_action == OnError.Warn: logger.warning(log_msg) return response - - self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status)) + elif self.on_error_action == OnError.Retry: + return self._get_retry_request( + request=request, + reason=server_error, + stats_base_key="crawlera_fetch/retry/error", + ) crawlera_meta["upstream_response"] = { "status": response.status, From 6e56acf5dd456c15652abc8199a9ef342c519ac9 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 12 Apr 2021 03:07:32 -0300 Subject: [PATCH 04/14] Paint it black, fix type issue (py35) --- crawlera_fetch/middleware.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index 37222c5..4912a2a 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -90,7 +90,7 @@ def _read_settings(self, spider: Spider) -> None: self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {}) # what to do when an error hapepns? - self.on_error_action: Optional[OnError] = None + self.on_error_action = None # type: Optional[OnError] if "CRAWLERA_FETCH_RAISE_ON_ERROR" in settings: warnings.warn( "CRAWLERA_FETCH_RAISE_ON_ERROR is deprecated, " @@ -222,7 +222,10 @@ def process_request(self, request: Request, spider: Spider) -> Optional[Request] return request.replace(url=self.url, method="POST", body=body_json) def _get_retry_request( - self, request: Request, reason: Union[Exception, str], stats_base_key: str, + self, + request: Request, + reason: Union[Exception, str], + stats_base_key: str, ) -> Optional[Request]: return get_retry_request( request=request, From 8e2b2d806439633973aa5fe1fd34287b9f9fc953 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 12 Apr 2021 03:18:45 -0300 Subject: [PATCH 05/14] CRAWLERA_FETCH_SHOULD_RETRY --- crawlera_fetch/middleware.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index 4912a2a..2d606ae 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -351,7 +351,7 @@ def process_response(self, request: Request, response: Response, spider: Spider) url=json_response["url"], body=resp_body, ) - return response.replace( + response = response.replace( cls=respcls, request=original_request, headers=json_response["headers"], @@ -359,6 +359,14 @@ def process_response(self, request: Request, response: Response, spider: Spider) body=resp_body, status=original_status or 200, ) + if self.should_retry is not None: + if self.should_retry(response=response, request=request, spider=spider): + return self._get_retry_request( + request=request, + reason="should-retry", + stats_base_key="crawlera_fetch/retry/should-retry", + ) + return response def _set_download_slot(self, request: Request, spider: Spider) -> None: if self.download_slot_policy == DownloadSlotPolicy.Domain: From fd6b54d574afcfb129faa7b24e27b15617fd1667 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Mon, 12 Apr 2021 12:08:10 -0300 Subject: [PATCH 06/14] Fallback implementation of get_retry_request --- README.md | 5 +--- crawlera_fetch/middleware.py | 30 +++++++++---------- crawlera_fetch/utils.py | 56 ++++++++++++++++++++++++++++++++++++ tox.ini | 2 +- 4 files changed, 71 insertions(+), 22 deletions(-) create mode 100644 crawlera_fetch/utils.py diff --git a/README.md b/README.md index 8bcccc1..3fe5cdf 100644 --- a/README.md +++ b/README.md @@ -63,8 +63,7 @@ Crawlera middleware won't be able to handle them. What to do if an error occurs while downloading or decoding a response. Possible values are: * `OnError.Raise` (raise a `crawlera_fetch.CrawleraFetchException` exception) * `OnError.Warn` (log a warning and return the raw upstream response) - * `OnError.Retry` (retry the failed request, up to `CRAWLERA_FETCH_RETRY_TIMES` times - - Requires Scrapy>=2.5) + * `OnError.Retry` (retry the failed request, up to `CRAWLERA_FETCH_RETRY_TIMES` times) * `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`) @@ -89,8 +88,6 @@ Crawlera middleware won't be able to handle them. * `CRAWLERA_FETCH_SHOULD_RETRY` (type `Optional[Callable, str]`, default `None`) - **_Requires Scrapy>=2.5_** - A boolean callable that determines whether a request should be retried by the middleware. If the setting value is a `str`, an attribute by that name will be looked up on the spider object doing the crawl. The callable should accept the following arguments: diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index 2d606ae..060d1ee 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -23,15 +23,18 @@ try: from scrapy.downloadermiddlewares.retry import get_retry_request except ImportError: - get_retry_request = None + from crawlera_fetch.utils import _get_retry_request as get_retry_request -logger = logging.getLogger("crawlera-fetch-middleware") - +__all__ = [ + "CrawleraFetchException", + "CrawleraFetchMiddleware", + "DownloadSlotPolicy", + "OnError", +] +logger = logging.getLogger("crawlera-fetch-middleware") MiddlewareTypeVar = TypeVar("MiddlewareTypeVar", bound="CrawleraFetchMiddleware") - - META_KEY = "crawlera_fetch" @@ -104,19 +107,13 @@ def _read_settings(self, spider: Spider) -> None: self.on_error_action = OnError.Warn if "CRAWLERA_FETCH_ON_ERROR" in settings: self.on_error_action = settings.get("CRAWLERA_FETCH_ON_ERROR") - if self.on_error_action == OnError.Retry and get_retry_request is None: - logger.warning("Cannot retry on error, Scrapy>=2.5 required") - self.on_error_action = None if self.on_error_action is None: self.on_error_action = OnError.Raise # should retry? self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY") if self.should_retry is not None: - if get_retry_request is None: - logger.warning("Retry feature disabled, Scrapy>=2.5 required") - self.should_retry = None - elif isinstance(self.should_retry, str): + if isinstance(self.should_retry, str): try: self.should_retry = getattr(spider, self.should_retry) except AttributeError: @@ -125,17 +122,16 @@ def _read_settings(self, spider: Spider) -> None: self.should_retry, ) self.should_retry = None - elif not isinstance(self.should_retry, Callable): # type: ignore + elif not isinstance(self.should_retry, Callable): # type: ignore[arg-type] logger.warning( "Invalid type for retry function: expected Callable" " or str, got %s - retries are disabled", type(self.should_retry), ) self.should_retry = None - if self.should_retry is not None: - self.retry_times = settings.getint("CRAWLERA_FETCH_RETRY_TIMES") - if not self.retry_times: - self.retry_times = settings.getint("RETRY_TIMES") + self.retry_times = settings.getint("CRAWLERA_FETCH_RETRY_TIMES") + if not self.retry_times: + self.retry_times = settings.getint("RETRY_TIMES") def spider_opened(self, spider): try: diff --git a/crawlera_fetch/utils.py b/crawlera_fetch/utils.py new file mode 100644 index 0000000..6b20a09 --- /dev/null +++ b/crawlera_fetch/utils.py @@ -0,0 +1,56 @@ +from logging import Logger +from typing import Optional, Union + +from scrapy import Request, Spider +from scrapy.utils.python import global_object_name + + +def _get_retry_request( + request: Request, + *, + spider: Spider, + reason: Union[str, Exception] = "unspecified", + max_retry_times: Optional[int] = None, + priority_adjust: Optional[int] = None, + logger: Logger, + stats_base_key: str, +): + """ + Fallback implementation, taken verbatim from https://github.com/scrapy/scrapy/pull/4902 + """ + settings = spider.crawler.settings + stats = spider.crawler.stats + retry_times = request.meta.get("retry_times", 0) + 1 + if max_retry_times is None: + max_retry_times = request.meta.get("max_retry_times") + if max_retry_times is None: + max_retry_times = settings.getint("RETRY_TIMES") + if retry_times <= max_retry_times: + logger.debug( + "Retrying %(request)s (failed %(retry_times)d times): %(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + new_request = request.copy() + new_request.meta["retry_times"] = retry_times + new_request.dont_filter = True + if priority_adjust is None: + priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST") + new_request.priority = request.priority + priority_adjust + + if callable(reason): + reason = reason() + if isinstance(reason, Exception): + reason = global_object_name(reason.__class__) + + stats.inc_value(f"{stats_base_key}/count") + stats.inc_value(f"{stats_base_key}/reason_count/{reason}") + return new_request + else: + stats.inc_value(f"{stats_base_key}/max_reached") + logger.error( + "Gave up retrying %(request)s (failed %(retry_times)d times): " "%(reason)s", + {"request": request, "retry_times": retry_times, "reason": reason}, + extra={"spider": spider}, + ) + return None diff --git a/tox.ini b/tox.ini index 8ffe602..04a1ea2 100644 --- a/tox.ini +++ b/tox.ini @@ -21,7 +21,7 @@ commands = black --check crawlera_fetch tests [testenv:typing] deps = mypy==0.770 basepython = python3.8 -commands = mypy --ignore-missing-imports --follow-imports=skip crawlera_fetch tests +commands = mypy --show-error-codes --ignore-missing-imports --follow-imports=skip crawlera_fetch tests [testenv:py35-pinned] deps = From e5c5feb1e801c475fe8bd18e695573b24020f54e Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 21 Apr 2021 14:00:42 -0300 Subject: [PATCH 07/14] Exclude _utils from coverage report --- .coveragerc | 2 ++ crawlera_fetch/{utils.py => _utils.py} | 0 crawlera_fetch/middleware.py | 2 +- tox.ini | 9 ++++++++- 4 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 .coveragerc rename crawlera_fetch/{utils.py => _utils.py} (100%) diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..7307f76 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +omit = crawlera_fetch/_utils.py diff --git a/crawlera_fetch/utils.py b/crawlera_fetch/_utils.py similarity index 100% rename from crawlera_fetch/utils.py rename to crawlera_fetch/_utils.py diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index 060d1ee..b1e5595 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -23,7 +23,7 @@ try: from scrapy.downloadermiddlewares.retry import get_retry_request except ImportError: - from crawlera_fetch.utils import _get_retry_request as get_retry_request + from crawlera_fetch._utils import _get_retry_request as get_retry_request __all__ = [ diff --git a/tox.ini b/tox.ini index 04a1ea2..4020719 100644 --- a/tox.ini +++ b/tox.ini @@ -6,7 +6,14 @@ envlist = flake8,black,typing,py35-pinned,py36,py37,py38 deps = -rtests/requirements.txt commands = - py.test --verbose --cov=crawlera_fetch --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: crawlera_fetch tests} + py.test \ + --verbose \ + --cov=crawlera_fetch \ + --cov-config=.coveragerc \ + --cov-report=term-missing \ + --cov-report=html \ + --cov-report=xml \ + {posargs:crawlera_fetch tests} [testenv:flake8] deps = flake8>=3.7.9 From 1e36a0332b6f4de129670638fde60b5282a2f463 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 21 Apr 2021 15:51:17 -0300 Subject: [PATCH 08/14] Retry tests --- tests/data/responses.py | 371 ++++++++++++++++++++++++---------------- tests/test_requests.py | 16 +- tests/test_responses.py | 145 +++++++--------- tests/test_retry.py | 49 ++++++ tests/utils.py | 20 ++- 5 files changed, 352 insertions(+), 249 deletions(-) create mode 100644 tests/test_retry.py diff --git a/tests/data/responses.py b/tests/data/responses.py index 3af9ba7..4e86c7a 100644 --- a/tests/data/responses.py +++ b/tests/data/responses.py @@ -10,105 +10,241 @@ from tests.utils import foo_spider, mocked_time -test_responses = [] +def get_test_responses(include_unprocessed=True): + test_responses = [] -test_responses.append( - { - "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], - status=200, - headers={ - "Content-Type": "application/json", - "Content-Encoding": "gzip", - "Transfer-Encoding": "chunked", - "Date": "Fri, 24 Apr 2020 18:06:42 GMT", - "Proxy-Connection": "close", - "Connection": "close", - }, - request=Request( + test_responses.append( + { + "original": HtmlResponse( url=SETTINGS["CRAWLERA_FETCH_URL"], - meta={ - "crawlera_fetch": { - "timing": {"start_ts": mocked_time()}, - "original_request": request_to_dict( - Request("https://fake.host.com"), - spider=foo_spider, - ), + status=200, + headers={ + "Content-Type": "application/json", + "Content-Encoding": "gzip", + "Transfer-Encoding": "chunked", + "Date": "Fri, 24 Apr 2020 18:06:42 GMT", + "Proxy-Connection": "close", + "Connection": "close", + }, + request=Request( + url=SETTINGS["CRAWLERA_FETCH_URL"], + meta={ + "crawlera_fetch": { + "timing": {"start_ts": mocked_time()}, + "original_request": request_to_dict( + Request("https://fake.host.com"), + spider=foo_spider, + ), + } + }, + ), + body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""", # noqa: E501 + ), + "expected": TextResponse( + url="https://fake.host.com", + status=123, + headers={"Fake-Header": "true"}, + body=b"""foobar""", # noqa: E501 + ), + } + ) + + test_responses.append( + { + "original": HtmlResponse( + url=SETTINGS["CRAWLERA_FETCH_URL"], + status=200, + headers={ + "Content-Type": "application/json", + "Content-Encoding": "gzip", + "Transfer-Encoding": "chunked", + "Date": "Fri, 24 Apr 2020 18:06:42 GMT", + "Proxy-Connection": "close", + "Connection": "close", + }, + request=Request( + url=SETTINGS["CRAWLERA_FETCH_URL"], + meta={ + "crawlera_fetch": { + "timing": {"start_ts": mocked_time()}, + "original_request": request_to_dict( + Request("https://httpbin.org/get"), + spider=foo_spider, + ), + } + }, + ), + body=b"""{"url":"https://httpbin.org/get","original_status":200,"headers":{"X-Crawlera-Slave":"196.16.27.20:8800","X-Crawlera-Version":"1.43.0-","status":"200","date":"Fri, 24 Apr 2020 18:06:42 GMT","content-type":"application/json","content-length":"756","server":"gunicorn/19.9.0","access-control-allow-origin":"*","access-control-allow-credentials":"true"},"crawlera_status":"success","body_encoding":"plain","body":"
{\\n  \\"args\\": {}, \\n  \\"headers\\": {\\n    \\"Accept\\": \\"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\\", \\n    \\"Accept-Encoding\\": \\"gzip, deflate, br\\", \\n    \\"Accept-Language\\": \\"en-US,en;q=0.9\\", \\n    \\"Cache-Control\\": \\"no-cache\\", \\n    \\"Host\\": \\"httpbin.org\\", \\n    \\"Pragma\\": \\"no-cache\\", \\n    \\"Sec-Fetch-Mode\\": \\"navigate\\", \\n    \\"Sec-Fetch-Site\\": \\"none\\", \\n    \\"Sec-Fetch-User\\": \\"?1\\", \\n    \\"Upgrade-Insecure-Requests\\": \\"1\\", \\n    \\"User-Agent\\": \\"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36\\", \\n    \\"X-Amzn-Trace-Id\\": \\"Root=1-5ea32ab2-93f521ee8238c744c88e3fec\\"\\n  }, \\n  \\"origin\\": \\"173.0.152.100\\", \\n  \\"url\\": \\"https://httpbin.org/get\\"\\n}\\n
"}""", # noqa: E501 + ), + "expected": HtmlResponse( + url="https://httpbin.org/get", + status=200, + headers={ + "X-Crawlera-Slave": "196.16.27.20:8800", + "X-Crawlera-Version": "1.43.0-", + "status": "200", + "date": "Fri, 24 Apr 2020 18:06:42 GMT", + "content-type": "application/json", + "content-length": "756", + "server": "gunicorn/19.9.0", + "access-control-allow-origin": "*", + "access-control-allow-credentials": "true", + }, + body=b"""
{\n  "args": {}, \n  "headers": {\n    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", \n    "Accept-Encoding": "gzip, deflate, br", \n    "Accept-Language": "en-US,en;q=0.9", \n    "Cache-Control": "no-cache", \n    "Host": "httpbin.org", \n    "Pragma": "no-cache", \n    "Sec-Fetch-Mode": "navigate", \n    "Sec-Fetch-Site": "none", \n    "Sec-Fetch-User": "?1", \n    "Upgrade-Insecure-Requests": "1", \n    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36", \n    "X-Amzn-Trace-Id": "Root=1-5ea32ab2-93f521ee8238c744c88e3fec"\n  }, \n  "origin": "173.0.152.100", \n  "url": "https://httpbin.org/get"\n}\n
""", # noqa: E501 + ), + } + ) + + test_responses.append( + { + "original": HtmlResponse( + url=SETTINGS["CRAWLERA_FETCH_URL"], + status=200, + headers={ + "Content-Type": "application/json", + "Content-Encoding": "gzip", + "Transfer-Encoding": "chunked", + "Date": "Fri, 24 Apr 2020 18:22:10 GMT", + "Proxy-Connection": "close", + "Connection": "close", + }, + request=Request( + url=SETTINGS["CRAWLERA_FETCH_URL"], + meta={ + "crawlera_fetch": { + "timing": {"start_ts": mocked_time()}, + "original_request": request_to_dict( + Request("https://example.org"), + spider=foo_spider, + ), + } + }, + ), + body=b"""{"url":"https://example.org","original_status":200,"headers":{"X-Crawlera-Slave":"192.241.80.236:3128","X-Crawlera-Version":"1.43.0-","status":"200","content-encoding":"gzip","accept-ranges":"bytes","age":"108944","cache-control":"max-age=604800","content-type":"text/html; charset=UTF-8","date":"Fri, 24 Apr 2020 18:22:10 GMT","etag":"\\"3147526947\\"","expires":"Fri, 01 May 2020 18:22:10 GMT","last-modified":"Thu, 17 Oct 2019 07:18:26 GMT","server":"ECS (dab/4B85)","vary":"Accept-Encoding","content-length":"648"},"crawlera_status":"success","body_encoding":"plain","body":"\\n Example Domain\\n\\n \\n \\n \\n \\n\\n\\n\\n
\\n

Example Domain

\\n

This domain is for use in illustrative examples in documents. You may use this\\n domain in literature without prior coordination or asking for permission.

\\n

More information...

\\n
\\n\\n\\n"}""", # noqa: E501 + ), + "expected": HtmlResponse( + url="https://example.org", + status=200, + headers={ + "X-Crawlera-Slave": "192.241.80.236:3128", + "X-Crawlera-Version": "1.43.0-", + "status": "200", + "content-encoding": "gzip", + "accept-ranges": "bytes", + "age": "108944", + "cache-control": "max-age=604800", + "content-type": "text/html; charset=UTF-8", + "date": "Fri, 24 Apr 2020 18:22:10 GMT", + "etag": '"3147526947"', + "expires": "Fri, 01 May 2020 18:22:10 GMT", + "last-modified": "Thu, 17 Oct 2019 07:18:26 GMT", + "server": "ECS (dab/4B85)", + "vary": "Accept-Encoding", + "content-length": "648", + }, + body=b"""\n Example Domain\n\n \n \n \n \n\n\n\n
\n

Example Domain

\n

This domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.

\n

More information...

\n
\n\n\n""", # noqa: E501 + ), + } + ) + + response_body_test = b"Hello middleware test!" + test_responses.append( + { + "original": HtmlResponse( + url=SETTINGS["CRAWLERA_FETCH_URL"], + status=200, + headers={ + "Content-Type": "application/json", + "Content-Encoding": "gzip", + "Date": "Fri, 24 Apr 2020 18:22:10 GMT", + }, + request=Request( + url=SETTINGS["CRAWLERA_FETCH_URL"], + meta={ + "crawlera_fetch": { + "timing": {"start_ts": mocked_time()}, + "original_request": request_to_dict( + Request("http://httpbin.org/ip"), + spider=foo_spider, + ), + } + }, + ), + body=json.dumps( + { + "id": "8498642e-1de3-40dd-b32f-f6eb6131e45e", + "body": base64.b64encode(response_body_test).decode(), + "original_status": 200, + "url": "http://httpbin.org/ip", + "original_url": "http://httpbin.org/ip", + "headers": { + "Content-Encoding": "gzip", + "Content-Type": "text/html", + "Date": "Fri, 24 Apr 2020 18:06:42 GMT", + }, } + ).encode(), + ), + "expected": HtmlResponse( + url="http://httpbin.org/ip", + status=200, + headers={ + "content-encoding": "gzip", + "content-type": "text/html", + "date": "Fri, 24 Apr 2020 18:06:42 GMT", }, + body=response_body_test, ), - body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""", # noqa: E501 - ), - "expected": TextResponse( - url="https://fake.host.com", - status=123, - headers={"Fake-Header": "true"}, - body=b"""foobar""", # noqa: E501 - ), - } -) + } + ) -test_responses.append( - { - "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], + if include_unprocessed: + unprocessed = HtmlResponse( + url="https://example.org", status=200, headers={ - "Content-Type": "application/json", + "Content-Type": "text/html", "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", - "Proxy-Connection": "close", - "Connection": "close", }, + request=Request("https://example.org"), + body=b"""""", + ) + test_responses.append({"original": unprocessed, "expected": unprocessed}) + + return test_responses + + +def get_test_responses_error(): + return [ + TextResponse( + url="https://crawlera.com/fake/api/endpoint", request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( - Request("https://httpbin.org/get"), + Request("https://example.org"), spider=foo_spider, ), } }, ), - body=b"""{"url":"https://httpbin.org/get","original_status":200,"headers":{"X-Crawlera-Slave":"196.16.27.20:8800","X-Crawlera-Version":"1.43.0-","status":"200","date":"Fri, 24 Apr 2020 18:06:42 GMT","content-type":"application/json","content-length":"756","server":"gunicorn/19.9.0","access-control-allow-origin":"*","access-control-allow-credentials":"true"},"crawlera_status":"success","body_encoding":"plain","body":"
{\\n  \\"args\\": {}, \\n  \\"headers\\": {\\n    \\"Accept\\": \\"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\\", \\n    \\"Accept-Encoding\\": \\"gzip, deflate, br\\", \\n    \\"Accept-Language\\": \\"en-US,en;q=0.9\\", \\n    \\"Cache-Control\\": \\"no-cache\\", \\n    \\"Host\\": \\"httpbin.org\\", \\n    \\"Pragma\\": \\"no-cache\\", \\n    \\"Sec-Fetch-Mode\\": \\"navigate\\", \\n    \\"Sec-Fetch-Site\\": \\"none\\", \\n    \\"Sec-Fetch-User\\": \\"?1\\", \\n    \\"Upgrade-Insecure-Requests\\": \\"1\\", \\n    \\"User-Agent\\": \\"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36\\", \\n    \\"X-Amzn-Trace-Id\\": \\"Root=1-5ea32ab2-93f521ee8238c744c88e3fec\\"\\n  }, \\n  \\"origin\\": \\"173.0.152.100\\", \\n  \\"url\\": \\"https://httpbin.org/get\\"\\n}\\n
"}""", # noqa: E501 - ), - "expected": HtmlResponse( - url="https://httpbin.org/get", - status=200, - headers={ - "X-Crawlera-Slave": "196.16.27.20:8800", - "X-Crawlera-Version": "1.43.0-", - "status": "200", - "date": "Fri, 24 Apr 2020 18:06:42 GMT", - "content-type": "application/json", - "content-length": "756", - "server": "gunicorn/19.9.0", - "access-control-allow-origin": "*", - "access-control-allow-credentials": "true", - }, - body=b"""
{\n  "args": {}, \n  "headers": {\n    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", \n    "Accept-Encoding": "gzip, deflate, br", \n    "Accept-Language": "en-US,en;q=0.9", \n    "Cache-Control": "no-cache", \n    "Host": "httpbin.org", \n    "Pragma": "no-cache", \n    "Sec-Fetch-Mode": "navigate", \n    "Sec-Fetch-Site": "none", \n    "Sec-Fetch-User": "?1", \n    "Upgrade-Insecure-Requests": "1", \n    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36", \n    "X-Amzn-Trace-Id": "Root=1-5ea32ab2-93f521ee8238c744c88e3fec"\n  }, \n  "origin": "173.0.152.100", \n  "url": "https://httpbin.org/get"\n}\n
""", # noqa: E501 - ), - } -) - -test_responses.append( - { - "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], - status=200, headers={ - "Content-Type": "application/json", - "Content-Encoding": "gzip", - "Transfer-Encoding": "chunked", - "Date": "Fri, 24 Apr 2020 18:22:10 GMT", + "X-Crawlera-Error": "bad_proxy_auth", + "Proxy-Authenticate": 'Basic realm="Crawlera"', + "Content-Length": "0", + "Date": "Mon, 04 May 2020 13:06:15 GMT", "Proxy-Connection": "close", "Connection": "close", }, + ), + TextResponse( + url="https://crawlera.com/fake/api/endpoint", request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": {"start_ts": mocked_time()}, @@ -119,65 +255,17 @@ } }, ), - body=b"""{"url":"https://example.org","original_status":200,"headers":{"X-Crawlera-Slave":"192.241.80.236:3128","X-Crawlera-Version":"1.43.0-","status":"200","content-encoding":"gzip","accept-ranges":"bytes","age":"108944","cache-control":"max-age=604800","content-type":"text/html; charset=UTF-8","date":"Fri, 24 Apr 2020 18:22:10 GMT","etag":"\\"3147526947\\"","expires":"Fri, 01 May 2020 18:22:10 GMT","last-modified":"Thu, 17 Oct 2019 07:18:26 GMT","server":"ECS (dab/4B85)","vary":"Accept-Encoding","content-length":"648"},"crawlera_status":"success","body_encoding":"plain","body":"\\n Example Domain\\n\\n \\n \\n \\n \\n\\n\\n\\n
\\n

Example Domain

\\n

This domain is for use in illustrative examples in documents. You may use this\\n domain in literature without prior coordination or asking for permission.

\\n

More information...

\\n
\\n\\n\\n"}""", # noqa: E501 + body=b'{"Bad": "JSON', ), - "expected": HtmlResponse( - url="https://example.org", - status=200, - headers={ - "X-Crawlera-Slave": "192.241.80.236:3128", - "X-Crawlera-Version": "1.43.0-", - "status": "200", - "content-encoding": "gzip", - "accept-ranges": "bytes", - "age": "108944", - "cache-control": "max-age=604800", - "content-type": "text/html; charset=UTF-8", - "date": "Fri, 24 Apr 2020 18:22:10 GMT", - "etag": '"3147526947"', - "expires": "Fri, 01 May 2020 18:22:10 GMT", - "last-modified": "Thu, 17 Oct 2019 07:18:26 GMT", - "server": "ECS (dab/4B85)", - "vary": "Accept-Encoding", - "content-length": "648", - }, - body=b"""\n Example Domain\n\n \n \n \n \n\n\n\n
\n

Example Domain

\n

This domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.

\n

More information...

\n
\n\n\n""", # noqa: E501 - ), - } -) - -unprocessed = HtmlResponse( - url="https://example.org", - status=200, - headers={ - "Content-Type": "text/html", - "Content-Encoding": "gzip", - "Transfer-Encoding": "chunked", - "Date": "Fri, 24 Apr 2020 18:06:42 GMT", - }, - request=Request("https://example.org"), - body=b"""""", -) -test_responses.append({"original": unprocessed, "expected": unprocessed}) - -response_body_test = b"Hello middleware test!" -test_responses.append( - { - "original": HtmlResponse( - url=SETTINGS["CRAWLERA_FETCH_URL"], - status=200, - headers={ - "Content-Type": "application/json", - "Content-Encoding": "gzip", - "Date": "Fri, 24 Apr 2020 18:22:10 GMT", - }, + TextResponse( + url="https://crawlera.com/fake/api/endpoint", request=Request( - url=SETTINGS["CRAWLERA_FETCH_URL"], + url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( - Request("http://httpbin.org/ip"), + Request("https://example.org"), spider=foo_spider, ), } @@ -185,28 +273,15 @@ ), body=json.dumps( { - "id": "8498642e-1de3-40dd-b32f-f6eb6131e45e", - "body": base64.b64encode(response_body_test).decode(), - "original_status": 200, - "url": "http://httpbin.org/ip", - "original_url": "http://httpbin.org/ip", - "headers": { - "Content-Encoding": "gzip", - "Content-Type": "text/html", - "Date": "Fri, 24 Apr 2020 18:06:42 GMT", - }, + "url": "https://example.org", + "original_status": 503, + "headers": {}, + "crawlera_status": "fail", + "crawlera_error": "serverbusy", + "body_encoding": "plain", + "body": "Server busy: too many outstanding requests", } - ).encode(), - ), - "expected": HtmlResponse( - url="http://httpbin.org/ip", - status=200, - headers={ - "content-encoding": "gzip", - "content-type": "text/html", - "date": "Fri, 24 Apr 2020 18:06:42 GMT", - }, - body=response_body_test, + ), + encoding="utf8", ), - } -) + ] diff --git a/tests/test_requests.py b/tests/test_requests.py index b36423a..c7bbcc3 100644 --- a/tests/test_requests.py +++ b/tests/test_requests.py @@ -1,6 +1,4 @@ import json -import os -from contextlib import contextmanager from unittest.mock import patch from scrapy import Request @@ -8,19 +6,7 @@ from crawlera_fetch import DownloadSlotPolicy from tests.data.requests import get_test_requests -from tests.utils import foo_spider, get_test_middleware, mocked_time - - -@contextmanager -def shub_jobkey_env_variable(): - SHUB_JOBKEY_OLD = os.environ.get("SHUB_JOBKEY") - os.environ["SHUB_JOBKEY"] = "1/2/3" - try: - yield - finally: - del os.environ["SHUB_JOBKEY"] - if SHUB_JOBKEY_OLD: - os.environ["SHUB_JOBKEY"] = SHUB_JOBKEY_OLD +from tests.utils import foo_spider, get_test_middleware, mocked_time, shub_jobkey_env_variable def test_process_request_disabled(): diff --git a/tests/test_responses.py b/tests/test_responses.py index 2ddf87b..53670f4 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -3,18 +3,17 @@ import pytest from scrapy import Request from scrapy.http.response.text import TextResponse -from scrapy.utils.reqser import request_to_dict from testfixtures import LogCapture -from crawlera_fetch.middleware import CrawleraFetchException +from crawlera_fetch.middleware import CrawleraFetchException, OnError -from tests.data.responses import test_responses -from tests.utils import foo_spider, get_test_middleware, mocked_time +from tests.data.responses import get_test_responses, get_test_responses_error +from tests.utils import foo_spider, get_test_middleware def test_process_response_disabled(): middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ENABLED": False}) - for case in test_responses: + for case in get_test_responses(): response = case["original"] assert middleware.process_response(response.request, response, foo_spider) is response @@ -22,7 +21,7 @@ def test_process_response_disabled(): def test_process_response(): middleware = get_test_middleware() - for case in test_responses: + for case in get_test_responses(): original = case["original"] expected = case["expected"] @@ -64,89 +63,65 @@ def test_process_response_skip(): def test_process_response_error(): - response_list = [ - TextResponse( - url="https://crawlera.com/fake/api/endpoint", - request=Request( - url="https://crawlera.com/fake/api/endpoint", - meta={ - "crawlera_fetch": { - "timing": {"start_ts": mocked_time()}, - "original_request": request_to_dict( - Request("https://example.org"), - spider=foo_spider, - ), - } - }, - ), - headers={ - "X-Crawlera-Error": "bad_proxy_auth", - "Proxy-Authenticate": 'Basic realm="Crawlera"', - "Content-Length": "0", - "Date": "Mon, 04 May 2020 13:06:15 GMT", - "Proxy-Connection": "close", - "Connection": "close", - }, + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Raise}) + for response in get_test_responses_error(): + with pytest.raises(CrawleraFetchException): + middleware.process_response(response.request, response, foo_spider) + + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + + +def test_process_response_warn(): + with LogCapture() as logs: + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Warn}) + for response in get_test_responses_error(): + processed = middleware.process_response(response.request, response, foo_spider) + assert response is processed + + logs.check_present( + ( + "crawlera-fetch-middleware", + "WARNING", + "Error downloading (status: 200, X-Crawlera-Error header: bad_proxy_auth)", # noqa: E501 ), - TextResponse( - url="https://crawlera.com/fake/api/endpoint", - request=Request( - url="https://crawlera.com/fake/api/endpoint", - meta={ - "crawlera_fetch": { - "timing": {"start_ts": mocked_time()}, - "original_request": request_to_dict( - Request("https://example.org"), - spider=foo_spider, - ), - } - }, - ), - body=b'{"Bad": "JSON', + ( + "crawlera-fetch-middleware", + "WARNING", + "Error decoding (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)", # noqa: E501 ), - TextResponse( - url="https://crawlera.com/fake/api/endpoint", - request=Request( - url="https://crawlera.com/fake/api/endpoint", - meta={ - "crawlera_fetch": { - "timing": {"start_ts": mocked_time()}, - "original_request": request_to_dict( - Request("https://example.org"), - spider=foo_spider, - ), - } - }, - ), - body=json.dumps( - { - "url": "https://example.org", - "original_status": 503, - "headers": {}, - "crawlera_status": "fail", - "crawlera_error": "serverbusy", - "body_encoding": "plain", - "body": "Server busy: too many outstanding requests", - } - ), - encoding="utf8", + ( + "crawlera-fetch-middleware", + "WARNING", + "Error downloading (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)", # noqa: E501 ), - ] + ) - middleware_raise = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) - for response in response_list: + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + + +def test_process_response_error_deprecated(): + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) + for response in get_test_responses_error(): with pytest.raises(CrawleraFetchException): - middleware_raise.process_response(response.request, response, foo_spider) + middleware.process_response(response.request, response, foo_spider) + + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error") == 3 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 - assert middleware_raise.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 - middleware_log = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) +def test_process_response_warn_deprecated(): with LogCapture() as logs: - for response in response_list: - processed = middleware_log.process_response(response.request, response, foo_spider) + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) + for response in get_test_responses_error(): + processed = middleware.process_response(response.request, response, foo_spider) assert response is processed logs.check_present( @@ -167,7 +142,7 @@ def test_process_response_error(): ), ) - assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 - assert middleware_log.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 diff --git a/tests/test_retry.py b/tests/test_retry.py new file mode 100644 index 0000000..97687a9 --- /dev/null +++ b/tests/test_retry.py @@ -0,0 +1,49 @@ +from scrapy import Request + +from crawlera_fetch.middleware import OnError + +from tests.data.responses import get_test_responses, get_test_responses_error +from tests.utils import foo_spider, get_test_middleware + + +def test_process_response_should_retry_function(): + def retry_function(response, request, spider): + return True + + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": retry_function}) + for case in get_test_responses(include_unprocessed=False): + response = case["original"] + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Request) + assert result.url == response.request.url + + base_key = "crawlera_fetch/retry/should-retry" + assert middleware.stats.get_value(f"{base_key}/count") == 4 + assert middleware.stats.get_value(f"{base_key}/reason_count/should-retry") == 4 + + +def test_process_response_should_retry_spider_method(): + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": "should_retry"}) + for case in get_test_responses(include_unprocessed=False): + response = case["original"] + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Request) + assert result.url == response.request.url + + base_key = "crawlera_fetch/retry/should-retry" + assert middleware.stats.get_value(f"{base_key}/count") == 4 + assert middleware.stats.get_value(f"{base_key}/reason_count/should-retry") == 4 + + +def test_process_response_error(): + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Retry}) + for response in get_test_responses_error(): + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Request) + assert result.url == response.request.url + + base_key = "crawlera_fetch/retry/error" + assert middleware.stats.get_value(f"{base_key}/count") == 3 + assert middleware.stats.get_value(f"{base_key}/reason_count/bad_proxy_auth") == 1 + assert middleware.stats.get_value(f"{base_key}/reason_count/json.decoder.JSONDecodeError") == 1 + assert middleware.stats.get_value(f"{base_key}/reason_count/serverbusy") == 1 diff --git a/tests/utils.py b/tests/utils.py index 6b010c9..959f023 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,5 +1,7 @@ -from urllib.parse import urlparse +import os +from contextlib import contextmanager from unittest.mock import Mock +from urllib.parse import urlparse from scrapy import Spider from scrapy.utils.test import get_crawler @@ -27,6 +29,9 @@ class FooSpider(Spider): def foo_callback(self, response): pass + def should_retry(self, response, request, spider): + return True + foo_spider = FooSpider() @@ -37,6 +42,7 @@ def get_test_middleware(settings=None): foo_spider = FooSpider() foo_spider.crawler = get_crawler(FooSpider, settings_dict=settings_dict) + foo_spider.crawler.spider = foo_spider foo_spider.crawler.engine = MockEngine() middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler) @@ -46,3 +52,15 @@ def get_test_middleware(settings=None): mocked_time = Mock(return_value=1234567890.123) + + +@contextmanager +def shub_jobkey_env_variable(): + SHUB_JOBKEY_OLD = os.environ.get("SHUB_JOBKEY") + os.environ["SHUB_JOBKEY"] = "1/2/3" + try: + yield + finally: + del os.environ["SHUB_JOBKEY"] + if SHUB_JOBKEY_OLD: + os.environ["SHUB_JOBKEY"] = SHUB_JOBKEY_OLD From c25600a2e76cdb5ed4651edb642e25c219b21f35 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 21 Apr 2021 19:11:04 -0300 Subject: [PATCH 09/14] Add missing type hint --- crawlera_fetch/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawlera_fetch/_utils.py b/crawlera_fetch/_utils.py index 6b20a09..a85d042 100644 --- a/crawlera_fetch/_utils.py +++ b/crawlera_fetch/_utils.py @@ -14,7 +14,7 @@ def _get_retry_request( priority_adjust: Optional[int] = None, logger: Logger, stats_base_key: str, -): +) -> Optional[Request]: """ Fallback implementation, taken verbatim from https://github.com/scrapy/scrapy/pull/4902 """ From 022f1fc4ba6e6a17c3798a36f71aa2358b002adc Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 21 Apr 2021 19:12:32 -0300 Subject: [PATCH 10/14] Add another missing type hint --- crawlera_fetch/middleware.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index b1e5595..672819f 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -133,7 +133,7 @@ def _read_settings(self, spider: Spider) -> None: if not self.retry_times: self.retry_times = settings.getint("RETRY_TIMES") - def spider_opened(self, spider): + def spider_opened(self, spider: Spider) -> None: try: spider_attr = getattr(spider, "crawlera_fetch_enabled") except AttributeError: From dcd4efa0e7a39b100699aed716348270c6e4e597 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 21 Apr 2021 19:24:55 -0300 Subject: [PATCH 11/14] py35 compliance --- crawlera_fetch/_utils.py | 11 +++++++---- tests/test_retry.py | 16 ++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/crawlera_fetch/_utils.py b/crawlera_fetch/_utils.py index a85d042..f33cf6e 100644 --- a/crawlera_fetch/_utils.py +++ b/crawlera_fetch/_utils.py @@ -5,6 +5,8 @@ from scrapy.utils.python import global_object_name +# disable black formatting to avoid syntax error on py35 +# fmt: off def _get_retry_request( request: Request, *, @@ -13,8 +15,9 @@ def _get_retry_request( max_retry_times: Optional[int] = None, priority_adjust: Optional[int] = None, logger: Logger, - stats_base_key: str, + stats_base_key: str # black wants to put a comma at the end, but py35 doesn't like it ) -> Optional[Request]: + # fmt: on """ Fallback implementation, taken verbatim from https://github.com/scrapy/scrapy/pull/4902 """ @@ -43,11 +46,11 @@ def _get_retry_request( if isinstance(reason, Exception): reason = global_object_name(reason.__class__) - stats.inc_value(f"{stats_base_key}/count") - stats.inc_value(f"{stats_base_key}/reason_count/{reason}") + stats.inc_value("{}/count".format(stats_base_key)) + stats.inc_value("{}/reason_count/{}".format(stats_base_key, reason)) return new_request else: - stats.inc_value(f"{stats_base_key}/max_reached") + stats.inc_value("{}/max_reached".format(stats_base_key)) logger.error( "Gave up retrying %(request)s (failed %(retry_times)d times): " "%(reason)s", {"request": request, "retry_times": retry_times, "reason": reason}, diff --git a/tests/test_retry.py b/tests/test_retry.py index 97687a9..d227dd7 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -18,8 +18,8 @@ def retry_function(response, request, spider): assert result.url == response.request.url base_key = "crawlera_fetch/retry/should-retry" - assert middleware.stats.get_value(f"{base_key}/count") == 4 - assert middleware.stats.get_value(f"{base_key}/reason_count/should-retry") == 4 + assert middleware.stats.get_value(base_key + "/count") == 4 + assert middleware.stats.get_value(base_key + "/reason_count/should-retry") == 4 def test_process_response_should_retry_spider_method(): @@ -31,8 +31,8 @@ def test_process_response_should_retry_spider_method(): assert result.url == response.request.url base_key = "crawlera_fetch/retry/should-retry" - assert middleware.stats.get_value(f"{base_key}/count") == 4 - assert middleware.stats.get_value(f"{base_key}/reason_count/should-retry") == 4 + assert middleware.stats.get_value(base_key + "/count") == 4 + assert middleware.stats.get_value(base_key + "/reason_count/should-retry") == 4 def test_process_response_error(): @@ -43,7 +43,7 @@ def test_process_response_error(): assert result.url == response.request.url base_key = "crawlera_fetch/retry/error" - assert middleware.stats.get_value(f"{base_key}/count") == 3 - assert middleware.stats.get_value(f"{base_key}/reason_count/bad_proxy_auth") == 1 - assert middleware.stats.get_value(f"{base_key}/reason_count/json.decoder.JSONDecodeError") == 1 - assert middleware.stats.get_value(f"{base_key}/reason_count/serverbusy") == 1 + assert middleware.stats.get_value(base_key + "/count") == 3 + assert middleware.stats.get_value(base_key + "/reason_count/bad_proxy_auth") == 1 + assert middleware.stats.get_value(base_key + "/reason_count/json.decoder.JSONDecodeError") == 1 + assert middleware.stats.get_value(base_key + "/reason_count/serverbusy") == 1 From 872dcda3b77d6ccc06f15e9de92785ca4046de50 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Wed, 21 Apr 2021 19:39:06 -0300 Subject: [PATCH 12/14] Full diff coverage --- crawlera_fetch/middleware.py | 4 +-- tests/test_retry.py | 61 +++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index 672819f..a2707a7 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -118,14 +118,14 @@ def _read_settings(self, spider: Spider) -> None: self.should_retry = getattr(spider, self.should_retry) except AttributeError: logger.warning( - "Could not find a '%s' callable on the spider - retries are disabled", + "Could not find a '%s' callable on the spider - user retries are disabled", self.should_retry, ) self.should_retry = None elif not isinstance(self.should_retry, Callable): # type: ignore[arg-type] logger.warning( "Invalid type for retry function: expected Callable" - " or str, got %s - retries are disabled", + " or str, got %s - user retries are disabled", type(self.should_retry), ) self.should_retry = None diff --git a/tests/test_retry.py b/tests/test_retry.py index d227dd7..b255020 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -1,4 +1,6 @@ -from scrapy import Request +from scrapy.http.request import Request +from scrapy.http.response import Response +from testfixtures import LogCapture from crawlera_fetch.middleware import OnError @@ -11,6 +13,7 @@ def retry_function(response, request, spider): return True middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": retry_function}) + assert middleware.should_retry is not None for case in get_test_responses(include_unprocessed=False): response = case["original"] result = middleware.process_response(response.request, response, foo_spider) @@ -24,6 +27,7 @@ def retry_function(response, request, spider): def test_process_response_should_retry_spider_method(): middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": "should_retry"}) + assert middleware.should_retry is not None for case in get_test_responses(include_unprocessed=False): response = case["original"] result = middleware.process_response(response.request, response, foo_spider) @@ -35,6 +39,61 @@ def test_process_response_should_retry_spider_method(): assert middleware.stats.get_value(base_key + "/reason_count/should-retry") == 4 +def test_process_response_should_retry_spider_method_non_existent(): + with LogCapture() as logs: + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": "not_found"}) + + assert middleware.should_retry is None + logs.check_present( + ( + "crawlera-fetch-middleware", + "WARNING", + "Could not find a 'not_found' callable on the spider - user retries are disabled", + ) + ) + + for case in get_test_responses(include_unprocessed=False): + response = case["original"] + expected = case["expected"] + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Response) + assert type(result) is type(expected) + assert result.url == expected.url + assert result.status == expected.status + + base_key = "crawlera_fetch/retry/should-retry" + assert middleware.stats.get_value(base_key + "/count") is None + assert middleware.stats.get_value(base_key + "/reason_count/should-retry") is None + + +def test_process_response_should_retry_invalid_type(): + with LogCapture() as logs: + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": 1}) + + assert middleware.should_retry is None + logs.check_present( + ( + "crawlera-fetch-middleware", + "WARNING", + "Invalid type for retry function: expected Callable" + " or str, got - user retries are disabled", + ) + ) + + for case in get_test_responses(include_unprocessed=False): + response = case["original"] + expected = case["expected"] + result = middleware.process_response(response.request, response, foo_spider) + assert isinstance(result, Response) + assert type(result) is type(expected) + assert result.url == expected.url + assert result.status == expected.status + + base_key = "crawlera_fetch/retry/should-retry" + assert middleware.stats.get_value(base_key + "/count") is None + assert middleware.stats.get_value(base_key + "/reason_count/should-retry") is None + + def test_process_response_error(): middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Retry}) for response in get_test_responses_error(): From 9b9560ed729a764693753a7928710b704cda38e3 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 5 Nov 2021 11:59:03 -0300 Subject: [PATCH 13/14] Comment about excluding upstream code from test coverage reports --- .coveragerc | 2 +- crawlera_fetch/middleware.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.coveragerc b/.coveragerc index 7307f76..8dd2a30 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,2 +1,2 @@ [run] -omit = crawlera_fetch/_utils.py +omit = crawlera_fetch/_utils.py # already tested in upstream Scrapy diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index a2707a7..f805c16 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -21,7 +21,7 @@ from w3lib.http import basic_auth_header try: - from scrapy.downloadermiddlewares.retry import get_retry_request + from scrapy.downloadermiddlewares.retry import get_retry_request # available on Scrapy >= 2.5 except ImportError: from crawlera_fetch._utils import _get_retry_request as get_retry_request From c77983ee4a5716548eddc03ae70c3718226d6e02 Mon Sep 17 00:00:00 2001 From: Eugenio Lacuesta Date: Fri, 5 Nov 2021 12:21:50 -0300 Subject: [PATCH 14/14] Validate CRAWLERA_FETCH_ON_ERROR setting --- crawlera_fetch/__init__.py | 2 +- crawlera_fetch/middleware.py | 19 ++++++++++++++++--- tests/test_responses.py | 21 +++++++++++++++++++++ 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/crawlera_fetch/__init__.py b/crawlera_fetch/__init__.py index f04b93b..e87e23b 100644 --- a/crawlera_fetch/__init__.py +++ b/crawlera_fetch/__init__.py @@ -1,2 +1,2 @@ from .logformatter import CrawleraFetchLogFormatter # noqa: F401 -from .middleware import CrawleraFetchMiddleware, DownloadSlotPolicy # noqa: F401 +from .middleware import CrawleraFetchMiddleware, DownloadSlotPolicy, OnError # noqa: F401 diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py index f805c16..b0e871f 100644 --- a/crawlera_fetch/middleware.py +++ b/crawlera_fetch/middleware.py @@ -92,7 +92,7 @@ def _read_settings(self, spider: Spider) -> None: ) self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {}) - # what to do when an error hapepns? + # what to do when errors happen? self.on_error_action = None # type: Optional[OnError] if "CRAWLERA_FETCH_RAISE_ON_ERROR" in settings: warnings.warn( @@ -106,11 +106,18 @@ def _read_settings(self, spider: Spider) -> None: else: self.on_error_action = OnError.Warn if "CRAWLERA_FETCH_ON_ERROR" in settings: - self.on_error_action = settings.get("CRAWLERA_FETCH_ON_ERROR") + if isinstance(settings["CRAWLERA_FETCH_ON_ERROR"], OnError): + self.on_error_action = settings["CRAWLERA_FETCH_ON_ERROR"] + else: + logger.warning( + "Invalid type for CRAWLERA_FETCH_ON_ERROR setting:" + " expected crawlera_fetch.OnError, got %s", + type(settings["CRAWLERA_FETCH_ON_ERROR"]), + ) if self.on_error_action is None: self.on_error_action = OnError.Raise - # should retry? + # should we retry? self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY") if self.should_retry is not None: if isinstance(self.should_retry, str): @@ -273,6 +280,8 @@ def process_response(self, request: Request, response: Response, spider: Spider) reason=message, stats_base_key="crawlera_fetch/retry/error", ) + else: + raise Exception("Invalid CRAWLERA_FETCH_ON_ERROR setting") try: json_response = json.loads(response.text) @@ -299,6 +308,8 @@ def process_response(self, request: Request, response: Response, spider: Spider) reason=exc, stats_base_key="crawlera_fetch/retry/error", ) + else: + raise Exception("Invalid CRAWLERA_FETCH_ON_ERROR setting") original_status = json_response.get("original_status") self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status)) @@ -331,6 +342,8 @@ def process_response(self, request: Request, response: Response, spider: Spider) reason=server_error, stats_base_key="crawlera_fetch/retry/error", ) + else: + raise Exception("Invalid CRAWLERA_FETCH_ON_ERROR setting") crawlera_meta["upstream_response"] = { "status": response.status, diff --git a/tests/test_responses.py b/tests/test_responses.py index 53670f4..0aea814 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -74,6 +74,27 @@ def test_process_response_error(): assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 +def test_process_response_error_default_raise(): + with LogCapture() as logs: + middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": "invalid"}) + logs.check_present( + ( + "crawlera-fetch-middleware", + "WARNING", + "Invalid type for CRAWLERA_FETCH_ON_ERROR setting: expected crawlera_fetch.OnError, got ", # noqa: E501 + ), + ) + + for response in get_test_responses_error(): + with pytest.raises(CrawleraFetchException): + middleware.process_response(response.request, response, foo_spider) + + assert middleware.stats.get_value("crawlera_fetch/response_error") == 3 + assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1 + assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1 + + def test_process_response_warn(): with LogCapture() as logs: middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Warn})