From 3e12067077755f2d953bc37914fd1395b4c2acdd Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Sat, 10 Apr 2021 15:06:47 -0300
Subject: [PATCH 01/14] Retry: settings and docs

---
 README.md                    | 11 +++++++++++
 crawlera_fetch/middleware.py | 26 +++++++++++++++++++++++---
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 3af17cc..8460106 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,17 @@ Crawlera middleware won't be able to handle them.
     Default values to be sent to the Crawlera Fetch API. For instance, set to `{"device": "mobile"}`
     to render all requests with a mobile profile.
 
+* `CRAWLERA_FETCH_SHOULD_RETRY` (type `Optional[Callable, str]`, default `None`)
+    A boolean callable that determines whether a request should be retried by the middleware.
+    If the setting value is a `str`, an attribute by that name will be looked up on the spider
+    object doing the crawl. The callable should accept the following arguments:
+    `response: scrapy.http.response.Response, request: scrapy.http.request.Request, spider: scrapy.spiders.Spider`.
+    If the return value evaluates to `True`, the request will be retried by the middleware.
+
+* `CRAWLERA_FETCH_RETRY_TIMES` (type `Optional[int]`, default `None`)
+    The maximum number of times a request should be retried.
+    If `None`, the value is taken from the `RETRY_TIMES` setting.
+
 ### Spider attributes
 
 * `crawlera_fetch_enabled` (type `bool`, default `False`). Whether or not the middleware will be enabled.
diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index b2a2467..7b7219c 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -5,7 +5,7 @@
 import os
 import time
 from enum import Enum
-from typing import Optional, Type, TypeVar
+from typing import Callable, Optional, Type, TypeVar
 
 import scrapy
 from scrapy.crawler import Crawler
@@ -74,11 +74,31 @@ def _read_settings(self, spider: Spider) -> None:
         self.download_slot_policy = settings.get(
             "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain
         )
-
         self.raise_on_error = settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR", True)
-
         self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {})
 
+        self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY")
+        if isinstance(self.should_retry, str):
+            try:
+                self.should_retry = getattr(spider, self.should_retry)
+            except AttributeError:
+                logger.info(
+                    "Could not find a callable named '%s' on the spider - retries are disabled",
+                    self.should_retry,
+                )
+                self.should_retry = None
+        elif not isinstance(self.should_retry, Callable):  # type: ignore
+            logger.info(
+                "Invalid type for retry function: expected Callable"
+                " or str, got %s - retries are disabled",
+                type(self.should_retry),
+            )
+            self.should_retry = None
+        if self.should_retry is not None:
+            self.retry_times = settings.getint("CRAWLERA_FETCH_RETRY_TIMES")
+            if not self.retry_times:
+                self.retry_times = settings.getint("RETRY_TIMES")
+
     def spider_opened(self, spider):
         try:
             spider_attr = getattr(spider, "crawlera_fetch_enabled")

From c3543a93801179f1a3f9958101bd4a2963a40656 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Sat, 10 Apr 2021 15:36:57 -0300
Subject: [PATCH 02/14] Import get_retry_request, update settings format in
 readme

---
 README.md                    | 32 ++++++++++++++++++++++---------
 crawlera_fetch/middleware.py | 37 ++++++++++++++++++++++--------------
 2 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 8460106..9815c3b 100644
--- a/README.md
+++ b/README.md
@@ -43,32 +43,43 @@ Crawlera middleware won't be able to handle them.
 
 ### Settings
 
-* `CRAWLERA_FETCH_ENABLED` (type `bool`, default `False`). Whether or not the middleware will be enabled,
-    i.e. requests should be downloaded using the Crawlera Fetch API. The `crawlera_fetch_enabled` spider
-    attribute takes precedence over this setting.
+* `CRAWLERA_FETCH_ENABLED` (type `bool`, default `False`)
 
-* `CRAWLERA_FETCH_APIKEY` (type `str`). API key to be used to authenticate against the Crawlera endpoint
-    (mandatory if enabled)
+    Whether or not the middleware will be enabled,
+    i.e. requests should be downloaded using the Crawlera Fetch API. The `crawlera_fetch_enabled`
+    spider attribute takes precedence over this setting.
+
+* `CRAWLERA_FETCH_APIKEY` (type `str`)
+
+    API key to be used to authenticate against the Crawlera endpoint (mandatory if enabled)
+
+* `CRAWLERA_FETCH_URL` (Type `str`, default `"http://fetch.crawlera.com:8010/fetch/v2/"`)
 
-* `CRAWLERA_FETCH_URL` (Type `str`, default `"http://fetch.crawlera.com:8010/fetch/v2/"`).
     The endpoint of a specific Crawlera instance
 
-* `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`). Whether or not the middleware will
+* `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`)
+
+    Whether or not the middleware will
     raise an exception if an error occurs while downloading or decoding a request. If `False`, a
     warning will be logged and the raw upstream response will be returned upon encountering an error.
 
 * `CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY` (type `enum.Enum` - `crawlera_fetch.DownloadSlotPolicy`,
-    default `DownloadSlotPolicy.Domain`).
+    default `DownloadSlotPolicy.Domain`)
+
     Possible values are `DownloadSlotPolicy.Domain`, `DownloadSlotPolicy.Single`,
     `DownloadSlotPolicydefault` (Scrapy default). If set to `DownloadSlotPolicy.Domain`, please
     consider setting `SCHEDULER_PRIORITY_QUEUE="scrapy.pqueues.DownloaderAwarePriorityQueue"` to
     make better usage of concurrency options and avoid delays.
 
 * `CRAWLERA_FETCH_DEFAULT_ARGS` (type `dict`, default `{}`)
+
     Default values to be sent to the Crawlera Fetch API. For instance, set to `{"device": "mobile"}`
     to render all requests with a mobile profile.
 
 * `CRAWLERA_FETCH_SHOULD_RETRY` (type `Optional[Callable, str]`, default `None`)
+
+    **_Requires Scrapy>=2.5_**
+
     A boolean callable that determines whether a request should be retried by the middleware.
     If the setting value is a `str`, an attribute by that name will be looked up on the spider
     object doing the crawl. The callable should accept the following arguments:
@@ -76,12 +87,15 @@ Crawlera middleware won't be able to handle them.
     If the return value evaluates to `True`, the request will be retried by the middleware.
 
 * `CRAWLERA_FETCH_RETRY_TIMES` (type `Optional[int]`, default `None`)
+
     The maximum number of times a request should be retried.
     If `None`, the value is taken from the `RETRY_TIMES` setting.
 
 ### Spider attributes
 
-* `crawlera_fetch_enabled` (type `bool`, default `False`). Whether or not the middleware will be enabled.
+* `crawlera_fetch_enabled` (type `bool`, default `False`)
+
+    Whether or not the middleware will be enabled.
     Takes precedence over the `CRAWLERA_FETCH_ENABLED` setting.
 
 ### Log formatter
diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index 7b7219c..58245d1 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -18,6 +18,11 @@
 from scrapy.utils.reqser import request_from_dict, request_to_dict
 from w3lib.http import basic_auth_header
 
+try:
+    from scrapy.downloadermiddlewares.retry import get_retry_request
+except ImportError:
+    get_retry_request = None
+
 
 logger = logging.getLogger("crawlera-fetch-middleware")
 
@@ -78,22 +83,26 @@ def _read_settings(self, spider: Spider) -> None:
         self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {})
 
         self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY")
-        if isinstance(self.should_retry, str):
-            try:
-                self.should_retry = getattr(spider, self.should_retry)
-            except AttributeError:
-                logger.info(
-                    "Could not find a callable named '%s' on the spider - retries are disabled",
-                    self.should_retry,
+        if self.should_retry is not None:
+            if get_retry_request is None:
+                logger.warning("Retry feature disabled, Scrapy>=2.5 required")
+                self.should_retry = None
+            elif isinstance(self.should_retry, str):
+                try:
+                    self.should_retry = getattr(spider, self.should_retry)
+                except AttributeError:
+                    logger.warning(
+                        "Could not find a '%s' callable on the spider - retries are disabled",
+                        self.should_retry,
+                    )
+                    self.should_retry = None
+            elif not isinstance(self.should_retry, Callable):  # type: ignore
+                logger.warning(
+                    "Invalid type for retry function: expected Callable"
+                    " or str, got %s - retries are disabled",
+                    type(self.should_retry),
                 )
                 self.should_retry = None
-        elif not isinstance(self.should_retry, Callable):  # type: ignore
-            logger.info(
-                "Invalid type for retry function: expected Callable"
-                " or str, got %s - retries are disabled",
-                type(self.should_retry),
-            )
-            self.should_retry = None
         if self.should_retry is not None:
             self.retry_times = settings.getint("CRAWLERA_FETCH_RETRY_TIMES")
             if not self.retry_times:

From dba07897517062cd0b9fafa444d48fa6ad4eaad4 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Mon, 12 Apr 2021 03:01:22 -0300
Subject: [PATCH 03/14] Deprecate CRAWLERA_FETCH_RAISE_ON_ERROR, add
 CRAWLERA_FETCH_ON_ERROR

---
 README.md                    | 11 +++++
 crawlera_fetch/middleware.py | 81 +++++++++++++++++++++++++++++++-----
 2 files changed, 81 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 292f2dc..8bcccc1 100644
--- a/README.md
+++ b/README.md
@@ -57,8 +57,19 @@ Crawlera middleware won't be able to handle them.
 
     The endpoint of a specific Crawlera instance
 
+* `CRAWLERA_FETCH_ON_ERROR` (type `enum.Enum` - `crawlera_fetch.OnError`,
+    default `OnError.Raise`)
+
+    What to do if an error occurs while downloading or decoding a response. Possible values are:
+    * `OnError.Raise` (raise a `crawlera_fetch.CrawleraFetchException` exception)
+    * `OnError.Warn` (log a warning and return the raw upstream response)
+    * `OnError.Retry` (retry the failed request, up to `CRAWLERA_FETCH_RETRY_TIMES` times -
+    Requires Scrapy>=2.5)
+
 * `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`)
 
+    **_Deprecated, please use `CRAWLERA_FETCH_ON_ERROR`_**
+
     Whether or not the middleware will raise an exception if an error occurs while downloading
     or decoding a response. If `False`, a warning will be logged and the raw upstream response
     will be returned upon encountering an error.
diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index 58245d1..37222c5 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -4,11 +4,13 @@
 import logging
 import os
 import time
+import warnings
 from enum import Enum
-from typing import Callable, Optional, Type, TypeVar
+from typing import Callable, Optional, Type, TypeVar, Union
 
 import scrapy
 from scrapy.crawler import Crawler
+from scrapy.exceptions import ScrapyDeprecationWarning
 from scrapy.http.request import Request
 from scrapy.http.response import Response
 from scrapy.responsetypes import responsetypes
@@ -39,6 +41,12 @@ class DownloadSlotPolicy(Enum):
     Default = "default"
 
 
+class OnError(Enum):
+    Warn = "warn"
+    Raise = "raise"
+    Retry = "retry"
+
+
 class CrawleraFetchException(Exception):
     pass
 
@@ -79,9 +87,30 @@ def _read_settings(self, spider: Spider) -> None:
         self.download_slot_policy = settings.get(
             "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY", DownloadSlotPolicy.Domain
         )
-        self.raise_on_error = settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR", True)
         self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {})
 
+        # what to do when an error hapepns?
+        self.on_error_action: Optional[OnError] = None
+        if "CRAWLERA_FETCH_RAISE_ON_ERROR" in settings:
+            warnings.warn(
+                "CRAWLERA_FETCH_RAISE_ON_ERROR is deprecated, "
+                "please use CRAWLERA_FETCH_ON_ERROR instead",
+                category=ScrapyDeprecationWarning,
+                stacklevel=2,
+            )
+            if settings.getbool("CRAWLERA_FETCH_RAISE_ON_ERROR"):
+                self.on_error_action = OnError.Raise
+            else:
+                self.on_error_action = OnError.Warn
+        if "CRAWLERA_FETCH_ON_ERROR" in settings:
+            self.on_error_action = settings.get("CRAWLERA_FETCH_ON_ERROR")
+        if self.on_error_action == OnError.Retry and get_retry_request is None:
+            logger.warning("Cannot retry on error, Scrapy>=2.5 required")
+            self.on_error_action = None
+        if self.on_error_action is None:
+            self.on_error_action = OnError.Raise
+
+        # should retry?
         self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY")
         if self.should_retry is not None:
             if get_retry_request is None:
@@ -192,6 +221,18 @@ def process_request(self, request: Request, spider: Spider) -> Optional[Request]
         request.meta[META_KEY] = crawlera_meta
         return request.replace(url=self.url, method="POST", body=body_json)
 
+    def _get_retry_request(
+        self, request: Request, reason: Union[Exception, str], stats_base_key: str,
+    ) -> Optional[Request]:
+        return get_retry_request(
+            request=request,
+            reason=reason,
+            stats_base_key=stats_base_key,
+            spider=self.crawler.spider,
+            max_retry_times=self.retry_times,
+            logger=logger,
+        )
+
     def process_response(self, request: Request, response: Response, spider: Spider) -> Response:
         if not self.enabled:
             return response
@@ -222,11 +263,17 @@ def process_response(self, request: Request, response: Response, spider: Spider)
                 response.status,
                 message,
             )
-            if self.raise_on_error:
+            if self.on_error_action == OnError.Raise:
                 raise CrawleraFetchException(log_msg)
-            else:
+            elif self.on_error_action == OnError.Warn:
                 logger.warning(log_msg)
                 return response
+            elif self.on_error_action == OnError.Retry:
+                return self._get_retry_request(
+                    request=request,
+                    reason=message,
+                    stats_base_key="crawlera_fetch/retry/error",
+                )
 
         try:
             json_response = json.loads(response.text)
@@ -242,14 +289,22 @@ def process_response(self, request: Request, response: Response, spider: Spider)
                 exc.lineno,
                 exc.colno,
             )
-            if self.raise_on_error:
+            if self.on_error_action == OnError.Raise:
                 raise CrawleraFetchException(log_msg) from exc
-            else:
+            elif self.on_error_action == OnError.Warn:
                 logger.warning(log_msg)
                 return response
+            elif self.on_error_action == OnError.Retry:
+                return self._get_retry_request(
+                    request=request,
+                    reason=exc,
+                    stats_base_key="crawlera_fetch/retry/error",
+                )
 
-        server_error = json_response.get("crawlera_error") or json_response.get("error_code")
         original_status = json_response.get("original_status")
+        self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status))
+
+        server_error = json_response.get("crawlera_error") or json_response.get("error_code")
         request_id = json_response.get("id") or json_response.get("uncork_id")
         if server_error:
             message = json_response.get("body") or json_response.get("message")
@@ -266,13 +321,17 @@ def process_response(self, request: Request, response: Response, spider: Spider)
                 message,
                 request_id or "unknown",
             )
-            if self.raise_on_error:
+            if self.on_error_action == OnError.Raise:
                 raise CrawleraFetchException(log_msg)
-            else:
+            elif self.on_error_action == OnError.Warn:
                 logger.warning(log_msg)
                 return response
-
-        self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status))
+            elif self.on_error_action == OnError.Retry:
+                return self._get_retry_request(
+                    request=request,
+                    reason=server_error,
+                    stats_base_key="crawlera_fetch/retry/error",
+                )
 
         crawlera_meta["upstream_response"] = {
             "status": response.status,

From 6e56acf5dd456c15652abc8199a9ef342c519ac9 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Mon, 12 Apr 2021 03:07:32 -0300
Subject: [PATCH 04/14] Paint it black, fix type issue (py35)

---
 crawlera_fetch/middleware.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index 37222c5..4912a2a 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -90,7 +90,7 @@ def _read_settings(self, spider: Spider) -> None:
         self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {})
 
         # what to do when an error hapepns?
-        self.on_error_action: Optional[OnError] = None
+        self.on_error_action = None  # type: Optional[OnError]
         if "CRAWLERA_FETCH_RAISE_ON_ERROR" in settings:
             warnings.warn(
                 "CRAWLERA_FETCH_RAISE_ON_ERROR is deprecated, "
@@ -222,7 +222,10 @@ def process_request(self, request: Request, spider: Spider) -> Optional[Request]
         return request.replace(url=self.url, method="POST", body=body_json)
 
     def _get_retry_request(
-        self, request: Request, reason: Union[Exception, str], stats_base_key: str,
+        self,
+        request: Request,
+        reason: Union[Exception, str],
+        stats_base_key: str,
     ) -> Optional[Request]:
         return get_retry_request(
             request=request,

From 8e2b2d806439633973aa5fe1fd34287b9f9fc953 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Mon, 12 Apr 2021 03:18:45 -0300
Subject: [PATCH 05/14] CRAWLERA_FETCH_SHOULD_RETRY

---
 crawlera_fetch/middleware.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index 4912a2a..2d606ae 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -351,7 +351,7 @@ def process_response(self, request: Request, response: Response, spider: Spider)
             url=json_response["url"],
             body=resp_body,
         )
-        return response.replace(
+        response = response.replace(
             cls=respcls,
             request=original_request,
             headers=json_response["headers"],
@@ -359,6 +359,14 @@ def process_response(self, request: Request, response: Response, spider: Spider)
             body=resp_body,
             status=original_status or 200,
         )
+        if self.should_retry is not None:
+            if self.should_retry(response=response, request=request, spider=spider):
+                return self._get_retry_request(
+                    request=request,
+                    reason="should-retry",
+                    stats_base_key="crawlera_fetch/retry/should-retry",
+                )
+        return response
 
     def _set_download_slot(self, request: Request, spider: Spider) -> None:
         if self.download_slot_policy == DownloadSlotPolicy.Domain:

From fd6b54d574afcfb129faa7b24e27b15617fd1667 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Mon, 12 Apr 2021 12:08:10 -0300
Subject: [PATCH 06/14] Fallback implementation of get_retry_request

---
 README.md                    |  5 +---
 crawlera_fetch/middleware.py | 30 +++++++++----------
 crawlera_fetch/utils.py      | 56 ++++++++++++++++++++++++++++++++++++
 tox.ini                      |  2 +-
 4 files changed, 71 insertions(+), 22 deletions(-)
 create mode 100644 crawlera_fetch/utils.py

diff --git a/README.md b/README.md
index 8bcccc1..3fe5cdf 100644
--- a/README.md
+++ b/README.md
@@ -63,8 +63,7 @@ Crawlera middleware won't be able to handle them.
     What to do if an error occurs while downloading or decoding a response. Possible values are:
     * `OnError.Raise` (raise a `crawlera_fetch.CrawleraFetchException` exception)
     * `OnError.Warn` (log a warning and return the raw upstream response)
-    * `OnError.Retry` (retry the failed request, up to `CRAWLERA_FETCH_RETRY_TIMES` times -
-    Requires Scrapy>=2.5)
+    * `OnError.Retry` (retry the failed request, up to `CRAWLERA_FETCH_RETRY_TIMES` times)
 
 * `CRAWLERA_FETCH_RAISE_ON_ERROR` (type `bool`, default `True`)
 
@@ -89,8 +88,6 @@ Crawlera middleware won't be able to handle them.
 
 * `CRAWLERA_FETCH_SHOULD_RETRY` (type `Optional[Callable, str]`, default `None`)
 
-    **_Requires Scrapy>=2.5_**
-
     A boolean callable that determines whether a request should be retried by the middleware.
     If the setting value is a `str`, an attribute by that name will be looked up on the spider
     object doing the crawl. The callable should accept the following arguments:
diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index 2d606ae..060d1ee 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -23,15 +23,18 @@
 try:
     from scrapy.downloadermiddlewares.retry import get_retry_request
 except ImportError:
-    get_retry_request = None
+    from crawlera_fetch.utils import _get_retry_request as get_retry_request
 
 
-logger = logging.getLogger("crawlera-fetch-middleware")
-
+__all__ = [
+    "CrawleraFetchException",
+    "CrawleraFetchMiddleware",
+    "DownloadSlotPolicy",
+    "OnError",
+]
 
+logger = logging.getLogger("crawlera-fetch-middleware")
 MiddlewareTypeVar = TypeVar("MiddlewareTypeVar", bound="CrawleraFetchMiddleware")
-
-
 META_KEY = "crawlera_fetch"
 
 
@@ -104,19 +107,13 @@ def _read_settings(self, spider: Spider) -> None:
                 self.on_error_action = OnError.Warn
         if "CRAWLERA_FETCH_ON_ERROR" in settings:
             self.on_error_action = settings.get("CRAWLERA_FETCH_ON_ERROR")
-        if self.on_error_action == OnError.Retry and get_retry_request is None:
-            logger.warning("Cannot retry on error, Scrapy>=2.5 required")
-            self.on_error_action = None
         if self.on_error_action is None:
             self.on_error_action = OnError.Raise
 
         # should retry?
         self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY")
         if self.should_retry is not None:
-            if get_retry_request is None:
-                logger.warning("Retry feature disabled, Scrapy>=2.5 required")
-                self.should_retry = None
-            elif isinstance(self.should_retry, str):
+            if isinstance(self.should_retry, str):
                 try:
                     self.should_retry = getattr(spider, self.should_retry)
                 except AttributeError:
@@ -125,17 +122,16 @@ def _read_settings(self, spider: Spider) -> None:
                         self.should_retry,
                     )
                     self.should_retry = None
-            elif not isinstance(self.should_retry, Callable):  # type: ignore
+            elif not isinstance(self.should_retry, Callable):  # type: ignore[arg-type]
                 logger.warning(
                     "Invalid type for retry function: expected Callable"
                     " or str, got %s - retries are disabled",
                     type(self.should_retry),
                 )
                 self.should_retry = None
-        if self.should_retry is not None:
-            self.retry_times = settings.getint("CRAWLERA_FETCH_RETRY_TIMES")
-            if not self.retry_times:
-                self.retry_times = settings.getint("RETRY_TIMES")
+        self.retry_times = settings.getint("CRAWLERA_FETCH_RETRY_TIMES")
+        if not self.retry_times:
+            self.retry_times = settings.getint("RETRY_TIMES")
 
     def spider_opened(self, spider):
         try:
diff --git a/crawlera_fetch/utils.py b/crawlera_fetch/utils.py
new file mode 100644
index 0000000..6b20a09
--- /dev/null
+++ b/crawlera_fetch/utils.py
@@ -0,0 +1,56 @@
+from logging import Logger
+from typing import Optional, Union
+
+from scrapy import Request, Spider
+from scrapy.utils.python import global_object_name
+
+
+def _get_retry_request(
+    request: Request,
+    *,
+    spider: Spider,
+    reason: Union[str, Exception] = "unspecified",
+    max_retry_times: Optional[int] = None,
+    priority_adjust: Optional[int] = None,
+    logger: Logger,
+    stats_base_key: str,
+):
+    """
+    Fallback implementation, taken verbatim from https://github.com/scrapy/scrapy/pull/4902
+    """
+    settings = spider.crawler.settings
+    stats = spider.crawler.stats
+    retry_times = request.meta.get("retry_times", 0) + 1
+    if max_retry_times is None:
+        max_retry_times = request.meta.get("max_retry_times")
+        if max_retry_times is None:
+            max_retry_times = settings.getint("RETRY_TIMES")
+    if retry_times <= max_retry_times:
+        logger.debug(
+            "Retrying %(request)s (failed %(retry_times)d times): %(reason)s",
+            {"request": request, "retry_times": retry_times, "reason": reason},
+            extra={"spider": spider},
+        )
+        new_request = request.copy()
+        new_request.meta["retry_times"] = retry_times
+        new_request.dont_filter = True
+        if priority_adjust is None:
+            priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST")
+        new_request.priority = request.priority + priority_adjust
+
+        if callable(reason):
+            reason = reason()
+        if isinstance(reason, Exception):
+            reason = global_object_name(reason.__class__)
+
+        stats.inc_value(f"{stats_base_key}/count")
+        stats.inc_value(f"{stats_base_key}/reason_count/{reason}")
+        return new_request
+    else:
+        stats.inc_value(f"{stats_base_key}/max_reached")
+        logger.error(
+            "Gave up retrying %(request)s (failed %(retry_times)d times): " "%(reason)s",
+            {"request": request, "retry_times": retry_times, "reason": reason},
+            extra={"spider": spider},
+        )
+        return None
diff --git a/tox.ini b/tox.ini
index 8ffe602..04a1ea2 100644
--- a/tox.ini
+++ b/tox.ini
@@ -21,7 +21,7 @@ commands = black --check crawlera_fetch tests
 [testenv:typing]
 deps = mypy==0.770
 basepython = python3.8
-commands = mypy --ignore-missing-imports --follow-imports=skip crawlera_fetch tests
+commands = mypy --show-error-codes --ignore-missing-imports --follow-imports=skip crawlera_fetch tests
 
 [testenv:py35-pinned]
 deps =

From e5c5feb1e801c475fe8bd18e695573b24020f54e Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Wed, 21 Apr 2021 14:00:42 -0300
Subject: [PATCH 07/14] Exclude _utils from coverage report

---
 .coveragerc                            | 2 ++
 crawlera_fetch/{utils.py => _utils.py} | 0
 crawlera_fetch/middleware.py           | 2 +-
 tox.ini                                | 9 ++++++++-
 4 files changed, 11 insertions(+), 2 deletions(-)
 create mode 100644 .coveragerc
 rename crawlera_fetch/{utils.py => _utils.py} (100%)

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..7307f76
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = crawlera_fetch/_utils.py
diff --git a/crawlera_fetch/utils.py b/crawlera_fetch/_utils.py
similarity index 100%
rename from crawlera_fetch/utils.py
rename to crawlera_fetch/_utils.py
diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index 060d1ee..b1e5595 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -23,7 +23,7 @@
 try:
     from scrapy.downloadermiddlewares.retry import get_retry_request
 except ImportError:
-    from crawlera_fetch.utils import _get_retry_request as get_retry_request
+    from crawlera_fetch._utils import _get_retry_request as get_retry_request
 
 
 __all__ = [
diff --git a/tox.ini b/tox.ini
index 04a1ea2..4020719 100644
--- a/tox.ini
+++ b/tox.ini
@@ -6,7 +6,14 @@ envlist = flake8,black,typing,py35-pinned,py36,py37,py38
 deps =
     -rtests/requirements.txt
 commands =
-    py.test --verbose --cov=crawlera_fetch --cov-report=term-missing --cov-report=html --cov-report=xml {posargs: crawlera_fetch tests}
+    py.test \
+        --verbose \
+        --cov=crawlera_fetch \
+        --cov-config=.coveragerc \
+        --cov-report=term-missing \
+        --cov-report=html \
+        --cov-report=xml \
+        {posargs:crawlera_fetch tests}
 
 [testenv:flake8]
 deps = flake8>=3.7.9

From 1e36a0332b6f4de129670638fde60b5282a2f463 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Wed, 21 Apr 2021 15:51:17 -0300
Subject: [PATCH 08/14] Retry tests

---
 tests/data/responses.py | 371 ++++++++++++++++++++++++----------------
 tests/test_requests.py  |  16 +-
 tests/test_responses.py | 145 +++++++---------
 tests/test_retry.py     |  49 ++++++
 tests/utils.py          |  20 ++-
 5 files changed, 352 insertions(+), 249 deletions(-)
 create mode 100644 tests/test_retry.py

diff --git a/tests/data/responses.py b/tests/data/responses.py
index 3af9ba7..4e86c7a 100644
--- a/tests/data/responses.py
+++ b/tests/data/responses.py
@@ -10,105 +10,241 @@
 from tests.utils import foo_spider, mocked_time
 
 
-test_responses = []
+def get_test_responses(include_unprocessed=True):
+    test_responses = []
 
-test_responses.append(
-    {
-        "original": HtmlResponse(
-            url=SETTINGS["CRAWLERA_FETCH_URL"],
-            status=200,
-            headers={
-                "Content-Type": "application/json",
-                "Content-Encoding": "gzip",
-                "Transfer-Encoding": "chunked",
-                "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
-                "Proxy-Connection": "close",
-                "Connection": "close",
-            },
-            request=Request(
+    test_responses.append(
+        {
+            "original": HtmlResponse(
                 url=SETTINGS["CRAWLERA_FETCH_URL"],
-                meta={
-                    "crawlera_fetch": {
-                        "timing": {"start_ts": mocked_time()},
-                        "original_request": request_to_dict(
-                            Request("https://fake.host.com"),
-                            spider=foo_spider,
-                        ),
+                status=200,
+                headers={
+                    "Content-Type": "application/json",
+                    "Content-Encoding": "gzip",
+                    "Transfer-Encoding": "chunked",
+                    "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
+                    "Proxy-Connection": "close",
+                    "Connection": "close",
+                },
+                request=Request(
+                    url=SETTINGS["CRAWLERA_FETCH_URL"],
+                    meta={
+                        "crawlera_fetch": {
+                            "timing": {"start_ts": mocked_time()},
+                            "original_request": request_to_dict(
+                                Request("https://fake.host.com"),
+                                spider=foo_spider,
+                            ),
+                        }
+                    },
+                ),
+                body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""",  # noqa: E501
+            ),
+            "expected": TextResponse(
+                url="https://fake.host.com",
+                status=123,
+                headers={"Fake-Header": "true"},
+                body=b"""foobar""",  # noqa: E501
+            ),
+        }
+    )
+
+    test_responses.append(
+        {
+            "original": HtmlResponse(
+                url=SETTINGS["CRAWLERA_FETCH_URL"],
+                status=200,
+                headers={
+                    "Content-Type": "application/json",
+                    "Content-Encoding": "gzip",
+                    "Transfer-Encoding": "chunked",
+                    "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
+                    "Proxy-Connection": "close",
+                    "Connection": "close",
+                },
+                request=Request(
+                    url=SETTINGS["CRAWLERA_FETCH_URL"],
+                    meta={
+                        "crawlera_fetch": {
+                            "timing": {"start_ts": mocked_time()},
+                            "original_request": request_to_dict(
+                                Request("https://httpbin.org/get"),
+                                spider=foo_spider,
+                            ),
+                        }
+                    },
+                ),
+                body=b"""{"url":"https://httpbin.org/get","original_status":200,"headers":{"X-Crawlera-Slave":"196.16.27.20:8800","X-Crawlera-Version":"1.43.0-","status":"200","date":"Fri, 24 Apr 2020 18:06:42 GMT","content-type":"application/json","content-length":"756","server":"gunicorn/19.9.0","access-control-allow-origin":"*","access-control-allow-credentials":"true"},"crawlera_status":"success","body_encoding":"plain","body":"<html><head></head><body><pre style=\\"word-wrap: break-word; white-space: pre-wrap;\\">{\\n  \\"args\\": {}, \\n  \\"headers\\": {\\n    \\"Accept\\": \\"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\\", \\n    \\"Accept-Encoding\\": \\"gzip, deflate, br\\", \\n    \\"Accept-Language\\": \\"en-US,en;q=0.9\\", \\n    \\"Cache-Control\\": \\"no-cache\\", \\n    \\"Host\\": \\"httpbin.org\\", \\n    \\"Pragma\\": \\"no-cache\\", \\n    \\"Sec-Fetch-Mode\\": \\"navigate\\", \\n    \\"Sec-Fetch-Site\\": \\"none\\", \\n    \\"Sec-Fetch-User\\": \\"?1\\", \\n    \\"Upgrade-Insecure-Requests\\": \\"1\\", \\n    \\"User-Agent\\": \\"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36\\", \\n    \\"X-Amzn-Trace-Id\\": \\"Root=1-5ea32ab2-93f521ee8238c744c88e3fec\\"\\n  }, \\n  \\"origin\\": \\"173.0.152.100\\", \\n  \\"url\\": \\"https://httpbin.org/get\\"\\n}\\n</pre></body></html>"}""",  # noqa: E501
+            ),
+            "expected": HtmlResponse(
+                url="https://httpbin.org/get",
+                status=200,
+                headers={
+                    "X-Crawlera-Slave": "196.16.27.20:8800",
+                    "X-Crawlera-Version": "1.43.0-",
+                    "status": "200",
+                    "date": "Fri, 24 Apr 2020 18:06:42 GMT",
+                    "content-type": "application/json",
+                    "content-length": "756",
+                    "server": "gunicorn/19.9.0",
+                    "access-control-allow-origin": "*",
+                    "access-control-allow-credentials": "true",
+                },
+                body=b"""<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">{\n  "args": {}, \n  "headers": {\n    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", \n    "Accept-Encoding": "gzip, deflate, br", \n    "Accept-Language": "en-US,en;q=0.9", \n    "Cache-Control": "no-cache", \n    "Host": "httpbin.org", \n    "Pragma": "no-cache", \n    "Sec-Fetch-Mode": "navigate", \n    "Sec-Fetch-Site": "none", \n    "Sec-Fetch-User": "?1", \n    "Upgrade-Insecure-Requests": "1", \n    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36", \n    "X-Amzn-Trace-Id": "Root=1-5ea32ab2-93f521ee8238c744c88e3fec"\n  }, \n  "origin": "173.0.152.100", \n  "url": "https://httpbin.org/get"\n}\n</pre></body></html>""",  # noqa: E501
+            ),
+        }
+    )
+
+    test_responses.append(
+        {
+            "original": HtmlResponse(
+                url=SETTINGS["CRAWLERA_FETCH_URL"],
+                status=200,
+                headers={
+                    "Content-Type": "application/json",
+                    "Content-Encoding": "gzip",
+                    "Transfer-Encoding": "chunked",
+                    "Date": "Fri, 24 Apr 2020 18:22:10 GMT",
+                    "Proxy-Connection": "close",
+                    "Connection": "close",
+                },
+                request=Request(
+                    url=SETTINGS["CRAWLERA_FETCH_URL"],
+                    meta={
+                        "crawlera_fetch": {
+                            "timing": {"start_ts": mocked_time()},
+                            "original_request": request_to_dict(
+                                Request("https://example.org"),
+                                spider=foo_spider,
+                            ),
+                        }
+                    },
+                ),
+                body=b"""{"url":"https://example.org","original_status":200,"headers":{"X-Crawlera-Slave":"192.241.80.236:3128","X-Crawlera-Version":"1.43.0-","status":"200","content-encoding":"gzip","accept-ranges":"bytes","age":"108944","cache-control":"max-age=604800","content-type":"text/html; charset=UTF-8","date":"Fri, 24 Apr 2020 18:22:10 GMT","etag":"\\"3147526947\\"","expires":"Fri, 01 May 2020 18:22:10 GMT","last-modified":"Thu, 17 Oct 2019 07:18:26 GMT","server":"ECS (dab/4B85)","vary":"Accept-Encoding","content-length":"648"},"crawlera_status":"success","body_encoding":"plain","body":"<!DOCTYPE html><html><head>\\n    <title>Example Domain</title>\\n\\n    <meta charset=\\"utf-8\\">\\n    <meta http-equiv=\\"Content-type\\" content=\\"text/html; charset=utf-8\\">\\n    <meta name=\\"viewport\\" content=\\"width=device-width, initial-scale=1\\">\\n    <style type=\\"text/css\\">\\n    body {\\n        background-color: #f0f0f2;\\n        margin: 0;\\n        padding: 0;\\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, \\"Segoe UI\\", \\"Open Sans\\", \\"Helvetica Neue\\", Helvetica, Arial, sans-serif;\\n        \\n    }\\n    div {\\n        width: 600px;\\n        margin: 5em auto;\\n        padding: 2em;\\n        background-color: #fdfdff;\\n        border-radius: 0.5em;\\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\\n    }\\n    a:link, a:visited {\\n        color: #38488f;\\n        text-decoration: none;\\n    }\\n    @media (max-width: 700px) {\\n        div {\\n            margin: 0 auto;\\n            width: auto;\\n        }\\n    }\\n    </style>    \\n</head>\\n\\n<body>\\n<div>\\n    <h1>Example Domain</h1>\\n    <p>This domain is for use in illustrative examples in documents. You may use this\\n    domain in literature without prior coordination or asking for permission.</p>\\n    <p><a href=\\"https://www.iana.org/domains/example\\">More information...</a></p>\\n</div>\\n\\n\\n</body></html>"}""",  # noqa: E501
+            ),
+            "expected": HtmlResponse(
+                url="https://example.org",
+                status=200,
+                headers={
+                    "X-Crawlera-Slave": "192.241.80.236:3128",
+                    "X-Crawlera-Version": "1.43.0-",
+                    "status": "200",
+                    "content-encoding": "gzip",
+                    "accept-ranges": "bytes",
+                    "age": "108944",
+                    "cache-control": "max-age=604800",
+                    "content-type": "text/html; charset=UTF-8",
+                    "date": "Fri, 24 Apr 2020 18:22:10 GMT",
+                    "etag": '"3147526947"',
+                    "expires": "Fri, 01 May 2020 18:22:10 GMT",
+                    "last-modified": "Thu, 17 Oct 2019 07:18:26 GMT",
+                    "server": "ECS (dab/4B85)",
+                    "vary": "Accept-Encoding",
+                    "content-length": "648",
+                },
+                body=b"""<!DOCTYPE html><html><head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8">\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <h1>Example Domain</h1>\n    <p>This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.</p>\n    <p><a href="https://www.iana.org/domains/example">More information...</a></p>\n</div>\n\n\n</body></html>""",  # noqa: E501
+            ),
+        }
+    )
+
+    response_body_test = b"<html>Hello middleware test!</html>"
+    test_responses.append(
+        {
+            "original": HtmlResponse(
+                url=SETTINGS["CRAWLERA_FETCH_URL"],
+                status=200,
+                headers={
+                    "Content-Type": "application/json",
+                    "Content-Encoding": "gzip",
+                    "Date": "Fri, 24 Apr 2020 18:22:10 GMT",
+                },
+                request=Request(
+                    url=SETTINGS["CRAWLERA_FETCH_URL"],
+                    meta={
+                        "crawlera_fetch": {
+                            "timing": {"start_ts": mocked_time()},
+                            "original_request": request_to_dict(
+                                Request("http://httpbin.org/ip"),
+                                spider=foo_spider,
+                            ),
+                        }
+                    },
+                ),
+                body=json.dumps(
+                    {
+                        "id": "8498642e-1de3-40dd-b32f-f6eb6131e45e",
+                        "body": base64.b64encode(response_body_test).decode(),
+                        "original_status": 200,
+                        "url": "http://httpbin.org/ip",
+                        "original_url": "http://httpbin.org/ip",
+                        "headers": {
+                            "Content-Encoding": "gzip",
+                            "Content-Type": "text/html",
+                            "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
+                        },
                     }
+                ).encode(),
+            ),
+            "expected": HtmlResponse(
+                url="http://httpbin.org/ip",
+                status=200,
+                headers={
+                    "content-encoding": "gzip",
+                    "content-type": "text/html",
+                    "date": "Fri, 24 Apr 2020 18:06:42 GMT",
                 },
+                body=response_body_test,
             ),
-            body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""",  # noqa: E501
-        ),
-        "expected": TextResponse(
-            url="https://fake.host.com",
-            status=123,
-            headers={"Fake-Header": "true"},
-            body=b"""foobar""",  # noqa: E501
-        ),
-    }
-)
+        }
+    )
 
-test_responses.append(
-    {
-        "original": HtmlResponse(
-            url=SETTINGS["CRAWLERA_FETCH_URL"],
+    if include_unprocessed:
+        unprocessed = HtmlResponse(
+            url="https://example.org",
             status=200,
             headers={
-                "Content-Type": "application/json",
+                "Content-Type": "text/html",
                 "Content-Encoding": "gzip",
                 "Transfer-Encoding": "chunked",
                 "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
-                "Proxy-Connection": "close",
-                "Connection": "close",
             },
+            request=Request("https://example.org"),
+            body=b"""<html></html>""",
+        )
+        test_responses.append({"original": unprocessed, "expected": unprocessed})
+
+    return test_responses
+
+
+def get_test_responses_error():
+    return [
+        TextResponse(
+            url="https://crawlera.com/fake/api/endpoint",
             request=Request(
-                url=SETTINGS["CRAWLERA_FETCH_URL"],
+                url="https://crawlera.com/fake/api/endpoint",
                 meta={
                     "crawlera_fetch": {
                         "timing": {"start_ts": mocked_time()},
                         "original_request": request_to_dict(
-                            Request("https://httpbin.org/get"),
+                            Request("https://example.org"),
                             spider=foo_spider,
                         ),
                     }
                 },
             ),
-            body=b"""{"url":"https://httpbin.org/get","original_status":200,"headers":{"X-Crawlera-Slave":"196.16.27.20:8800","X-Crawlera-Version":"1.43.0-","status":"200","date":"Fri, 24 Apr 2020 18:06:42 GMT","content-type":"application/json","content-length":"756","server":"gunicorn/19.9.0","access-control-allow-origin":"*","access-control-allow-credentials":"true"},"crawlera_status":"success","body_encoding":"plain","body":"<html><head></head><body><pre style=\\"word-wrap: break-word; white-space: pre-wrap;\\">{\\n  \\"args\\": {}, \\n  \\"headers\\": {\\n    \\"Accept\\": \\"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9\\", \\n    \\"Accept-Encoding\\": \\"gzip, deflate, br\\", \\n    \\"Accept-Language\\": \\"en-US,en;q=0.9\\", \\n    \\"Cache-Control\\": \\"no-cache\\", \\n    \\"Host\\": \\"httpbin.org\\", \\n    \\"Pragma\\": \\"no-cache\\", \\n    \\"Sec-Fetch-Mode\\": \\"navigate\\", \\n    \\"Sec-Fetch-Site\\": \\"none\\", \\n    \\"Sec-Fetch-User\\": \\"?1\\", \\n    \\"Upgrade-Insecure-Requests\\": \\"1\\", \\n    \\"User-Agent\\": \\"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36\\", \\n    \\"X-Amzn-Trace-Id\\": \\"Root=1-5ea32ab2-93f521ee8238c744c88e3fec\\"\\n  }, \\n  \\"origin\\": \\"173.0.152.100\\", \\n  \\"url\\": \\"https://httpbin.org/get\\"\\n}\\n</pre></body></html>"}""",  # noqa: E501
-        ),
-        "expected": HtmlResponse(
-            url="https://httpbin.org/get",
-            status=200,
-            headers={
-                "X-Crawlera-Slave": "196.16.27.20:8800",
-                "X-Crawlera-Version": "1.43.0-",
-                "status": "200",
-                "date": "Fri, 24 Apr 2020 18:06:42 GMT",
-                "content-type": "application/json",
-                "content-length": "756",
-                "server": "gunicorn/19.9.0",
-                "access-control-allow-origin": "*",
-                "access-control-allow-credentials": "true",
-            },
-            body=b"""<html><head></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">{\n  "args": {}, \n  "headers": {\n    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", \n    "Accept-Encoding": "gzip, deflate, br", \n    "Accept-Language": "en-US,en;q=0.9", \n    "Cache-Control": "no-cache", \n    "Host": "httpbin.org", \n    "Pragma": "no-cache", \n    "Sec-Fetch-Mode": "navigate", \n    "Sec-Fetch-Site": "none", \n    "Sec-Fetch-User": "?1", \n    "Upgrade-Insecure-Requests": "1", \n    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.44 Safari/537.36", \n    "X-Amzn-Trace-Id": "Root=1-5ea32ab2-93f521ee8238c744c88e3fec"\n  }, \n  "origin": "173.0.152.100", \n  "url": "https://httpbin.org/get"\n}\n</pre></body></html>""",  # noqa: E501
-        ),
-    }
-)
-
-test_responses.append(
-    {
-        "original": HtmlResponse(
-            url=SETTINGS["CRAWLERA_FETCH_URL"],
-            status=200,
             headers={
-                "Content-Type": "application/json",
-                "Content-Encoding": "gzip",
-                "Transfer-Encoding": "chunked",
-                "Date": "Fri, 24 Apr 2020 18:22:10 GMT",
+                "X-Crawlera-Error": "bad_proxy_auth",
+                "Proxy-Authenticate": 'Basic realm="Crawlera"',
+                "Content-Length": "0",
+                "Date": "Mon, 04 May 2020 13:06:15 GMT",
                 "Proxy-Connection": "close",
                 "Connection": "close",
             },
+        ),
+        TextResponse(
+            url="https://crawlera.com/fake/api/endpoint",
             request=Request(
-                url=SETTINGS["CRAWLERA_FETCH_URL"],
+                url="https://crawlera.com/fake/api/endpoint",
                 meta={
                     "crawlera_fetch": {
                         "timing": {"start_ts": mocked_time()},
@@ -119,65 +255,17 @@
                     }
                 },
             ),
-            body=b"""{"url":"https://example.org","original_status":200,"headers":{"X-Crawlera-Slave":"192.241.80.236:3128","X-Crawlera-Version":"1.43.0-","status":"200","content-encoding":"gzip","accept-ranges":"bytes","age":"108944","cache-control":"max-age=604800","content-type":"text/html; charset=UTF-8","date":"Fri, 24 Apr 2020 18:22:10 GMT","etag":"\\"3147526947\\"","expires":"Fri, 01 May 2020 18:22:10 GMT","last-modified":"Thu, 17 Oct 2019 07:18:26 GMT","server":"ECS (dab/4B85)","vary":"Accept-Encoding","content-length":"648"},"crawlera_status":"success","body_encoding":"plain","body":"<!DOCTYPE html><html><head>\\n    <title>Example Domain</title>\\n\\n    <meta charset=\\"utf-8\\">\\n    <meta http-equiv=\\"Content-type\\" content=\\"text/html; charset=utf-8\\">\\n    <meta name=\\"viewport\\" content=\\"width=device-width, initial-scale=1\\">\\n    <style type=\\"text/css\\">\\n    body {\\n        background-color: #f0f0f2;\\n        margin: 0;\\n        padding: 0;\\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, \\"Segoe UI\\", \\"Open Sans\\", \\"Helvetica Neue\\", Helvetica, Arial, sans-serif;\\n        \\n    }\\n    div {\\n        width: 600px;\\n        margin: 5em auto;\\n        padding: 2em;\\n        background-color: #fdfdff;\\n        border-radius: 0.5em;\\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\\n    }\\n    a:link, a:visited {\\n        color: #38488f;\\n        text-decoration: none;\\n    }\\n    @media (max-width: 700px) {\\n        div {\\n            margin: 0 auto;\\n            width: auto;\\n        }\\n    }\\n    </style>    \\n</head>\\n\\n<body>\\n<div>\\n    <h1>Example Domain</h1>\\n    <p>This domain is for use in illustrative examples in documents. You may use this\\n    domain in literature without prior coordination or asking for permission.</p>\\n    <p><a href=\\"https://www.iana.org/domains/example\\">More information...</a></p>\\n</div>\\n\\n\\n</body></html>"}""",  # noqa: E501
+            body=b'{"Bad": "JSON',
         ),
-        "expected": HtmlResponse(
-            url="https://example.org",
-            status=200,
-            headers={
-                "X-Crawlera-Slave": "192.241.80.236:3128",
-                "X-Crawlera-Version": "1.43.0-",
-                "status": "200",
-                "content-encoding": "gzip",
-                "accept-ranges": "bytes",
-                "age": "108944",
-                "cache-control": "max-age=604800",
-                "content-type": "text/html; charset=UTF-8",
-                "date": "Fri, 24 Apr 2020 18:22:10 GMT",
-                "etag": '"3147526947"',
-                "expires": "Fri, 01 May 2020 18:22:10 GMT",
-                "last-modified": "Thu, 17 Oct 2019 07:18:26 GMT",
-                "server": "ECS (dab/4B85)",
-                "vary": "Accept-Encoding",
-                "content-length": "648",
-            },
-            body=b"""<!DOCTYPE html><html><head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8">\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <h1>Example Domain</h1>\n    <p>This domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.</p>\n    <p><a href="https://www.iana.org/domains/example">More information...</a></p>\n</div>\n\n\n</body></html>""",  # noqa: E501
-        ),
-    }
-)
-
-unprocessed = HtmlResponse(
-    url="https://example.org",
-    status=200,
-    headers={
-        "Content-Type": "text/html",
-        "Content-Encoding": "gzip",
-        "Transfer-Encoding": "chunked",
-        "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
-    },
-    request=Request("https://example.org"),
-    body=b"""<html></html>""",
-)
-test_responses.append({"original": unprocessed, "expected": unprocessed})
-
-response_body_test = b"<html>Hello middleware test!</html>"
-test_responses.append(
-    {
-        "original": HtmlResponse(
-            url=SETTINGS["CRAWLERA_FETCH_URL"],
-            status=200,
-            headers={
-                "Content-Type": "application/json",
-                "Content-Encoding": "gzip",
-                "Date": "Fri, 24 Apr 2020 18:22:10 GMT",
-            },
+        TextResponse(
+            url="https://crawlera.com/fake/api/endpoint",
             request=Request(
-                url=SETTINGS["CRAWLERA_FETCH_URL"],
+                url="https://crawlera.com/fake/api/endpoint",
                 meta={
                     "crawlera_fetch": {
                         "timing": {"start_ts": mocked_time()},
                         "original_request": request_to_dict(
-                            Request("http://httpbin.org/ip"),
+                            Request("https://example.org"),
                             spider=foo_spider,
                         ),
                     }
@@ -185,28 +273,15 @@
             ),
             body=json.dumps(
                 {
-                    "id": "8498642e-1de3-40dd-b32f-f6eb6131e45e",
-                    "body": base64.b64encode(response_body_test).decode(),
-                    "original_status": 200,
-                    "url": "http://httpbin.org/ip",
-                    "original_url": "http://httpbin.org/ip",
-                    "headers": {
-                        "Content-Encoding": "gzip",
-                        "Content-Type": "text/html",
-                        "Date": "Fri, 24 Apr 2020 18:06:42 GMT",
-                    },
+                    "url": "https://example.org",
+                    "original_status": 503,
+                    "headers": {},
+                    "crawlera_status": "fail",
+                    "crawlera_error": "serverbusy",
+                    "body_encoding": "plain",
+                    "body": "Server busy: too many outstanding requests",
                 }
-            ).encode(),
-        ),
-        "expected": HtmlResponse(
-            url="http://httpbin.org/ip",
-            status=200,
-            headers={
-                "content-encoding": "gzip",
-                "content-type": "text/html",
-                "date": "Fri, 24 Apr 2020 18:06:42 GMT",
-            },
-            body=response_body_test,
+            ),
+            encoding="utf8",
         ),
-    }
-)
+    ]
diff --git a/tests/test_requests.py b/tests/test_requests.py
index b36423a..c7bbcc3 100644
--- a/tests/test_requests.py
+++ b/tests/test_requests.py
@@ -1,6 +1,4 @@
 import json
-import os
-from contextlib import contextmanager
 from unittest.mock import patch
 
 from scrapy import Request
@@ -8,19 +6,7 @@
 from crawlera_fetch import DownloadSlotPolicy
 
 from tests.data.requests import get_test_requests
-from tests.utils import foo_spider, get_test_middleware, mocked_time
-
-
-@contextmanager
-def shub_jobkey_env_variable():
-    SHUB_JOBKEY_OLD = os.environ.get("SHUB_JOBKEY")
-    os.environ["SHUB_JOBKEY"] = "1/2/3"
-    try:
-        yield
-    finally:
-        del os.environ["SHUB_JOBKEY"]
-        if SHUB_JOBKEY_OLD:
-            os.environ["SHUB_JOBKEY"] = SHUB_JOBKEY_OLD
+from tests.utils import foo_spider, get_test_middleware, mocked_time, shub_jobkey_env_variable
 
 
 def test_process_request_disabled():
diff --git a/tests/test_responses.py b/tests/test_responses.py
index 2ddf87b..53670f4 100644
--- a/tests/test_responses.py
+++ b/tests/test_responses.py
@@ -3,18 +3,17 @@
 import pytest
 from scrapy import Request
 from scrapy.http.response.text import TextResponse
-from scrapy.utils.reqser import request_to_dict
 from testfixtures import LogCapture
 
-from crawlera_fetch.middleware import CrawleraFetchException
+from crawlera_fetch.middleware import CrawleraFetchException, OnError
 
-from tests.data.responses import test_responses
-from tests.utils import foo_spider, get_test_middleware, mocked_time
+from tests.data.responses import get_test_responses, get_test_responses_error
+from tests.utils import foo_spider, get_test_middleware
 
 
 def test_process_response_disabled():
     middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ENABLED": False})
-    for case in test_responses:
+    for case in get_test_responses():
         response = case["original"]
         assert middleware.process_response(response.request, response, foo_spider) is response
 
@@ -22,7 +21,7 @@ def test_process_response_disabled():
 def test_process_response():
     middleware = get_test_middleware()
 
-    for case in test_responses:
+    for case in get_test_responses():
         original = case["original"]
         expected = case["expected"]
 
@@ -64,89 +63,65 @@ def test_process_response_skip():
 
 
 def test_process_response_error():
-    response_list = [
-        TextResponse(
-            url="https://crawlera.com/fake/api/endpoint",
-            request=Request(
-                url="https://crawlera.com/fake/api/endpoint",
-                meta={
-                    "crawlera_fetch": {
-                        "timing": {"start_ts": mocked_time()},
-                        "original_request": request_to_dict(
-                            Request("https://example.org"),
-                            spider=foo_spider,
-                        ),
-                    }
-                },
-            ),
-            headers={
-                "X-Crawlera-Error": "bad_proxy_auth",
-                "Proxy-Authenticate": 'Basic realm="Crawlera"',
-                "Content-Length": "0",
-                "Date": "Mon, 04 May 2020 13:06:15 GMT",
-                "Proxy-Connection": "close",
-                "Connection": "close",
-            },
+    middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Raise})
+    for response in get_test_responses_error():
+        with pytest.raises(CrawleraFetchException):
+            middleware.process_response(response.request, response, foo_spider)
+
+    assert middleware.stats.get_value("crawlera_fetch/response_error") == 3
+    assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1
+
+
+def test_process_response_warn():
+    with LogCapture() as logs:
+        middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Warn})
+        for response in get_test_responses_error():
+            processed = middleware.process_response(response.request, response, foo_spider)
+            assert response is processed
+
+    logs.check_present(
+        (
+            "crawlera-fetch-middleware",
+            "WARNING",
+            "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)",  # noqa: E501
         ),
-        TextResponse(
-            url="https://crawlera.com/fake/api/endpoint",
-            request=Request(
-                url="https://crawlera.com/fake/api/endpoint",
-                meta={
-                    "crawlera_fetch": {
-                        "timing": {"start_ts": mocked_time()},
-                        "original_request": request_to_dict(
-                            Request("https://example.org"),
-                            spider=foo_spider,
-                        ),
-                    }
-                },
-            ),
-            body=b'{"Bad": "JSON',
+        (
+            "crawlera-fetch-middleware",
+            "WARNING",
+            "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)",  # noqa: E501
         ),
-        TextResponse(
-            url="https://crawlera.com/fake/api/endpoint",
-            request=Request(
-                url="https://crawlera.com/fake/api/endpoint",
-                meta={
-                    "crawlera_fetch": {
-                        "timing": {"start_ts": mocked_time()},
-                        "original_request": request_to_dict(
-                            Request("https://example.org"),
-                            spider=foo_spider,
-                        ),
-                    }
-                },
-            ),
-            body=json.dumps(
-                {
-                    "url": "https://example.org",
-                    "original_status": 503,
-                    "headers": {},
-                    "crawlera_status": "fail",
-                    "crawlera_error": "serverbusy",
-                    "body_encoding": "plain",
-                    "body": "Server busy: too many outstanding requests",
-                }
-            ),
-            encoding="utf8",
+        (
+            "crawlera-fetch-middleware",
+            "WARNING",
+            "Error downloading <GET https://example.org> (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)",  # noqa: E501
         ),
-    ]
+    )
 
-    middleware_raise = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True})
-    for response in response_list:
+    assert middleware.stats.get_value("crawlera_fetch/response_error") == 3
+    assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1
+
+
+def test_process_response_error_deprecated():
+    middleware = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True})
+    for response in get_test_responses_error():
         with pytest.raises(CrawleraFetchException):
-            middleware_raise.process_response(response.request, response, foo_spider)
+            middleware.process_response(response.request, response, foo_spider)
+
+    assert middleware.stats.get_value("crawlera_fetch/response_error") == 3
+    assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1
 
-    assert middleware_raise.stats.get_value("crawlera_fetch/response_error") == 3
-    assert middleware_raise.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1
-    assert middleware_raise.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1
-    assert middleware_raise.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1
 
-    middleware_log = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False})
+def test_process_response_warn_deprecated():
     with LogCapture() as logs:
-        for response in response_list:
-            processed = middleware_log.process_response(response.request, response, foo_spider)
+        middleware = get_test_middleware(settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False})
+        for response in get_test_responses_error():
+            processed = middleware.process_response(response.request, response, foo_spider)
             assert response is processed
 
     logs.check_present(
@@ -167,7 +142,7 @@ def test_process_response_error():
         ),
     )
 
-    assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3
-    assert middleware_log.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1
-    assert middleware_log.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1
-    assert middleware_log.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error") == 3
+    assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1
diff --git a/tests/test_retry.py b/tests/test_retry.py
new file mode 100644
index 0000000..97687a9
--- /dev/null
+++ b/tests/test_retry.py
@@ -0,0 +1,49 @@
+from scrapy import Request
+
+from crawlera_fetch.middleware import OnError
+
+from tests.data.responses import get_test_responses, get_test_responses_error
+from tests.utils import foo_spider, get_test_middleware
+
+
+def test_process_response_should_retry_function():
+    def retry_function(response, request, spider):
+        return True
+
+    middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": retry_function})
+    for case in get_test_responses(include_unprocessed=False):
+        response = case["original"]
+        result = middleware.process_response(response.request, response, foo_spider)
+        assert isinstance(result, Request)
+        assert result.url == response.request.url
+
+    base_key = "crawlera_fetch/retry/should-retry"
+    assert middleware.stats.get_value(f"{base_key}/count") == 4
+    assert middleware.stats.get_value(f"{base_key}/reason_count/should-retry") == 4
+
+
+def test_process_response_should_retry_spider_method():
+    middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": "should_retry"})
+    for case in get_test_responses(include_unprocessed=False):
+        response = case["original"]
+        result = middleware.process_response(response.request, response, foo_spider)
+        assert isinstance(result, Request)
+        assert result.url == response.request.url
+
+    base_key = "crawlera_fetch/retry/should-retry"
+    assert middleware.stats.get_value(f"{base_key}/count") == 4
+    assert middleware.stats.get_value(f"{base_key}/reason_count/should-retry") == 4
+
+
+def test_process_response_error():
+    middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Retry})
+    for response in get_test_responses_error():
+        result = middleware.process_response(response.request, response, foo_spider)
+        assert isinstance(result, Request)
+        assert result.url == response.request.url
+
+    base_key = "crawlera_fetch/retry/error"
+    assert middleware.stats.get_value(f"{base_key}/count") == 3
+    assert middleware.stats.get_value(f"{base_key}/reason_count/bad_proxy_auth") == 1
+    assert middleware.stats.get_value(f"{base_key}/reason_count/json.decoder.JSONDecodeError") == 1
+    assert middleware.stats.get_value(f"{base_key}/reason_count/serverbusy") == 1
diff --git a/tests/utils.py b/tests/utils.py
index 6b010c9..959f023 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,5 +1,7 @@
-from urllib.parse import urlparse
+import os
+from contextlib import contextmanager
 from unittest.mock import Mock
+from urllib.parse import urlparse
 
 from scrapy import Spider
 from scrapy.utils.test import get_crawler
@@ -27,6 +29,9 @@ class FooSpider(Spider):
     def foo_callback(self, response):
         pass
 
+    def should_retry(self, response, request, spider):
+        return True
+
 
 foo_spider = FooSpider()
 
@@ -37,6 +42,7 @@ def get_test_middleware(settings=None):
 
     foo_spider = FooSpider()
     foo_spider.crawler = get_crawler(FooSpider, settings_dict=settings_dict)
+    foo_spider.crawler.spider = foo_spider
     foo_spider.crawler.engine = MockEngine()
 
     middleware = CrawleraFetchMiddleware.from_crawler(foo_spider.crawler)
@@ -46,3 +52,15 @@ def get_test_middleware(settings=None):
 
 
 mocked_time = Mock(return_value=1234567890.123)
+
+
+@contextmanager
+def shub_jobkey_env_variable():
+    SHUB_JOBKEY_OLD = os.environ.get("SHUB_JOBKEY")
+    os.environ["SHUB_JOBKEY"] = "1/2/3"
+    try:
+        yield
+    finally:
+        del os.environ["SHUB_JOBKEY"]
+        if SHUB_JOBKEY_OLD:
+            os.environ["SHUB_JOBKEY"] = SHUB_JOBKEY_OLD

From c25600a2e76cdb5ed4651edb642e25c219b21f35 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Wed, 21 Apr 2021 19:11:04 -0300
Subject: [PATCH 09/14] Add missing type hint

---
 crawlera_fetch/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawlera_fetch/_utils.py b/crawlera_fetch/_utils.py
index 6b20a09..a85d042 100644
--- a/crawlera_fetch/_utils.py
+++ b/crawlera_fetch/_utils.py
@@ -14,7 +14,7 @@ def _get_retry_request(
     priority_adjust: Optional[int] = None,
     logger: Logger,
     stats_base_key: str,
-):
+) -> Optional[Request]:
     """
     Fallback implementation, taken verbatim from https://github.com/scrapy/scrapy/pull/4902
     """

From 022f1fc4ba6e6a17c3798a36f71aa2358b002adc Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Wed, 21 Apr 2021 19:12:32 -0300
Subject: [PATCH 10/14] Add another missing type hint

---
 crawlera_fetch/middleware.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index b1e5595..672819f 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -133,7 +133,7 @@ def _read_settings(self, spider: Spider) -> None:
         if not self.retry_times:
             self.retry_times = settings.getint("RETRY_TIMES")
 
-    def spider_opened(self, spider):
+    def spider_opened(self, spider: Spider) -> None:
         try:
             spider_attr = getattr(spider, "crawlera_fetch_enabled")
         except AttributeError:

From dcd4efa0e7a39b100699aed716348270c6e4e597 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Wed, 21 Apr 2021 19:24:55 -0300
Subject: [PATCH 11/14] py35 compliance

---
 crawlera_fetch/_utils.py | 11 +++++++----
 tests/test_retry.py      | 16 ++++++++--------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/crawlera_fetch/_utils.py b/crawlera_fetch/_utils.py
index a85d042..f33cf6e 100644
--- a/crawlera_fetch/_utils.py
+++ b/crawlera_fetch/_utils.py
@@ -5,6 +5,8 @@
 from scrapy.utils.python import global_object_name
 
 
+# disable black formatting to avoid syntax error on py35
+# fmt: off
 def _get_retry_request(
     request: Request,
     *,
@@ -13,8 +15,9 @@ def _get_retry_request(
     max_retry_times: Optional[int] = None,
     priority_adjust: Optional[int] = None,
     logger: Logger,
-    stats_base_key: str,
+    stats_base_key: str  # black wants to put a comma at the end, but py35 doesn't like it
 ) -> Optional[Request]:
+    # fmt: on
     """
     Fallback implementation, taken verbatim from https://github.com/scrapy/scrapy/pull/4902
     """
@@ -43,11 +46,11 @@ def _get_retry_request(
         if isinstance(reason, Exception):
             reason = global_object_name(reason.__class__)
 
-        stats.inc_value(f"{stats_base_key}/count")
-        stats.inc_value(f"{stats_base_key}/reason_count/{reason}")
+        stats.inc_value("{}/count".format(stats_base_key))
+        stats.inc_value("{}/reason_count/{}".format(stats_base_key, reason))
         return new_request
     else:
-        stats.inc_value(f"{stats_base_key}/max_reached")
+        stats.inc_value("{}/max_reached".format(stats_base_key))
         logger.error(
             "Gave up retrying %(request)s (failed %(retry_times)d times): " "%(reason)s",
             {"request": request, "retry_times": retry_times, "reason": reason},
diff --git a/tests/test_retry.py b/tests/test_retry.py
index 97687a9..d227dd7 100644
--- a/tests/test_retry.py
+++ b/tests/test_retry.py
@@ -18,8 +18,8 @@ def retry_function(response, request, spider):
         assert result.url == response.request.url
 
     base_key = "crawlera_fetch/retry/should-retry"
-    assert middleware.stats.get_value(f"{base_key}/count") == 4
-    assert middleware.stats.get_value(f"{base_key}/reason_count/should-retry") == 4
+    assert middleware.stats.get_value(base_key + "/count") == 4
+    assert middleware.stats.get_value(base_key + "/reason_count/should-retry") == 4
 
 
 def test_process_response_should_retry_spider_method():
@@ -31,8 +31,8 @@ def test_process_response_should_retry_spider_method():
         assert result.url == response.request.url
 
     base_key = "crawlera_fetch/retry/should-retry"
-    assert middleware.stats.get_value(f"{base_key}/count") == 4
-    assert middleware.stats.get_value(f"{base_key}/reason_count/should-retry") == 4
+    assert middleware.stats.get_value(base_key + "/count") == 4
+    assert middleware.stats.get_value(base_key + "/reason_count/should-retry") == 4
 
 
 def test_process_response_error():
@@ -43,7 +43,7 @@ def test_process_response_error():
         assert result.url == response.request.url
 
     base_key = "crawlera_fetch/retry/error"
-    assert middleware.stats.get_value(f"{base_key}/count") == 3
-    assert middleware.stats.get_value(f"{base_key}/reason_count/bad_proxy_auth") == 1
-    assert middleware.stats.get_value(f"{base_key}/reason_count/json.decoder.JSONDecodeError") == 1
-    assert middleware.stats.get_value(f"{base_key}/reason_count/serverbusy") == 1
+    assert middleware.stats.get_value(base_key + "/count") == 3
+    assert middleware.stats.get_value(base_key + "/reason_count/bad_proxy_auth") == 1
+    assert middleware.stats.get_value(base_key + "/reason_count/json.decoder.JSONDecodeError") == 1
+    assert middleware.stats.get_value(base_key + "/reason_count/serverbusy") == 1

From 872dcda3b77d6ccc06f15e9de92785ca4046de50 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Wed, 21 Apr 2021 19:39:06 -0300
Subject: [PATCH 12/14] Full diff coverage

---
 crawlera_fetch/middleware.py |  4 +--
 tests/test_retry.py          | 61 +++++++++++++++++++++++++++++++++++-
 2 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index 672819f..a2707a7 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -118,14 +118,14 @@ def _read_settings(self, spider: Spider) -> None:
                     self.should_retry = getattr(spider, self.should_retry)
                 except AttributeError:
                     logger.warning(
-                        "Could not find a '%s' callable on the spider - retries are disabled",
+                        "Could not find a '%s' callable on the spider - user retries are disabled",
                         self.should_retry,
                     )
                     self.should_retry = None
             elif not isinstance(self.should_retry, Callable):  # type: ignore[arg-type]
                 logger.warning(
                     "Invalid type for retry function: expected Callable"
-                    " or str, got %s - retries are disabled",
+                    " or str, got %s - user retries are disabled",
                     type(self.should_retry),
                 )
                 self.should_retry = None
diff --git a/tests/test_retry.py b/tests/test_retry.py
index d227dd7..b255020 100644
--- a/tests/test_retry.py
+++ b/tests/test_retry.py
@@ -1,4 +1,6 @@
-from scrapy import Request
+from scrapy.http.request import Request
+from scrapy.http.response import Response
+from testfixtures import LogCapture
 
 from crawlera_fetch.middleware import OnError
 
@@ -11,6 +13,7 @@ def retry_function(response, request, spider):
         return True
 
     middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": retry_function})
+    assert middleware.should_retry is not None
     for case in get_test_responses(include_unprocessed=False):
         response = case["original"]
         result = middleware.process_response(response.request, response, foo_spider)
@@ -24,6 +27,7 @@ def retry_function(response, request, spider):
 
 def test_process_response_should_retry_spider_method():
     middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": "should_retry"})
+    assert middleware.should_retry is not None
     for case in get_test_responses(include_unprocessed=False):
         response = case["original"]
         result = middleware.process_response(response.request, response, foo_spider)
@@ -35,6 +39,61 @@ def test_process_response_should_retry_spider_method():
     assert middleware.stats.get_value(base_key + "/reason_count/should-retry") == 4
 
 
+def test_process_response_should_retry_spider_method_non_existent():
+    with LogCapture() as logs:
+        middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": "not_found"})
+
+    assert middleware.should_retry is None
+    logs.check_present(
+        (
+            "crawlera-fetch-middleware",
+            "WARNING",
+            "Could not find a 'not_found' callable on the spider - user retries are disabled",
+        )
+    )
+
+    for case in get_test_responses(include_unprocessed=False):
+        response = case["original"]
+        expected = case["expected"]
+        result = middleware.process_response(response.request, response, foo_spider)
+        assert isinstance(result, Response)
+        assert type(result) is type(expected)
+        assert result.url == expected.url
+        assert result.status == expected.status
+
+    base_key = "crawlera_fetch/retry/should-retry"
+    assert middleware.stats.get_value(base_key + "/count") is None
+    assert middleware.stats.get_value(base_key + "/reason_count/should-retry") is None
+
+
+def test_process_response_should_retry_invalid_type():
+    with LogCapture() as logs:
+        middleware = get_test_middleware(settings={"CRAWLERA_FETCH_SHOULD_RETRY": 1})
+
+    assert middleware.should_retry is None
+    logs.check_present(
+        (
+            "crawlera-fetch-middleware",
+            "WARNING",
+            "Invalid type for retry function: expected Callable"
+            " or str, got <class 'int'> - user retries are disabled",
+        )
+    )
+
+    for case in get_test_responses(include_unprocessed=False):
+        response = case["original"]
+        expected = case["expected"]
+        result = middleware.process_response(response.request, response, foo_spider)
+        assert isinstance(result, Response)
+        assert type(result) is type(expected)
+        assert result.url == expected.url
+        assert result.status == expected.status
+
+    base_key = "crawlera_fetch/retry/should-retry"
+    assert middleware.stats.get_value(base_key + "/count") is None
+    assert middleware.stats.get_value(base_key + "/reason_count/should-retry") is None
+
+
 def test_process_response_error():
     middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Retry})
     for response in get_test_responses_error():

From 9b9560ed729a764693753a7928710b704cda38e3 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Fri, 5 Nov 2021 11:59:03 -0300
Subject: [PATCH 13/14] Comment about excluding upstream code from test
 coverage reports

---
 .coveragerc                  | 2 +-
 crawlera_fetch/middleware.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.coveragerc b/.coveragerc
index 7307f76..8dd2a30 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,2 +1,2 @@
 [run]
-omit = crawlera_fetch/_utils.py
+omit = crawlera_fetch/_utils.py  # already tested in upstream Scrapy
diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index a2707a7..f805c16 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -21,7 +21,7 @@
 from w3lib.http import basic_auth_header
 
 try:
-    from scrapy.downloadermiddlewares.retry import get_retry_request
+    from scrapy.downloadermiddlewares.retry import get_retry_request  # available on Scrapy >= 2.5
 except ImportError:
     from crawlera_fetch._utils import _get_retry_request as get_retry_request
 

From c77983ee4a5716548eddc03ae70c3718226d6e02 Mon Sep 17 00:00:00 2001
From: Eugenio Lacuesta <eugenio.lacuesta@gmail.com>
Date: Fri, 5 Nov 2021 12:21:50 -0300
Subject: [PATCH 14/14] Validate CRAWLERA_FETCH_ON_ERROR setting

---
 crawlera_fetch/__init__.py   |  2 +-
 crawlera_fetch/middleware.py | 19 ++++++++++++++++---
 tests/test_responses.py      | 21 +++++++++++++++++++++
 3 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/crawlera_fetch/__init__.py b/crawlera_fetch/__init__.py
index f04b93b..e87e23b 100644
--- a/crawlera_fetch/__init__.py
+++ b/crawlera_fetch/__init__.py
@@ -1,2 +1,2 @@
 from .logformatter import CrawleraFetchLogFormatter  # noqa: F401
-from .middleware import CrawleraFetchMiddleware, DownloadSlotPolicy  # noqa: F401
+from .middleware import CrawleraFetchMiddleware, DownloadSlotPolicy, OnError  # noqa: F401
diff --git a/crawlera_fetch/middleware.py b/crawlera_fetch/middleware.py
index f805c16..b0e871f 100644
--- a/crawlera_fetch/middleware.py
+++ b/crawlera_fetch/middleware.py
@@ -92,7 +92,7 @@ def _read_settings(self, spider: Spider) -> None:
         )
         self.default_args = settings.getdict("CRAWLERA_FETCH_DEFAULT_ARGS", {})
 
-        # what to do when an error hapepns?
+        # what to do when errors happen?
         self.on_error_action = None  # type: Optional[OnError]
         if "CRAWLERA_FETCH_RAISE_ON_ERROR" in settings:
             warnings.warn(
@@ -106,11 +106,18 @@ def _read_settings(self, spider: Spider) -> None:
             else:
                 self.on_error_action = OnError.Warn
         if "CRAWLERA_FETCH_ON_ERROR" in settings:
-            self.on_error_action = settings.get("CRAWLERA_FETCH_ON_ERROR")
+            if isinstance(settings["CRAWLERA_FETCH_ON_ERROR"], OnError):
+                self.on_error_action = settings["CRAWLERA_FETCH_ON_ERROR"]
+            else:
+                logger.warning(
+                    "Invalid type for CRAWLERA_FETCH_ON_ERROR setting:"
+                    " expected crawlera_fetch.OnError, got %s",
+                    type(settings["CRAWLERA_FETCH_ON_ERROR"]),
+                )
         if self.on_error_action is None:
             self.on_error_action = OnError.Raise
 
-        # should retry?
+        # should we retry?
         self.should_retry = settings.get("CRAWLERA_FETCH_SHOULD_RETRY")
         if self.should_retry is not None:
             if isinstance(self.should_retry, str):
@@ -273,6 +280,8 @@ def process_response(self, request: Request, response: Response, spider: Spider)
                     reason=message,
                     stats_base_key="crawlera_fetch/retry/error",
                 )
+            else:
+                raise Exception("Invalid CRAWLERA_FETCH_ON_ERROR setting")
 
         try:
             json_response = json.loads(response.text)
@@ -299,6 +308,8 @@ def process_response(self, request: Request, response: Response, spider: Spider)
                     reason=exc,
                     stats_base_key="crawlera_fetch/retry/error",
                 )
+            else:
+                raise Exception("Invalid CRAWLERA_FETCH_ON_ERROR setting")
 
         original_status = json_response.get("original_status")
         self.stats.inc_value("crawlera_fetch/response_status_count/{}".format(original_status))
@@ -331,6 +342,8 @@ def process_response(self, request: Request, response: Response, spider: Spider)
                     reason=server_error,
                     stats_base_key="crawlera_fetch/retry/error",
                 )
+            else:
+                raise Exception("Invalid CRAWLERA_FETCH_ON_ERROR setting")
 
         crawlera_meta["upstream_response"] = {
             "status": response.status,
diff --git a/tests/test_responses.py b/tests/test_responses.py
index 53670f4..0aea814 100644
--- a/tests/test_responses.py
+++ b/tests/test_responses.py
@@ -74,6 +74,27 @@ def test_process_response_error():
     assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1
 
 
+def test_process_response_error_default_raise():
+    with LogCapture() as logs:
+        middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": "invalid"})
+    logs.check_present(
+        (
+            "crawlera-fetch-middleware",
+            "WARNING",
+            "Invalid type for CRAWLERA_FETCH_ON_ERROR setting: expected crawlera_fetch.OnError, got <class 'str'>",  # noqa: E501
+        ),
+    )
+
+    for response in get_test_responses_error():
+        with pytest.raises(CrawleraFetchException):
+            middleware.process_response(response.request, response, foo_spider)
+
+    assert middleware.stats.get_value("crawlera_fetch/response_error") == 3
+    assert middleware.stats.get_value("crawlera_fetch/response_error/bad_proxy_auth") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/JSONDecodeError") == 1
+    assert middleware.stats.get_value("crawlera_fetch/response_error/serverbusy") == 1
+
+
 def test_process_response_warn():
     with LogCapture() as logs:
         middleware = get_test_middleware(settings={"CRAWLERA_FETCH_ON_ERROR": OnError.Warn})