From 9caf83d39d3d95173a27b989a6fdf8d5b778b4f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 5 Apr 2019 07:04:46 +0200 Subject: [PATCH] Fail gracefully on SPLASH_URL without protocol --- scrapy_splash/middleware.py | 6 ++++++ tests/test_middleware.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py index 24ab23a..0325648 100644 --- a/scrapy_splash/middleware.py +++ b/scrapy_splash/middleware.py @@ -4,6 +4,7 @@ import copy import json import logging +import re import warnings from collections import defaultdict @@ -233,6 +234,11 @@ def __init__(self, crawler, splash_base_url, slot_policy, log_400): def from_crawler(cls, crawler): splash_base_url = crawler.settings.get('SPLASH_URL', cls.default_splash_url) + if not re.match('^https?://', splash_base_url): + raise NotConfigured( + 'The SPLASH_URL setting does not start with http:// or ' + 'https://: {}'.format(splash_base_url) + ) log_400 = crawler.settings.getbool('SPLASH_LOG_400', True) slot_policy = crawler.settings.get('SPLASH_SLOT_POLICY', cls.default_policy) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 66b79ce..32f6bd2 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -4,8 +4,10 @@ import json import base64 +from pytest import raises import scrapy from scrapy.core.engine import ExecutionEngine +from scrapy.exceptions import NotConfigured from scrapy.utils.test import get_crawler from scrapy.http import Response, TextResponse from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware @@ -765,3 +767,15 @@ def test_adjust_timeout(): }) req2 = mw.process_request(req2, None) assert req2.meta['download_timeout'] == 30 + + +def test_bad_splash_url(): + crawler = _get_crawler({'SPLASH_URL': 'localhost:1234'}) + with raises(NotConfigured): + mw = SplashMiddleware.from_crawler(crawler) + + +def test_bad_slot_policy(): + crawler = _get_crawler({'SPLASH_SLOT_POLICY': 'asdf'}) + with raises(NotConfigured): + mw = SplashMiddleware.from_crawler(crawler)