Skip to content
14 changes: 14 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,20 @@ Alternatively, you can use regular scrapy.Request and
}
})

It is also possible to configure Splash for all requests in a Spider by default
using a ``splash`` spider attribute::

class MySpider(Spider):
name = 'myspider'
splash = {
# …
}

If you use a ``splash`` spider attribute, you can still override those Splash
settings for specific requests using the ``splash`` request meta key, or
disable Splash completely setting the ``dont_splash`` request meta key to
``True``.

Use ``request.meta['splash']`` API in middlewares or when scrapy.Request
subclasses are used (there is also ``SplashFormRequest`` described below).
For example, ``meta['splash']`` allows to create a middleware which enables
Expand Down
13 changes: 10 additions & 3 deletions scrapy_splash/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,16 @@ def _argument_values(self):
def _remote_keys(self):
return self.crawler.spider.state[self.remote_keys_key]

def _get_splash_options(self, request, spider):
if request.meta.get("dont_splash") is True:
return
spider_options = getattr(spider, "splash", {})
request_options = request.meta.get("splash")
return request_options or spider_options

def process_request(self, request, spider):
if 'splash' not in request.meta:
splash_options = self._get_splash_options(request, spider)
if not splash_options:
return

if request.method not in {'GET', 'POST'}:
Expand All @@ -274,7 +282,6 @@ def process_request(self, request, spider):
# don't process the same request more than once
return

splash_options = request.meta['splash']
request.meta['_splash_processed'] = True

slot_policy = splash_options.get('slot_policy', self.slot_policy)
Expand Down Expand Up @@ -368,7 +375,7 @@ def process_response(self, request, response, spider):
if not request.meta.get("_splash_processed"):
return response

splash_options = request.meta['splash']
splash_options = self._get_splash_options(request, spider)
if not splash_options:
return response

Expand Down
56 changes: 56 additions & 0 deletions tests/test_middleware.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-

from __future__ import absolute_import
import copy
import json
Expand Down Expand Up @@ -765,3 +766,58 @@ def test_adjust_timeout():
})
req2 = mw.process_request(req2, None)
assert req2.meta['download_timeout'] == 30


def test_spider_attribute():
req_url = "http://scrapy.org"
spider = scrapy.Spider("example")
mw = _get_mw()

req1 = scrapy.Request(req_url)
spider.splash = {"args": {"images": 0}}

mw = _get_mw()
req2 = mw.process_request(req1, spider)
assert "_splash_processed" in req2.meta
assert "render.json" in req2.url
request_data = json.loads(req2.body.decode('utf8'))
assert "url" in request_data
assert request_data.get("url") == req_url
assert "images" in request_data
assert req2.method == 'POST'

response = Response(req_url, request=req2)
response2 = mw.process_response(req2, response, spider)
assert response2 is not response


def test_spider_attribute_dont_splash():
req_url = "http://scrapy.org"
spider = scrapy.Spider("example")
mw = _get_mw()

req1 = scrapy.Request(req_url, meta={'dont_splash': True})
spider.splash = {"args": {"images": 0}}

req2 = mw.process_request(req1, spider)
assert req2 is None

response = Response(req_url, request=req1)
response2 = mw.process_response(req1, response, spider)
assert response2 is response


def test_spider_attribute_blank():
req_url = "http://scrapy.org"
spider = scrapy.Spider("example")
mw = _get_mw()

req1 = scrapy.Request(req_url)
spider.splash = {}

req2 = mw.process_request(req1, spider)
assert req2 is None

response = Response(req_url, request=req1)
response2 = mw.process_response(req1, response, spider)
assert response2 is response