Skip to content
This repository was archived by the owner on Jul 5, 2023. It is now read-only.

Commit e920e45

Browse files
authored
Merge pull request #2 from marianobrc/master
Request class with parameters and utomatic urls handling
2 parents 5f34e89 + 7776635 commit e920e45

File tree

4 files changed

+181
-3
lines changed

4 files changed

+181
-3
lines changed

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,27 @@ DOWNLOADER_MIDDLEWARES = {
2727
'scrapy_proxycrawl.ProxyCrawlMiddleware': 610
2828
}
2929
```
30+
## Usage
31+
32+
Use the scrapy_proxycrawl.ProxyCrawlRequest instead of the scrapy built-in Request.
33+
The scrapy_proxycrawl.ProxyCrawlRequest accepts additional arguments, used in Proxy Crawl API:
34+
35+
```python
36+
from scrapy_proxycrawl import ProxyCrawlRequest
37+
38+
yield ProxyCrawlRequest(
39+
"http://target-url",
40+
callback=self.parse_result
41+
device='desktop',
42+
country='US',
43+
page_wait=1000,
44+
ajax_wait=True,
45+
dont_filter=True
46+
)
47+
```
48+
49+
The target url will be replaced with proxy crawl url and parameters will be encoded into the url by the middleware automatically.
50+
3051

3152
If you have questions or need help using the library, please open an issue or [contact us](https://proxycrawl.com/contact).
3253

scrapy_proxycrawl/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
try:
22
# Python 2
33
from proxycrawl import ProxyCrawlMiddleware
4+
from request import ProxyCrawlRequest
5+
from response import ProxyCrawlResponse, ProxyCrawlTextResponse
46
except ImportError:
57
# Python 3
68
from .proxycrawl import ProxyCrawlMiddleware
9+
from .request import ProxyCrawlRequest

scrapy_proxycrawl/proxycrawl.py

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import logging
2+
from .request import ProxyCrawlRequest
3+
24
try:
35
# For Python 3.0 and later
46
from urllib.parse import quote_plus
@@ -8,6 +10,7 @@
810

911
log = logging.getLogger('scrapy.proxycrawl')
1012

13+
1114
class ProxyCrawlMiddleware(object):
1215
def __init__(self, settings):
1316
self.proxycrawl_enabled = settings.get('PROXYCRAWL_ENABLED', True)
@@ -19,10 +22,42 @@ def from_crawler(cls, crawler):
1922
return cls(crawler.settings)
2023

2124
def process_request(self, request, spider):
25+
"""Process a request using the proxycrawl API if applicable"""
26+
2227
if not self.proxycrawl_enabled:
2328
log.warning('Skipping ProxyCrawl API CALL disabled!')
24-
return
29+
return None
30+
31+
if not isinstance(request, ProxyCrawlRequest):
32+
return None
33+
2534
if self.proxycrawl_url not in request.url:
26-
new_url = 'https://api.proxycrawl.com/?token=%s&url=%s' % (self.proxycrawl_token, quote_plus(request.url))
27-
log.debug('Using ProxyCrawl API, overridden URL is: %s' % (new_url))
35+
new_url = self._get_proxied_url(request.url, request.query_params_str)
36+
log.debug('Using ProxyCrawl API, Request overridden with URL: {}'.format(new_url))
2837
return request.replace(url=new_url)
38+
39+
def process_response(self, request, response, spider):
40+
"""Process a response coming from proxycrawl API if applicable"""
41+
42+
if not isinstance(request, ProxyCrawlRequest):
43+
return response
44+
45+
# Replace url again with the original url saved in request
46+
log.debug('Using ProxyCrawl API, Response overridden with URL: {}'.format(request.original_url))
47+
return response.replace(url=request.original_url)
48+
49+
def _get_proxied_url(self, url, query_params):
50+
"""
51+
Transform the url into a call to proxy crawl api, sending the target url as query parameter.
52+
"""
53+
original_url_encoded = quote_plus(url, safe='')
54+
proxycrawl_url = self.proxycrawl_url
55+
proxycrawl_token = self.proxycrawl_token
56+
proxycrawl_query_params = query_params
57+
proxied_url = '{}/?token={}&{}&url={}'.format(
58+
proxycrawl_url,
59+
proxycrawl_token,
60+
proxycrawl_query_params,
61+
original_url_encoded
62+
)
63+
return proxied_url

scrapy_proxycrawl/request.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import urllib
2+
import urllib.parse
3+
import copy
4+
from json import JSONEncoder
5+
from scrapy import Request
6+
7+
8+
class ProxyCrawlRequest(Request):
9+
"""Scrapy ``Request`` subclass providing additional arguments for Proxy Crawl"""
10+
11+
def __init__(self, url, original_url=None, response_format='html', user_agent=None, page_wait=None, ajax_wait=False,
12+
css_click_selector=None, device='desktop', get_cookies=False,
13+
get_headers=False, proxy_session=None, cookies_session=None,
14+
screenshot=False, scraper=None, autoparse=False, country=None, **kwargs):
15+
"""
16+
Initialize a new request
17+
18+
Docs: https://proxycrawl.com/dashboard/api/docs
19+
20+
response_format: str
21+
Indicates the response response_format, either json or html. Defaults to html
22+
user_agent: str
23+
If you want to make the request with a custom user agent, you can pass it here
24+
page_wait: int
25+
If you are using the javascript token, you can optionally pass page_wait parameter to wait
26+
an amount of milliseconds before the browser captures the resulting html code.
27+
ajax_wait: boolean
28+
If you are using the javascript token, you can optionally pass ajax_wait parameter to wait
29+
for the ajax requests to finish before getting the html response.
30+
css_click_selector: str
31+
If you are using the javascript token, you can optionally pass css_click_selector
32+
parameter to click an element in the page before the browser captures the resulting html code.
33+
device: str
34+
Optionally, if you don't want to specify a user_agent but you want to have the requests from
35+
a specific device, you can use this parameter. There are two options available: desktop and mobile.
36+
get_cookies: boolean
37+
Optionally, if you need to get the cookies that the original website sets on the response,
38+
you can use the get_cookies=True parameter
39+
get_headers: boolean
40+
Optionally, if you need to get the headers that the original website sets on the response,
41+
you can use the get_headers=True parameter.
42+
proxy_session: str
43+
If you need to use the same proxy for subsequent requests, you can use the
44+
proxy_session parameter. The &proxy_session= parameter can be any value. Simply send a new value to create a
45+
new proxy session (this will allow you to continue using the same proxy for all subsequent requests with
46+
that proxy session value). Sessions expire 30 seconds after the last API call.
47+
cookies_session: str
48+
If you need to send the cookies that come back on every request to all subsequent calls,
49+
you can use the &cookies_session= parameter. The cookies_session parameter can be any value. Simply send a
50+
new value to create a new cookies session (this will allow you to send the returned cookies from the
51+
subsequent calls to the next API calls with that cookies session value).
52+
Sessions expire in 300 seconds after the last API call.
53+
screenshot: boolean
54+
If you are using the javascript token, you can optionally pass &screenshot=true parameter to
55+
get a screenshot in JPEG response_format of the whole crawled page. ProxyCrawl will send you back the screenshot_url
56+
in the response headers (or in the json response if you use &response_format=json). The screenshot_url expires in
57+
one hour.
58+
scraper: str
59+
Returns back the information parsed according to the specified scraper. Check the list of all
60+
the available data scrapers to see which one to choose. The response will come back as JSON.
61+
If you don't use it, you will receive back the full HTML of the page so you can scrape it freely.
62+
autoparse: boolean
63+
Optionally, if you need to get the scraped data of the page that you requested, you can pass
64+
autoparse=True parameter. The response will come back as JSON. The structure of the response varies
65+
depending on the URL that you sent. autoparse is an optional parameter. If you don't use it, you will
66+
receive back the full HTML of the page so you can scrape it freely.
67+
country: str
68+
If you want your requests to be geolocated from a specific country, you can use the country
69+
parameter, like country='US' (two-character country code). Please take into account that specifying a
70+
country can reduce the amount of successful requests you get back, so use it wisely and only when
71+
geolocation crawls are required.
72+
73+
:param args: other args to be passed to Scrapy base Request constructor
74+
:param kwargs: other kwargs to be passed to Scrapy base Request constructor
75+
"""
76+
self.original_url = original_url if original_url else url # Save url to replace it in response later
77+
self.response_format = response_format
78+
self.user_agent = user_agent
79+
self.page_wait = page_wait
80+
self.ajax_wait = ajax_wait
81+
self.css_click_selector = css_click_selector
82+
self.device = device
83+
self.get_cookies = get_cookies
84+
self.get_headers = get_headers
85+
self.proxy_session = proxy_session
86+
self.cookies_session = cookies_session
87+
self.screenshot = screenshot
88+
self.scraper = scraper
89+
self.autoparse = autoparse
90+
self.country = country
91+
self.query_params_str=self._build_query_params()
92+
super().__init__(url, **kwargs)
93+
94+
def replace(self, *args, **kwargs):
95+
"""Create a new Request with the same attributes except for those
96+
given new values.
97+
"""
98+
for x in ['url', 'original_url', 'response_format', 'user_agent', 'page_wait',
99+
'ajax_wait', 'css_click_selector', 'device', 'get_cookies', 'get_headers',
100+
'proxy_session', 'cookies_session', 'screenshot', 'scraper', 'autoparse',
101+
'country', 'method', 'headers', 'body', 'cookies', 'meta', 'flags',
102+
'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs']:
103+
kwargs.setdefault(x, getattr(self, x))
104+
cls = kwargs.pop('cls', self.__class__)
105+
return cls(*args, **kwargs)
106+
107+
def _build_query_params(self):
108+
encoder = JSONEncoder()
109+
# Prepare aprams from attributes
110+
params = copy.deepcopy(self.__dict__)
111+
params.pop('original_url') # Remove param
112+
params['format'] = params.pop('response_format') # rename param
113+
# Convert values proxy crawl compatible values (json-like, i.e True -> 'true')
114+
# and ignore params with None or False value
115+
params = [(k, encoder.encode(v).strip('"')) for k,v in params.items() if v]
116+
# Make query string
117+
query_params = urllib.parse.urlencode(params)
118+
return query_params
119+

0 commit comments

Comments
 (0)