Skip to content
This repository was archived by the owner on Jul 5, 2023. It is now read-only.

Commit ae112e4

Browse files
committed
improve method to get final url
1 parent 1f5473c commit ae112e4

File tree

1 file changed

+8
-5
lines changed

1 file changed

+8
-5
lines changed

scrapy_proxycrawl/proxycrawl.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def process_request(self, request, spider):
3232
return None
3333

3434
if self.proxycrawl_url not in request.url:
35-
new_url = self._get_proxied_url(request)
35+
new_url = self._get_proxied_url(request.url, request.query_params_str)
3636
log.debug('Using ProxyCrawl API, Request overridden with URL: {}'.format(new_url))
3737
return request.replace(url=new_url)
3838

@@ -46,15 +46,18 @@ def process_response(self, request, response, spider):
4646
log.debug('Using ProxyCrawl API, Response overridden with URL: {}'.format(request.original_url))
4747
return response.replace(url=request.original_url)
4848

49-
def _get_proxied_url(self, request):
50-
original_url_encoded = quote_plus(request.url, safe='')
49+
def _get_proxied_url(self, url, query_params):
50+
"""
51+
Transform the url into a call to proxy crawl api, sending the target url as query parameter.
52+
"""
53+
original_url_encoded = quote_plus(url, safe='')
5154
proxycrawl_url = self.proxycrawl_url
5255
proxycrawl_token = self.proxycrawl_token
53-
proxycrawl_query_params = request.query_params_str # 'country=US&device=desktop&page_wait=5000&ajax_wait=true'
56+
proxycrawl_query_params = query_params
5457
proxied_url = '{}/?token={}&{}&url={}'.format(
5558
proxycrawl_url,
5659
proxycrawl_token,
5760
proxycrawl_query_params,
5861
original_url_encoded
5962
)
60-
return proxied_url
63+
return proxied_url

0 commit comments

Comments
 (0)