|
| 1 | +import urllib |
| 2 | +import urllib.parse |
| 3 | +import copy |
| 4 | +from json import JSONEncoder |
| 5 | +from scrapy import Request |
| 6 | + |
| 7 | + |
| 8 | +class ProxyCrawlRequest(Request): |
| 9 | + """Scrapy ``Request`` subclass providing additional arguments for Proxy Crawl""" |
| 10 | + |
| 11 | + def __init__(self, url, original_url=None, response_format='html', user_agent=None, page_wait=None, ajax_wait=False, |
| 12 | + css_click_selector=None, device='desktop', get_cookies=False, |
| 13 | + get_headers=False, proxy_session=None, cookies_session=None, |
| 14 | + screenshot=False, scraper=None, autoparse=False, country=None, **kwargs): |
| 15 | + """ |
| 16 | + Initialize a new request |
| 17 | +
|
| 18 | + Docs: https://proxycrawl.com/dashboard/api/docs |
| 19 | +
|
| 20 | + response_format: str |
| 21 | + Indicates the response response_format, either json or html. Defaults to html |
| 22 | + user_agent: str |
| 23 | + If you want to make the request with a custom user agent, you can pass it here |
| 24 | + page_wait: int |
| 25 | + If you are using the javascript token, you can optionally pass page_wait parameter to wait |
| 26 | + an amount of milliseconds before the browser captures the resulting html code. |
| 27 | + ajax_wait: boolean |
| 28 | + If you are using the javascript token, you can optionally pass ajax_wait parameter to wait |
| 29 | + for the ajax requests to finish before getting the html response. |
| 30 | + css_click_selector: str |
| 31 | + If you are using the javascript token, you can optionally pass css_click_selector |
| 32 | + parameter to click an element in the page before the browser captures the resulting html code. |
| 33 | + device: str |
| 34 | + Optionally, if you don't want to specify a user_agent but you want to have the requests from |
| 35 | + a specific device, you can use this parameter. There are two options available: desktop and mobile. |
| 36 | + get_cookies: boolean |
| 37 | + Optionally, if you need to get the cookies that the original website sets on the response, |
| 38 | + you can use the get_cookies=True parameter |
| 39 | + get_headers: boolean |
| 40 | + Optionally, if you need to get the headers that the original website sets on the response, |
| 41 | + you can use the get_headers=True parameter. |
| 42 | + proxy_session: str |
| 43 | + If you need to use the same proxy for subsequent requests, you can use the |
| 44 | + proxy_session parameter. The &proxy_session= parameter can be any value. Simply send a new value to create a |
| 45 | + new proxy session (this will allow you to continue using the same proxy for all subsequent requests with |
| 46 | + that proxy session value). Sessions expire 30 seconds after the last API call. |
| 47 | + cookies_session: str |
| 48 | + If you need to send the cookies that come back on every request to all subsequent calls, |
| 49 | + you can use the &cookies_session= parameter. The cookies_session parameter can be any value. Simply send a |
| 50 | + new value to create a new cookies session (this will allow you to send the returned cookies from the |
| 51 | + subsequent calls to the next API calls with that cookies session value). |
| 52 | + Sessions expire in 300 seconds after the last API call. |
| 53 | + screenshot: boolean |
| 54 | + If you are using the javascript token, you can optionally pass &screenshot=true parameter to |
| 55 | + get a screenshot in JPEG response_format of the whole crawled page. ProxyCrawl will send you back the screenshot_url |
| 56 | + in the response headers (or in the json response if you use &response_format=json). The screenshot_url expires in |
| 57 | + one hour. |
| 58 | + scraper: str |
| 59 | + Returns back the information parsed according to the specified scraper. Check the list of all |
| 60 | + the available data scrapers to see which one to choose. The response will come back as JSON. |
| 61 | + If you don't use it, you will receive back the full HTML of the page so you can scrape it freely. |
| 62 | + autoparse: boolean |
| 63 | + Optionally, if you need to get the scraped data of the page that you requested, you can pass |
| 64 | + autoparse=True parameter. The response will come back as JSON. The structure of the response varies |
| 65 | + depending on the URL that you sent. autoparse is an optional parameter. If you don't use it, you will |
| 66 | + receive back the full HTML of the page so you can scrape it freely. |
| 67 | + country: str |
| 68 | + If you want your requests to be geolocated from a specific country, you can use the country |
| 69 | + parameter, like country='US' (two-character country code). Please take into account that specifying a |
| 70 | + country can reduce the amount of successful requests you get back, so use it wisely and only when |
| 71 | + geolocation crawls are required. |
| 72 | +
|
| 73 | + :param args: other args to be passed to Scrapy base Request constructor |
| 74 | + :param kwargs: other kwargs to be passed to Scrapy base Request constructor |
| 75 | + """ |
| 76 | + self.original_url = original_url if original_url else url # Save url to replace it in response later |
| 77 | + self.response_format = response_format |
| 78 | + self.user_agent = user_agent |
| 79 | + self.page_wait = page_wait |
| 80 | + self.ajax_wait = ajax_wait |
| 81 | + self.css_click_selector = css_click_selector |
| 82 | + self.device = device |
| 83 | + self.get_cookies = get_cookies |
| 84 | + self.get_headers = get_headers |
| 85 | + self.proxy_session = proxy_session |
| 86 | + self.cookies_session = cookies_session |
| 87 | + self.screenshot = screenshot |
| 88 | + self.scraper = scraper |
| 89 | + self.autoparse = autoparse |
| 90 | + self.country = country |
| 91 | + self.query_params_str=self._build_query_params() |
| 92 | + super().__init__(url, **kwargs) |
| 93 | + |
| 94 | + def replace(self, *args, **kwargs): |
| 95 | + """Create a new Request with the same attributes except for those |
| 96 | + given new values. |
| 97 | + """ |
| 98 | + for x in ['url', 'original_url', 'response_format', 'user_agent', 'page_wait', |
| 99 | + 'ajax_wait', 'css_click_selector', 'device', 'get_cookies', 'get_headers', |
| 100 | + 'proxy_session', 'cookies_session', 'screenshot', 'scraper', 'autoparse', |
| 101 | + 'country', 'method', 'headers', 'body', 'cookies', 'meta', 'flags', |
| 102 | + 'encoding', 'priority', 'dont_filter', 'callback', 'errback', 'cb_kwargs']: |
| 103 | + kwargs.setdefault(x, getattr(self, x)) |
| 104 | + cls = kwargs.pop('cls', self.__class__) |
| 105 | + return cls(*args, **kwargs) |
| 106 | + |
| 107 | + def _build_query_params(self): |
| 108 | + encoder = JSONEncoder() |
| 109 | + # Prepare aprams from attributes |
| 110 | + params = copy.deepcopy(self.__dict__) |
| 111 | + params.pop('original_url') # Remove param |
| 112 | + params['format'] = params.pop('response_format') # rename param |
| 113 | + # Convert values proxy crawl compatible values (json-like, i.e True -> 'true') |
| 114 | + # and ignore params with None or False value |
| 115 | + params = [(k, encoder.encode(v).strip('"')) for k,v in params.items() if v] |
| 116 | + # Make query string |
| 117 | + query_params = urllib.parse.urlencode(params) |
| 118 | + return query_params |
| 119 | + |
0 commit comments