-
Notifications
You must be signed in to change notification settings - Fork 142
Closed as duplicate of#62
Closed as duplicate of#62
Copy link
Labels
Description
Cannot close spider through SIGINT (ctrl+c)
My code:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy_playwright.page import PageMethod
meta={
'playwright': True,
'playwright_include_page': True,
'playwright_page_methods': [PageMethod('wait_for_load_state','networkidle')]
}
async def error_back(failure):
page = failure.request.meta["playwright_page"]
await page.close()
class MsuSpider(scrapy.Spider):
name = "msu"
allowed_domains = ["msu.ru"]
start_urls = ["https://msu.ru"]
custom_settings = {
'PLAYWRIGHT_BROWSER_TYPE': 'chromium',
'HTTPCACHE_ENABLED': False,
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
}
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.link_extractor = LinkExtractor(unique=True)
def start_requests(self) :
start_urls = ["https://msu.ru"]
for url in start_urls:
yield scrapy.Request(
url=url,
meta=meta,
callback=self.parse,
errback=error_back
)
async def parse(self, response, **kwargs):
page = response.meta["playwright_page"]
screenshot = await page.screenshot(path=f"{response.url}.png", full_page=True)
await page.close()
links_on_current_page = self.link_extractor.extract_links(response)
for link in links_on_current_page:
yield response.follow(link.url, callback=self.parse, meta=meta, errback=error_back)
logs:
2024-10-17 14:08:04 [scrapy.crawler] INFO: Received SIGINT, shutting down gracefully. Send again to force
2024-10-17 14:08:04 [scrapy.core.engine] INFO: Closing spider (shutdown)
2024-10-17 14:25:12 [scrapy-playwright] INFO: Launching browser chromium
2024-10-17 14:25:12 [scrapy.core.scraper] ERROR: Error downloading <GET https://msu.ru/ch>
Traceback (most recent call last):
File "/home/../../../venv/lib/python3.12/site-packages/twisted/internet/defer.py", line 1999, in _inlineCallbacks
result = context.run(
File "/home/../../../venv/lib/python3.12/site-packages/twisted/python/failure.py", line 519, in throwExceptionIntoGenerator
return g.throw(self.value.with_traceback(self.tb))
File "/home/../../../venv/lib/python3.12/site-packages/scrapy/core/downloader/middleware.py", line 54, in process_request
return (yield download_func(request=request, spider=spider))
File "/home/../../../venv/lib/python3.12/site-packages/twisted/internet/defer.py", line 1251, in adapt
extracted: _SelfResultT | Failure = result.result()
File "/home/../../../venv/lib/python3.12/site-packages/scrapy_playwright/handler.py", line 378, in _download_request
return await self._download_request_with_retry(request=request, spider=spider)
File "/home/../../../venv/lib/python3.12/site-packages/scrapy_playwright/handler.py", line 397, in _download_request_with_retry
page = await self._create_page(request=request, spider=spider)
File "/home/../../.../venv/lib/python3.12/site-packages/scrapy_playwright/handler.py", line 296, in _create_page
ctx_wrapper = await self._create_browser_context(
File "/home/../../../venv/lib/python3.12/site-packages/scrapy_playwright/handler.py", line 257, in _create_browser_context
await self._maybe_launch_browser()
File "/home/../../../venv/lib/python3.12/site-packages/scrapy_playwright/handler.py", line 205, in _maybe_launch_browser
self.browser = await self.browser_type.launch(**self.config.launch_options)
File "/home/../../../venv/lib/python3.12/site-packages/playwright/async_api/_generated.py", line 14115, in launch
await self._impl_obj.launch(
File "/home/../../../venv/lib/python3.12/site-packages/playwright/_impl/_browser_type.py", line 95, in launch
Browser, from_channel(await self._channel.send("launch", params))
File "/home/../../../venv/lib/python3.12/site-packages/playwright/_impl/_connection.py", line 59, in send
return await self._connection.wrap_api_call(
File "/home/../../../venv/lib/python3.12/site-packages/playwright/_impl/_connection.py", line 514, in wrap_api_call
raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None
Exception: BrowserType.launch: Connection closed while reading from the driver
2024-10-17 14:25:12 [scrapy.core.scraper] ERROR: Spider error processing <GET https://msu.ru/ch> (referer: https://msu.ru/)
Traceback (most recent call last):
File "/home/../../../venv/lib/python3.12/site-packages/twisted/internet/defer.py", line 1251, in adapt
extracted: _SelfResultT | Failure = result.result()
^^^^^^^^^^^^^^^
File "/home/../../../../crawlers/spiders/msu.py", line 28, in error_back
page = failure.request.meta["playwright_page"]
~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^
KeyError: 'playwright_page'
playwright._impl._errors.TargetClosedError: Page.screenshot: Target page, context or browser has been closed
Call log:
taking page screenshot
- waiting for fonts to load...
- fonts loaded
/home/../../../venv/lib/python3.12/site-packages/playwright/driver/package/lib/server/chromium/crPage.js:491
this._firstNonInitialNavigationCommittedReject(new _errors.TargetClosedError());
^
TargetClosedError: Target page, context or browser has been closed
at FrameSession.dispose (/home/../../../venv/lib/python3.12/site-packages/playwright/driver/package/lib/server/chromium/crPage.js:491:52)
at CRPage.didClose (/home/../../../venv/lib/python3.12/site-packages/playwright/driver/package/lib/server/chromium/crPage.js:162:60)
at CRBrowser._onDetachedFromTarget (/home/../../../venv/lib/python3.12/site-packages/playwright/driver/package/lib/server/chromium/crBrowser.js:200:14)
at CRSession.emit (node:events:519:28)
at /home/../../../venv/lib/python3.12/site-packages/playwright/driver/package/lib/server/chromium/crConnection.js:160:14
Node.js v20.17.0
2024-10-17 14:08:25 [asyncio] WARNING: pipe closed by peer or os.write(pipe, data) raised exception.
2024-10-17 14:08:28 [asyncio] WARNING: pipe closed by peer or os.write(pipe, data) raised exception.
...
2024-10-17 14:13:58 [scrapy.extensions.logstats] INFO: Crawled 66 pages (at 0 pages/min), scraped 62 items (at 0 items/min)
last lines continues forever