@@ -22,7 +22,12 @@ import {
2222 matchesNoProxy ,
2323 validateUrl ,
2424} from "network" ;
25- import { Browser , BrowserContextOptions } from "playwright" ;
25+ import {
26+ Browser ,
27+ BrowserContext ,
28+ BrowserContextOptions ,
29+ Page ,
30+ } from "playwright" ;
2631import { chromium } from "playwright-extra" ;
2732import StealthPlugin from "puppeteer-extra-plugin-stealth" ;
2833import { withWorkerTracing } from "workerTracing" ;
@@ -191,6 +196,73 @@ let globalCookies: Cookie[] = [];
191196// This is needed given that most of the browser APIs are async.
192197const browserMutex = new Mutex ( ) ;
193198
199+ // Tracks active browser contexts so we can reap leaked ones.
200+ const activeContexts = new Map <
201+ string ,
202+ { context : BrowserContext ; createdAt : number }
203+ > ( ) ;
204+
205+ const CONTEXT_CLOSE_TIMEOUT_MS = 10_000 ;
206+ const PAGE_CLOSE_TIMEOUT_MS = 5_000 ;
207+
208+ /**
209+ * Reaps browser contexts that have been open longer than the max job timeout.
210+ * This is a safety net for cases where context.close() hangs or is never called.
211+ */
212+ function startContextReaper ( ) {
213+ const maxContextAgeMs =
214+ ( serverConfig . crawler . jobTimeoutSec + 30 ) * 1000 +
215+ 60_000 * 5 ; /* 5 minutes buffer */
216+ const intervalId = setInterval ( ( ) => {
217+ try {
218+ const now = Date . now ( ) ;
219+ for ( const [ id , entry ] of activeContexts ) {
220+ if ( now - entry . createdAt > maxContextAgeMs ) {
221+ logger . warn (
222+ `[Crawler] Reaping stale browser context for job ${ id } (age: ${ Math . round ( ( now - entry . createdAt ) / 1000 ) } s)` ,
223+ ) ;
224+ void Promise . race ( [
225+ entry . context
226+ . close ( )
227+ . then ( ( ) => true )
228+ . catch ( ( e : unknown ) => {
229+ logger . warn (
230+ `[Crawler] Failed to close stale context for job ${ id } : ${ e } ` ,
231+ ) ;
232+ return true ;
233+ } ) ,
234+ new Promise < false > ( ( r ) =>
235+ setTimeout ( ( ) => r ( false ) , CONTEXT_CLOSE_TIMEOUT_MS ) ,
236+ ) ,
237+ ] ) . then ( ( contextClosed ) => {
238+ // Protect against deleting a newer context if the job id gets reused.
239+ if ( ! contextClosed ) {
240+ logger . warn (
241+ `[Crawler] Timed out closing stale context for job ${ id } — keeping in active set for retry` ,
242+ ) ;
243+ return ;
244+ }
245+ if ( activeContexts . get ( id ) === entry ) {
246+ activeContexts . delete ( id ) ;
247+ }
248+ } ) ;
249+ }
250+ }
251+ } catch ( e ) {
252+ logger . error (
253+ `[Crawler] caught an unexpected error while reaping stale browser contexts: ${ e } ` ,
254+ ) ;
255+ }
256+ } , 60_000 * 5 ) ;
257+ exitAbortController . signal . addEventListener (
258+ "abort" ,
259+ ( ) => clearInterval ( intervalId ) ,
260+ {
261+ once : true ,
262+ } ,
263+ ) ;
264+ }
265+
194266async function startBrowserInstance ( ) {
195267 if ( serverConfig . crawler . browserWebSocketUrl ) {
196268 logger . info (
@@ -289,6 +361,7 @@ export class CrawlerWorker {
289361 ) ;
290362 }
291363 await loadCookiesFromFile ( ) ;
364+ startContextReaper ( ) ;
292365 } ) ( ) ;
293366 }
294367 return CrawlerWorker . initPromise ;
@@ -519,6 +592,8 @@ async function crawlPage(
519592 } ) ,
520593 ) ;
521594
595+ activeContexts . set ( jobId , { context, createdAt : Date . now ( ) } ) ;
596+ let page : Page | undefined ;
522597 try {
523598 if ( globalCookies . length > 0 ) {
524599 await context . addCookies ( globalCookies ) ;
@@ -527,7 +602,7 @@ async function crawlPage(
527602 ) ;
528603 }
529604
530- const page = await withSpan (
605+ page = await withSpan (
531606 tracer ,
532607 "crawlerWorker.crawlPage.setupPage" ,
533608 {
@@ -598,10 +673,27 @@ async function crawlPage(
598673 // Continue with other requests
599674 await route . continue ( ) ;
600675 } ) ;
676+
677+ // On abort, immediately stop intercepting requests so that
678+ // in-flight route handlers don't block page/context closure.
679+ abortSignal . addEventListener (
680+ "abort" ,
681+ ( ) => {
682+ nextPage . unrouteAll ( { behavior : "ignoreErrors" } ) . catch ( ( ) => {
683+ // Ignore errors — the page may already be closed.
684+ } ) ;
685+ } ,
686+ { once : true } ,
687+ ) ;
688+
601689 return nextPage ;
602690 } ,
603691 ) ;
604692
693+ // page is guaranteed to be assigned here; alias to a const for
694+ // TypeScript narrowing so the rest of the try block sees `Page`.
695+ const activePage = page ;
696+
605697 // Navigate to the target URL
606698 const navigationValidation = await withSpan (
607699 tracer ,
@@ -634,7 +726,7 @@ async function crawlPage(
634726 } ,
635727 async ( ) =>
636728 Promise . race ( [
637- page . goto ( targetUrl , {
729+ activePage . goto ( targetUrl , {
638730 timeout : serverConfig . crawler . navigateTimeoutSec * 1000 ,
639731 waitUntil : "domcontentloaded" ,
640732 } ) ,
@@ -662,7 +754,7 @@ async function crawlPage(
662754 } ,
663755 async ( ) => {
664756 await Promise . race ( [
665- page
757+ activePage
666758 . waitForLoadState ( "networkidle" , { timeout : 5000 } )
667759 . catch ( ( ) => ( { } ) ) ,
668760 new Promise ( ( resolve ) => setTimeout ( resolve , 5000 ) ) ,
@@ -695,7 +787,7 @@ async function crawlPage(
695787 } ,
696788 } ,
697789 async ( ) => {
698- const content = await page . content ( ) ;
790+ const content = await activePage . content ( ) ;
699791 abortSignal . throwIfAborted ( ) ;
700792 logger . info (
701793 `[Crawler][${ jobId } ] Successfully fetched the page content.` ,
@@ -719,7 +811,7 @@ async function crawlPage(
719811 const { data : screenshotData , error : screenshotError } =
720812 await tryCatch (
721813 Promise . race < Buffer > ( [
722- page . screenshot ( {
814+ activePage . screenshot ( {
723815 // If you change this, you need to change the asset type in the store function.
724816 type : "jpeg" ,
725817 fullPage : serverConfig . crawler . fullPageScreenshot ,
@@ -769,7 +861,7 @@ async function crawlPage(
769861 async ( ) => {
770862 const { data : pdfData , error : pdfError } = await tryCatch (
771863 Promise . race < Buffer > ( [
772- page . pdf ( {
864+ activePage . pdf ( {
773865 format : "A4" ,
774866 printBackground : true ,
775867 } ) ,
@@ -818,14 +910,105 @@ async function crawlPage(
818910 statusCode : response ?. status ( ) ?? 0 ,
819911 screenshot,
820912 pdf,
821- url : page . url ( ) ,
913+ url : activePage . url ( ) ,
822914 } ;
823915 } finally {
824- await context . close ( ) ;
825- // Only close the browser if it was created on demand
826- if ( serverConfig . crawler . browserConnectOnDemand ) {
827- await browser . close ( ) ;
828- }
916+ await withSpan (
917+ tracer ,
918+ "crawlerWorker.crawlPage.cleanup" ,
919+ {
920+ attributes : {
921+ "job.id" : jobId ,
922+ "crawler.cleanup.hasPage" : ! ! page ,
923+ } ,
924+ } ,
925+ async ( ) => {
926+ // Explicitly close the page first (with timeout) to release resources
927+ // even if context.close() later hangs.
928+ if ( page ) {
929+ const pageToClose = page ;
930+ const pageClosed = await withSpan (
931+ tracer ,
932+ "crawlerWorker.crawlPage.cleanup.closePage" ,
933+ { attributes : { "job.id" : jobId } } ,
934+ async ( ) =>
935+ Promise . race ( [
936+ pageToClose
937+ . close ( )
938+ . then ( ( ) => true )
939+ . catch ( ( e : unknown ) => {
940+ logger . warn (
941+ `[Crawler][${ jobId } ] page.close() failed: ${ e } ` ,
942+ ) ;
943+ return true ;
944+ } ) ,
945+ new Promise < false > ( ( r ) =>
946+ setTimeout ( ( ) => r ( false ) , PAGE_CLOSE_TIMEOUT_MS ) ,
947+ ) ,
948+ ] ) ,
949+ ) ;
950+ setSpanAttributes ( { "crawler.cleanup.pageClosed" : pageClosed } ) ;
951+ if ( ! pageClosed ) {
952+ logger . warn ( `[Crawler][${ jobId } ] page.close() timed out` ) ;
953+ }
954+ }
955+
956+ // Close the context (with timeout) to avoid hanging on in-flight ops.
957+ // Only remove from tracking if close actually succeeded; otherwise
958+ // the reaper will retry the close later.
959+ const contextClosed = await withSpan (
960+ tracer ,
961+ "crawlerWorker.crawlPage.cleanup.closeContext" ,
962+ { attributes : { "job.id" : jobId } } ,
963+ async ( ) =>
964+ Promise . race ( [
965+ context
966+ . close ( )
967+ . then ( ( ) => true )
968+ . catch ( ( e : unknown ) => {
969+ logger . warn (
970+ `[Crawler][${ jobId } ] context.close() failed: ${ e } ` ,
971+ ) ;
972+ return true ; // Error means it's likely already closed
973+ } ) ,
974+ new Promise < false > ( ( r ) =>
975+ setTimeout ( ( ) => r ( false ) , CONTEXT_CLOSE_TIMEOUT_MS ) ,
976+ ) ,
977+ ] ) ,
978+ ) ;
979+ setSpanAttributes ( {
980+ "crawler.cleanup.contextClosed" : contextClosed ,
981+ } ) ;
982+
983+ if ( contextClosed ) {
984+ activeContexts . delete ( jobId ) ;
985+ } else {
986+ logger . warn (
987+ `[Crawler][${ jobId } ] context.close() timed out — leaving in active set for reaper` ,
988+ ) ;
989+ }
990+
991+ // Only close the browser if it was created on demand
992+ if ( serverConfig . crawler . browserConnectOnDemand ) {
993+ await withSpan (
994+ tracer ,
995+ "crawlerWorker.crawlPage.cleanup.closeBrowser" ,
996+ { attributes : { "job.id" : jobId } } ,
997+ async ( ) =>
998+ browser
999+ . close ( )
1000+ . then ( ( ) => {
1001+ activeContexts . delete ( jobId ) ;
1002+ } )
1003+ . catch ( ( e : unknown ) => {
1004+ logger . warn (
1005+ `[Crawler][${ jobId } ] browser.close() failed: ${ e } ` ,
1006+ ) ;
1007+ } ) ,
1008+ ) ;
1009+ }
1010+ } ,
1011+ ) ;
8291012 }
8301013 } ,
8311014 ) ;
0 commit comments