Skip to content

Commit 627faf5

Browse files
fix(workers): mitigate leaking browser contexts and setup and auto reaper (#2503)
* fix(workers): mitigate leaking browser contexts and setup and auto reaper * comments
1 parent ec0aaad commit 627faf5

File tree

2 files changed

+199
-13
lines changed

2 files changed

+199
-13
lines changed

apps/workers/workers/crawlerWorker.ts

Lines changed: 196 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,12 @@ import {
2222
matchesNoProxy,
2323
validateUrl,
2424
} from "network";
25-
import { Browser, BrowserContextOptions } from "playwright";
25+
import {
26+
Browser,
27+
BrowserContext,
28+
BrowserContextOptions,
29+
Page,
30+
} from "playwright";
2631
import { chromium } from "playwright-extra";
2732
import StealthPlugin from "puppeteer-extra-plugin-stealth";
2833
import { withWorkerTracing } from "workerTracing";
@@ -191,6 +196,73 @@ let globalCookies: Cookie[] = [];
191196
// This is needed given that most of the browser APIs are async.
192197
const browserMutex = new Mutex();
193198

199+
// Tracks active browser contexts so we can reap leaked ones.
200+
const activeContexts = new Map<
201+
string,
202+
{ context: BrowserContext; createdAt: number }
203+
>();
204+
205+
const CONTEXT_CLOSE_TIMEOUT_MS = 10_000;
206+
const PAGE_CLOSE_TIMEOUT_MS = 5_000;
207+
208+
/**
209+
* Reaps browser contexts that have been open longer than the max job timeout.
210+
* This is a safety net for cases where context.close() hangs or is never called.
211+
*/
212+
function startContextReaper() {
213+
const maxContextAgeMs =
214+
(serverConfig.crawler.jobTimeoutSec + 30) * 1000 +
215+
60_000 * 5; /* 5 minutes buffer */
216+
const intervalId = setInterval(() => {
217+
try {
218+
const now = Date.now();
219+
for (const [id, entry] of activeContexts) {
220+
if (now - entry.createdAt > maxContextAgeMs) {
221+
logger.warn(
222+
`[Crawler] Reaping stale browser context for job ${id} (age: ${Math.round((now - entry.createdAt) / 1000)}s)`,
223+
);
224+
void Promise.race([
225+
entry.context
226+
.close()
227+
.then(() => true)
228+
.catch((e: unknown) => {
229+
logger.warn(
230+
`[Crawler] Failed to close stale context for job ${id}: ${e}`,
231+
);
232+
return true;
233+
}),
234+
new Promise<false>((r) =>
235+
setTimeout(() => r(false), CONTEXT_CLOSE_TIMEOUT_MS),
236+
),
237+
]).then((contextClosed) => {
238+
// Protect against deleting a newer context if the job id gets reused.
239+
if (!contextClosed) {
240+
logger.warn(
241+
`[Crawler] Timed out closing stale context for job ${id} — keeping in active set for retry`,
242+
);
243+
return;
244+
}
245+
if (activeContexts.get(id) === entry) {
246+
activeContexts.delete(id);
247+
}
248+
});
249+
}
250+
}
251+
} catch (e) {
252+
logger.error(
253+
`[Crawler] caught an unexpected error while reaping stale browser contexts: ${e}`,
254+
);
255+
}
256+
}, 60_000 * 5);
257+
exitAbortController.signal.addEventListener(
258+
"abort",
259+
() => clearInterval(intervalId),
260+
{
261+
once: true,
262+
},
263+
);
264+
}
265+
194266
async function startBrowserInstance() {
195267
if (serverConfig.crawler.browserWebSocketUrl) {
196268
logger.info(
@@ -289,6 +361,7 @@ export class CrawlerWorker {
289361
);
290362
}
291363
await loadCookiesFromFile();
364+
startContextReaper();
292365
})();
293366
}
294367
return CrawlerWorker.initPromise;
@@ -519,6 +592,8 @@ async function crawlPage(
519592
}),
520593
);
521594

595+
activeContexts.set(jobId, { context, createdAt: Date.now() });
596+
let page: Page | undefined;
522597
try {
523598
if (globalCookies.length > 0) {
524599
await context.addCookies(globalCookies);
@@ -527,7 +602,7 @@ async function crawlPage(
527602
);
528603
}
529604

530-
const page = await withSpan(
605+
page = await withSpan(
531606
tracer,
532607
"crawlerWorker.crawlPage.setupPage",
533608
{
@@ -598,10 +673,27 @@ async function crawlPage(
598673
// Continue with other requests
599674
await route.continue();
600675
});
676+
677+
// On abort, immediately stop intercepting requests so that
678+
// in-flight route handlers don't block page/context closure.
679+
abortSignal.addEventListener(
680+
"abort",
681+
() => {
682+
nextPage.unrouteAll({ behavior: "ignoreErrors" }).catch(() => {
683+
// Ignore errors — the page may already be closed.
684+
});
685+
},
686+
{ once: true },
687+
);
688+
601689
return nextPage;
602690
},
603691
);
604692

693+
// page is guaranteed to be assigned here; alias to a const for
694+
// TypeScript narrowing so the rest of the try block sees `Page`.
695+
const activePage = page;
696+
605697
// Navigate to the target URL
606698
const navigationValidation = await withSpan(
607699
tracer,
@@ -634,7 +726,7 @@ async function crawlPage(
634726
},
635727
async () =>
636728
Promise.race([
637-
page.goto(targetUrl, {
729+
activePage.goto(targetUrl, {
638730
timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
639731
waitUntil: "domcontentloaded",
640732
}),
@@ -662,7 +754,7 @@ async function crawlPage(
662754
},
663755
async () => {
664756
await Promise.race([
665-
page
757+
activePage
666758
.waitForLoadState("networkidle", { timeout: 5000 })
667759
.catch(() => ({})),
668760
new Promise((resolve) => setTimeout(resolve, 5000)),
@@ -695,7 +787,7 @@ async function crawlPage(
695787
},
696788
},
697789
async () => {
698-
const content = await page.content();
790+
const content = await activePage.content();
699791
abortSignal.throwIfAborted();
700792
logger.info(
701793
`[Crawler][${jobId}] Successfully fetched the page content.`,
@@ -719,7 +811,7 @@ async function crawlPage(
719811
const { data: screenshotData, error: screenshotError } =
720812
await tryCatch(
721813
Promise.race<Buffer>([
722-
page.screenshot({
814+
activePage.screenshot({
723815
// If you change this, you need to change the asset type in the store function.
724816
type: "jpeg",
725817
fullPage: serverConfig.crawler.fullPageScreenshot,
@@ -769,7 +861,7 @@ async function crawlPage(
769861
async () => {
770862
const { data: pdfData, error: pdfError } = await tryCatch(
771863
Promise.race<Buffer>([
772-
page.pdf({
864+
activePage.pdf({
773865
format: "A4",
774866
printBackground: true,
775867
}),
@@ -818,14 +910,105 @@ async function crawlPage(
818910
statusCode: response?.status() ?? 0,
819911
screenshot,
820912
pdf,
821-
url: page.url(),
913+
url: activePage.url(),
822914
};
823915
} finally {
824-
await context.close();
825-
// Only close the browser if it was created on demand
826-
if (serverConfig.crawler.browserConnectOnDemand) {
827-
await browser.close();
828-
}
916+
await withSpan(
917+
tracer,
918+
"crawlerWorker.crawlPage.cleanup",
919+
{
920+
attributes: {
921+
"job.id": jobId,
922+
"crawler.cleanup.hasPage": !!page,
923+
},
924+
},
925+
async () => {
926+
// Explicitly close the page first (with timeout) to release resources
927+
// even if context.close() later hangs.
928+
if (page) {
929+
const pageToClose = page;
930+
const pageClosed = await withSpan(
931+
tracer,
932+
"crawlerWorker.crawlPage.cleanup.closePage",
933+
{ attributes: { "job.id": jobId } },
934+
async () =>
935+
Promise.race([
936+
pageToClose
937+
.close()
938+
.then(() => true)
939+
.catch((e: unknown) => {
940+
logger.warn(
941+
`[Crawler][${jobId}] page.close() failed: ${e}`,
942+
);
943+
return true;
944+
}),
945+
new Promise<false>((r) =>
946+
setTimeout(() => r(false), PAGE_CLOSE_TIMEOUT_MS),
947+
),
948+
]),
949+
);
950+
setSpanAttributes({ "crawler.cleanup.pageClosed": pageClosed });
951+
if (!pageClosed) {
952+
logger.warn(`[Crawler][${jobId}] page.close() timed out`);
953+
}
954+
}
955+
956+
// Close the context (with timeout) to avoid hanging on in-flight ops.
957+
// Only remove from tracking if close actually succeeded; otherwise
958+
// the reaper will retry the close later.
959+
const contextClosed = await withSpan(
960+
tracer,
961+
"crawlerWorker.crawlPage.cleanup.closeContext",
962+
{ attributes: { "job.id": jobId } },
963+
async () =>
964+
Promise.race([
965+
context
966+
.close()
967+
.then(() => true)
968+
.catch((e: unknown) => {
969+
logger.warn(
970+
`[Crawler][${jobId}] context.close() failed: ${e}`,
971+
);
972+
return true; // Error means it's likely already closed
973+
}),
974+
new Promise<false>((r) =>
975+
setTimeout(() => r(false), CONTEXT_CLOSE_TIMEOUT_MS),
976+
),
977+
]),
978+
);
979+
setSpanAttributes({
980+
"crawler.cleanup.contextClosed": contextClosed,
981+
});
982+
983+
if (contextClosed) {
984+
activeContexts.delete(jobId);
985+
} else {
986+
logger.warn(
987+
`[Crawler][${jobId}] context.close() timed out — leaving in active set for reaper`,
988+
);
989+
}
990+
991+
// Only close the browser if it was created on demand
992+
if (serverConfig.crawler.browserConnectOnDemand) {
993+
await withSpan(
994+
tracer,
995+
"crawlerWorker.crawlPage.cleanup.closeBrowser",
996+
{ attributes: { "job.id": jobId } },
997+
async () =>
998+
browser
999+
.close()
1000+
.then(() => {
1001+
activeContexts.delete(jobId);
1002+
})
1003+
.catch((e: unknown) => {
1004+
logger.warn(
1005+
`[Crawler][${jobId}] browser.close() failed: ${e}`,
1006+
);
1007+
}),
1008+
);
1009+
}
1010+
},
1011+
);
8291012
}
8301013
},
8311014
);

packages/shared-server/src/tracingTypes.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ export type TracingAttributeKey =
2929
| "crawler.getContentType.statusCode"
3030
| "crawler.contentType"
3131
| "crawler.statusCode"
32+
| "crawler.cleanup.hasPage"
33+
| "crawler.cleanup.pageClosed"
34+
| "crawler.cleanup.contextClosed"
3235
// Database attributes
3336
| "db.system"
3437
| "db.statement"

0 commit comments

Comments
 (0)