Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/crawlers/commonCrawlerFunc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1159,6 +1159,16 @@ export const postNavigationHooks = [
},
];

export const getPreLaunchHook = (userDataDirectory: string) => {
return async (_pageId: string, launchContext: any) => {
const fsp = await import('fs/promises').then(m => m.default);
await fsp.mkdir(userDataDirectory, { recursive: true });
await fsp.rm(path.join(userDataDirectory, 'SingletonLock'), { force: true });
// eslint-disable-next-line no-param-reassign
launchContext.userDataDir = userDataDirectory;
};
};

export const failedRequestHandler = async ({ request }: { request: Request }) => {
guiInfoLog(guiInfoStatusTypes.ERROR, { numScanned: 0, urlScanned: request.url });
log.error(`Failed Request - ${request.url}: ${request.errorMessages}`);
Expand Down
28 changes: 3 additions & 25 deletions src/crawlers/crawlDomain.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,9 @@ import crawlee, { EnqueueStrategy } from 'crawlee';
import { CrawlRateController } from './crawlRateController.js';
import type { BrowserContext, ElementHandle, Frame, Page } from 'playwright';
import type { PlaywrightCrawlingContext, RequestOptions } from 'crawlee';
import * as path from 'path';
import fsp from 'fs/promises';
import {
createCrawleeSubFolders,
getPreLaunchHook,
runAxeScript,
isUrlPdf,
shouldSkipClickDueToDisallowedHref,
Expand Down Expand Up @@ -391,43 +390,22 @@ const crawlDomain = async ({
launchContext: {
launcher: constants.launcher,
launchOptions: getPlaywrightLaunchOptions(browser),
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
},
retryOnBlocked: true,
browserPoolOptions: {
useFingerprints: false,
preLaunchHooks: [
getPreLaunchHook(userDataDirectory),
async (_pageId, launchContext) => {
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...

// Ensure base exists
await fsp.mkdir(baseDir, { recursive: true });

// Create a unique subdir per browser
const subProfileDir = path.join(
baseDir,
`profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
);
await fsp.mkdir(subProfileDir, { recursive: true });

// Assign to Crawlee's launcher
// Crawlee preLaunchHooks expects launchContext to be mutated in-place.
// eslint-disable-next-line no-param-reassign
launchContext.userDataDir = subProfileDir;

// Safely extend launchOptions
// eslint-disable-next-line no-param-reassign
launchContext.launchOptions = {
...launchContext.launchOptions,
ignoreHTTPSErrors: true,
...playwrightDeviceDetailsObject,
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
...(extraHTTPHeaders && { extraHTTPHeaders }),
};

// Optionally log for debugging
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
},
],
},
Expand Down
26 changes: 3 additions & 23 deletions src/crawlers/crawlSitemap.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import crawlee, { EnqueueStrategy, LaunchContext, Request, RequestList, Dataset } from 'crawlee';
import { CrawlRateController } from './crawlRateController.js';
import fs from 'fs';
import * as path from 'path';
import fsp from 'fs/promises';
import {
createCrawleeSubFolders,
getPreLaunchHook,
preNavigationHooks,
runAxeScript,
isUrlPdf,
Expand Down Expand Up @@ -130,39 +129,20 @@ const crawlSitemap = async ({
launchContext: {
launcher: constants.launcher,
launchOptions: getPlaywrightLaunchOptions(browser),
// Bug in Chrome which causes browser pool crash when userDataDirectory is set in non-headless mode
...(process.env.CRAWLEE_HEADLESS === '1' && { userDataDir: userDataDirectory }),
},
retryOnBlocked: true,
browserPoolOptions: {
useFingerprints: false,
preLaunchHooks: [
getPreLaunchHook(userDataDirectory),
async (_pageId, launchContext) => {
const baseDir = userDataDirectory; // e.g., /Users/young/.../Chrome/oobee-...

// Ensure base exists
await fsp.mkdir(baseDir, { recursive: true });

// Create a unique subdir per browser
const subProfileDir = path.join(
baseDir,
`profile-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
);
await fsp.mkdir(subProfileDir, { recursive: true });

// Assign to Crawlee's launcher
launchContext.userDataDir = subProfileDir;

// Safely extend launchOptions
launchContext.launchOptions = {
...launchContext.launchOptions,
ignoreHTTPSErrors: true,
...playwrightDeviceDetailsObject,
...(process.env.OOBEE_USER_AGENT && { userAgent: process.env.OOBEE_USER_AGENT }),
...(process.env.OOBEE_DISABLE_BROWSER_DOWNLOAD && { acceptDownloads: false }),
};

// Optionally log for debugging
// console.log(`[HOOK] Using userDataDir: ${subProfileDir}`);
},
],
},
Expand Down
Loading