From eb15ee941ae9865ae712acc588f4e2a8d9dd0160 Mon Sep 17 00:00:00 2001 From: younglim Date: Tue, 28 Apr 2026 16:27:16 +0800 Subject: [PATCH] New scan strategy only-subpages --- README.md | 17 +++++++++++--- src/constants/cliFunctions.ts | 4 ++-- src/constants/common.ts | 6 ++++- src/crawlers/crawlDomain.ts | 23 ++++++++++++++----- src/crawlers/crawlIntelligentSitemap.ts | 2 +- src/generateHtmlReport.ts | 1 + src/index.ts | 2 +- src/mergeAxeResults.ts | 1 + .../header/aboutScanModal/ScanDetails.ejs | 3 +++ src/utils.ts | 15 ++++++++++++ 10 files changed, 60 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 6f70622c..d005776f 100644 --- a/README.md +++ b/README.md @@ -224,6 +224,16 @@ If the website URL provided is invalid, an error message will be prompted for yo >> Cannot resolve URL. Please provide a valid URL. ``` +### Scan Strategy + +When running a website scan, the `-s` / `--strategy` option controls which URLs are enqueued for scanning: + +| Strategy | Description | +|---|---| +| `same-domain` (default) | Crawls all pages sharing the same registered domain, including subdomains (e.g. `docs.example.com` and `www.example.com` are both crawled from `example.com`). | +| `same-hostname` | Restricts crawling to the exact hostname of the provided URL. Subdomains are not followed. | +| `same-path` | Only enqueues URLs whose path starts with the directory path of the provided URL. For example, scanning `https://example.com/docs/guide` will only crawl pages under `https://example.com/docs/`. | + ### Customised Mobile Device Scan ```shell @@ -325,9 +335,10 @@ Options: Chrome, 3) Edge. Defaults to Chromium. [choices: "chromium", "chrome", "edge"] [default: "chrome"] -s, --strategy Crawls up to general (same parent) domains, - or only specific hostname. Defaults to "sa - me-domain". - [choices: "same-domain", "same-hostname"] + only specific hostname, or only URLs under + the same path as the provided URL. Defaults + to "same-domain". + [choices: "same-domain", "same-hostname", "same-path"] -e, --exportDirectory Preferred directory to store scan results. Path is relative to your home directory. [string] diff --git a/src/constants/cliFunctions.ts b/src/constants/cliFunctions.ts index e85f6f1e..9b45436a 100644 --- a/src/constants/cliFunctions.ts +++ b/src/constants/cliFunctions.ts @@ -168,8 +168,8 @@ export const cliOptions: { [key: string]: Options } = { s: { alias: 'strategy', describe: - 'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".', - choices: ['same-domain', 'same-hostname'], + 'Crawls up to general (same parent) domains, only specific hostname, or only URLs matching the path of the provided URL. Defaults to "same-domain".', + choices: ['same-domain', 'same-hostname', 'same-path'], requiresArg: true, demandOption: false, }, diff --git a/src/constants/common.ts b/src/constants/common.ts index 9d6b6094..2c091739 100644 --- a/src/constants/common.ts +++ b/src/constants/common.ts @@ -746,7 +746,11 @@ export const prepareData = async (argv: Answers): Promise => { playwrightDeviceDetailsObject, maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl, strategy: - strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain, + strategy === 'same-hostname' + ? EnqueueStrategy.SameHostname + : strategy === 'same-path' + ? 'same-path' + : EnqueueStrategy.SameDomain, isLocalFileScan, browser: browserToRun, nameEmail, diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts index a2ee92d2..48465b81 100644 --- a/src/crawlers/crawlDomain.ts +++ b/src/crawlers/crawlDomain.ts @@ -29,7 +29,7 @@ import { getUrlsFromRobotsTxt, waitForPageLoaded, } from '../constants/common.js'; -import { areLinksEqual, isFollowStrategy, register } from '../utils.js'; +import { areLinksEqual, getMatchPathPrefix, isFollowStrategy, register } from '../utils.js'; import { handlePdfDownload, runPdfScan, @@ -84,7 +84,7 @@ const crawlDomain = async ({ maxRequestsPerCrawl: number; browser: string; userDataDirectory: string; - strategy: EnqueueStrategy; + strategy: EnqueueStrategy | string; specifiedMaxConcurrency: number; fileTypes: FileTypes; blacklistedPatterns: string[]; @@ -122,6 +122,7 @@ const crawlDomain = async ({ ); const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes); const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes); + const matchPathPrefix = strategy === 'same-path' ? getMatchPathPrefix(url) : ''; const { maxConcurrency } = constants; const { playwrightDeviceDetailsObject } = viewportSettings; @@ -168,7 +169,10 @@ const crawlDomain = async ({ const isExcluded = (newPageUrl: string): boolean => { const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl); const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns); - const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy); + const isNotFollowStrategy: boolean = + strategy === 'same-path' + ? !newPageUrl.startsWith(matchPathPrefix) + : !isFollowStrategy(newPageUrl, initialPageUrl, strategy); const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern => newPageUrl.toLowerCase().startsWith(pattern), ); @@ -333,7 +337,8 @@ const crawlDomain = async ({ await enqueueLinks({ // set selector matches anchor elements with href but not contains # or starting with mailto: selector: `a:not(${disallowedSelectorPatterns})`, - strategy, + strategy: + strategy === 'same-path' ? EnqueueStrategy.SameHostname : (strategy as EnqueueStrategy), requestQueue, transformRequestFunction: (req: RequestOptions): RequestOptions | null => { try { @@ -341,6 +346,7 @@ const crawlDomain = async ({ } catch (e) { consoleLogger.error(e); } + if (strategy === 'same-path' && !req.url.startsWith(matchPathPrefix)) return null; if (scannedUrlSet.has(req.url)) { req.skipNavigation = true; } @@ -475,7 +481,10 @@ const crawlDomain = async ({ const requestLabelUrl = request.label; // to handle scenario where the redirected link is not within the scanning website - const isLoadedUrlFollowStrategy = isFollowStrategy(finalUrl, requestLabelUrl, strategy); + const isLoadedUrlFollowStrategy = + strategy === 'same-path' + ? finalUrl.startsWith(matchPathPrefix) + : isFollowStrategy(finalUrl, requestLabelUrl, strategy); if (!isLoadedUrlFollowStrategy) { finalUrl = requestLabelUrl; } @@ -513,7 +522,9 @@ const crawlDomain = async ({ } if ( - !isFollowStrategy(url, actualUrl, strategy) && + !(strategy === 'same-path' + ? actualUrl.startsWith(matchPathPrefix) + : isFollowStrategy(url, actualUrl, strategy)) && (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs)) ) { guiInfoLog(guiInfoStatusTypes.SKIPPED, { diff --git a/src/crawlers/crawlIntelligentSitemap.ts b/src/crawlers/crawlIntelligentSitemap.ts index 8b5f2469..8c792561 100644 --- a/src/crawlers/crawlIntelligentSitemap.ts +++ b/src/crawlers/crawlIntelligentSitemap.ts @@ -18,7 +18,7 @@ const crawlIntelligentSitemap = async ( maxRequestsPerCrawl: number, browser: string, userDataDirectory: string, - strategy: EnqueueStrategy, + strategy: EnqueueStrategy | string, specifiedMaxConcurrency: number, fileTypes: FileTypes, blacklistedPatterns: string[], diff --git a/src/generateHtmlReport.ts b/src/generateHtmlReport.ts index 49975466..1113ff5c 100644 --- a/src/generateHtmlReport.ts +++ b/src/generateHtmlReport.ts @@ -186,6 +186,7 @@ export const generateHtmlReport = async (resultDir: string): Promise => advancedScanOptionsSummaryItems: { showIncludeScreenshots: !!scanData.advancedScanOptionsSummaryItems?.showIncludeScreenshots, showAllowSubdomains: !!scanData.advancedScanOptionsSummaryItems?.showAllowSubdomains, + showOnlySubpages: !!scanData.advancedScanOptionsSummaryItems?.showOnlySubpages, showEnableCustomChecks: !!scanData.advancedScanOptionsSummaryItems?.showEnableCustomChecks, showEnableWcagAaa: !!scanData.advancedScanOptionsSummaryItems?.showEnableWcagAaa, showSlowScanMode: !!scanData.advancedScanOptionsSummaryItems?.showSlowScanMode, diff --git a/src/index.ts b/src/index.ts index 76529731..01365e16 100644 --- a/src/index.ts +++ b/src/index.ts @@ -64,7 +64,7 @@ export type Data = { viewportWidth: number; playwrightDeviceDetailsObject: DeviceDescriptor; maxRequestsPerCrawl: number; - strategy: EnqueueStrategy; + strategy: EnqueueStrategy | string; isLocalFileScan: boolean; browser: string; nameEmail: string; diff --git a/src/mergeAxeResults.ts b/src/mergeAxeResults.ts index 41bc5e98..08d20be1 100644 --- a/src/mergeAxeResults.ts +++ b/src/mergeAxeResults.ts @@ -798,6 +798,7 @@ const generateArtifacts = async ( advancedScanOptionsSummaryItems: { showIncludeScreenshots: [true].includes(scanDetails.isIncludeScreenshots), showAllowSubdomains: ['same-domain'].includes(scanDetails.isAllowSubdomains), + showOnlySubpages: ['same-path'].includes(scanDetails.isAllowSubdomains), showEnableCustomChecks: ['default', 'enable-wcag-aaa'].includes( scanDetails.isEnableCustomChecks?.[0], ), diff --git a/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs b/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs index bd048df4..53846402 100644 --- a/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs +++ b/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs @@ -93,6 +93,9 @@ <% if (advancedScanOptionsSummaryItems.showAllowSubdomains) { %>
  • Allow subdomains for scans
  • <% } %> + <% if (advancedScanOptionsSummaryItems.showOnlySubpages) { %> +
  • Only subpages
  • + <% } %> <% if (advancedScanOptionsSummaryItems.showEnableCustomChecks) { %>
  • Enable custom checks
  • <% } %> diff --git a/src/utils.ts b/src/utils.ts index 80de109f..bc236a0c 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1078,6 +1078,21 @@ export const randomThreeDigitNumberString = () => { return String(threeDigitNumber); }; +export const getMatchPathPrefix = (url: string): string => { + try { + const parsed = new URL(url); + const pathname = parsed.pathname; + if (pathname === '/' || pathname.endsWith('/')) { + return parsed.origin + pathname; + } + const lastSlash = pathname.lastIndexOf('/'); + const dirPath = lastSlash >= 0 ? pathname.substring(0, lastSlash + 1) : '/'; + return parsed.origin + dirPath; + } catch { + return url; + } +}; + export const isFollowStrategy = (link1: string, link2: string, rule: string): boolean => { try { const parsedLink1 = new URL(link1);