From eb15ee941ae9865ae712acc588f4e2a8d9dd0160 Mon Sep 17 00:00:00 2001
From: younglim <younglim@users.noreply.github.com>
Date: Tue, 28 Apr 2026 16:27:16 +0800
Subject: [PATCH] New scan strategy only-subpages

---
 README.md                                     | 17 +++++++++++---
 src/constants/cliFunctions.ts                 |  4 ++--
 src/constants/common.ts                       |  6 ++++-
 src/crawlers/crawlDomain.ts                   | 23 ++++++++++++++-----
 src/crawlers/crawlIntelligentSitemap.ts       |  2 +-
 src/generateHtmlReport.ts                     |  1 +
 src/index.ts                                  |  2 +-
 src/mergeAxeResults.ts                        |  1 +
 .../header/aboutScanModal/ScanDetails.ejs     |  3 +++
 src/utils.ts                                  | 15 ++++++++++++
 10 files changed, 60 insertions(+), 14 deletions(-)
diff --git a/README.md b/README.md
index 6f70622c..d005776f 100644
--- a/README.md
+++ b/README.md
@@ -224,6 +224,16 @@ If the website URL provided is invalid, an error message will be prompted for yo
 >> Cannot resolve URL. Please provide a valid URL.
 ```
 
+### Scan Strategy
+
+When running a website scan, the `-s` / `--strategy` option controls which URLs are enqueued for scanning:
+
+| Strategy | Description |
+|---|---|
+| `same-domain` (default) | Crawls all pages sharing the same registered domain, including subdomains (e.g. `docs.example.com` and `www.example.com` are both crawled from `example.com`). |
+| `same-hostname` | Restricts crawling to the exact hostname of the provided URL. Subdomains are not followed. |
+| `same-path` | Only enqueues URLs whose path starts with the directory path of the provided URL. For example, scanning `https://example.com/docs/guide` will only crawl pages under `https://example.com/docs/`. |
+
 ### Customised Mobile Device Scan
 
 ```shell
@@ -325,9 +335,10 @@ Options:
                                       Chrome, 3) Edge. Defaults to Chromium.
                      [choices: "chromium", "chrome", "edge"] [default: "chrome"]
   -s, --strategy                     Crawls up to general (same parent) domains,
-                                      or only specific hostname. Defaults to "sa
-                                     me-domain".
-                                       [choices: "same-domain", "same-hostname"]
+                                     only specific hostname, or only URLs under
+                                     the same path as the provided URL. Defaults
+                                      to "same-domain".
+                       [choices: "same-domain", "same-hostname", "same-path"]
   -e, --exportDirectory              Preferred directory to store scan results.
                                      Path is relative to your home directory.
                                                                         [string]
diff --git a/src/constants/cliFunctions.ts b/src/constants/cliFunctions.ts
index e85f6f1e..9b45436a 100644
--- a/src/constants/cliFunctions.ts
+++ b/src/constants/cliFunctions.ts
@@ -168,8 +168,8 @@ export const cliOptions: { [key: string]: Options } = {
   s: {
     alias: 'strategy',
     describe:
-      'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
-    choices: ['same-domain', 'same-hostname'],
+      'Crawls up to general (same parent) domains, only specific hostname, or only URLs matching the path of the provided URL. Defaults to "same-domain".',
+    choices: ['same-domain', 'same-hostname', 'same-path'],
     requiresArg: true,
     demandOption: false,
   },
diff --git a/src/constants/common.ts b/src/constants/common.ts
index 9d6b6094..2c091739 100644
--- a/src/constants/common.ts
+++ b/src/constants/common.ts
@@ -746,7 +746,11 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
     playwrightDeviceDetailsObject,
     maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
     strategy:
-      strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
+      strategy === 'same-hostname'
+        ? EnqueueStrategy.SameHostname
+        : strategy === 'same-path'
+          ? 'same-path'
+          : EnqueueStrategy.SameDomain,
     isLocalFileScan,
     browser: browserToRun,
     nameEmail,
diff --git a/src/crawlers/crawlDomain.ts b/src/crawlers/crawlDomain.ts
index a2ee92d2..48465b81 100644
--- a/src/crawlers/crawlDomain.ts
+++ b/src/crawlers/crawlDomain.ts
@@ -29,7 +29,7 @@ import {
   getUrlsFromRobotsTxt,
   waitForPageLoaded,
 } from '../constants/common.js';
-import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
+import { areLinksEqual, getMatchPathPrefix, isFollowStrategy, register } from '../utils.js';
 import {
   handlePdfDownload,
   runPdfScan,
@@ -84,7 +84,7 @@ const crawlDomain = async ({
   maxRequestsPerCrawl: number;
   browser: string;
   userDataDirectory: string;
-  strategy: EnqueueStrategy;
+  strategy: EnqueueStrategy | string;
   specifiedMaxConcurrency: number;
   fileTypes: FileTypes;
   blacklistedPatterns: string[];
@@ -122,6 +122,7 @@ const crawlDomain = async ({
   );
   const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
   const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
+  const matchPathPrefix = strategy === 'same-path' ? getMatchPathPrefix(url) : '';
   const { maxConcurrency } = constants;
   const { playwrightDeviceDetailsObject } = viewportSettings;
 
@@ -168,7 +169,10 @@ const crawlDomain = async ({
     const isExcluded = (newPageUrl: string): boolean => {
       const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
       const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
-      const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
+      const isNotFollowStrategy: boolean =
+        strategy === 'same-path'
+          ? !newPageUrl.startsWith(matchPathPrefix)
+          : !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
       const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
         newPageUrl.toLowerCase().startsWith(pattern),
       );
@@ -333,7 +337,8 @@ const crawlDomain = async ({
       await enqueueLinks({
         // set selector matches anchor elements with href but not contains # or starting with mailto:
         selector: `a:not(${disallowedSelectorPatterns})`,
-        strategy,
+        strategy:
+          strategy === 'same-path' ? EnqueueStrategy.SameHostname : (strategy as EnqueueStrategy),
         requestQueue,
         transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
           try {
@@ -341,6 +346,7 @@ const crawlDomain = async ({
           } catch (e) {
             consoleLogger.error(e);
           }
+          if (strategy === 'same-path' && !req.url.startsWith(matchPathPrefix)) return null;
           if (scannedUrlSet.has(req.url)) {
             req.skipNavigation = true;
           }
@@ -475,7 +481,10 @@ const crawlDomain = async ({
           const requestLabelUrl = request.label;
 
           // to handle scenario where the redirected link is not within the scanning website
-          const isLoadedUrlFollowStrategy = isFollowStrategy(finalUrl, requestLabelUrl, strategy);
+          const isLoadedUrlFollowStrategy =
+            strategy === 'same-path'
+              ? finalUrl.startsWith(matchPathPrefix)
+              : isFollowStrategy(finalUrl, requestLabelUrl, strategy);
           if (!isLoadedUrlFollowStrategy) {
             finalUrl = requestLabelUrl;
           }
@@ -513,7 +522,9 @@ const crawlDomain = async ({
           }
 
           if (
-            !isFollowStrategy(url, actualUrl, strategy) &&
+            !(strategy === 'same-path'
+              ? actualUrl.startsWith(matchPathPrefix)
+              : isFollowStrategy(url, actualUrl, strategy)) &&
             (isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))
           ) {
             guiInfoLog(guiInfoStatusTypes.SKIPPED, {
diff --git a/src/crawlers/crawlIntelligentSitemap.ts b/src/crawlers/crawlIntelligentSitemap.ts
index 8b5f2469..8c792561 100644
--- a/src/crawlers/crawlIntelligentSitemap.ts
+++ b/src/crawlers/crawlIntelligentSitemap.ts
@@ -18,7 +18,7 @@ const crawlIntelligentSitemap = async (
   maxRequestsPerCrawl: number,
   browser: string,
   userDataDirectory: string,
-  strategy: EnqueueStrategy,
+  strategy: EnqueueStrategy | string,
   specifiedMaxConcurrency: number,
   fileTypes: FileTypes,
   blacklistedPatterns: string[],
diff --git a/src/generateHtmlReport.ts b/src/generateHtmlReport.ts
index 49975466..1113ff5c 100644
--- a/src/generateHtmlReport.ts
+++ b/src/generateHtmlReport.ts
@@ -186,6 +186,7 @@ export const generateHtmlReport = async (resultDir: string): Promise<string> =>
       advancedScanOptionsSummaryItems: {
         showIncludeScreenshots: !!scanData.advancedScanOptionsSummaryItems?.showIncludeScreenshots,
         showAllowSubdomains: !!scanData.advancedScanOptionsSummaryItems?.showAllowSubdomains,
+        showOnlySubpages: !!scanData.advancedScanOptionsSummaryItems?.showOnlySubpages,
         showEnableCustomChecks: !!scanData.advancedScanOptionsSummaryItems?.showEnableCustomChecks,
         showEnableWcagAaa: !!scanData.advancedScanOptionsSummaryItems?.showEnableWcagAaa,
         showSlowScanMode: !!scanData.advancedScanOptionsSummaryItems?.showSlowScanMode,
diff --git a/src/index.ts b/src/index.ts
index 76529731..01365e16 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -64,7 +64,7 @@ export type Data = {
   viewportWidth: number;
   playwrightDeviceDetailsObject: DeviceDescriptor;
   maxRequestsPerCrawl: number;
-  strategy: EnqueueStrategy;
+  strategy: EnqueueStrategy | string;
   isLocalFileScan: boolean;
   browser: string;
   nameEmail: string;
diff --git a/src/mergeAxeResults.ts b/src/mergeAxeResults.ts
index 41bc5e98..08d20be1 100644
--- a/src/mergeAxeResults.ts
+++ b/src/mergeAxeResults.ts
@@ -798,6 +798,7 @@ const generateArtifacts = async (
     advancedScanOptionsSummaryItems: {
       showIncludeScreenshots: [true].includes(scanDetails.isIncludeScreenshots),
       showAllowSubdomains: ['same-domain'].includes(scanDetails.isAllowSubdomains),
+      showOnlySubpages: ['same-path'].includes(scanDetails.isAllowSubdomains),
       showEnableCustomChecks: ['default', 'enable-wcag-aaa'].includes(
         scanDetails.isEnableCustomChecks?.[0],
       ),
diff --git a/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs b/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs
index bd048df4..53846402 100644
--- a/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs
+++ b/src/static/ejs/partials/components/header/aboutScanModal/ScanDetails.ejs
@@ -93,6 +93,9 @@
       <% if (advancedScanOptionsSummaryItems.showAllowSubdomains) { %>
         <li class="advanced-sublist-li">Allow subdomains for scans</li>
       <% } %>
+      <% if (advancedScanOptionsSummaryItems.showOnlySubpages) { %>
+        <li class="advanced-sublist-li">Only subpages</li>
+      <% } %>
       <% if (advancedScanOptionsSummaryItems.showEnableCustomChecks) { %>
         <li class="advanced-sublist-li">Enable custom checks</li>
       <% } %>
diff --git a/src/utils.ts b/src/utils.ts
index 80de109f..bc236a0c 100644
--- a/src/utils.ts
+++ b/src/utils.ts
@@ -1078,6 +1078,21 @@ export const randomThreeDigitNumberString = () => {
   return String(threeDigitNumber);
 };
 
+export const getMatchPathPrefix = (url: string): string => {
+  try {
+    const parsed = new URL(url);
+    const pathname = parsed.pathname;
+    if (pathname === '/' || pathname.endsWith('/')) {
+      return parsed.origin + pathname;
+    }
+    const lastSlash = pathname.lastIndexOf('/');
+    const dirPath = lastSlash >= 0 ? pathname.substring(0, lastSlash + 1) : '/';
+    return parsed.origin + dirPath;
+  } catch {
+    return url;
+  }
+};
+
 export const isFollowStrategy = (link1: string, link2: string, rule: string): boolean => {
   try {
     const parsedLink1 = new URL(link1);