Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,16 @@ If the website URL provided is invalid, an error message will be prompted for yo
>> Cannot resolve URL. Please provide a valid URL.
```

### Scan Strategy

When running a website scan, the `-s` / `--strategy` option controls which URLs are enqueued for scanning:

| Strategy | Description |
|---|---|
| `same-domain` (default) | Crawls all pages sharing the same registered domain, including subdomains (e.g. `docs.example.com` and `www.example.com` are both crawled from `example.com`). |
| `same-hostname` | Restricts crawling to the exact hostname of the provided URL. Subdomains are not followed. |
| `same-path` | Only enqueues URLs whose path starts with the directory path of the provided URL. For example, scanning `https://example.com/docs/guide` will only crawl pages under `https://example.com/docs/`. |

### Customised Mobile Device Scan

```shell
Expand Down Expand Up @@ -325,9 +335,10 @@ Options:
Chrome, 3) Edge. Defaults to Chromium.
[choices: "chromium", "chrome", "edge"] [default: "chrome"]
-s, --strategy Crawls up to general (same parent) domains,
or only specific hostname. Defaults to "sa
me-domain".
[choices: "same-domain", "same-hostname"]
only specific hostname, or only URLs under
the same path as the provided URL. Defaults
to "same-domain".
[choices: "same-domain", "same-hostname", "same-path"]
-e, --exportDirectory Preferred directory to store scan results.
Path is relative to your home directory.
[string]
Expand Down
4 changes: 2 additions & 2 deletions src/constants/cliFunctions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ export const cliOptions: { [key: string]: Options } = {
s: {
alias: 'strategy',
describe:
'Crawls up to general (same parent) domains, or only specific hostname. Defaults to "same-domain".',
choices: ['same-domain', 'same-hostname'],
'Crawls up to general (same parent) domains, only specific hostname, or only URLs matching the path of the provided URL. Defaults to "same-domain".',
choices: ['same-domain', 'same-hostname', 'same-path'],
requiresArg: true,
demandOption: false,
},
Expand Down
6 changes: 5 additions & 1 deletion src/constants/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -746,7 +746,11 @@ export const prepareData = async (argv: Answers): Promise<Data> => {
playwrightDeviceDetailsObject,
maxRequestsPerCrawl: maxpages || constants.maxRequestsPerCrawl,
strategy:
strategy === 'same-hostname' ? EnqueueStrategy.SameHostname : EnqueueStrategy.SameDomain,
strategy === 'same-hostname'
? EnqueueStrategy.SameHostname
: strategy === 'same-path'
? 'same-path'
: EnqueueStrategy.SameDomain,
isLocalFileScan,
browser: browserToRun,
nameEmail,
Expand Down
23 changes: 17 additions & 6 deletions src/crawlers/crawlDomain.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import {
getUrlsFromRobotsTxt,
waitForPageLoaded,
} from '../constants/common.js';
import { areLinksEqual, isFollowStrategy, register } from '../utils.js';
import { areLinksEqual, getMatchPathPrefix, isFollowStrategy, register } from '../utils.js';
import {
handlePdfDownload,
runPdfScan,
Expand Down Expand Up @@ -84,7 +84,7 @@ const crawlDomain = async ({
maxRequestsPerCrawl: number;
browser: string;
userDataDirectory: string;
strategy: EnqueueStrategy;
strategy: EnqueueStrategy | string;
specifiedMaxConcurrency: number;
fileTypes: FileTypes;
blacklistedPatterns: string[];
Expand Down Expand Up @@ -122,6 +122,7 @@ const crawlDomain = async ({
);
const isScanHtml = [FileTypes.All, FileTypes.HtmlOnly].includes(fileTypes as FileTypes);
const isScanPdfs = [FileTypes.All, FileTypes.PdfOnly].includes(fileTypes as FileTypes);
const matchPathPrefix = strategy === 'same-path' ? getMatchPathPrefix(url) : '';
const { maxConcurrency } = constants;
const { playwrightDeviceDetailsObject } = viewportSettings;

Expand Down Expand Up @@ -168,7 +169,10 @@ const crawlDomain = async ({
const isExcluded = (newPageUrl: string): boolean => {
const isAlreadyScanned: boolean = urlsCrawled.scanned.some(item => item.url === newPageUrl);
const isBlacklistedUrl: boolean = isBlacklisted(newPageUrl, blacklistedPatterns);
const isNotFollowStrategy: boolean = !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
const isNotFollowStrategy: boolean =
strategy === 'same-path'
? !newPageUrl.startsWith(matchPathPrefix)
: !isFollowStrategy(newPageUrl, initialPageUrl, strategy);
const isNotSupportedDocument: boolean = disallowedListOfPatterns.some(pattern =>
newPageUrl.toLowerCase().startsWith(pattern),
);
Expand Down Expand Up @@ -333,14 +337,16 @@ const crawlDomain = async ({
await enqueueLinks({
// set selector matches anchor elements with href but not contains # or starting with mailto:
selector: `a:not(${disallowedSelectorPatterns})`,
strategy,
strategy:
strategy === 'same-path' ? EnqueueStrategy.SameHostname : (strategy as EnqueueStrategy),
requestQueue,
transformRequestFunction: (req: RequestOptions): RequestOptions | null => {
try {
req.url = req.url.replace(/(?<=&|\?)utm_.*?(&|$)/gim, '');
} catch (e) {
consoleLogger.error(e);
}
if (strategy === 'same-path' && !req.url.startsWith(matchPathPrefix)) return null;
if (scannedUrlSet.has(req.url)) {
req.skipNavigation = true;
}
Expand Down Expand Up @@ -475,7 +481,10 @@ const crawlDomain = async ({
const requestLabelUrl = request.label;

// to handle scenario where the redirected link is not within the scanning website
const isLoadedUrlFollowStrategy = isFollowStrategy(finalUrl, requestLabelUrl, strategy);
const isLoadedUrlFollowStrategy =
strategy === 'same-path'
? finalUrl.startsWith(matchPathPrefix)
: isFollowStrategy(finalUrl, requestLabelUrl, strategy);
if (!isLoadedUrlFollowStrategy) {
finalUrl = requestLabelUrl;
}
Expand Down Expand Up @@ -513,7 +522,9 @@ const crawlDomain = async ({
}

if (
!isFollowStrategy(url, actualUrl, strategy) &&
!(strategy === 'same-path'
? actualUrl.startsWith(matchPathPrefix)
: isFollowStrategy(url, actualUrl, strategy)) &&
(isBlacklisted(actualUrl, blacklistedPatterns) || (isUrlPdf(actualUrl) && !isScanPdfs))
) {
guiInfoLog(guiInfoStatusTypes.SKIPPED, {
Expand Down
2 changes: 1 addition & 1 deletion src/crawlers/crawlIntelligentSitemap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ const crawlIntelligentSitemap = async (
maxRequestsPerCrawl: number,
browser: string,
userDataDirectory: string,
strategy: EnqueueStrategy,
strategy: EnqueueStrategy | string,
specifiedMaxConcurrency: number,
fileTypes: FileTypes,
blacklistedPatterns: string[],
Expand Down
1 change: 1 addition & 0 deletions src/generateHtmlReport.ts
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ export const generateHtmlReport = async (resultDir: string): Promise<string> =>
advancedScanOptionsSummaryItems: {
showIncludeScreenshots: !!scanData.advancedScanOptionsSummaryItems?.showIncludeScreenshots,
showAllowSubdomains: !!scanData.advancedScanOptionsSummaryItems?.showAllowSubdomains,
showOnlySubpages: !!scanData.advancedScanOptionsSummaryItems?.showOnlySubpages,
showEnableCustomChecks: !!scanData.advancedScanOptionsSummaryItems?.showEnableCustomChecks,
showEnableWcagAaa: !!scanData.advancedScanOptionsSummaryItems?.showEnableWcagAaa,
showSlowScanMode: !!scanData.advancedScanOptionsSummaryItems?.showSlowScanMode,
Expand Down
2 changes: 1 addition & 1 deletion src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ export type Data = {
viewportWidth: number;
playwrightDeviceDetailsObject: DeviceDescriptor;
maxRequestsPerCrawl: number;
strategy: EnqueueStrategy;
strategy: EnqueueStrategy | string;
isLocalFileScan: boolean;
browser: string;
nameEmail: string;
Expand Down
1 change: 1 addition & 0 deletions src/mergeAxeResults.ts
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,7 @@ const generateArtifacts = async (
advancedScanOptionsSummaryItems: {
showIncludeScreenshots: [true].includes(scanDetails.isIncludeScreenshots),
showAllowSubdomains: ['same-domain'].includes(scanDetails.isAllowSubdomains),
showOnlySubpages: ['same-path'].includes(scanDetails.isAllowSubdomains),
showEnableCustomChecks: ['default', 'enable-wcag-aaa'].includes(
scanDetails.isEnableCustomChecks?.[0],
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@
<% if (advancedScanOptionsSummaryItems.showAllowSubdomains) { %>
<li class="advanced-sublist-li">Allow subdomains for scans</li>
<% } %>
<% if (advancedScanOptionsSummaryItems.showOnlySubpages) { %>
<li class="advanced-sublist-li">Only subpages</li>
<% } %>
<% if (advancedScanOptionsSummaryItems.showEnableCustomChecks) { %>
<li class="advanced-sublist-li">Enable custom checks</li>
<% } %>
Expand Down
15 changes: 15 additions & 0 deletions src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1078,6 +1078,21 @@ export const randomThreeDigitNumberString = () => {
return String(threeDigitNumber);
};

export const getMatchPathPrefix = (url: string): string => {
try {
const parsed = new URL(url);
const pathname = parsed.pathname;
if (pathname === '/' || pathname.endsWith('/')) {
return parsed.origin + pathname;
}
const lastSlash = pathname.lastIndexOf('/');
const dirPath = lastSlash >= 0 ? pathname.substring(0, lastSlash + 1) : '/';
return parsed.origin + dirPath;
} catch {
return url;
}
};

export const isFollowStrategy = (link1: string, link2: string, rule: string): boolean => {
try {
const parsedLink1 = new URL(link1);
Expand Down