From 971bc0bc1b4ffe946fb77ecbd126c8d77877feaa Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 30 Jul 2025 11:23:54 -0400 Subject: [PATCH 1/4] Add shareable flag to workflows --- backend/btrixcloud/crawlconfigs.py | 4 ++++ backend/btrixcloud/models.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 6087337411..8a7f1e4f55 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -334,6 +334,7 @@ async def add_crawl_config( proxyId=config_in.proxyId, firstSeed=first_seed, seedCount=seed_count, + shareable=config_in.shareable, ) if config_in.runNow: @@ -545,6 +546,9 @@ async def update_crawl_config( changed = changed or self.check_attr_changed( orig_crawl_config, update, "browserWindows" ) + changed = changed or ( + self.check_attr_changed(orig_crawl_config, update, "shareable") + ) schedule_changed = self.check_attr_changed( orig_crawl_config, update, "schedule" diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index e31ac0975d..9cd904595a 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -403,6 +403,8 @@ class CrawlConfigIn(BaseModel): crawlFilenameTemplate: Optional[str] = None + shareable: bool = False + # ============================================================================ class ConfigRevision(BaseMongoModel): @@ -494,6 +496,8 @@ class CrawlConfigAdditional(BaseModel): crawlFilenameTemplate: Optional[str] = None + shareable: Optional[bool] = False + # ============================================================================ class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional): @@ -552,6 +556,7 @@ class UpdateCrawlConfig(BaseModel): browserWindows: Optional[BrowserWindowCount] = None crawlFilenameTemplate: Optional[str] = None config: Optional[RawCrawlConfig] = None + shareable: Optional[bool] = None # ============================================================================ From da801c1a23241b16a24c5cf96facf6717543cc44 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 30 Jul 2025 11:46:07 -0400 Subject: [PATCH 2/4] Add public replay.json endpoint for shareable workflows --- backend/btrixcloud/crawlconfigs.py | 61 +++++++++++++++++++++++++++++- backend/test/test_crawlconfigs.py | 60 +++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 1 deletion(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 8a7f1e4f55..0aba0dfc0e 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -16,7 +16,7 @@ import urllib.parse import aiohttp -from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response import pymongo from .pagination import DEFAULT_PAGE_SIZE, paginated_format @@ -27,17 +27,20 @@ CrawlConfigOut, CrawlConfigTags, CrawlOut, + CrawlOutWithResources, UpdateCrawlConfig, Organization, User, PaginatedCrawlConfigOutResponse, PaginatedSeedResponse, PaginatedConfigRevisionResponse, + SUCCESSFUL_STATES, FAILED_STATES, CrawlerChannel, CrawlerChannels, StartedResponse, SuccessResponse, + EmptyResponse, CrawlConfigAddedResponse, CrawlConfigSearchValues, CrawlConfigUpdateResponse, @@ -825,6 +828,29 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]: return None + async def get_last_successful_crawl_out( + self, + cid: UUID, + org: Organization, + request: Request, + ) -> Optional[CrawlOutWithResources]: + """Return the last successful crawl out with resources for this config, if any""" + match_query = { + "cid": cid, + "oid": org.id, + "finished": {"$ne": None}, + "state": {"$in": SUCCESSFUL_STATES}, + } + last_crawl = await self.crawls.find_one( + match_query, sort=[("finished", pymongo.DESCENDING)] + ) + if last_crawl: + return await self.crawl_ops.get_crawl_out( + last_crawl["_id"], org, "crawl", headers=dict(request.headers) + ) + + return None + async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): """recompute stats by incrementing size counter and number of crawls""" update_query: dict[str, object] = {} @@ -1479,6 +1505,7 @@ def init_crawl_config_api( org_crawl_dep = org_ops.org_crawl_dep org_viewer_dep = org_ops.org_viewer_dep + org_public = org_ops.org_public @router.get("", response_model=PaginatedCrawlConfigOutResponse) async def get_crawl_configs( @@ -1595,6 +1622,38 @@ async def get_all_crawler_proxies( return ops.get_crawler_proxies() + @app.get( + "/orgs/{oid}/crawlconfigs/{cid}/public/replay.json", + response_model=CrawlOutWithResources, + ) + async def get_crawl_config_latest_crawl_public_replay( + request: Request, + response: Response, + cid: UUID, + org: Organization = Depends(org_public), + ): + crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True) + if not crawl_config.shareable: + raise HTTPException(status_code=404, detail="crawl_config_not_found") + + last_successful_crawl_out = await ops.get_last_successful_crawl_out( + cid, org, request + ) + + response.headers["Access-Control-Allow-Origin"] = "*" + response.headers["Access-Control-Allow-Headers"] = "*" + return last_successful_crawl_out + + @app.options( + "orgs/{oid}/crawlconfigs/{cid}/public/replay.json", + response_model=EmptyResponse, + ) + async def get_replay_preflight(response: Response): + response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS" + response.headers["Access-Control-Allow-Origin"] = "*" + response.headers["Access-Control-Allow-Headers"] = "*" + return {} + @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse) async def get_crawl_config_seeds( cid: UUID, diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index a2d8871a3e..4567737f28 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -995,3 +995,63 @@ def test_delete_in_use_seed_file( ) assert r.status_code == 200 assert r.json()["id"] == seed_file_id + + +def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id): + # Verify workflow is not shareable + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["shareable"] is False + + # Verify public replay.json returns 404 while not shareable + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "crawl_config_not_found" + + # Mark workflow as shareable + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/", + headers=admin_auth_headers, + json={"shareable": True}, + ) + assert r.status_code == 200 + + data = r.json() + assert data["updated"] + assert data["settings_changed"] + assert data["metadata_changed"] is False + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["shareable"] + + # Verify public replay.json returns last successful crawl while shareable + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json" + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == admin_crawl_id + assert data["oid"] == default_org_id + assert data["cid"] == _admin_crawl_cid + assert data["type"] == "crawl" + assert data["state"] == "complete" + + resources = data["resources"] + assert resources + assert resources[0]["path"] + + assert len(data["initialPages"]) == 4 + assert data["pagesQueryUrl"].endswith( + f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pagesSearch" + ) + assert data["downloadUrl"] is None From f4d0d9469c1b434d2fd90494aea5fbf3726190ed Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 30 Jul 2025 16:14:21 -0400 Subject: [PATCH 3/4] Add and return public page search for workflow --- backend/btrixcloud/basecrawls.py | 15 +++++++-- backend/btrixcloud/crawlconfigs.py | 5 +-- backend/btrixcloud/main.py | 1 + backend/btrixcloud/pages.py | 51 +++++++++++++++++++++++++++++- backend/test/test_crawlconfigs.py | 23 ++++++++++++-- 5 files changed, 87 insertions(+), 8 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 035e4a2f34..6b09a80f9f 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -162,6 +162,7 @@ async def get_crawl_out( type_: Optional[str] = None, skip_resources=False, headers: Optional[dict] = None, + cid: Optional[UUID] = None, ) -> CrawlOutWithResources: """Get crawl data for api output""" res = await self.get_crawl_raw(crawlid, org, type_) @@ -183,9 +184,17 @@ async def get_crawl_out( oid = res.get("oid") if oid: origin = get_origin(headers) - res["pagesQueryUrl"] = ( - origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch" - ) + # If cid is passed, construct pagesSearch query for public + # shareable workflow + if cid: + res["pagesQueryUrl"] = ( + origin + + f"/api/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch" + ) + else: + res["pagesQueryUrl"] = ( + origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch" + ) # this will now disable the downloadUrl in RWP res["downloadUrl"] = None diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 0aba0dfc0e..b291c325c2 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -832,9 +832,10 @@ async def get_last_successful_crawl_out( self, cid: UUID, org: Organization, - request: Request, + request: Optional[Request] = None, ) -> Optional[CrawlOutWithResources]: """Return the last successful crawl out with resources for this config, if any""" + headers = dict(request.headers) if request else None match_query = { "cid": cid, "oid": org.id, @@ -846,7 +847,7 @@ async def get_last_successful_crawl_out( ) if last_crawl: return await self.crawl_ops.get_crawl_out( - last_crawl["_id"], org, "crawl", headers=dict(request.headers) + last_crawl["_id"], org, "crawl", headers=headers, cid=cid ) return None diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 6ab720a65f..3e4f2f80a6 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -266,6 +266,7 @@ def main() -> None: storage_ops, background_job_ops, coll_ops, + crawl_config_ops, current_active_user, ) diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 576cba0041..a9c802df3a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -1084,7 +1084,15 @@ async def process_finished_crawls(): # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme def init_pages_api( - app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep + app, + mdb, + crawl_ops, + org_ops, + storage_ops, + background_job_ops, + coll_ops, + crawl_config_ops, + user_dep, ) -> PageOps: """init pages API""" # pylint: disable=invalid-name @@ -1336,6 +1344,47 @@ async def get_search_pages_list( ) return {"items": pages} + @app.get( + "/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch", + tags=["pages", "crawlconfigs"], + response_model=PageOutItemsResponse, + ) + async def get_search_pages_list_shareable_crawl_config( + cid: UUID, + org: Organization = Depends(org_public), + search: Optional[str] = None, + url: Optional[str] = None, + ts: Optional[datetime] = None, + isSeed: Optional[bool] = None, + depth: Optional[int] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + ): + """Retrieve paginated list of pages for last successful crawl of workflow""" + crawl_config = await crawl_config_ops.get_crawl_config( + cid, org.id, active_only=True + ) + if not crawl_config.shareable: + raise HTTPException(status_code=404, detail="crawl_config_not_found") + + last_successful_crawl_out = ( + await crawl_config_ops.get_last_successful_crawl_out(cid, org) + ) + + pages, _ = await ops.list_pages( + crawl_ids=[last_successful_crawl_out.id], + search=search, + url=url, + ts=ts, + is_seed=isSeed, + depth=depth, + org=org, + page_size=pageSize, + page=page, + include_total=False, + ) + return {"items": pages} + @app.get( "/orgs/{oid}/collections/{coll_id}/public/pages", tags=["pages", "collections"], diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 4567737f28..d89200e311 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -1013,6 +1013,12 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id): assert r.status_code == 404 assert r.json()["detail"] == "crawl_config_not_found" + # Verify public pagesSearch endpoint returns 404 while not shareable + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch" + ) + assert r.status_code == 404 + # Mark workflow as shareable r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/", @@ -1051,7 +1057,20 @@ def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id): assert resources[0]["path"] assert len(data["initialPages"]) == 4 - assert data["pagesQueryUrl"].endswith( - f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pagesSearch" + + pages_query_url = data["pagesQueryUrl"] + assert pages_query_url.endswith( + f"/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch" ) assert data["downloadUrl"] is None + + # Verify pages search endpoint is accessible and works + r = requests.get(pages_query_url) + assert r.status_code == 200 + data = r.json() + assert data["items"] + for page in data["items"]: + assert page["id"] + assert page["oid"] == default_org_id + assert page["crawl_id"] == admin_crawl_id + assert page["url"] From 83ced38ade082adccf38796976bc0ec79260d54b Mon Sep 17 00:00:00 2001 From: Emma Segal-Grossman Date: Mon, 18 Aug 2025 13:53:15 -0400 Subject: [PATCH 4/4] Add "shareable" notices to workflow list & detail (#2788) Closes #2785 --- .../crawl-workflows/templates/shareable-notice.ts | 14 ++++++++++++++ .../src/features/crawl-workflows/workflow-list.ts | 6 +++++- frontend/src/pages/org/workflow-detail.ts | 14 +++++++------- frontend/src/types/crawler.ts | 1 + 4 files changed, 27 insertions(+), 8 deletions(-) create mode 100644 frontend/src/features/crawl-workflows/templates/shareable-notice.ts diff --git a/frontend/src/features/crawl-workflows/templates/shareable-notice.ts b/frontend/src/features/crawl-workflows/templates/shareable-notice.ts new file mode 100644 index 0000000000..8e2a501f71 --- /dev/null +++ b/frontend/src/features/crawl-workflows/templates/shareable-notice.ts @@ -0,0 +1,14 @@ +import { msg } from "@lit/localize"; +import { html } from "lit"; + +export const ShareableNotice = () => + html` + + + ${msg("Public")} + + `; diff --git a/frontend/src/features/crawl-workflows/workflow-list.ts b/frontend/src/features/crawl-workflows/workflow-list.ts index ef416e8d02..b800d03111 100644 --- a/frontend/src/features/crawl-workflows/workflow-list.ts +++ b/frontend/src/features/crawl-workflows/workflow-list.ts @@ -19,6 +19,9 @@ import { query, queryAssignedElements, } from "lit/decorators.js"; +import { when } from "lit/directives/when.js"; + +import { ShareableNotice } from "./templates/shareable-notice"; import { BtrixElement } from "@/classes/BtrixElement"; import type { OverflowDropdown } from "@/components/ui/overflow-dropdown"; @@ -250,7 +253,8 @@ export class WorkflowListItem extends BtrixElement { }} >
-
+
+ ${when(this.workflow?.shareable, ShareableNotice)} ${this.safeRender(this.renderName)}
diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index 5777f94ffa..a2d8c6367b 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -23,6 +23,7 @@ import { import { ClipboardController } from "@/controllers/clipboard"; import { CrawlStatus } from "@/features/archived-items/crawl-status"; import { ExclusionEditor } from "@/features/crawl-workflows/exclusion-editor"; +import { ShareableNotice } from "@/features/crawl-workflows/templates/shareable-notice"; import { pageError } from "@/layouts/pageError"; import { pageNav, type Breadcrumb } from "@/layouts/pageHeader"; import { WorkflowTab } from "@/routes"; @@ -445,16 +446,17 @@ export class WorkflowDetail extends BtrixElement {
-
+
+ ${when(this.workflow?.shareable, ShareableNotice)} ${when( this.workflow?.inactive, () => html` - ${msg("Inactive")} `, @@ -471,9 +473,7 @@ export class WorkflowDetail extends BtrixElement { )}
-
+
${when( this.isCrawler && this.workflow && !this.workflow.inactive, this.renderActions, diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 0fefb91525..3c135de40c 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -107,6 +107,7 @@ export type Workflow = CrawlConfig & { isCrawlRunning: boolean | null; autoAddCollections: string[]; seedCount: number; + shareable?: boolean; }; export type ListWorkflow = Omit;