diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 035e4a2f34..6b09a80f9f 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -162,6 +162,7 @@ async def get_crawl_out( type_: Optional[str] = None, skip_resources=False, headers: Optional[dict] = None, + cid: Optional[UUID] = None, ) -> CrawlOutWithResources: """Get crawl data for api output""" res = await self.get_crawl_raw(crawlid, org, type_) @@ -183,9 +184,17 @@ async def get_crawl_out( oid = res.get("oid") if oid: origin = get_origin(headers) - res["pagesQueryUrl"] = ( - origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch" - ) + # If cid is passed, construct pagesSearch query for public + # shareable workflow + if cid: + res["pagesQueryUrl"] = ( + origin + + f"/api/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch" + ) + else: + res["pagesQueryUrl"] = ( + origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch" + ) # this will now disable the downloadUrl in RWP res["downloadUrl"] = None diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 6087337411..b291c325c2 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -16,7 +16,7 @@ import urllib.parse import aiohttp -from fastapi import APIRouter, Depends, HTTPException, Query +from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response import pymongo from .pagination import DEFAULT_PAGE_SIZE, paginated_format @@ -27,17 +27,20 @@ CrawlConfigOut, CrawlConfigTags, CrawlOut, + CrawlOutWithResources, UpdateCrawlConfig, Organization, User, PaginatedCrawlConfigOutResponse, PaginatedSeedResponse, PaginatedConfigRevisionResponse, + SUCCESSFUL_STATES, FAILED_STATES, CrawlerChannel, CrawlerChannels, StartedResponse, SuccessResponse, + EmptyResponse, CrawlConfigAddedResponse, CrawlConfigSearchValues, CrawlConfigUpdateResponse, @@ -334,6 +337,7 @@ async def add_crawl_config( proxyId=config_in.proxyId, firstSeed=first_seed, seedCount=seed_count, + shareable=config_in.shareable, ) if config_in.runNow: @@ -545,6 +549,9 @@ async def update_crawl_config( changed = changed or self.check_attr_changed( orig_crawl_config, update, "browserWindows" ) + changed = changed or ( + self.check_attr_changed(orig_crawl_config, update, "shareable") + ) schedule_changed = self.check_attr_changed( orig_crawl_config, update, "schedule" @@ -821,6 +828,30 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]: return None + async def get_last_successful_crawl_out( + self, + cid: UUID, + org: Organization, + request: Optional[Request] = None, + ) -> Optional[CrawlOutWithResources]: + """Return the last successful crawl out with resources for this config, if any""" + headers = dict(request.headers) if request else None + match_query = { + "cid": cid, + "oid": org.id, + "finished": {"$ne": None}, + "state": {"$in": SUCCESSFUL_STATES}, + } + last_crawl = await self.crawls.find_one( + match_query, sort=[("finished", pymongo.DESCENDING)] + ) + if last_crawl: + return await self.crawl_ops.get_crawl_out( + last_crawl["_id"], org, "crawl", headers=headers, cid=cid + ) + + return None + async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1): """recompute stats by incrementing size counter and number of crawls""" update_query: dict[str, object] = {} @@ -1475,6 +1506,7 @@ def init_crawl_config_api( org_crawl_dep = org_ops.org_crawl_dep org_viewer_dep = org_ops.org_viewer_dep + org_public = org_ops.org_public @router.get("", response_model=PaginatedCrawlConfigOutResponse) async def get_crawl_configs( @@ -1591,6 +1623,38 @@ async def get_all_crawler_proxies( return ops.get_crawler_proxies() + @app.get( + "/orgs/{oid}/crawlconfigs/{cid}/public/replay.json", + response_model=CrawlOutWithResources, + ) + async def get_crawl_config_latest_crawl_public_replay( + request: Request, + response: Response, + cid: UUID, + org: Organization = Depends(org_public), + ): + crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True) + if not crawl_config.shareable: + raise HTTPException(status_code=404, detail="crawl_config_not_found") + + last_successful_crawl_out = await ops.get_last_successful_crawl_out( + cid, org, request + ) + + response.headers["Access-Control-Allow-Origin"] = "*" + response.headers["Access-Control-Allow-Headers"] = "*" + return last_successful_crawl_out + + @app.options( + "orgs/{oid}/crawlconfigs/{cid}/public/replay.json", + response_model=EmptyResponse, + ) + async def get_replay_preflight(response: Response): + response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS" + response.headers["Access-Control-Allow-Origin"] = "*" + response.headers["Access-Control-Allow-Headers"] = "*" + return {} + @router.get("/{cid}/seeds", response_model=PaginatedSeedResponse) async def get_crawl_config_seeds( cid: UUID, diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 6ab720a65f..3e4f2f80a6 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -266,6 +266,7 @@ def main() -> None: storage_ops, background_job_ops, coll_ops, + crawl_config_ops, current_active_user, ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 7996a67a97..b66ab508a4 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -405,6 +405,8 @@ class CrawlConfigIn(BaseModel): crawlFilenameTemplate: Optional[str] = None + shareable: bool = False + # ============================================================================ class ConfigRevision(BaseMongoModel): @@ -496,6 +498,8 @@ class CrawlConfigAdditional(BaseModel): crawlFilenameTemplate: Optional[str] = None + shareable: Optional[bool] = False + # ============================================================================ class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional): @@ -554,6 +558,7 @@ class UpdateCrawlConfig(BaseModel): browserWindows: Optional[BrowserWindowCount] = None crawlFilenameTemplate: Optional[str] = None config: Optional[RawCrawlConfig] = None + shareable: Optional[bool] = None # ============================================================================ diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index 576cba0041..a9c802df3a 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -1084,7 +1084,15 @@ async def process_finished_crawls(): # ============================================================================ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme def init_pages_api( - app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep + app, + mdb, + crawl_ops, + org_ops, + storage_ops, + background_job_ops, + coll_ops, + crawl_config_ops, + user_dep, ) -> PageOps: """init pages API""" # pylint: disable=invalid-name @@ -1336,6 +1344,47 @@ async def get_search_pages_list( ) return {"items": pages} + @app.get( + "/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch", + tags=["pages", "crawlconfigs"], + response_model=PageOutItemsResponse, + ) + async def get_search_pages_list_shareable_crawl_config( + cid: UUID, + org: Organization = Depends(org_public), + search: Optional[str] = None, + url: Optional[str] = None, + ts: Optional[datetime] = None, + isSeed: Optional[bool] = None, + depth: Optional[int] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + ): + """Retrieve paginated list of pages for last successful crawl of workflow""" + crawl_config = await crawl_config_ops.get_crawl_config( + cid, org.id, active_only=True + ) + if not crawl_config.shareable: + raise HTTPException(status_code=404, detail="crawl_config_not_found") + + last_successful_crawl_out = ( + await crawl_config_ops.get_last_successful_crawl_out(cid, org) + ) + + pages, _ = await ops.list_pages( + crawl_ids=[last_successful_crawl_out.id], + search=search, + url=url, + ts=ts, + is_seed=isSeed, + depth=depth, + org=org, + page_size=pageSize, + page=page, + include_total=False, + ) + return {"items": pages} + @app.get( "/orgs/{oid}/collections/{coll_id}/public/pages", tags=["pages", "collections"], diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index a2d8871a3e..d89200e311 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -995,3 +995,82 @@ def test_delete_in_use_seed_file( ) assert r.status_code == 200 assert r.json()["id"] == seed_file_id + + +def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id): + # Verify workflow is not shareable + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["shareable"] is False + + # Verify public replay.json returns 404 while not shareable + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json" + ) + assert r.status_code == 404 + assert r.json()["detail"] == "crawl_config_not_found" + + # Verify public pagesSearch endpoint returns 404 while not shareable + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch" + ) + assert r.status_code == 404 + + # Mark workflow as shareable + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/", + headers=admin_auth_headers, + json={"shareable": True}, + ) + assert r.status_code == 200 + + data = r.json() + assert data["updated"] + assert data["settings_changed"] + assert data["metadata_changed"] is False + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + assert r.json()["shareable"] + + # Verify public replay.json returns last successful crawl while shareable + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json" + ) + assert r.status_code == 200 + data = r.json() + + assert data["id"] == admin_crawl_id + assert data["oid"] == default_org_id + assert data["cid"] == _admin_crawl_cid + assert data["type"] == "crawl" + assert data["state"] == "complete" + + resources = data["resources"] + assert resources + assert resources[0]["path"] + + assert len(data["initialPages"]) == 4 + + pages_query_url = data["pagesQueryUrl"] + assert pages_query_url.endswith( + f"/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch" + ) + assert data["downloadUrl"] is None + + # Verify pages search endpoint is accessible and works + r = requests.get(pages_query_url) + assert r.status_code == 200 + data = r.json() + assert data["items"] + for page in data["items"]: + assert page["id"] + assert page["oid"] == default_org_id + assert page["crawl_id"] == admin_crawl_id + assert page["url"] diff --git a/frontend/src/features/crawl-workflows/templates/shareable-notice.ts b/frontend/src/features/crawl-workflows/templates/shareable-notice.ts new file mode 100644 index 0000000000..8e2a501f71 --- /dev/null +++ b/frontend/src/features/crawl-workflows/templates/shareable-notice.ts @@ -0,0 +1,14 @@ +import { msg } from "@lit/localize"; +import { html } from "lit"; + +export const ShareableNotice = () => + html` + + + ${msg("Public")} + + `; diff --git a/frontend/src/features/crawl-workflows/workflow-list.ts b/frontend/src/features/crawl-workflows/workflow-list.ts index ef416e8d02..b800d03111 100644 --- a/frontend/src/features/crawl-workflows/workflow-list.ts +++ b/frontend/src/features/crawl-workflows/workflow-list.ts @@ -19,6 +19,9 @@ import { query, queryAssignedElements, } from "lit/decorators.js"; +import { when } from "lit/directives/when.js"; + +import { ShareableNotice } from "./templates/shareable-notice"; import { BtrixElement } from "@/classes/BtrixElement"; import type { OverflowDropdown } from "@/components/ui/overflow-dropdown"; @@ -250,7 +253,8 @@ export class WorkflowListItem extends BtrixElement { }} >
-
+
+ ${when(this.workflow?.shareable, ShareableNotice)} ${this.safeRender(this.renderName)}
diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index 7fcb741095..4fc5167087 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -25,6 +25,7 @@ import { import { ClipboardController } from "@/controllers/clipboard"; import { CrawlStatus } from "@/features/archived-items/crawl-status"; import { ExclusionEditor } from "@/features/crawl-workflows/exclusion-editor"; +import { ShareableNotice } from "@/features/crawl-workflows/templates/shareable-notice"; import { Action, type BtrixSelectActionEvent, @@ -454,16 +455,17 @@ export class WorkflowDetail extends BtrixElement {
-
+
+ ${when(this.workflow?.shareable, ShareableNotice)} ${when( this.workflow?.inactive, () => html` - ${msg("Inactive")} `, @@ -480,9 +482,7 @@ export class WorkflowDetail extends BtrixElement { )}
-
+
${when( this.isCrawler && this.workflow && !this.workflow.inactive, this.renderActions, diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index b00b81fe57..ae3cc49d98 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -108,6 +108,7 @@ export type Workflow = CrawlConfig & { isCrawlRunning: boolean | null; autoAddCollections: string[]; seedCount: number; + shareable?: boolean; }; export type ListWorkflow = Omit;