diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
index 035e4a2f34..6b09a80f9f 100644
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@@ -162,6 +162,7 @@ async def get_crawl_out(
type_: Optional[str] = None,
skip_resources=False,
headers: Optional[dict] = None,
+ cid: Optional[UUID] = None,
) -> CrawlOutWithResources:
"""Get crawl data for api output"""
res = await self.get_crawl_raw(crawlid, org, type_)
@@ -183,9 +184,17 @@ async def get_crawl_out(
oid = res.get("oid")
if oid:
origin = get_origin(headers)
- res["pagesQueryUrl"] = (
- origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
- )
+ # If cid is passed, construct pagesSearch query for public
+ # shareable workflow
+ if cid:
+ res["pagesQueryUrl"] = (
+ origin
+ + f"/api/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch"
+ )
+ else:
+ res["pagesQueryUrl"] = (
+ origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
+ )
# this will now disable the downloadUrl in RWP
res["downloadUrl"] = None
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
index 6087337411..b291c325c2 100644
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@@ -16,7 +16,7 @@
import urllib.parse
import aiohttp
-from fastapi import APIRouter, Depends, HTTPException, Query
+from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response
import pymongo
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
@@ -27,17 +27,20 @@
CrawlConfigOut,
CrawlConfigTags,
CrawlOut,
+ CrawlOutWithResources,
UpdateCrawlConfig,
Organization,
User,
PaginatedCrawlConfigOutResponse,
PaginatedSeedResponse,
PaginatedConfigRevisionResponse,
+ SUCCESSFUL_STATES,
FAILED_STATES,
CrawlerChannel,
CrawlerChannels,
StartedResponse,
SuccessResponse,
+ EmptyResponse,
CrawlConfigAddedResponse,
CrawlConfigSearchValues,
CrawlConfigUpdateResponse,
@@ -334,6 +337,7 @@ async def add_crawl_config(
proxyId=config_in.proxyId,
firstSeed=first_seed,
seedCount=seed_count,
+ shareable=config_in.shareable,
)
if config_in.runNow:
@@ -545,6 +549,9 @@ async def update_crawl_config(
changed = changed or self.check_attr_changed(
orig_crawl_config, update, "browserWindows"
)
+ changed = changed or (
+ self.check_attr_changed(orig_crawl_config, update, "shareable")
+ )
schedule_changed = self.check_attr_changed(
orig_crawl_config, update, "schedule"
@@ -821,6 +828,30 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:
return None
+ async def get_last_successful_crawl_out(
+ self,
+ cid: UUID,
+ org: Organization,
+ request: Optional[Request] = None,
+ ) -> Optional[CrawlOutWithResources]:
+ """Return the last successful crawl out with resources for this config, if any"""
+ headers = dict(request.headers) if request else None
+ match_query = {
+ "cid": cid,
+ "oid": org.id,
+ "finished": {"$ne": None},
+ "state": {"$in": SUCCESSFUL_STATES},
+ }
+ last_crawl = await self.crawls.find_one(
+ match_query, sort=[("finished", pymongo.DESCENDING)]
+ )
+ if last_crawl:
+ return await self.crawl_ops.get_crawl_out(
+ last_crawl["_id"], org, "crawl", headers=headers, cid=cid
+ )
+
+ return None
+
async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
"""recompute stats by incrementing size counter and number of crawls"""
update_query: dict[str, object] = {}
@@ -1475,6 +1506,7 @@ def init_crawl_config_api(
org_crawl_dep = org_ops.org_crawl_dep
org_viewer_dep = org_ops.org_viewer_dep
+ org_public = org_ops.org_public
@router.get("", response_model=PaginatedCrawlConfigOutResponse)
async def get_crawl_configs(
@@ -1591,6 +1623,38 @@ async def get_all_crawler_proxies(
return ops.get_crawler_proxies()
+ @app.get(
+ "/orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
+ response_model=CrawlOutWithResources,
+ )
+ async def get_crawl_config_latest_crawl_public_replay(
+ request: Request,
+ response: Response,
+ cid: UUID,
+ org: Organization = Depends(org_public),
+ ):
+ crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True)
+ if not crawl_config.shareable:
+ raise HTTPException(status_code=404, detail="crawl_config_not_found")
+
+ last_successful_crawl_out = await ops.get_last_successful_crawl_out(
+ cid, org, request
+ )
+
+ response.headers["Access-Control-Allow-Origin"] = "*"
+ response.headers["Access-Control-Allow-Headers"] = "*"
+ return last_successful_crawl_out
+
+ @app.options(
+ "orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
+ response_model=EmptyResponse,
+ )
+ async def get_replay_preflight(response: Response):
+ response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
+ response.headers["Access-Control-Allow-Origin"] = "*"
+ response.headers["Access-Control-Allow-Headers"] = "*"
+ return {}
+
@router.get("/{cid}/seeds", response_model=PaginatedSeedResponse)
async def get_crawl_config_seeds(
cid: UUID,
diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py
index 6ab720a65f..3e4f2f80a6 100644
--- a/backend/btrixcloud/main.py
+++ b/backend/btrixcloud/main.py
@@ -266,6 +266,7 @@ def main() -> None:
storage_ops,
background_job_ops,
coll_ops,
+ crawl_config_ops,
current_active_user,
)
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index 7996a67a97..b66ab508a4 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -405,6 +405,8 @@ class CrawlConfigIn(BaseModel):
crawlFilenameTemplate: Optional[str] = None
+ shareable: bool = False
+
# ============================================================================
class ConfigRevision(BaseMongoModel):
@@ -496,6 +498,8 @@ class CrawlConfigAdditional(BaseModel):
crawlFilenameTemplate: Optional[str] = None
+ shareable: Optional[bool] = False
+
# ============================================================================
class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional):
@@ -554,6 +558,7 @@ class UpdateCrawlConfig(BaseModel):
browserWindows: Optional[BrowserWindowCount] = None
crawlFilenameTemplate: Optional[str] = None
config: Optional[RawCrawlConfig] = None
+ shareable: Optional[bool] = None
# ============================================================================
diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
index 576cba0041..a9c802df3a 100644
--- a/backend/btrixcloud/pages.py
+++ b/backend/btrixcloud/pages.py
@@ -1084,7 +1084,15 @@ async def process_finished_crawls():
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
def init_pages_api(
- app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
+ app,
+ mdb,
+ crawl_ops,
+ org_ops,
+ storage_ops,
+ background_job_ops,
+ coll_ops,
+ crawl_config_ops,
+ user_dep,
) -> PageOps:
"""init pages API"""
# pylint: disable=invalid-name
@@ -1336,6 +1344,47 @@ async def get_search_pages_list(
)
return {"items": pages}
+ @app.get(
+ "/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch",
+ tags=["pages", "crawlconfigs"],
+ response_model=PageOutItemsResponse,
+ )
+ async def get_search_pages_list_shareable_crawl_config(
+ cid: UUID,
+ org: Organization = Depends(org_public),
+ search: Optional[str] = None,
+ url: Optional[str] = None,
+ ts: Optional[datetime] = None,
+ isSeed: Optional[bool] = None,
+ depth: Optional[int] = None,
+ pageSize: int = DEFAULT_PAGE_SIZE,
+ page: int = 1,
+ ):
+ """Retrieve paginated list of pages for last successful crawl of workflow"""
+ crawl_config = await crawl_config_ops.get_crawl_config(
+ cid, org.id, active_only=True
+ )
+ if not crawl_config.shareable:
+ raise HTTPException(status_code=404, detail="crawl_config_not_found")
+
+ last_successful_crawl_out = (
+ await crawl_config_ops.get_last_successful_crawl_out(cid, org)
+ )
+
+ pages, _ = await ops.list_pages(
+ crawl_ids=[last_successful_crawl_out.id],
+ search=search,
+ url=url,
+ ts=ts,
+ is_seed=isSeed,
+ depth=depth,
+ org=org,
+ page_size=pageSize,
+ page=page,
+ include_total=False,
+ )
+ return {"items": pages}
+
@app.get(
"/orgs/{oid}/collections/{coll_id}/public/pages",
tags=["pages", "collections"],
diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py
index a2d8871a3e..d89200e311 100644
--- a/backend/test/test_crawlconfigs.py
+++ b/backend/test/test_crawlconfigs.py
@@ -995,3 +995,82 @@ def test_delete_in_use_seed_file(
)
assert r.status_code == 200
assert r.json()["id"] == seed_file_id
+
+
+def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
+ # Verify workflow is not shareable
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
+ headers=admin_auth_headers,
+ )
+ assert r.status_code == 200
+ assert r.json()["shareable"] is False
+
+ # Verify public replay.json returns 404 while not shareable
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
+ )
+ assert r.status_code == 404
+ assert r.json()["detail"] == "crawl_config_not_found"
+
+ # Verify public pagesSearch endpoint returns 404 while not shareable
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
+ )
+ assert r.status_code == 404
+
+ # Mark workflow as shareable
+ r = requests.patch(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/",
+ headers=admin_auth_headers,
+ json={"shareable": True},
+ )
+ assert r.status_code == 200
+
+ data = r.json()
+ assert data["updated"]
+ assert data["settings_changed"]
+ assert data["metadata_changed"] is False
+
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
+ headers=admin_auth_headers,
+ )
+ assert r.status_code == 200
+ assert r.json()["shareable"]
+
+ # Verify public replay.json returns last successful crawl while shareable
+ r = requests.get(
+ f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
+ )
+ assert r.status_code == 200
+ data = r.json()
+
+ assert data["id"] == admin_crawl_id
+ assert data["oid"] == default_org_id
+ assert data["cid"] == _admin_crawl_cid
+ assert data["type"] == "crawl"
+ assert data["state"] == "complete"
+
+ resources = data["resources"]
+ assert resources
+ assert resources[0]["path"]
+
+ assert len(data["initialPages"]) == 4
+
+ pages_query_url = data["pagesQueryUrl"]
+ assert pages_query_url.endswith(
+ f"/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
+ )
+ assert data["downloadUrl"] is None
+
+ # Verify pages search endpoint is accessible and works
+ r = requests.get(pages_query_url)
+ assert r.status_code == 200
+ data = r.json()
+ assert data["items"]
+ for page in data["items"]:
+ assert page["id"]
+ assert page["oid"] == default_org_id
+ assert page["crawl_id"] == admin_crawl_id
+ assert page["url"]
diff --git a/frontend/src/features/crawl-workflows/templates/shareable-notice.ts b/frontend/src/features/crawl-workflows/templates/shareable-notice.ts
new file mode 100644
index 0000000000..8e2a501f71
--- /dev/null
+++ b/frontend/src/features/crawl-workflows/templates/shareable-notice.ts
@@ -0,0 +1,14 @@
+import { msg } from "@lit/localize";
+import { html } from "lit";
+
+export const ShareableNotice = () =>
+ html`