Skip to content

Shareable workflows (API + frontend) #2783

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ async def get_crawl_out(
type_: Optional[str] = None,
skip_resources=False,
headers: Optional[dict] = None,
cid: Optional[UUID] = None,
) -> CrawlOutWithResources:
"""Get crawl data for api output"""
res = await self.get_crawl_raw(crawlid, org, type_)
Expand All @@ -183,9 +184,17 @@ async def get_crawl_out(
oid = res.get("oid")
if oid:
origin = get_origin(headers)
res["pagesQueryUrl"] = (
origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
)
# If cid is passed, construct pagesSearch query for public
# shareable workflow
if cid:
res["pagesQueryUrl"] = (
origin
+ f"/api/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch"
)
else:
res["pagesQueryUrl"] = (
origin + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
)

# this will now disable the downloadUrl in RWP
res["downloadUrl"] = None
Expand Down
66 changes: 65 additions & 1 deletion backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import urllib.parse

import aiohttp
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi import APIRouter, Depends, HTTPException, Query, Request, Response
import pymongo

from .pagination import DEFAULT_PAGE_SIZE, paginated_format
Expand All @@ -27,17 +27,20 @@
CrawlConfigOut,
CrawlConfigTags,
CrawlOut,
CrawlOutWithResources,
UpdateCrawlConfig,
Organization,
User,
PaginatedCrawlConfigOutResponse,
PaginatedSeedResponse,
PaginatedConfigRevisionResponse,
SUCCESSFUL_STATES,
FAILED_STATES,
CrawlerChannel,
CrawlerChannels,
StartedResponse,
SuccessResponse,
EmptyResponse,
CrawlConfigAddedResponse,
CrawlConfigSearchValues,
CrawlConfigUpdateResponse,
Expand Down Expand Up @@ -334,6 +337,7 @@ async def add_crawl_config(
proxyId=config_in.proxyId,
firstSeed=first_seed,
seedCount=seed_count,
shareable=config_in.shareable,
)

if config_in.runNow:
Expand Down Expand Up @@ -545,6 +549,9 @@ async def update_crawl_config(
changed = changed or self.check_attr_changed(
orig_crawl_config, update, "browserWindows"
)
changed = changed or (
self.check_attr_changed(orig_crawl_config, update, "shareable")
)

schedule_changed = self.check_attr_changed(
orig_crawl_config, update, "schedule"
Expand Down Expand Up @@ -821,6 +828,30 @@ async def get_running_crawl(self, cid: UUID) -> Optional[CrawlOut]:

return None

async def get_last_successful_crawl_out(
self,
cid: UUID,
org: Organization,
request: Optional[Request] = None,
) -> Optional[CrawlOutWithResources]:
"""Return the last successful crawl out with resources for this config, if any"""
headers = dict(request.headers) if request else None
match_query = {
"cid": cid,
"oid": org.id,
"finished": {"$ne": None},
"state": {"$in": SUCCESSFUL_STATES},
}
last_crawl = await self.crawls.find_one(
match_query, sort=[("finished", pymongo.DESCENDING)]
)
if last_crawl:
return await self.crawl_ops.get_crawl_out(
last_crawl["_id"], org, "crawl", headers=headers, cid=cid
)

return None

async def stats_recompute_last(self, cid: UUID, size: int, inc_crawls: int = 1):
"""recompute stats by incrementing size counter and number of crawls"""
update_query: dict[str, object] = {}
Expand Down Expand Up @@ -1475,6 +1506,7 @@ def init_crawl_config_api(

org_crawl_dep = org_ops.org_crawl_dep
org_viewer_dep = org_ops.org_viewer_dep
org_public = org_ops.org_public

@router.get("", response_model=PaginatedCrawlConfigOutResponse)
async def get_crawl_configs(
Expand Down Expand Up @@ -1591,6 +1623,38 @@ async def get_all_crawler_proxies(

return ops.get_crawler_proxies()

@app.get(
"/orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
response_model=CrawlOutWithResources,
)
async def get_crawl_config_latest_crawl_public_replay(
request: Request,
response: Response,
cid: UUID,
org: Organization = Depends(org_public),
):
crawl_config = await ops.get_crawl_config(cid, org.id, active_only=True)
if not crawl_config.shareable:
raise HTTPException(status_code=404, detail="crawl_config_not_found")

last_successful_crawl_out = await ops.get_last_successful_crawl_out(
cid, org, request
)

response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return last_successful_crawl_out

@app.options(
"orgs/{oid}/crawlconfigs/{cid}/public/replay.json",
response_model=EmptyResponse,
)
async def get_replay_preflight(response: Response):
response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return {}

@router.get("/{cid}/seeds", response_model=PaginatedSeedResponse)
async def get_crawl_config_seeds(
cid: UUID,
Expand Down
1 change: 1 addition & 0 deletions backend/btrixcloud/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ def main() -> None:
storage_ops,
background_job_ops,
coll_ops,
crawl_config_ops,
current_active_user,
)

Expand Down
5 changes: 5 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,8 @@ class CrawlConfigIn(BaseModel):

crawlFilenameTemplate: Optional[str] = None

shareable: bool = False


# ============================================================================
class ConfigRevision(BaseMongoModel):
Expand Down Expand Up @@ -496,6 +498,8 @@ class CrawlConfigAdditional(BaseModel):

crawlFilenameTemplate: Optional[str] = None

shareable: Optional[bool] = False


# ============================================================================
class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional):
Expand Down Expand Up @@ -554,6 +558,7 @@ class UpdateCrawlConfig(BaseModel):
browserWindows: Optional[BrowserWindowCount] = None
crawlFilenameTemplate: Optional[str] = None
config: Optional[RawCrawlConfig] = None
shareable: Optional[bool] = None


# ============================================================================
Expand Down
51 changes: 50 additions & 1 deletion backend/btrixcloud/pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -1084,7 +1084,15 @@ async def process_finished_crawls():
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
def init_pages_api(
app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
app,
mdb,
crawl_ops,
org_ops,
storage_ops,
background_job_ops,
coll_ops,
crawl_config_ops,
user_dep,
) -> PageOps:
"""init pages API"""
# pylint: disable=invalid-name
Expand Down Expand Up @@ -1336,6 +1344,47 @@ async def get_search_pages_list(
)
return {"items": pages}

@app.get(
"/orgs/{oid}/crawlconfigs/{cid}/public/pagesSearch",
tags=["pages", "crawlconfigs"],
response_model=PageOutItemsResponse,
)
async def get_search_pages_list_shareable_crawl_config(
cid: UUID,
org: Organization = Depends(org_public),
search: Optional[str] = None,
url: Optional[str] = None,
ts: Optional[datetime] = None,
isSeed: Optional[bool] = None,
depth: Optional[int] = None,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
):
"""Retrieve paginated list of pages for last successful crawl of workflow"""
crawl_config = await crawl_config_ops.get_crawl_config(
cid, org.id, active_only=True
)
if not crawl_config.shareable:
raise HTTPException(status_code=404, detail="crawl_config_not_found")

last_successful_crawl_out = (
await crawl_config_ops.get_last_successful_crawl_out(cid, org)
)

pages, _ = await ops.list_pages(
crawl_ids=[last_successful_crawl_out.id],
search=search,
url=url,
ts=ts,
is_seed=isSeed,
depth=depth,
org=org,
page_size=pageSize,
page=page,
include_total=False,
)
return {"items": pages}

@app.get(
"/orgs/{oid}/collections/{coll_id}/public/pages",
tags=["pages", "collections"],
Expand Down
79 changes: 79 additions & 0 deletions backend/test/test_crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,3 +995,82 @@ def test_delete_in_use_seed_file(
)
assert r.status_code == 200
assert r.json()["id"] == seed_file_id


def test_shareable_workflow(admin_auth_headers, default_org_id, admin_crawl_id):
# Verify workflow is not shareable
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["shareable"] is False

# Verify public replay.json returns 404 while not shareable
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
)
assert r.status_code == 404
assert r.json()["detail"] == "crawl_config_not_found"

# Verify public pagesSearch endpoint returns 404 while not shareable
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
)
assert r.status_code == 404

# Mark workflow as shareable
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/",
headers=admin_auth_headers,
json={"shareable": True},
)
assert r.status_code == 200

data = r.json()
assert data["updated"]
assert data["settings_changed"]
assert data["metadata_changed"] is False

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["shareable"]

# Verify public replay.json returns last successful crawl while shareable
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/replay.json"
)
assert r.status_code == 200
data = r.json()

assert data["id"] == admin_crawl_id
assert data["oid"] == default_org_id
assert data["cid"] == _admin_crawl_cid
assert data["type"] == "crawl"
assert data["state"] == "complete"

resources = data["resources"]
assert resources
assert resources[0]["path"]

assert len(data["initialPages"]) == 4

pages_query_url = data["pagesQueryUrl"]
assert pages_query_url.endswith(
f"/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/public/pagesSearch"
)
assert data["downloadUrl"] is None

# Verify pages search endpoint is accessible and works
r = requests.get(pages_query_url)
assert r.status_code == 200
data = r.json()
assert data["items"]
for page in data["items"]:
assert page["id"]
assert page["oid"] == default_org_id
assert page["crawl_id"] == admin_crawl_id
assert page["url"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import { msg } from "@lit/localize";
import { html } from "lit";

export const ShareableNotice = () =>
html`<btrix-popover
content=${msg(
"The latest crawl from this workflow is publicly accessible to anyone with the link. This can be changed with the Browsertrix API.",
)}
>
<btrix-badge class="part-[base]:min-h-5" variant="warning">
<sl-icon name="info-circle" class="align-icon mr-1"></sl-icon>
${msg("Public")}
</btrix-badge>
</btrix-popover>`;
6 changes: 5 additions & 1 deletion frontend/src/features/crawl-workflows/workflow-list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ import {
query,
queryAssignedElements,
} from "lit/decorators.js";
import { when } from "lit/directives/when.js";

import { ShareableNotice } from "./templates/shareable-notice";

import { BtrixElement } from "@/classes/BtrixElement";
import type { OverflowDropdown } from "@/components/ui/overflow-dropdown";
Expand Down Expand Up @@ -250,7 +253,8 @@ export class WorkflowListItem extends BtrixElement {
}}
>
<div class="col">
<div class="detail url truncate">
<div class="detail url items-center truncate">
${when(this.workflow?.shareable, ShareableNotice)}
${this.safeRender(this.renderName)}
</div>
<div class="desc">
Expand Down
Loading
Loading