Skip to content

Commit 9206027

Browse files
committed
Hook up ReleaseFiles to the cleanup/deletion script
As the `ReleaseFile` is cross-project, and has no index on the newly added date columns, I have duplicated the per-project deletion code to rather run per-organization.
1 parent 0e59468 commit 9206027

File tree

2 files changed

+100
-5
lines changed

2 files changed

+100
-5
lines changed

src/sentry/db/deletion.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111

1212

1313
class BulkDeleteQuery:
14-
def __init__(self, model, project_id=None, dtfield=None, days=None, order_by=None):
14+
def __init__(
15+
self, model, project_id=None, organization_id=None, dtfield=None, days=None, order_by=None
16+
):
1517
self.model = model
1618
self.project_id = int(project_id) if project_id else None
19+
self.organization_id = int(organization_id) if organization_id else None
1720
self.dtfield = dtfield
1821
self.days = int(days) if days is not None else None
1922
self.order_by = order_by
@@ -32,6 +35,8 @@ def execute(self, chunk_size=10000):
3235
)
3336
if self.project_id:
3437
where.append(f"project_id = {self.project_id}")
38+
if self.organization_id:
39+
where.append(f"organization_id = {self.organization_id}")
3540

3641
if where:
3742
where_clause = "where {}".format(" and ".join(where))
@@ -101,6 +106,8 @@ def iterator(self, chunk_size=100, batch_size=100000) -> Generator[tuple[int, ..
101106

102107
if self.project_id:
103108
where.append(("project_id = %s", [self.project_id]))
109+
if self.organization_id:
110+
where.append(("organization_id = %s", [self.organization_id]))
104111

105112
if self.order_by[0] == "-":
106113
direction = "desc"

src/sentry/runner/commands/cleanup.py

Lines changed: 92 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,17 @@ def get_project(value: str) -> int | None:
3333
return None
3434

3535

36+
def get_organization(value: str) -> int | None:
37+
from sentry.models.organization import Organization
38+
39+
try:
40+
if value.isdigit():
41+
return int(value)
42+
return Organization.objects.get(slug=value).id
43+
except Organization.DoesNotExist:
44+
return None
45+
46+
3647
# We need a unique value to indicate when to stop multiprocessing queue
3748
# an identity on an object() isn't guaranteed to work between parent
3849
# and child proc
@@ -104,6 +115,7 @@ def multiprocess_worker(task_queue: _WorkQueue) -> None:
104115
@click.command()
105116
@click.option("--days", default=30, show_default=True, help="Numbers of days to truncate on.")
106117
@click.option("--project", help="Limit truncation to only entries from project.")
118+
@click.option("--organization", help="Limit truncation to only entries from organization.")
107119
@click.option(
108120
"--concurrency",
109121
type=int,
@@ -127,6 +139,7 @@ def multiprocess_worker(task_queue: _WorkQueue) -> None:
127139
def cleanup(
128140
days: int,
129141
project: str | None,
142+
organization: str | None,
130143
concurrency: int,
131144
silent: bool,
132145
model: tuple[str, ...],
@@ -137,9 +150,9 @@ def cleanup(
137150
138151
All data that is older than `--days` will be deleted. The default for
139152
this is 30 days. In the default setting all projects will be truncated
140-
but if you have a specific project you want to limit this to this can be
141-
done with the `--project` flag which accepts a project ID or a string
142-
with the form `org/project` where both are slugs.
153+
but if you have a specific project or organization you want to limit this to,
154+
this can be done with the `--project` or `--organization` flags respectively,
155+
which accepts a project/organization ID or a string with the form `org/project` where both are slugs.
143156
"""
144157
if concurrency < 1:
145158
click.echo("Error: Minimum concurrency is 1", err=True)
@@ -209,10 +222,13 @@ def is_filtered(model: type[Model]) -> bool:
209222
exported_data(is_filtered, silent)
210223

211224
project_id = None
225+
organization_id = None
212226
if SiloMode.get_current_mode() != SiloMode.CONTROL:
213227
if project:
214228
remove_cross_project_models(deletes)
215229
project_id = get_project_id_or_fail(project)
230+
elif organization:
231+
organization_id = get_organization_id_or_fail(organization)
216232
else:
217233
remove_old_nodestore_values(days)
218234

@@ -240,6 +256,34 @@ def is_filtered(model: type[Model]) -> bool:
240256

241257
task_queue.join()
242258

259+
organization_deletion_query, to_delete_by_organization = prepare_deletes_by_organization(
260+
organization, organization_id, is_filtered
261+
)
262+
263+
if organization_deletion_query is not None and len(to_delete_by_organization):
264+
debug_output("Running bulk deletes in DELETES_BY_ORGANIZATION")
265+
for organization_id_for_deletion in RangeQuerySetWrapper(
266+
organization_deletion_query.values_list("id", flat=True),
267+
result_value_getter=lambda item: item,
268+
):
269+
for model_tp, dtfield, order_by in to_delete_by_organization:
270+
debug_output(
271+
f"Removing {model_tp.__name__} for days={days} organization={organization_id_for_deletion}"
272+
)
273+
274+
imp = ".".join((model_tp.__module__, model_tp.__name__))
275+
276+
q = BulkDeleteQuery(
277+
model=model_tp,
278+
dtfield=dtfield,
279+
days=days,
280+
organization_id=organization_id_for_deletion,
281+
order_by=order_by,
282+
)
283+
284+
for chunk in q.iterator(chunk_size=100):
285+
task_queue.put((imp, chunk))
286+
243287
project_deletion_query, to_delete_by_project = prepare_deletes_by_project(
244288
project, project_id, is_filtered
245289
)
@@ -357,6 +401,7 @@ def models_which_use_deletions_code_path() -> list[tuple[type[Model], str, str]]
357401
from sentry.models.artifactbundle import ArtifactBundle
358402
from sentry.models.eventattachment import EventAttachment
359403
from sentry.models.grouprulestatus import GroupRuleStatus
404+
from sentry.models.releasefile import ReleaseFile
360405
from sentry.models.rulefirehistory import RuleFireHistory
361406
from sentry.monitors.models import MonitorCheckIn
362407
from sentry.replays.models import ReplayRecordingSegment
@@ -370,16 +415,19 @@ def models_which_use_deletions_code_path() -> list[tuple[type[Model], str, str]]
370415
(MonitorCheckIn, "date_added", "date_added"),
371416
(GroupRuleStatus, "date_added", "date_added"),
372417
(RuleFireHistory, "date_added", "date_added"),
418+
(ReleaseFile, "date_accessed", "date_accessed"),
373419
]
374420

375421

376422
def remove_cross_project_models(
377-
deletes: list[tuple[type[Model], str, str]]
423+
deletes: list[tuple[type[Model], str, str]],
378424
) -> list[tuple[type[Model], str, str]]:
379425
from sentry.models.artifactbundle import ArtifactBundle
426+
from sentry.models.releasefile import ReleaseFile
380427

381428
# These models span across projects, so let's skip them
382429
deletes.remove((ArtifactBundle, "date_added", "date_added"))
430+
deletes.remove((ReleaseFile, "date_accessed", "date_accessed"))
383431
return deletes
384432

385433

@@ -392,6 +440,15 @@ def get_project_id_or_fail(project: str) -> int:
392440
return project_id
393441

394442

443+
def get_organization_id_or_fail(organization: str) -> int:
444+
click.echo("Bulk NodeStore deletion not available for organization selection", err=True)
445+
organization_id = get_organization(organization)
446+
if organization_id is None:
447+
click.echo("Error: Organization not found", err=True)
448+
raise click.Abort()
449+
return organization_id
450+
451+
395452
def remove_old_nodestore_values(days: int) -> None:
396453
from sentry import nodestore
397454

@@ -486,6 +543,37 @@ def prepare_deletes_by_project(
486543
return project_deletion_query, to_delete_by_project
487544

488545

546+
def prepare_deletes_by_organization(
547+
organization: str | None,
548+
organization_id: int | None,
549+
is_filtered: Callable[[type[Model]], bool],
550+
) -> tuple[QuerySet[Any] | None, list[tuple[Any, str, str]]]:
551+
from sentry.constants import ObjectStatus
552+
from sentry.models.organization import Organization
553+
from sentry.models.releasefile import ReleaseFile
554+
555+
# Deletions that we run per organization. In some cases we can't use an index on just the date
556+
# column, so as an alternative we use `(organization_id, <date_col>)` instead
557+
DELETES_BY_ORGANIZATION = [
558+
(ReleaseFile, "date_accessed", "date_accessed"),
559+
]
560+
organization_deletion_query = None
561+
to_delete_by_organization = []
562+
if SiloMode.get_current_mode() != SiloMode.CONTROL:
563+
debug_output("Preparing DELETES_BY_ORGANIZATION context")
564+
organization_deletion_query = Organization.objects.filter(status=ObjectStatus.ACTIVE)
565+
if organization:
566+
organization_deletion_query = Organization.objects.filter(id=organization_id)
567+
568+
for model_tp_tup in DELETES_BY_ORGANIZATION:
569+
if is_filtered(model_tp_tup[0]):
570+
debug_output(f">> Skipping {model_tp_tup[0].__name__}")
571+
else:
572+
to_delete_by_organization.append(model_tp_tup)
573+
574+
return organization_deletion_query, to_delete_by_organization
575+
576+
489577
def remove_file_blobs(is_filtered: Callable[[type[Model]], bool], silent: bool) -> None:
490578
from sentry.models.file import FileBlob
491579

0 commit comments

Comments
 (0)