Skip to content

Commit 6fd541f

Browse files
Swatinemandrewshie-sentry
authored andcommitted
Hook up ReleaseFiles to the cleanup/deletion script (#95870)
As the `ReleaseFile` is cross-project, and has no index on the newly added date columns, I have duplicated the per-project deletion code to rather run per-organization. I’m not entirely confident in this TBH, and I wonder if this has any tests?
1 parent 7af8339 commit 6fd541f

File tree

2 files changed

+98
-5
lines changed

2 files changed

+98
-5
lines changed

src/sentry/db/deletion.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111

1212

1313
class BulkDeleteQuery:
14-
def __init__(self, model, project_id=None, dtfield=None, days=None, order_by=None):
14+
def __init__(
15+
self, model, project_id=None, organization_id=None, dtfield=None, days=None, order_by=None
16+
):
1517
self.model = model
1618
self.project_id = int(project_id) if project_id else None
19+
self.organization_id = int(organization_id) if organization_id else None
1720
self.dtfield = dtfield
1821
self.days = int(days) if days is not None else None
1922
self.order_by = order_by
@@ -32,6 +35,8 @@ def execute(self, chunk_size=10000):
3235
)
3336
if self.project_id:
3437
where.append(f"project_id = {self.project_id}")
38+
if self.organization_id:
39+
where.append(f"organization_id = {self.organization_id}")
3540

3641
if where:
3742
where_clause = "where {}".format(" and ".join(where))
@@ -101,6 +106,8 @@ def iterator(self, chunk_size=100, batch_size=100000) -> Generator[tuple[int, ..
101106

102107
if self.project_id:
103108
where.append(("project_id = %s", [self.project_id]))
109+
if self.organization_id:
110+
where.append(("organization_id = %s", [self.organization_id]))
104111

105112
if self.order_by[0] == "-":
106113
direction = "desc"

src/sentry/runner/commands/cleanup.py

Lines changed: 90 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,17 @@ def get_project(value: str) -> int | None:
3333
return None
3434

3535

36+
def get_organization(value: str) -> int | None:
37+
from sentry.models.organization import Organization
38+
39+
try:
40+
if value.isdigit():
41+
return int(value)
42+
return Organization.objects.get(slug=value).id
43+
except Organization.DoesNotExist:
44+
return None
45+
46+
3647
# We need a unique value to indicate when to stop multiprocessing queue
3748
# an identity on an object() isn't guaranteed to work between parent
3849
# and child proc
@@ -104,6 +115,7 @@ def multiprocess_worker(task_queue: _WorkQueue) -> None:
104115
@click.command()
105116
@click.option("--days", default=30, show_default=True, help="Numbers of days to truncate on.")
106117
@click.option("--project", help="Limit truncation to only entries from project.")
118+
@click.option("--organization", help="Limit truncation to only entries from organization.")
107119
@click.option(
108120
"--concurrency",
109121
type=int,
@@ -127,6 +139,7 @@ def multiprocess_worker(task_queue: _WorkQueue) -> None:
127139
def cleanup(
128140
days: int,
129141
project: str | None,
142+
organization: str | None,
130143
concurrency: int,
131144
silent: bool,
132145
model: tuple[str, ...],
@@ -137,9 +150,9 @@ def cleanup(
137150
138151
All data that is older than `--days` will be deleted. The default for
139152
this is 30 days. In the default setting all projects will be truncated
140-
but if you have a specific project you want to limit this to this can be
141-
done with the `--project` flag which accepts a project ID or a string
142-
with the form `org/project` where both are slugs.
153+
but if you have a specific project or organization you want to limit this to,
154+
this can be done with the `--project` or `--organization` flags respectively,
155+
which accepts a project/organization ID or a string with the form `org/project` where both are slugs.
143156
"""
144157
if concurrency < 1:
145158
click.echo("Error: Minimum concurrency is 1", err=True)
@@ -209,10 +222,13 @@ def is_filtered(model: type[Model]) -> bool:
209222
exported_data(is_filtered, silent)
210223

211224
project_id = None
225+
organization_id = None
212226
if SiloMode.get_current_mode() != SiloMode.CONTROL:
213227
if project:
214228
remove_cross_project_models(deletes)
215229
project_id = get_project_id_or_fail(project)
230+
elif organization:
231+
organization_id = get_organization_id_or_fail(organization)
216232
else:
217233
remove_old_nodestore_values(days)
218234

@@ -268,6 +284,36 @@ def is_filtered(model: type[Model]) -> bool:
268284
for chunk in q.iterator(chunk_size=100):
269285
task_queue.put((imp, chunk))
270286

287+
task_queue.join()
288+
289+
organization_deletion_query, to_delete_by_organization = prepare_deletes_by_organization(
290+
organization, organization_id, is_filtered
291+
)
292+
293+
if organization_deletion_query is not None and len(to_delete_by_organization):
294+
debug_output("Running bulk deletes in DELETES_BY_ORGANIZATION")
295+
for organization_id_for_deletion in RangeQuerySetWrapper(
296+
organization_deletion_query.values_list("id", flat=True),
297+
result_value_getter=lambda item: item,
298+
):
299+
for model_tp, dtfield, order_by in to_delete_by_organization:
300+
debug_output(
301+
f"Removing {model_tp.__name__} for days={days} organization={organization_id_for_deletion}"
302+
)
303+
304+
imp = ".".join((model_tp.__module__, model_tp.__name__))
305+
306+
q = BulkDeleteQuery(
307+
model=model_tp,
308+
dtfield=dtfield,
309+
days=days,
310+
organization_id=organization_id_for_deletion,
311+
order_by=order_by,
312+
)
313+
314+
for chunk in q.iterator(chunk_size=100):
315+
task_queue.put((imp, chunk))
316+
271317
task_queue.join()
272318

273319
remove_file_blobs(is_filtered, silent)
@@ -374,7 +420,7 @@ def models_which_use_deletions_code_path() -> list[tuple[type[Model], str, str]]
374420

375421

376422
def remove_cross_project_models(
377-
deletes: list[tuple[type[Model], str, str]]
423+
deletes: list[tuple[type[Model], str, str]],
378424
) -> list[tuple[type[Model], str, str]]:
379425
from sentry.models.artifactbundle import ArtifactBundle
380426

@@ -392,6 +438,15 @@ def get_project_id_or_fail(project: str) -> int:
392438
return project_id
393439

394440

441+
def get_organization_id_or_fail(organization: str) -> int:
442+
click.echo("Bulk NodeStore deletion not available for organization selection", err=True)
443+
organization_id = get_organization(organization)
444+
if organization_id is None:
445+
click.echo("Error: Organization not found", err=True)
446+
raise click.Abort()
447+
return organization_id
448+
449+
395450
def remove_old_nodestore_values(days: int) -> None:
396451
from sentry import nodestore
397452

@@ -486,6 +541,37 @@ def prepare_deletes_by_project(
486541
return project_deletion_query, to_delete_by_project
487542

488543

544+
def prepare_deletes_by_organization(
545+
organization: str | None,
546+
organization_id: int | None,
547+
is_filtered: Callable[[type[Model]], bool],
548+
) -> tuple[QuerySet[Any] | None, list[tuple[Any, str, str]]]:
549+
from sentry.constants import ObjectStatus
550+
from sentry.models.organization import Organization
551+
from sentry.models.releasefile import ReleaseFile
552+
553+
# Deletions that we run per organization. In some cases we can't use an index on just the date
554+
# column, so as an alternative we use `(organization_id, <date_col>)` instead
555+
DELETES_BY_ORGANIZATION = [
556+
(ReleaseFile, "date_accessed", "date_accessed"),
557+
]
558+
organization_deletion_query = None
559+
to_delete_by_organization = []
560+
if SiloMode.get_current_mode() != SiloMode.CONTROL:
561+
debug_output("Preparing DELETES_BY_ORGANIZATION context")
562+
organization_deletion_query = Organization.objects.filter(status=ObjectStatus.ACTIVE)
563+
if organization:
564+
organization_deletion_query = Organization.objects.filter(id=organization_id)
565+
566+
for model_tp_tup in DELETES_BY_ORGANIZATION:
567+
if is_filtered(model_tp_tup[0]):
568+
debug_output(f">> Skipping {model_tp_tup[0].__name__}")
569+
else:
570+
to_delete_by_organization.append(model_tp_tup)
571+
572+
return organization_deletion_query, to_delete_by_organization
573+
574+
489575
def remove_file_blobs(is_filtered: Callable[[type[Model]], bool], silent: bool) -> None:
490576
from sentry.models.file import FileBlob
491577

0 commit comments

Comments
 (0)