Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/workflows/cefs-gc.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: CEFS garbage collection

on:
schedule:
- cron: '0 9 * * 0'
workflow_dispatch:
inputs:
min-age:
description: Minimum age for items to be eligible for deletion
default: '2d'
required: false
include-broken:
description: Include unreferenced broken images
type: boolean
default: false
required: false

jobs:
gc:
if: github.repository == 'compiler-explorer/infra'
runs-on: ['admin']
steps:
- name: Start from a clean directory
uses: AutoModality/action-clean@v1.1.0
- uses: actions/checkout@v4
- name: Set up environment
run: make ce
- name: Run CEFS garbage collection
run: >-
sudo bin/ce_install cefs gc
--force
--cleanup-bak
--min-age ${{ github.event.inputs.min-age || '2d' }}
${{ github.event.inputs.include-broken == 'true' && '--include-broken' || '' }}
120 changes: 120 additions & 0 deletions bin/lib/cefs/gc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@

import datetime
import logging
import shutil
from dataclasses import dataclass, field
from pathlib import Path

from lib.cefs.paths import FileWithAge

_LOGGER = logging.getLogger(__name__)


Expand Down Expand Up @@ -107,6 +110,82 @@ def delete_image_with_manifest(image_path: Path) -> ImageDeletionResult:
return ImageDeletionResult(success=True, deleted_size=deleted_size, errors=errors)


@dataclass(frozen=True)
class BakCleanupResult:
"""Result of cleaning up .bak and .DELETE_ME_* items."""

deleted_count: int
skipped_too_recent: int
errors: list[str] = field(default_factory=list)


def delete_bak_item(item_path: Path) -> str | None:
"""Delete a single .bak or .DELETE_ME_* item (file, symlink, or directory tree).

Returns an error message on failure, or None on success.
"""
try:
if item_path.is_symlink() or item_path.is_file():
item_path.unlink()
elif item_path.is_dir():
shutil.rmtree(item_path)
else:
return f"Unknown file type, cannot delete: {item_path}"
except OSError as e:
return f"Failed to delete {item_path}: {e}"
return None


def cleanup_bak_items(
items: list[FileWithAge],
min_age_seconds: float,
dry_run: bool,
) -> BakCleanupResult:
"""Clean up .bak and .DELETE_ME_* items older than min_age.

Args:
items: List of FileWithAge items to consider
min_age_seconds: Minimum age in seconds before an item is eligible for deletion
dry_run: If True, only report what would be deleted

Returns:
BakCleanupResult with counts and any errors
"""
deleted_count = 0
skipped_too_recent = 0
errors = []

for item in items:
if item.age_seconds < min_age_seconds:
skipped_too_recent += 1
_LOGGER.info(
"Skipping recent item (age %s): %s",
datetime.timedelta(seconds=int(item.age_seconds)),
item.path,
)
continue

age_str = str(datetime.timedelta(seconds=int(item.age_seconds)))
if dry_run:
_LOGGER.info("Would delete: %s (age: %s)", item.path, age_str)
deleted_count += 1
continue

_LOGGER.info("Deleting: %s (age: %s)", item.path, age_str)
error = delete_bak_item(item.path)
if error:
errors.append(error)
_LOGGER.error(error)
else:
deleted_count += 1

return BakCleanupResult(
deleted_count=deleted_count,
skipped_too_recent=skipped_too_recent,
errors=errors,
)


def check_if_symlink_references_image(symlink_path: Path, image_stem: str, mount_point: Path) -> bool:
if not symlink_path.is_symlink():
return False
Expand All @@ -126,3 +205,44 @@ def check_if_symlink_references_image(symlink_path: Path, image_stem: str, mount
pass

return False


def find_bak_candidates(image_references: dict[str, list[Path]], current_time: float) -> list[FileWithAge]:
"""Find .bak and .DELETE_ME_* items adjacent to manifest-known NFS paths.

Uses manifests to check only known NFS paths rather than walking the entire
NFS tree (which is very slow on EFS).

Args:
image_references: Mapping of image stem -> list of expected NFS destination paths
current_time: Current time as float (time.time()) for age calculation

Returns:
List of FileWithAge for found .bak and .DELETE_ME_* items
"""
candidates: list[FileWithAge] = []
all_destinations: set[Path] = set()
for destinations in image_references.values():
all_destinations.update(destinations)

for dest_path in all_destinations:
# Check for .bak sibling
bak_path = dest_path.with_name(dest_path.name + ".bak")
try:
mtime = bak_path.lstat().st_mtime
candidates.append(FileWithAge(bak_path, current_time - mtime))
except OSError:
pass

# Check for .DELETE_ME_* siblings (from deferred cleanup)
try:
for delete_me in dest_path.parent.glob(dest_path.name + ".DELETE_ME_*"):
try:
mtime = delete_me.lstat().st_mtime
candidates.append(FileWithAge(delete_me, current_time - mtime))
except OSError:
pass
except OSError:
pass

return candidates
47 changes: 42 additions & 5 deletions bin/lib/cli/cefs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import shutil
import subprocess
import sys
import time
import uuid
from pathlib import Path

Expand All @@ -31,7 +32,7 @@
get_installable_current_locations,
)
from lib.cefs.fsck import FSCKResults, run_fsck_validation
from lib.cefs.gc import delete_image_with_manifest, filter_images_by_age
from lib.cefs.gc import cleanup_bak_items, delete_image_with_manifest, filter_images_by_age, find_bak_candidates
from lib.cefs.paths import (
FileWithAge,
get_cefs_mount_path,
Expand Down Expand Up @@ -692,21 +693,54 @@ def consolidate(
raise click.ClickException(f"Failed to consolidate {failed_groups} groups")


GC_DEFAULT_MIN_AGE = "2d"


def _run_bak_cleanup(context: CliContext, state: CEFSState, min_age_seconds: float, force: bool) -> None:
"""Find and remove stale .bak and .DELETE_ME_* items using manifest-known paths."""
current_time = time.time()
dry_run = context.installation_context.dry_run

_LOGGER.info("Checking %d manifest-known paths for .bak/.DELETE_ME_* items...", len(state.image_references))
candidates = find_bak_candidates(state.image_references, current_time)

if not candidates:
_LOGGER.info("No .bak or .DELETE_ME_* items found.")
return

_LOGGER.info("Found %d candidate items", len(candidates))

if not dry_run and not force:
eligible = sum(1 for item in candidates if item.age_seconds >= min_age_seconds)
if eligible > 0 and not click.confirm(f"Delete {eligible} .bak/.DELETE_ME_* items older than the min age?"):
_LOGGER.info("Backup cleanup cancelled by user.")
return

result = cleanup_bak_items(candidates, min_age_seconds, dry_run)
verb = "Would delete" if dry_run else "Deleted"
_LOGGER.info("%s %d items, skipped %d (too recent)", verb, result.deleted_count, result.skipped_too_recent)
if result.errors:
for err in result.errors:
_LOGGER.warning("Error during cleanup: %s", err)


@cefs.command()
@click.pass_obj
@click.option("--force", is_flag=True, help="Skip confirmation prompt")
@click.option(
"--min-age", default=DEFAULT_MIN_AGE, help="Minimum age of images to consider for deletion (e.g., 1h, 30m, 1d)"
)
@click.option("--min-age", default=GC_DEFAULT_MIN_AGE, help="Minimum age for deletion (e.g., 1h, 30m, 2d). Default: 2d")
@click.option("--include-broken", is_flag=True, help="Include unreferenced broken images in garbage collection")
def gc(context: CliContext, force: bool, min_age: str, include_broken: bool):
@click.option("--cleanup-bak", is_flag=True, help="Also remove old .bak and .DELETE_ME_* items from NFS before GC")
def gc(context: CliContext, force: bool, min_age: str, include_broken: bool, cleanup_bak: bool):
"""Garbage collect unreferenced CEFS images using manifests.

Reads manifest files from CEFS images to determine expected symlink locations,
then checks if those symlinks exist and point back to the images.
Images without valid references are marked for deletion.

Images with .yaml.inprogress manifests are NEVER deleted (incomplete operations).

With --cleanup-bak, also removes .bak directories and .DELETE_ME_* items that
accumulate when CEFS replaces compiler directories with symlinks.
"""
_LOGGER.info("Starting CEFS garbage collection using manifest system...")

Expand All @@ -729,6 +763,9 @@ def gc(context: CliContext, force: bool, min_age: str, include_broken: bool):
_LOGGER.info("Scanning CEFS images directory and reading manifests...")
state.scan_cefs_images_with_manifests()

if cleanup_bak:
_run_bak_cleanup(context, state, min_age_seconds, force)

if include_broken:
_LOGGER.info("Checking symlink references (--include-broken: will scan for actual usage of broken images)...")
else:
Expand Down
Loading
Loading