diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 7cf7033..52f9926 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -144,11 +144,12 @@ aci scan --target . --profile full --scope-mode full-repo --report-scope-class r aci emit-sarif --report report.json > aci.sarif # SARIF 2.1.0 for code scanning aci emit-annotations --report report.json # GitHub Actions annotations aci emit-github-summary --report report.json # GitHub markdown summary +aci emit-baseline --report report.json --output ops.toml # accept current findings as a baseline aci validate-report --report report.json # check against the report contract ``` Report-view filters are available on `scan`, `emit-sarif`, `emit-annotations`, -and `emit-github-summary`: +`emit-github-summary`, and `emit-baseline`: ```bash --report-scope-class runtime-source diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md index 3b8e646..4e43b1c 100644 --- a/docs/QUICKSTART.md +++ b/docs/QUICKSTART.md @@ -121,17 +121,30 @@ aci scan --target . --profile full --severity-threshold critical aci scan --target . --profile full --fail-on-new-findings ``` -## 6. Accept or defer findings (operations file) +## 6. Adopt on an existing codebase (baseline) -Create an operations TOML and pass `--operations-file ops.toml` to baseline, -suppress, or waive findings without editing code: +To start tracking only **new** issues on a project that already has findings, +generate a baseline from a scan and pass it back on later runs — no hand-edited +TOML: -```toml -[baseline] -entries = [{ ci_id = "CI-03", target_file = "legacy/util.py", line = 12 }] +```bash +aci scan --target . --profile full --output report.json # 1. scan once +aci emit-baseline --report report.json --output ops.toml # 2. accept today's findings +aci scan --target . --profile full --operations-file ops.toml --fail-on-new-findings + # 3. from now on, only NEW findings block +``` +Each baseline entry is keyed by the finding's **fingerprint**, which is stable +across unrelated line shifts, so edits elsewhere in a file do not resurrect a +baselined finding. When you fix a finding, the next scan reports its entry as +`resolved` (a candidate to remove from the baseline). + +You can also hand-author an operations file to **waive** or **suppress** specific +findings without editing code: + +```toml [waiver] -entries = [{ waiver_id = "W1", ci_id = "CI-21", target_file = "app/io.py", line = 42 }] +entries = [{ waiver_id = "W1", fingerprint = "…", owner = "alice", reason = "tracked in JIRA-123", review_condition = "before GA" }] ``` ## 7. Hosted CI integration diff --git a/shared/python/aci_baseline.py b/shared/python/aci_baseline.py new file mode 100644 index 0000000..351387d --- /dev/null +++ b/shared/python/aci_baseline.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Generate an operations-file baseline from a scan report. + +Adopting ACI on an existing codebase means accepting today's findings as +pre-existing so that future scans surface only NEW ones. That baseline used to +be hand-authored TOML; this module derives it from a report instead, turning the +central adoption step into one command (`aci emit-baseline`). + +The emitted TOML round-trips through aci_operations.load_operations_state. Each +entry's identity is the fingerprint (stable across unrelated line shifts), so no +line number is written -- encoding identity by line was the root of three earlier +defects. Output is sorted for a stable, reviewable diff when the baseline is +regenerated. +""" +from __future__ import annotations + +from typing import cast + +# TOML basic-string escapes (TOML v1.0.0 §String). Everything else, including +# printable non-ASCII, is emitted as-is in UTF-8. +_TOML_SIMPLE_ESCAPES = { + "\\": "\\\\", + '"': '\\"', + "\b": "\\b", + "\t": "\\t", + "\n": "\\n", + "\f": "\\f", + "\r": "\\r", +} + + +def _toml_escape(value: str) -> str: + out: list[str] = [] + for ch in value: + simple = _TOML_SIMPLE_ESCAPES.get(ch) + if simple is not None: + out.append(simple) + elif ch < "\x20" or ch == "\x7f": + out.append(f"\\u{ord(ch):04X}") + else: + out.append(ch) + return "".join(out) + + +def _entry_fields(finding: dict[str, object]) -> list[tuple[str, str]]: + """The stable-identity fields for one baseline entry, in emit order. + + fingerprint anchors the match; ci_id and target_file keep the TOML readable + and let resolved-baseline detection bound itself to scanned files. line is + deliberately omitted. + """ + fields: list[tuple[str, str]] = [] + for key in ("fingerprint", "ci_id", "target_file"): + raw = finding.get(key) + if isinstance(raw, str) and raw: + fields.append((key, raw)) + return fields + + +def build_baseline_operations(report: dict[str, object]) -> str: + """Return operations TOML whose [baseline] accepts every finding in *report*.""" + raw_findings = report.get("findings") + findings = raw_findings if isinstance(raw_findings, list) else [] + entries: list[list[tuple[str, str]]] = [] + skipped = 0 + for finding in findings: + if not isinstance(finding, dict): + skipped += 1 + continue + fields = _entry_fields(cast(dict[str, object], finding)) + if not fields: + skipped += 1 + continue + entries.append(fields) + + # Deterministic order: by target_file, then ci_id, then fingerprint, so a + # regenerated baseline diffs cleanly against the previous one. + def _sort_key(fields: list[tuple[str, str]]) -> tuple[str, str, str]: + as_map = dict(fields) + return (as_map.get("target_file", ""), as_map.get("ci_id", ""), as_map.get("fingerprint", "")) + + entries.sort(key=_sort_key) + + lines = [ + "# ACI baseline -- generated from a scan report by `aci emit-baseline`.", + "# Each entry accepts a finding as pre-existing; future scans report only", + "# NEW findings. Identity is the fingerprint (stable across line shifts), so", + "# no line numbers are stored here. Remove an entry once its finding is", + "# fixed -- ACI then reports it as resolved on the next scan.", + "[baseline]", + ] + if not entries: + lines.append("entries = []") + return "\n".join(lines) + "\n" + + lines.append("entries = [") + rendered = [ + " { " + ", ".join(f'{key} = "{_toml_escape(value)}"' for key, value in fields) + " }," + for fields in entries + ] + lines.extend(rendered) + lines.append("]") + if skipped: + lines.append(f"# note: {skipped} finding(s) had no usable identity and were not baselined.") + return "\n".join(lines) + "\n" diff --git a/shared/python/aci_cli.py b/shared/python/aci_cli.py index ba9b987..f0c8ba6 100644 --- a/shared/python/aci_cli.py +++ b/shared/python/aci_cli.py @@ -25,6 +25,7 @@ SCOPE_MODE_SOURCE_ONLY, ) from .aci_annotations import build_github_annotations + from .aci_baseline import build_baseline_operations from .aci_sarif import build_sarif_report from .aci_sarif import validate_sarif_report from .aci_package_assets import read_text_asset @@ -60,6 +61,7 @@ SCOPE_MODE_SOURCE_ONLY, ) from aci_annotations import build_github_annotations + from aci_baseline import build_baseline_operations # type: ignore[no-redef] from aci_sarif import build_sarif_report from aci_sarif import validate_sarif_report from aci_package_assets import read_text_asset @@ -229,6 +231,19 @@ def _build_parser() -> argparse.ArgumentParser: github_summary_cmd.add_argument("--report", type=Path, required=True, help="ACI report JSON file to summarize") _add_report_view_args(github_summary_cmd) + baseline_cmd = sub.add_parser( + "emit-baseline", + help="Generate an operations-file baseline (TOML) from a report JSON, accepting today's findings as pre-existing", + ) + baseline_cmd.add_argument("--report", type=Path, required=True, help="ACI report JSON file to baseline") + baseline_cmd.add_argument( + "--output", + type=Path, + default=None, + help="Write the baseline TOML to this path instead of stdout", + ) + _add_report_view_args(baseline_cmd) + catalog = sub.add_parser( "show-analyzer-catalog", help="Print the bounded external-analyzer catalog known to the common shelf", @@ -436,6 +451,20 @@ def _handle_report_command(args: argparse.Namespace) -> int | None: raise ValueError(f"Report file is not a JSON object: {args.report}") print(build_github_summary_markdown(_project_report_view_from_args(data, args)), end="") return EXIT_OK + if args.command == "emit-baseline": + data = _read_json_file(args.report) + if not isinstance(data, dict): + raise ValueError(f"Report file is not a JSON object: {args.report}") + toml_text = build_baseline_operations(_project_report_view_from_args(data, args)) + output_path = getattr(args, "output", None) + if output_path is not None: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(toml_text, encoding="utf-8") + entry_count = toml_text.count("{ ") + print(f"ACI baseline written to {output_path} ({entry_count} entr{'y' if entry_count == 1 else 'ies'})") + else: + print(toml_text, end="") + return EXIT_OK return None diff --git a/shared/tests/test_aci_report_surface_contracts.py b/shared/tests/test_aci_report_surface_contracts.py index 53b9f50..8f2e8ae 100644 --- a/shared/tests/test_aci_report_surface_contracts.py +++ b/shared/tests/test_aci_report_surface_contracts.py @@ -5,7 +5,9 @@ from types import SimpleNamespace from aci.aci_cli import _handle_report_command +from aci.aci_baseline import build_baseline_operations from aci.aci_github_summary import build_github_summary_markdown +from aci.aci_operations import load_operations_state def _sample_report() -> dict[str, object]: @@ -263,3 +265,58 @@ def test_github_summary_carries_the_detection_disclosure_at_the_point_of_use() - no_disclosure = {k: v for k, v in clean_report.items() if k != "detection_disclosure"} rendered = build_github_summary_markdown(no_disclosure) assert "Scope note:" not in rendered + + +def test_emit_baseline_round_trips_through_the_operations_loader(tmp_path: Path, capsys) -> None: + # The generated baseline is only useful if the real loader accepts it and the + # entries identify exactly the report's findings by their stable fingerprint. + report_path = _write_report(tmp_path) + + result = _handle_report_command( + SimpleNamespace(command="emit-baseline", report=report_path, output=None, report_scope_class=[], report_owner_lane=[]) + ) + assert result == 0 + toml_text = capsys.readouterr().out + + ops_path = tmp_path / "ops.toml" + ops_path.write_text(toml_text, encoding="utf-8") + state = load_operations_state(ops_path) + assert {e.fingerprint for e in state.baseline_entries} == {"fp-1", "fp-2", "fp-3"} + # Identity is the fingerprint, never the line: no entry carries a line number. + assert all(e.line is None for e in state.baseline_entries) + + +def test_emit_baseline_escapes_special_characters_and_round_trips(tmp_path: Path) -> None: + # File names and ids can carry TOML metacharacters; the emitter must produce + # TOML the strict loader (tomllib) still parses, not a broken file. + report = { + "findings": [ + {"fingerprint": "fp-x", "ci_id": "CI-21", "target_file": 'weird "name"\\dir/â.py'}, + ] + } + toml_text = build_baseline_operations(report) + ops_path = tmp_path / "ops.toml" + ops_path.write_text(toml_text, encoding="utf-8") + state = load_operations_state(ops_path) # raises if the escaping is wrong + assert state.baseline_entries[0].target_file == 'weird "name"\\dir/â.py' + + +def test_emit_baseline_empty_report_is_a_valid_empty_baseline(tmp_path: Path) -> None: + toml_text = build_baseline_operations({"findings": []}) + ops_path = tmp_path / "ops.toml" + ops_path.write_text(toml_text, encoding="utf-8") + state = load_operations_state(ops_path) + assert state.baseline_entries == () + + +def test_emit_baseline_is_deterministic() -> None: + report = { + "findings": [ + {"fingerprint": "fp-b", "ci_id": "CI-21", "target_file": "b.py"}, + {"fingerprint": "fp-a", "ci_id": "CI-03", "target_file": "a.py"}, + ] + } + first = build_baseline_operations(report) + assert first == build_baseline_operations(report) + # sorted by target_file, so a.py precedes b.py regardless of input order + assert first.index("a.py") < first.index("b.py")