diff --git a/.github/workflows/_ci-gate.yml b/.github/workflows/_ci-gate.yml index d745781..4f6870d 100644 --- a/.github/workflows/_ci-gate.yml +++ b/.github/workflows/_ci-gate.yml @@ -44,7 +44,7 @@ # Filter name convention (caller defines these in `filters:` input): # # nix -> gates `nix_validate` -# markdown -> gates `markdown_lint` and `file_size` +# markdown -> gates `markdown_lint`, `file_size`, and `token_limits` # python -> gates `python_security` # # Callers may include additional filters; this workflow ignores them. To add @@ -85,6 +85,13 @@ on: description: Enable `File Size` (gated on `nix` OR `markdown` filter) type: boolean default: false + token_limits: + description: >- + Enable `Token Limits` (gated on `markdown` filter). Budgets AI-read + docs via tiktoken per .token-limits.yaml; pairs with `file_size`, + which skips token-gated files. No secret required. + type: boolean + default: false python_security: description: Enable `Python Security` (gated on `python` filter) type: boolean @@ -167,6 +174,14 @@ jobs: with: runner_label: ${{ inputs.runner_label }} + token-limits: + name: Token Limits + needs: changes + if: ${{ inputs.token_limits && needs.changes.outputs.markdown == 'true' }} + uses: dryvist/.github/.github/workflows/_token-limits.yml@main + with: + runner_label: ${{ inputs.runner_label }} + python-security: name: Python Security needs: changes @@ -218,7 +233,7 @@ jobs: # ============================================================================ gate: name: Merge Gate - needs: [changes, watchdog, nix-validate, markdown-lint, file-size, python-security] + needs: [changes, watchdog, nix-validate, markdown-lint, file-size, token-limits, python-security] if: ${{ always() && !cancelled() }} runs-on: ${{ inputs.runner_label }} steps: @@ -227,5 +242,5 @@ jobs: with: # `watchdog` is always-success-on-completion; treating it as # allowed-skip lets `alls-green` ignore its result either way. - allowed-skips: nix-validate, markdown-lint, file-size, python-security, watchdog + allowed-skips: nix-validate, markdown-lint, file-size, token-limits, python-security, watchdog jobs: ${{ toJSON(needs) }} diff --git a/.github/workflows/_file-size.yml b/.github/workflows/_file-size.yml index b901d62..a5239b1 100644 --- a/.github/workflows/_file-size.yml +++ b/.github/workflows/_file-size.yml @@ -12,6 +12,12 @@ # scan: [.md, .nix] # replaces default scan list # extended: { limit: 32768, files: [AGENTS] } # additive higher limit # exempt: [RUNBOOK] # additive to default [CHANGELOG] +# +# Token partition: when a .token-limits.yaml is present, Markdown (.md) docs are +# token-gated (governed by _token-limits.yml) and dropped from this byte gate's +# scan, so every file is checked by exactly one gate. Repos without a +# .token-limits.yaml are unaffected. (.token-limits.yaml should budget all .md — +# e.g. a '*.md' catch-all — or exclude any it intentionally leaves ungated.) name: _file-size on: @@ -70,6 +76,18 @@ jobs: [ -n "$cfg_exempt" ] && EXEMPT="$EXEMPT$cfg_exempt " fi + # When a token gate is active (.token-limits.yaml present), Markdown + # docs are governed by _token-limits.yml. Drop .md from this byte + # gate's scan so each file is checked by exactly one gate. + if [ -f ".token-limits.yaml" ]; then + new_scan="" + for ext in $DEFAULT_SCAN; do + [ "$ext" = ".md" ] && continue + new_scan="$new_scan $ext" + done + DEFAULT_SCAN="$new_scan" + fi + # Build find name arguments from scan extensions name_args=(); first=true for ext in $DEFAULT_SCAN; do diff --git a/.github/workflows/_token-limits.yml b/.github/workflows/_token-limits.yml new file mode 100644 index 0000000..2da68fc --- /dev/null +++ b/.github/workflows/_token-limits.yml @@ -0,0 +1,58 @@ +# Reusable: Token Limit Check +# +# Token-budgets AI-read docs (the prose files an agent loads for background) +# using the public, open-source `tiktoken` tokenizer — NO API key, NO secret. +# Per-repo config in `.token-limits.yaml` (see scripts/check-token-limits.py). +# +# Pairs with `_file-size.yml`: a file is "token-gated" iff it matches a +# `limits` pattern, and the byte gate skips token-gated files — so every file +# is governed by exactly one gate. Repos with no `.token-limits.yaml` get a +# no-op here and keep the byte gate's original behavior. +name: _token-limits + +on: + workflow_call: + inputs: + runner_label: + description: >- + GitHub Actions runner label. Defaults to ubuntu-latest. Pass a + RunsOn label to opt the calling repo into self-hosted runners. + type: string + required: false + default: ubuntu-latest + +permissions: {} + +concurrency: + group: token-limits-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + check: + name: Check + runs-on: ${{ inputs.runner_label }} + permissions: + contents: read + steps: + - name: Checkout caller repo + uses: actions/checkout@v6 + + - name: Sparse-checkout the shared token-counter from this repo + uses: actions/checkout@v6 + with: + repository: dryvist/.github + ref: main + path: .gh-shared + sparse-checkout: scripts/check-token-limits.py + sparse-checkout-cone-mode: false + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Install tiktoken + run: pip install --quiet tiktoken pyyaml + + - name: Check token limits + run: python3 .gh-shared/scripts/check-token-limits.py diff --git a/scripts/check-token-limits.py b/scripts/check-token-limits.py new file mode 100644 index 0000000..af2291a --- /dev/null +++ b/scripts/check-token-limits.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 +"""Fail if a token-gated file exceeds its .token-limits.yaml budget. + +Counts with the public, offline tiktoken tokenizer (no API key). First matching +`limits` glob wins (list specific patterns first); `exclude` globs are skipped. +Pairs with the byte file-size gate, which drops .md when this config is present. +""" +import fnmatch +import os +import sys + +import tiktoken +import yaml + +cfg = yaml.safe_load(open(".token-limits.yaml")) if os.path.exists(".token-limits.yaml") else {} +cfg = cfg if isinstance(cfg, dict) else {} +limits = cfg.get("limits") +limits = limits if isinstance(limits, dict) else {} +exclude = cfg.get("exclude") +exclude = exclude if isinstance(exclude, list) else [] +if not limits: + sys.exit(0) + +enc = tiktoken.get_encoding("o200k_base") +SKIP = {".git", "node_modules", "result", ".terraform", ".terragrunt-cache", ".direnv", ".gh-shared"} + + +def hit(path, name, pat): + return fnmatch.fnmatch(path, pat) or fnmatch.fnmatch(name, pat) + + +errors = 0 +for root, dirs, files in os.walk("."): + dirs[:] = [d for d in dirs if d not in SKIP] + for name in files: + path = os.path.relpath(os.path.join(root, name), ".") + if any(hit(path, name, e) for e in exclude): + continue + lim = next((v for p, v in limits.items() if isinstance(v, int) and hit(path, name, p)), None) + if lim is None: + continue + try: + tokens = len(enc.encode(open(os.path.join(root, name), encoding="utf-8").read())) + except (UnicodeDecodeError, OSError): + continue + if tokens > lim: + print(f"::error file={path}::{path} is {tokens} tokens (exceeds {lim})") + errors += 1 + +sys.exit(1 if errors else 0)