diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ac5b6de..1f5e590 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,10 +34,6 @@ jobs: pip install --upgrade pip pip install -r backend/requirements.txt - - name: Download NLTK data - run: | - python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')" - - name: Run tests run: python -m pytest tests/ -v diff --git a/.github/workflows/dependency_security.yml b/.github/workflows/dependency_security.yml index 47b6f31..483a339 100644 --- a/.github/workflows/dependency_security.yml +++ b/.github/workflows/dependency_security.yml @@ -42,6 +42,4 @@ jobs: - name: Run vulnerability audit run: | - # Temporary exception: CVE-2025-14009 has no fixed NLTK release yet. - # Keep this ignored only until upstream publishes a patched version. - pip-audit -r backend/requirements.txt --ignore-vuln CVE-2025-14009 + pip-audit -r backend/requirements.txt diff --git a/.github/workflows/etl_semanal.yml b/.github/workflows/etl_semanal.yml index a441372..8873306 100644 --- a/.github/workflows/etl_semanal.yml +++ b/.github/workflows/etl_semanal.yml @@ -47,10 +47,6 @@ jobs: pip install --upgrade pip pip install -r backend/requirements.txt - - name: Download NLTK data - run: | - python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')" - - name: Run GitHub ETL env: GITHUB_TOKEN: ${{ secrets.GH_PAT }} @@ -145,10 +141,6 @@ jobs: pip install --upgrade pip pip install -r backend/requirements.txt - - name: Download NLTK data - run: | - python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')" - - name: Run Reddit ETL (non-blocking) id: reddit_run env: @@ -237,34 +229,28 @@ jobs: workflow_conclusion: success if_no_artifact_found: warn - - name: Restore previous history (if any) + - name: Detect previous aggregate snapshot + id: previous_history shell: bash run: | - if [ -d "prev_artifacts/datos/history" ]; then - mkdir -p datos/history - rsync -a prev_artifacts/datos/history/ datos/history/ + if [ -d "prev_artifacts/datos/history" ] || [ -d "prev_artifacts/history" ]; then + echo "expect_previous_history=1" >> "$GITHUB_OUTPUT" + else + echo "expect_previous_history=0" >> "$GITHUB_OUTPUT" fi - - name: Materialize source outputs + - name: Restore previous aggregate outputs (if any) shell: bash run: | - mkdir -p datos datos/latest datos/history frontend/assets/data - if [ -d "artifacts/github/datos" ]; then - rsync -a artifacts/github/datos/ datos/ - else - cp -f artifacts/github/*.csv datos/ 2>/dev/null || true - fi - if [ -d "artifacts/stackoverflow/datos" ]; then - rsync -a artifacts/stackoverflow/datos/ datos/ - else - cp -f artifacts/stackoverflow/*.csv datos/ 2>/dev/null || true - fi - if [ -d "artifacts/reddit/datos" ]; then - rsync -a artifacts/reddit/datos/ datos/ - else - cp -f artifacts/reddit/*.csv datos/ 2>/dev/null || true + if [ -d "prev_artifacts" ]; then + python scripts/materialize_etl_artifacts.py --project-root . prev_artifacts fi + - name: Materialize source outputs + shell: bash + run: | + python scripts/materialize_etl_artifacts.py --project-root . artifacts/github artifacts/stackoverflow artifacts/reddit + - name: Verify artifact handoff shell: bash run: | @@ -281,6 +267,10 @@ jobs: echo "::error::Missing required artifact file datos/${required}" missing=1 fi + if [ ! -f "datos/latest/${required}" ]; then + echo "::error::Missing required artifact file datos/latest/${required}" + missing=1 + fi done for optional in \ reddit_sentimiento_frameworks.csv \ @@ -289,6 +279,9 @@ jobs: if [ ! -f "datos/${optional}" ]; then echo "::warning::Optional artifact missing (degraded mode may continue): datos/${optional}" fi + if [ ! -f "datos/latest/${optional}" ]; then + echo "::warning::Optional artifact missing (degraded mode may continue): datos/latest/${optional}" + fi done if [ "$missing" -ne 0 ]; then exit 1 @@ -349,6 +342,12 @@ jobs: - name: Enforce frontend assets policy (strict) run: python scripts/check_frontend_assets.py --mode strict --root . + - name: Enforce bridge integrity gate + run: | + python scripts/check_bridge_integrity.py \ + --project-root . \ + --expect-previous-history "${{ steps.previous_history.outputs.expect_previous_history }}" + - name: Upload aggregate artifacts uses: actions/upload-artifact@v4 with: @@ -396,15 +395,7 @@ jobs: - name: Restore aggregated files into workspace shell: bash run: | - mkdir -p datos datos/latest datos/history datos/metadata frontend/assets/data - cp -f artifact_payload/datos/*.csv datos/ 2>/dev/null || true - cp -f artifact_payload/datos/latest/*.csv datos/latest/ 2>/dev/null || true - cp -f artifact_payload/datos/metadata/*.json datos/metadata/ 2>/dev/null || true - cp -f artifact_payload/frontend/assets/data/*.json frontend/assets/data/ 2>/dev/null || true - cp -f artifact_payload/frontend/assets/data/*.csv frontend/assets/data/ 2>/dev/null || true - if [ -d artifact_payload/datos/history ]; then - rsync -a artifact_payload/datos/history/ datos/history/ - fi + python scripts/materialize_etl_artifacts.py --project-root . artifact_payload - name: Commit updated data id: commit_data diff --git a/backend/reddit_etl.py b/backend/reddit_etl.py index 9063d01..314ef5e 100644 --- a/backend/reddit_etl.py +++ b/backend/reddit_etl.py @@ -10,11 +10,10 @@ import pandas as pd from datetime import datetime import requests -import nltk -from nltk.sentiment import SentimentIntensityAnalyzer import warnings import time import re +from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from config.settings import ( ARCHIVOS_SALIDA, REDDIT_SUBREDDIT, REDDIT_LIMIT, @@ -109,7 +108,6 @@ def _obtener_token_oauth(self): def definir_pasos(self): """Define los pasos ETL de Reddit.""" return [ - ("Preparar recursos NLTK", self._ensure_nltk_resources), ("Autenticacion OAuth", self._obtener_token_oauth), ("Extraccion de posts", self.extraer_posts), ("Sentimiento de frameworks", self.analizar_sentimiento_frameworks), @@ -136,23 +134,6 @@ def validar_configuracion(self): "Se ejecutara en modo degradado (API publica)." ) - def _ensure_nltk_resources(self): - """Asegura que recursos NLTK requeridos esten disponibles en runtime. - - Evita side effects durante import del modulo. - """ - try: - nltk.data.find('sentiment/vader_lexicon') - except LookupError: - self.logger.info("Descargando recurso NLTK: vader_lexicon") - nltk.download('vader_lexicon', quiet=True) - - try: - nltk.data.find('corpora/stopwords') - except LookupError: - self.logger.info("Descargando recurso NLTK: stopwords") - nltk.download('stopwords', quiet=True) - def extraer_posts(self, subreddit_name=REDDIT_SUBREDDIT, limit=REDDIT_LIMIT): """Extrae posts de un subreddit usando la API JSON publica de Reddit. diff --git a/backend/requirements.txt b/backend/requirements.txt index d14d1d0..e3a185e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -3,7 +3,7 @@ requests>=2.31.0,<2.33.0 pandas>=2.2.0,<3.0 numpy>=1.24,<3.0 python-dotenv>=1.0.0,<2.0 -nltk>=3.8.1,<3.10 +vaderSentiment>=3.3.2,<4.0 pandera>=0.22.0,<0.23.0 duckdb>=1.2.2,<2.0 diff --git a/scripts/check_bridge_integrity.py b/scripts/check_bridge_integrity.py new file mode 100644 index 0000000..66c56e4 --- /dev/null +++ b/scripts/check_bridge_integrity.py @@ -0,0 +1,129 @@ +"""Valida que los bridges frontend conserven historial util antes de publicar.""" + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + + +REQUIRED_HISTORY_DATASETS = { + "trend_score", + "github_commits", + "github_correlacion", + "github_lenguajes", + "so_volumen", + "so_aceptacion", + "so_tendencias", + "reddit_temas", + "interseccion", +} + +REQUIRED_HISTORY_BRIDGES = ( + "github_frameworks_history.json", + "github_correlacion_history.json", + "so_volumen_history.json", + "so_aceptacion_history.json", + "so_tendencias_history.json", + "reddit_temas_history.json", + "reddit_interseccion_history.json", +) + + +def _load_json(path: Path) -> dict: + return json.loads(path.read_text(encoding="utf-8")) + + +def _bridge_assets_root(project_root: Path) -> Path: + return project_root / "frontend" / "assets" / "data" + + +def check_bridge_integrity( + project_root: Path | str, + *, + expect_previous_history: bool = False, +) -> dict[str, int | str]: + project_root = Path(project_root) + assets_root = _bridge_assets_root(project_root) + errors: list[str] = [] + + history_index = _load_json(assets_root / "history_index.json") + dataset_names = { + str(item.get("dataset", "")).strip() + for item in history_index.get("datasets", []) + if str(item.get("dataset", "")).strip() + } + missing_datasets = sorted(REQUIRED_HISTORY_DATASETS - dataset_names) + if missing_datasets: + errors.append( + "history_index missing datasets: " + ", ".join(missing_datasets) + ) + + trend_history = _load_json(assets_root / "trend_score_history.json") + snapshot_count = int(trend_history.get("snapshot_count", 0) or 0) + minimum_snapshots = 2 if expect_previous_history else 1 + if snapshot_count < minimum_snapshots: + errors.append( + f"trend_score_history snapshot_count={snapshot_count} < {minimum_snapshots}" + ) + + technology_profiles = _load_json(assets_root / "technology_profiles.json") + if not technology_profiles.get("latest_snapshot_date"): + errors.append("technology_profiles latest_snapshot_date missing") + if int(technology_profiles.get("profile_count", 0) or 0) <= 0: + errors.append("technology_profiles profile_count must be positive") + if expect_previous_history and not technology_profiles.get("previous_snapshot_date"): + errors.append("technology_profiles previous_snapshot_date missing") + + home_highlights = _load_json(assets_root / "home_highlights.json") + highlights = home_highlights.get("highlights", []) + minimum_highlights = 3 if expect_previous_history else 2 + if len(highlights) < minimum_highlights: + errors.append( + f"home_highlights highlights={len(highlights)} < {minimum_highlights}" + ) + + for bridge_name in REQUIRED_HISTORY_BRIDGES: + payload = _load_json(assets_root / bridge_name) + source_mode = str(payload.get("source_mode", "")).strip().lower() + if source_mode in {"", "missing", "none"}: + errors.append(f"{bridge_name} source_mode={source_mode or 'missing'}") + if not payload.get("latest_snapshot_date"): + errors.append(f"{bridge_name} latest_snapshot_date missing") + if expect_previous_history and not payload.get("previous_snapshot_date"): + errors.append(f"{bridge_name} previous_snapshot_date missing") + + if errors: + raise ValueError("; ".join(errors)) + + return { + "status": "ok", + "dataset_count": len(dataset_names), + "trend_snapshot_count": snapshot_count, + "profile_count": int(technology_profiles.get("profile_count", 0) or 0), + "home_highlight_count": len(highlights), + } + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--project-root", default=".") + parser.add_argument( + "--expect-previous-history", + type=int, + default=0, + choices=(0, 1), + help="Exige snapshot previo cuando el workflow ya recupero un aggregate previo.", + ) + args = parser.parse_args() + + summary = check_bridge_integrity( + args.project_root, + expect_previous_history=bool(args.expect_previous_history), + ) + print(json.dumps(summary, ensure_ascii=False)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/materialize_etl_artifacts.py b/scripts/materialize_etl_artifacts.py new file mode 100644 index 0000000..82a273d --- /dev/null +++ b/scripts/materialize_etl_artifacts.py @@ -0,0 +1,116 @@ +"""Materializa artifacts ETL descargados dentro del workspace del proyecto.""" + +from __future__ import annotations + +import argparse +import json +import shutil +from pathlib import Path + + +def _copy_file(source: Path, destination: Path) -> None: + destination.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, destination) + + +def _copy_matching_files(source_root: Path, destination_root: Path, suffixes: tuple[str, ...]) -> int: + if not source_root.exists(): + return 0 + + copied = 0 + for file_path in sorted(source_root.rglob("*")): + if not file_path.is_file(): + continue + if file_path.suffix.lower() not in suffixes: + continue + _copy_file(file_path, destination_root / file_path.relative_to(source_root)) + copied += 1 + return copied + + +def _copy_top_level_csvs(source_root: Path, destination_root: Path) -> int: + if not source_root.exists(): + return 0 + + copied = 0 + for csv_path in sorted(source_root.glob("*.csv")): + _copy_file(csv_path, destination_root / csv_path.name) + copied += 1 + return copied + + +def _resolve_data_root(artifact_root: Path) -> Path: + nested = artifact_root / "datos" + return nested if nested.exists() else artifact_root + + +def materialize_artifacts(project_root: Path | str, artifact_roots: list[Path | str]) -> dict[str, int]: + project_root = Path(project_root) + data_root = project_root / "datos" + frontend_assets_root = project_root / "frontend" / "assets" / "data" + + summary = { + "artifact_roots": 0, + "legacy_files": 0, + "latest_files": 0, + "history_files": 0, + "metadata_files": 0, + "frontend_asset_files": 0, + } + + for raw_root in artifact_roots: + artifact_root = Path(raw_root) + if not artifact_root.exists(): + continue + + summary["artifact_roots"] += 1 + source_data_root = _resolve_data_root(artifact_root) + summary["legacy_files"] += _copy_top_level_csvs(source_data_root, data_root) + summary["latest_files"] += _copy_matching_files( + source_data_root / "latest", + data_root / "latest", + suffixes=(".csv",), + ) + summary["history_files"] += _copy_matching_files( + source_data_root / "history", + data_root / "history", + suffixes=(".csv",), + ) + summary["metadata_files"] += _copy_matching_files( + source_data_root / "metadata", + data_root / "metadata", + suffixes=(".json",), + ) + summary["frontend_asset_files"] += _copy_matching_files( + artifact_root / "frontend" / "assets" / "data", + frontend_assets_root, + suffixes=(".csv", ".json"), + ) + + return summary + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--project-root", + default=".", + help="Raiz del proyecto donde se restauran datos/ y frontend/assets/data.", + ) + parser.add_argument( + "artifact_roots", + nargs="+", + help="Directorios raiz descargados por actions/download-artifact.", + ) + args = parser.parse_args() + + summary = materialize_artifacts( + project_root=args.project_root, + artifact_roots=args.artifact_roots, + ) + print(json.dumps(summary, ensure_ascii=False)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_check_bridge_integrity.py b/tests/test_check_bridge_integrity.py new file mode 100644 index 0000000..ba9c787 --- /dev/null +++ b/tests/test_check_bridge_integrity.py @@ -0,0 +1,143 @@ +import json + +import pytest + +from scripts.check_bridge_integrity import check_bridge_integrity + + +def _write_json(path, payload): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def _write_healthy_bridge_set(root, *, previous_snapshot_date="2026-03-19"): + assets_dir = root / "frontend" / "assets" / "data" + _write_json( + assets_dir / "history_index.json", + { + "dataset_count": 9, + "datasets": [ + {"dataset": "trend_score"}, + {"dataset": "github_commits"}, + {"dataset": "github_correlacion"}, + {"dataset": "github_lenguajes"}, + {"dataset": "so_volumen"}, + {"dataset": "so_aceptacion"}, + {"dataset": "so_tendencias"}, + {"dataset": "reddit_temas"}, + {"dataset": "interseccion"}, + ], + }, + ) + _write_json( + assets_dir / "technology_profiles.json", + { + "latest_snapshot_date": "2026-03-22", + "previous_snapshot_date": previous_snapshot_date, + "profile_count": 22, + }, + ) + _write_json( + assets_dir / "home_highlights.json", + { + "candidate_count": 3, + "highlights": [{}, {}, {}], + }, + ) + _write_json( + assets_dir / "trend_score_history.json", + { + "snapshot_count": 2, + "snapshots": [{"date": "2026-03-19"}, {"date": "2026-03-22"}], + }, + ) + + for bridge_name in ( + "github_frameworks_history.json", + "github_correlacion_history.json", + "so_volumen_history.json", + "so_aceptacion_history.json", + "so_tendencias_history.json", + "reddit_temas_history.json", + "reddit_interseccion_history.json", + ): + _write_json( + assets_dir / bridge_name, + { + "source_mode": "history", + "latest_snapshot_date": "2026-03-22", + "previous_snapshot_date": previous_snapshot_date, + }, + ) + + +def test_bridge_integrity_passes_for_healthy_history(tmp_path): + _write_healthy_bridge_set(tmp_path) + + summary = check_bridge_integrity(tmp_path, expect_previous_history=True) + + assert summary["status"] == "ok" + assert summary["home_highlight_count"] == 3 + + +def test_bridge_integrity_fails_when_history_collapses_and_previous_is_missing(tmp_path): + _write_healthy_bridge_set(tmp_path, previous_snapshot_date=None) + assets_dir = tmp_path / "frontend" / "assets" / "data" + _write_json( + assets_dir / "history_index.json", + { + "dataset_count": 1, + "datasets": [{"dataset": "trend_score"}], + }, + ) + _write_json( + assets_dir / "home_highlights.json", + { + "candidate_count": 2, + "highlights": [{}, {}], + }, + ) + _write_json( + assets_dir / "reddit_interseccion_history.json", + { + "source_mode": "missing", + "latest_snapshot_date": None, + "previous_snapshot_date": None, + }, + ) + + with pytest.raises(ValueError, match="history_index"): + check_bridge_integrity(tmp_path, expect_previous_history=True) + + +def test_bridge_integrity_allows_bootstrap_without_previous_snapshot(tmp_path): + _write_healthy_bridge_set(tmp_path, previous_snapshot_date=None) + assets_dir = tmp_path / "frontend" / "assets" / "data" + _write_json( + assets_dir / "trend_score_history.json", + { + "snapshot_count": 1, + "snapshots": [{"date": "2026-03-22"}], + }, + ) + for bridge_name in ( + "github_frameworks_history.json", + "github_correlacion_history.json", + "so_volumen_history.json", + "so_aceptacion_history.json", + "so_tendencias_history.json", + "reddit_temas_history.json", + "reddit_interseccion_history.json", + ): + _write_json( + assets_dir / bridge_name, + { + "source_mode": "history", + "latest_snapshot_date": "2026-03-22", + "previous_snapshot_date": None, + }, + ) + + summary = check_bridge_integrity(tmp_path, expect_previous_history=False) + + assert summary["status"] == "ok" diff --git a/tests/test_materialize_etl_artifacts.py b/tests/test_materialize_etl_artifacts.py new file mode 100644 index 0000000..3afb00e --- /dev/null +++ b/tests/test_materialize_etl_artifacts.py @@ -0,0 +1,127 @@ +from scripts.materialize_etl_artifacts import materialize_artifacts + + +def test_materialize_artifacts_restores_nested_datos_layout(tmp_path): + artifact_root = tmp_path / "artifact" + workspace_root = tmp_path / "workspace" + + (artifact_root / "datos" / "latest").mkdir(parents=True, exist_ok=True) + ( + artifact_root + / "datos" + / "history" + / "github_lenguajes" + / "year=2026" + / "month=03" + / "day=22" + ).mkdir(parents=True, exist_ok=True) + + (artifact_root / "datos" / "github_lenguajes.csv").write_text( + "lenguaje,count\npython,10\n", + encoding="utf-8", + ) + (artifact_root / "datos" / "latest" / "github_lenguajes.csv").write_text( + "lenguaje,count\npython,12\n", + encoding="utf-8", + ) + ( + artifact_root + / "datos" + / "history" + / "github_lenguajes" + / "year=2026" + / "month=03" + / "day=22" + / "github_lenguajes.csv" + ).write_text("lenguaje,count\npython,12\n", encoding="utf-8") + + summary = materialize_artifacts( + project_root=workspace_root, + artifact_roots=[artifact_root], + ) + + assert summary["legacy_files"] == 1 + assert summary["latest_files"] == 1 + assert summary["history_files"] == 1 + assert (workspace_root / "datos" / "github_lenguajes.csv").exists() + assert (workspace_root / "datos" / "latest" / "github_lenguajes.csv").exists() + assert ( + workspace_root + / "datos" + / "history" + / "github_lenguajes" + / "year=2026" + / "month=03" + / "day=22" + / "github_lenguajes.csv" + ).exists() + + +def test_materialize_artifacts_restores_flattened_layout_and_overlays_in_order(tmp_path): + previous_root = tmp_path / "previous" + current_root = tmp_path / "current" + workspace_root = tmp_path / "workspace" + + (previous_root / "latest").mkdir(parents=True, exist_ok=True) + ( + previous_root + / "history" + / "reddit_temas" + / "year=2026" + / "month=03" + / "day=19" + ).mkdir(parents=True, exist_ok=True) + current_root.mkdir(parents=True, exist_ok=True) + + (previous_root / "reddit_temas_emergentes.csv").write_text( + "tema,menciones\nAI,10\n", + encoding="utf-8", + ) + (previous_root / "latest" / "reddit_temas_emergentes.csv").write_text( + "tema,menciones\nAI,10\n", + encoding="utf-8", + ) + ( + previous_root + / "history" + / "reddit_temas" + / "year=2026" + / "month=03" + / "day=19" + / "reddit_temas_emergentes.csv" + ).write_text("tema,menciones\nAI,10\n", encoding="utf-8") + + (current_root / "reddit_temas_emergentes.csv").write_text( + "tema,menciones\nAI,12\n", + encoding="utf-8", + ) + (current_root / "latest").mkdir(parents=True, exist_ok=True) + (current_root / "latest" / "reddit_temas_emergentes.csv").write_text( + "tema,menciones\nAI,12\n", + encoding="utf-8", + ) + + summary = materialize_artifacts( + project_root=workspace_root, + artifact_roots=[previous_root, current_root], + ) + + assert summary["legacy_files"] == 2 + assert summary["latest_files"] == 2 + assert summary["history_files"] == 1 + assert (workspace_root / "datos" / "reddit_temas_emergentes.csv").read_text( + encoding="utf-8" + ) == "tema,menciones\nAI,12\n" + assert (workspace_root / "datos" / "latest" / "reddit_temas_emergentes.csv").read_text( + encoding="utf-8" + ) == "tema,menciones\nAI,12\n" + assert ( + workspace_root + / "datos" + / "history" + / "reddit_temas" + / "year=2026" + / "month=03" + / "day=19" + / "reddit_temas_emergentes.csv" + ).exists() diff --git a/tests/test_reddit_etl.py b/tests/test_reddit_etl.py index 8c94679..c60e64d 100644 --- a/tests/test_reddit_etl.py +++ b/tests/test_reddit_etl.py @@ -50,12 +50,12 @@ class TestDefinirPasos: def test_returns_five_steps(self, etl): pasos = etl.definir_pasos() - assert len(pasos) == 6 + assert len(pasos) == 5 def test_step_names(self, etl): pasos = etl.definir_pasos() nombres = [n for n, _ in pasos] - assert "Preparar recursos NLTK" in nombres + assert "Preparar recursos NLTK" not in nombres assert "Autenticacion OAuth" in nombres assert "Sentimiento de frameworks" in nombres assert "Temas emergentes" in nombres diff --git a/tests/test_workflow_etl_contract.py b/tests/test_workflow_etl_contract.py index 7b3fb68..44471b7 100644 --- a/tests/test_workflow_etl_contract.py +++ b/tests/test_workflow_etl_contract.py @@ -37,8 +37,8 @@ def test_workflow_artifact_handoff_contract_is_defined(): assert "Download Reddit artifacts" in content assert "Download aggregate artifacts" in content assert "if-no-files-found: error" in content - assert "Missing required artifact file" in content - assert "Optional artifact missing (degraded mode may continue)" in content + assert "python scripts/materialize_etl_artifacts.py" in content + assert "python scripts/check_bridge_integrity.py" in content def test_workflow_publish_gate_and_bridge_asset_paths(): @@ -48,7 +48,7 @@ def test_workflow_publish_gate_and_bridge_asset_paths(): assert "Enforce frontend assets policy (strict)" in content assert "python scripts/check_frontend_assets.py --mode strict --root ." in content assert "frontend/assets/data/*.json" in content - assert "artifact_payload/frontend/assets/data/*.json" in content + assert "python scripts/materialize_etl_artifacts.py --project-root . artifact_payload" in content assert "frontend/assets/data/github_lenguajes.csv" in content assert "frontend/assets/data/so_volumen_preguntas.csv" in content assert "frontend/assets/data/reddit_temas_emergentes.csv" in content @@ -74,3 +74,9 @@ def test_workflow_generates_public_run_manifest_via_sync_assets(): assert "Sync CSVs to frontend assets" in content assert "python backend/sync_assets.py" in content assert "Generate/validate public run manifest" not in content + + +def test_workflow_no_longer_downloads_nltk_data(): + content = _load_workflow_text() + + assert "Download NLTK data" not in content