Sam-24-dev · Sam-24-dev · Mar 24, 2026 · Mar 23, 2026 · Copilot · Mar 23, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -34,10 +34,6 @@ jobs:
           pip install --upgrade pip
           pip install -r backend/requirements.txt
 
-      - name: Download NLTK data
-        run: |
-          python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')"
-
       - name: Run tests
         run: python -m pytest tests/ -v
 

diff --git a/.github/workflows/dependency_security.yml b/.github/workflows/dependency_security.yml
@@ -42,6 +42,4 @@ jobs:
 
       - name: Run vulnerability audit
         run: |
-          # Temporary exception: CVE-2025-14009 has no fixed NLTK release yet.
-          # Keep this ignored only until upstream publishes a patched version.
-          pip-audit -r backend/requirements.txt --ignore-vuln CVE-2025-14009
+          pip-audit -r backend/requirements.txt
diff --git a/.github/workflows/etl_semanal.yml b/.github/workflows/etl_semanal.yml
@@ -47,10 +47,6 @@ jobs:
           pip install --upgrade pip
           pip install -r backend/requirements.txt
 
-      - name: Download NLTK data
-        run: |
-          python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')"
-
       - name: Run GitHub ETL
         env:
           GITHUB_TOKEN: ${{ secrets.GH_PAT }}
@@ -145,10 +141,6 @@ jobs:
           pip install --upgrade pip
           pip install -r backend/requirements.txt
 
-      - name: Download NLTK data
-        run: |
-          python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')"
-
       - name: Run Reddit ETL (non-blocking)
         id: reddit_run
         env:
@@ -237,34 +229,28 @@ jobs:
           workflow_conclusion: success
           if_no_artifact_found: warn
 
-      - name: Restore previous history (if any)
+      - name: Detect previous aggregate snapshot
+        id: previous_history
         shell: bash
         run: |
-          if [ -d "prev_artifacts/datos/history" ]; then
-            mkdir -p datos/history
-            rsync -a prev_artifacts/datos/history/ datos/history/
+          if [ -d "prev_artifacts/datos/history" ] || [ -d "prev_artifacts/history" ]; then
+            echo "expect_previous_history=1" >> "$GITHUB_OUTPUT"
+          else
+            echo "expect_previous_history=0" >> "$GITHUB_OUTPUT"
           fi
 
-      - name: Materialize source outputs
+      - name: Restore previous aggregate outputs (if any)
         shell: bash
         run: |
-          mkdir -p datos datos/latest datos/history frontend/assets/data
-          if [ -d "artifacts/github/datos" ]; then
-            rsync -a artifacts/github/datos/ datos/
-          else
-            cp -f artifacts/github/*.csv datos/ 2>/dev/null || true
-          fi
-          if [ -d "artifacts/stackoverflow/datos" ]; then
-            rsync -a artifacts/stackoverflow/datos/ datos/
-          else
-            cp -f artifacts/stackoverflow/*.csv datos/ 2>/dev/null || true
-          fi
-          if [ -d "artifacts/reddit/datos" ]; then
-            rsync -a artifacts/reddit/datos/ datos/
-          else
-            cp -f artifacts/reddit/*.csv datos/ 2>/dev/null || true
+          if [ -d "prev_artifacts" ]; then
+            python scripts/materialize_etl_artifacts.py --project-root . prev_artifacts
           fi
 
+      - name: Materialize source outputs
+        shell: bash
+        run: |
+          python scripts/materialize_etl_artifacts.py --project-root . artifacts/github artifacts/stackoverflow artifacts/reddit
+
       - name: Verify artifact handoff
         shell: bash
         run: |
@@ -281,6 +267,10 @@ jobs:
               echo "::error::Missing required artifact file datos/${required}"
               missing=1
             fi
+            if [ ! -f "datos/latest/${required}" ]; then
+              echo "::error::Missing required artifact file datos/latest/${required}"
+              missing=1
+            fi
           done
           for optional in \
             reddit_sentimiento_frameworks.csv \
@@ -289,6 +279,9 @@ jobs:
             if [ ! -f "datos/${optional}" ]; then
               echo "::warning::Optional artifact missing (degraded mode may continue): datos/${optional}"
             fi
+            if [ ! -f "datos/latest/${optional}" ]; then
+              echo "::warning::Optional artifact missing (degraded mode may continue): datos/latest/${optional}"
+            fi
           done
           if [ "$missing" -ne 0 ]; then
             exit 1
@@ -349,6 +342,12 @@ jobs:
       - name: Enforce frontend assets policy (strict)
         run: python scripts/check_frontend_assets.py --mode strict --root .
 
+      - name: Enforce bridge integrity gate
+        run: |
+          python scripts/check_bridge_integrity.py \
+            --project-root . \
+            --expect-previous-history "${{ steps.previous_history.outputs.expect_previous_history }}"
+
       - name: Upload aggregate artifacts
         uses: actions/upload-artifact@v4
         with:
@@ -396,15 +395,7 @@ jobs:
       - name: Restore aggregated files into workspace
         shell: bash
         run: |
-          mkdir -p datos datos/latest datos/history datos/metadata frontend/assets/data
-          cp -f artifact_payload/datos/*.csv datos/ 2>/dev/null || true
-          cp -f artifact_payload/datos/latest/*.csv datos/latest/ 2>/dev/null || true
-          cp -f artifact_payload/datos/metadata/*.json datos/metadata/ 2>/dev/null || true
-          cp -f artifact_payload/frontend/assets/data/*.json frontend/assets/data/ 2>/dev/null || true
-          cp -f artifact_payload/frontend/assets/data/*.csv frontend/assets/data/ 2>/dev/null || true
-          if [ -d artifact_payload/datos/history ]; then
-            rsync -a artifact_payload/datos/history/ datos/history/
-          fi
+          python scripts/materialize_etl_artifacts.py --project-root . artifact_payload
 
       - name: Commit updated data
         id: commit_data

diff --git a/backend/reddit_etl.py b/backend/reddit_etl.py
@@ -10,11 +10,10 @@
 import pandas as pd
 from datetime import datetime
 import requests
-import nltk
-from nltk.sentiment import SentimentIntensityAnalyzer
 import warnings
 import time
 import re
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
 
 from config.settings import (
     ARCHIVOS_SALIDA, REDDIT_SUBREDDIT, REDDIT_LIMIT,
@@ -109,7 +108,6 @@ def _obtener_token_oauth(self):
     def definir_pasos(self):
         """Define los pasos ETL de Reddit."""
         return [
-            ("Preparar recursos NLTK", self._ensure_nltk_resources),
             ("Autenticacion OAuth", self._obtener_token_oauth),
             ("Extraccion de posts", self.extraer_posts),
             ("Sentimiento de frameworks", self.analizar_sentimiento_frameworks),
@@ -136,23 +134,6 @@ def validar_configuracion(self):
                 "Se ejecutara en modo degradado (API publica)."
             )
 
-    def _ensure_nltk_resources(self):
-        """Asegura que recursos NLTK requeridos esten disponibles en runtime.
-
-        Evita side effects durante import del modulo.
-        """
-        try:
-            nltk.data.find('sentiment/vader_lexicon')
-        except LookupError:
-            self.logger.info("Descargando recurso NLTK: vader_lexicon")
-            nltk.download('vader_lexicon', quiet=True)
-
-        try:
-            nltk.data.find('corpora/stopwords')
-        except LookupError:
-            self.logger.info("Descargando recurso NLTK: stopwords")
-            nltk.download('stopwords', quiet=True)
-
     def extraer_posts(self, subreddit_name=REDDIT_SUBREDDIT, limit=REDDIT_LIMIT):
         """Extrae posts de un subreddit usando la API JSON publica de Reddit.
 

diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -3,7 +3,7 @@ requests>=2.31.0,<2.33.0
 pandas>=2.2.0,<3.0
 numpy>=1.24,<3.0
 python-dotenv>=1.0.0,<2.0
-nltk>=3.8.1,<3.10
+vaderSentiment>=3.3.2,<4.0
 pandera>=0.22.0,<0.23.0
 duckdb>=1.2.2,<2.0
 

diff --git a/scripts/check_bridge_integrity.py b/scripts/check_bridge_integrity.py
@@ -0,0 +1,129 @@
+"""Valida que los bridges frontend conserven historial util antes de publicar."""
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+
+REQUIRED_HISTORY_DATASETS = {
+    "trend_score",
+    "github_commits",
+    "github_correlacion",
+    "github_lenguajes",
+    "so_volumen",
+    "so_aceptacion",
+    "so_tendencias",
+    "reddit_temas",
+    "interseccion",
+}
+
+REQUIRED_HISTORY_BRIDGES = (
+    "github_frameworks_history.json",
+    "github_correlacion_history.json",
+    "so_volumen_history.json",
+    "so_aceptacion_history.json",
+    "so_tendencias_history.json",
+    "reddit_temas_history.json",
+    "reddit_interseccion_history.json",
+)
+
+
+def _load_json(path: Path) -> dict:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def _bridge_assets_root(project_root: Path) -> Path:
+    return project_root / "frontend" / "assets" / "data"
+
+
+def check_bridge_integrity(
+    project_root: Path | str,
+    *,
+    expect_previous_history: bool = False,
+) -> dict[str, int | str]:
+    project_root = Path(project_root)
+    assets_root = _bridge_assets_root(project_root)
+    errors: list[str] = []
+
+    history_index = _load_json(assets_root / "history_index.json")
+    dataset_names = {
+        str(item.get("dataset", "")).strip()
+        for item in history_index.get("datasets", [])
+        if str(item.get("dataset", "")).strip()
+    }
+    missing_datasets = sorted(REQUIRED_HISTORY_DATASETS - dataset_names)
+    if missing_datasets:
+        errors.append(
+            "history_index missing datasets: " + ", ".join(missing_datasets)
+        )
+
+    trend_history = _load_json(assets_root / "trend_score_history.json")
+    snapshot_count = int(trend_history.get("snapshot_count", 0) or 0)
+    minimum_snapshots = 2 if expect_previous_history else 1
+    if snapshot_count < minimum_snapshots:
+        errors.append(
+            f"trend_score_history snapshot_count={snapshot_count} < {minimum_snapshots}"
+        )
+
+    technology_profiles = _load_json(assets_root / "technology_profiles.json")
+    if not technology_profiles.get("latest_snapshot_date"):
+        errors.append("technology_profiles latest_snapshot_date missing")
+    if int(technology_profiles.get("profile_count", 0) or 0) <= 0:
+        errors.append("technology_profiles profile_count must be positive")
+    if expect_previous_history and not technology_profiles.get("previous_snapshot_date"):
+        errors.append("technology_profiles previous_snapshot_date missing")
+
+    home_highlights = _load_json(assets_root / "home_highlights.json")
+    highlights = home_highlights.get("highlights", [])
+    minimum_highlights = 3 if expect_previous_history else 2
+    if len(highlights) < minimum_highlights:
+        errors.append(
+            f"home_highlights highlights={len(highlights)} < {minimum_highlights}"
+        )
+
+    for bridge_name in REQUIRED_HISTORY_BRIDGES:
+        payload = _load_json(assets_root / bridge_name)
+        source_mode = str(payload.get("source_mode", "")).strip().lower()
+        if source_mode in {"", "missing", "none"}:
+            errors.append(f"{bridge_name} source_mode={source_mode or 'missing'}")
+        if not payload.get("latest_snapshot_date"):
+            errors.append(f"{bridge_name} latest_snapshot_date missing")
+        if expect_previous_history and not payload.get("previous_snapshot_date"):
+            errors.append(f"{bridge_name} previous_snapshot_date missing")
+
+    if errors:
+        raise ValueError("; ".join(errors))
+
+    return {
+        "status": "ok",
+        "dataset_count": len(dataset_names),
+        "trend_snapshot_count": snapshot_count,
+        "profile_count": int(technology_profiles.get("profile_count", 0) or 0),
+        "home_highlight_count": len(highlights),
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--project-root", default=".")
+    parser.add_argument(
+        "--expect-previous-history",
+        type=int,
+        default=0,
+        choices=(0, 1),
+        help="Exige snapshot previo cuando el workflow ya recupero un aggregate previo.",
+    )
+    args = parser.parse_args()
+
+    summary = check_bridge_integrity(
+        args.project_root,
+        expect_previous_history=bool(args.expect_previous_history),
+    )
+    print(json.dumps(summary, ensure_ascii=False))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())