Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ jobs:
pip install --upgrade pip
pip install -r backend/requirements.txt

- name: Download NLTK data
run: |
python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')"

- name: Run tests
run: python -m pytest tests/ -v

Expand Down
4 changes: 1 addition & 3 deletions .github/workflows/dependency_security.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,4 @@ jobs:

- name: Run vulnerability audit
run: |
# Temporary exception: CVE-2025-14009 has no fixed NLTK release yet.
# Keep this ignored only until upstream publishes a patched version.
pip-audit -r backend/requirements.txt --ignore-vuln CVE-2025-14009
pip-audit -r backend/requirements.txt
65 changes: 28 additions & 37 deletions .github/workflows/etl_semanal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,6 @@ jobs:
pip install --upgrade pip
pip install -r backend/requirements.txt

- name: Download NLTK data
run: |
python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')"

- name: Run GitHub ETL
env:
GITHUB_TOKEN: ${{ secrets.GH_PAT }}
Expand Down Expand Up @@ -145,10 +141,6 @@ jobs:
pip install --upgrade pip
pip install -r backend/requirements.txt

- name: Download NLTK data
run: |
python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')"

- name: Run Reddit ETL (non-blocking)
id: reddit_run
env:
Expand Down Expand Up @@ -237,34 +229,28 @@ jobs:
workflow_conclusion: success
if_no_artifact_found: warn

- name: Restore previous history (if any)
- name: Detect previous aggregate snapshot
id: previous_history
shell: bash
run: |
if [ -d "prev_artifacts/datos/history" ]; then
mkdir -p datos/history
rsync -a prev_artifacts/datos/history/ datos/history/
if [ -d "prev_artifacts/datos/history" ] || [ -d "prev_artifacts/history" ]; then
echo "expect_previous_history=1" >> "$GITHUB_OUTPUT"
else
echo "expect_previous_history=0" >> "$GITHUB_OUTPUT"
fi

- name: Materialize source outputs
- name: Restore previous aggregate outputs (if any)
shell: bash
run: |
mkdir -p datos datos/latest datos/history frontend/assets/data
if [ -d "artifacts/github/datos" ]; then
rsync -a artifacts/github/datos/ datos/
else
cp -f artifacts/github/*.csv datos/ 2>/dev/null || true
fi
if [ -d "artifacts/stackoverflow/datos" ]; then
rsync -a artifacts/stackoverflow/datos/ datos/
else
cp -f artifacts/stackoverflow/*.csv datos/ 2>/dev/null || true
fi
if [ -d "artifacts/reddit/datos" ]; then
rsync -a artifacts/reddit/datos/ datos/
else
cp -f artifacts/reddit/*.csv datos/ 2>/dev/null || true
if [ -d "prev_artifacts" ]; then
python scripts/materialize_etl_artifacts.py --project-root . prev_artifacts
fi

- name: Materialize source outputs
shell: bash
run: |
python scripts/materialize_etl_artifacts.py --project-root . artifacts/github artifacts/stackoverflow artifacts/reddit

- name: Verify artifact handoff
shell: bash
run: |
Expand All @@ -281,6 +267,10 @@ jobs:
echo "::error::Missing required artifact file datos/${required}"
missing=1
fi
if [ ! -f "datos/latest/${required}" ]; then
echo "::error::Missing required artifact file datos/latest/${required}"
missing=1
fi
done
for optional in \
reddit_sentimiento_frameworks.csv \
Expand All @@ -289,6 +279,9 @@ jobs:
if [ ! -f "datos/${optional}" ]; then
echo "::warning::Optional artifact missing (degraded mode may continue): datos/${optional}"
fi
if [ ! -f "datos/latest/${optional}" ]; then
echo "::warning::Optional artifact missing (degraded mode may continue): datos/latest/${optional}"
fi
done
if [ "$missing" -ne 0 ]; then
exit 1
Expand Down Expand Up @@ -349,6 +342,12 @@ jobs:
- name: Enforce frontend assets policy (strict)
run: python scripts/check_frontend_assets.py --mode strict --root .

- name: Enforce bridge integrity gate
run: |
python scripts/check_bridge_integrity.py \
--project-root . \
--expect-previous-history "${{ steps.previous_history.outputs.expect_previous_history }}"

- name: Upload aggregate artifacts
uses: actions/upload-artifact@v4
with:
Expand Down Expand Up @@ -396,15 +395,7 @@ jobs:
- name: Restore aggregated files into workspace
shell: bash
run: |
mkdir -p datos datos/latest datos/history datos/metadata frontend/assets/data
cp -f artifact_payload/datos/*.csv datos/ 2>/dev/null || true
cp -f artifact_payload/datos/latest/*.csv datos/latest/ 2>/dev/null || true
cp -f artifact_payload/datos/metadata/*.json datos/metadata/ 2>/dev/null || true
cp -f artifact_payload/frontend/assets/data/*.json frontend/assets/data/ 2>/dev/null || true
cp -f artifact_payload/frontend/assets/data/*.csv frontend/assets/data/ 2>/dev/null || true
if [ -d artifact_payload/datos/history ]; then
rsync -a artifact_payload/datos/history/ datos/history/
fi
python scripts/materialize_etl_artifacts.py --project-root . artifact_payload

- name: Commit updated data
id: commit_data
Expand Down
21 changes: 1 addition & 20 deletions backend/reddit_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,10 @@
import pandas as pd
from datetime import datetime
import requests
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import warnings
import time
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from config.settings import (
ARCHIVOS_SALIDA, REDDIT_SUBREDDIT, REDDIT_LIMIT,
Expand Down Expand Up @@ -109,7 +108,6 @@ def _obtener_token_oauth(self):
def definir_pasos(self):
"""Define los pasos ETL de Reddit."""
return [
("Preparar recursos NLTK", self._ensure_nltk_resources),
("Autenticacion OAuth", self._obtener_token_oauth),
("Extraccion de posts", self.extraer_posts),
("Sentimiento de frameworks", self.analizar_sentimiento_frameworks),
Expand All @@ -136,23 +134,6 @@ def validar_configuracion(self):
"Se ejecutara en modo degradado (API publica)."
)

def _ensure_nltk_resources(self):
"""Asegura que recursos NLTK requeridos esten disponibles en runtime.

Evita side effects durante import del modulo.
"""
try:
nltk.data.find('sentiment/vader_lexicon')
except LookupError:
self.logger.info("Descargando recurso NLTK: vader_lexicon")
nltk.download('vader_lexicon', quiet=True)

try:
nltk.data.find('corpora/stopwords')
except LookupError:
self.logger.info("Descargando recurso NLTK: stopwords")
nltk.download('stopwords', quiet=True)

def extraer_posts(self, subreddit_name=REDDIT_SUBREDDIT, limit=REDDIT_LIMIT):
"""Extrae posts de un subreddit usando la API JSON publica de Reddit.

Expand Down
2 changes: 1 addition & 1 deletion backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requests>=2.31.0,<2.33.0
pandas>=2.2.0,<3.0
numpy>=1.24,<3.0
python-dotenv>=1.0.0,<2.0
nltk>=3.8.1,<3.10
vaderSentiment>=3.3.2,<4.0
pandera>=0.22.0,<0.23.0
duckdb>=1.2.2,<2.0

Expand Down
129 changes: 129 additions & 0 deletions scripts/check_bridge_integrity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""Valida que los bridges frontend conserven historial util antes de publicar."""

from __future__ import annotations

import argparse
import json
from pathlib import Path


REQUIRED_HISTORY_DATASETS = {
"trend_score",
"github_commits",
"github_correlacion",
"github_lenguajes",
"so_volumen",
"so_aceptacion",
"so_tendencias",
"reddit_temas",
"interseccion",
}

REQUIRED_HISTORY_BRIDGES = (
"github_frameworks_history.json",
"github_correlacion_history.json",
"so_volumen_history.json",
"so_aceptacion_history.json",
"so_tendencias_history.json",
"reddit_temas_history.json",
"reddit_interseccion_history.json",
)


def _load_json(path: Path) -> dict:
return json.loads(path.read_text(encoding="utf-8"))


def _bridge_assets_root(project_root: Path) -> Path:
return project_root / "frontend" / "assets" / "data"


def check_bridge_integrity(
project_root: Path | str,
*,
expect_previous_history: bool = False,
) -> dict[str, int | str]:
project_root = Path(project_root)
assets_root = _bridge_assets_root(project_root)
errors: list[str] = []

history_index = _load_json(assets_root / "history_index.json")
dataset_names = {
str(item.get("dataset", "")).strip()
for item in history_index.get("datasets", [])
if str(item.get("dataset", "")).strip()
}
missing_datasets = sorted(REQUIRED_HISTORY_DATASETS - dataset_names)
if missing_datasets:
errors.append(
"history_index missing datasets: " + ", ".join(missing_datasets)
)

trend_history = _load_json(assets_root / "trend_score_history.json")
snapshot_count = int(trend_history.get("snapshot_count", 0) or 0)
minimum_snapshots = 2 if expect_previous_history else 1
if snapshot_count < minimum_snapshots:
errors.append(
f"trend_score_history snapshot_count={snapshot_count} < {minimum_snapshots}"
)

technology_profiles = _load_json(assets_root / "technology_profiles.json")
if not technology_profiles.get("latest_snapshot_date"):
errors.append("technology_profiles latest_snapshot_date missing")
if int(technology_profiles.get("profile_count", 0) or 0) <= 0:
errors.append("technology_profiles profile_count must be positive")
if expect_previous_history and not technology_profiles.get("previous_snapshot_date"):
errors.append("technology_profiles previous_snapshot_date missing")

home_highlights = _load_json(assets_root / "home_highlights.json")
highlights = home_highlights.get("highlights", [])
minimum_highlights = 3 if expect_previous_history else 2
if len(highlights) < minimum_highlights:
errors.append(
f"home_highlights highlights={len(highlights)} < {minimum_highlights}"
)
Comment on lines +78 to +84

Copilot AI Mar 23, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check_bridge_integrity hard-fails when home_highlights.highlights has fewer than 3 entries whenever expect_previous_history=True. However export_history_json.build_home_highlights_payload() can legitimately emit <3 highlights when some upstream bridge summaries are null/empty (it only adds candidates when a specific payload is present). This makes the integrity gate brittle and can block publishes even when the bridge files are structurally valid and the UI can run in degraded mode. Consider validating the JSON shape (e.g., highlights is a list, candidate_count >= len(highlights)) and using a lower/conditional minimum (or tie the minimum to candidate_count) instead of a fixed 3.

Copilot uses AI. Check for mistakes.

for bridge_name in REQUIRED_HISTORY_BRIDGES:
payload = _load_json(assets_root / bridge_name)
source_mode = str(payload.get("source_mode", "")).strip().lower()
if source_mode in {"", "missing", "none"}:
errors.append(f"{bridge_name} source_mode={source_mode or 'missing'}")
if not payload.get("latest_snapshot_date"):
errors.append(f"{bridge_name} latest_snapshot_date missing")
if expect_previous_history and not payload.get("previous_snapshot_date"):
errors.append(f"{bridge_name} previous_snapshot_date missing")

if errors:
raise ValueError("; ".join(errors))

return {
"status": "ok",
"dataset_count": len(dataset_names),
"trend_snapshot_count": snapshot_count,
"profile_count": int(technology_profiles.get("profile_count", 0) or 0),
"home_highlight_count": len(highlights),
}


def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--project-root", default=".")
parser.add_argument(
"--expect-previous-history",
type=int,
default=0,
choices=(0, 1),
help="Exige snapshot previo cuando el workflow ya recupero un aggregate previo.",
)
args = parser.parse_args()

summary = check_bridge_integrity(
args.project_root,
expect_previous_history=bool(args.expect_previous_history),
)
print(json.dumps(summary, ensure_ascii=False))
return 0


if __name__ == "__main__":
raise SystemExit(main())
Loading
Loading