diff --git a/.env.example b/.env.example
index 92e7a35..513c975 100644
--- a/.env.example
+++ b/.env.example
@@ -13,3 +13,14 @@ STACKOVERFLOW_KEY=tu_key_aqui
# Registrar en: https://old.reddit.com/prefs/apps (tipo: script)
REDDIT_CLIENT_ID=tu_client_id_aqui
REDDIT_CLIENT_SECRET=tu_client_secret_aqui
+
+# Estrategia de escritura de datos (dual write)
+# 1 = habilitado, 0 = deshabilitado
+DATA_WRITE_LEGACY_CSV=1
+DATA_WRITE_LATEST_CSV=0
+DATA_WRITE_HISTORY_CSV=0
+EXPORT_HISTORY_BRIDGE_JSON=1
+
+# Trend score engine selector
+# allowed: legacy | duckdb
+TREND_SCORE_ENGINE=legacy
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 399aefe..8e5379e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@ name: CI - Tests
on:
push:
- branches: [main, refactor/standard-structure]
+ branches: [main, feat/backend, feat/frontend]
paths-ignore:
- 'datos/**'
- 'frontend/assets/data/**'
diff --git a/.github/workflows/dependency_security.yml b/.github/workflows/dependency_security.yml
index ad11b16..47b6f31 100644
--- a/.github/workflows/dependency_security.yml
+++ b/.github/workflows/dependency_security.yml
@@ -2,7 +2,7 @@ name: Dependency Security Audit
on:
push:
- branches: [main, refactor/standard-structure]
+ branches: [main, feat/backend]
paths:
- 'backend/requirements.txt'
- '.github/workflows/dependency_security.yml'
@@ -42,4 +42,6 @@ jobs:
- name: Run vulnerability audit
run: |
- pip-audit -r backend/requirements.txt
+ # Temporary exception: CVE-2025-14009 has no fixed NLTK release yet.
+ # Keep this ignored only until upstream publishes a patched version.
+ pip-audit -r backend/requirements.txt --ignore-vuln CVE-2025-14009
diff --git a/.github/workflows/deploy_frontend.yml b/.github/workflows/deploy_frontend.yml
index 9582217..3a2b337 100644
--- a/.github/workflows/deploy_frontend.yml
+++ b/.github/workflows/deploy_frontend.yml
@@ -17,7 +17,7 @@ permissions:
jobs:
deploy:
- if: ${{ github.event_name != 'workflow_run' || github.event.workflow_run.conclusion == 'success' }}
+ if: ${{ github.event_name != 'workflow_run' || (github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'main') }}
runs-on: ubuntu-latest
timeout-minutes: 30
@@ -47,4 +47,4 @@ jobs:
uses: peaceiris/actions-gh-pages@v4
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
- publish_dir: frontend/build/web
\ No newline at end of file
+ publish_dir: frontend/build/web
diff --git a/.github/workflows/etl_semanal.yml b/.github/workflows/etl_semanal.yml
index d662e94..46f31e1 100644
--- a/.github/workflows/etl_semanal.yml
+++ b/.github/workflows/etl_semanal.yml
@@ -2,32 +2,40 @@ name: ETL Weekly Data Refresh
on:
schedule:
- - cron: '0 8 * * 1' # Cada lunes a las 08:00 UTC (03:00 Ecuador)
- workflow_dispatch: # Boton manual en Actions
+ - cron: "0 8 * * 1"
+ workflow_dispatch:
permissions:
- contents: write
+ contents: read
-# Evitar que dos runs del ETL corran al mismo tiempo
concurrency:
group: etl-pipeline
cancel-in-progress: false
+env:
+ PYTHON_VERSION: "3.11"
+ DATA_WRITE_LEGACY_CSV: "1"
+ DATA_WRITE_LATEST_CSV: "1"
+ DATA_WRITE_HISTORY_CSV: "1"
+ EXPORT_HISTORY_BRIDGE_JSON: "1"
+ TREND_SCORE_ENGINE: "duckdb"
+
jobs:
- etl:
+ job_github:
+ name: Source - GitHub
runs-on: ubuntu-latest
- timeout-minutes: 45
+ timeout-minutes: 20
steps:
- name: Checkout code
uses: actions/checkout@v4
- - name: Set up Python 3.11
+ - name: Set up Python
uses: actions/setup-python@v5
with:
- python-version: '3.11'
- cache: 'pip'
- cache-dependency-path: 'backend/requirements.txt'
+ python-version: ${{ env.PYTHON_VERSION }}
+ cache: pip
+ cache-dependency-path: backend/requirements.txt
- name: Install dependencies
run: |
@@ -39,19 +47,89 @@ jobs:
python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')"
- name: Run GitHub ETL
- id: github_etl
env:
GITHUB_TOKEN: ${{ secrets.GH_PAT }}
run: python backend/github_etl.py
+ - name: Upload GitHub artifacts
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: github-data
+ if-no-files-found: warn
+ path: |
+ datos/github_repos_2025.csv
+ datos/github_lenguajes.csv
+ datos/github_ai_repos_insights.csv
+ datos/github_commits_frameworks.csv
+ datos/github_correlacion.csv
+
+ job_stackoverflow:
+ name: Source - StackOverflow
+ runs-on: ubuntu-latest
+ timeout-minutes: 20
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ env.PYTHON_VERSION }}
+ cache: pip
+ cache-dependency-path: backend/requirements.txt
+
+ - name: Install dependencies
+ run: |
+ pip install --upgrade pip
+ pip install -r backend/requirements.txt
+
- name: Run StackOverflow ETL
- id: so_etl
env:
STACKOVERFLOW_KEY: ${{ secrets.STACKOVERFLOW_KEY }}
run: python backend/stackoverflow_etl.py
- - name: Run Reddit ETL
- id: reddit_etl
+ - name: Upload StackOverflow artifacts
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: stackoverflow-data
+ if-no-files-found: warn
+ path: |
+ datos/so_volumen_preguntas.csv
+ datos/so_tasa_aceptacion.csv
+ datos/so_tendencias_mensuales.csv
+
+ job_reddit:
+ name: Source - Reddit
+ runs-on: ubuntu-latest
+ timeout-minutes: 20
+ outputs:
+ status: ${{ steps.reddit_run.outputs.status }}
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ env.PYTHON_VERSION }}
+ cache: pip
+ cache-dependency-path: backend/requirements.txt
+
+ - name: Install dependencies
+ run: |
+ pip install --upgrade pip
+ pip install -r backend/requirements.txt
+
+ - name: Download NLTK data
+ run: |
+ python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')"
+
+ - name: Run Reddit ETL (non-blocking)
+ id: reddit_run
env:
REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }}
REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }}
@@ -62,22 +140,200 @@ jobs:
code=$?
if [ $code -ne 0 ]; then
echo "status=failed" >> "$GITHUB_OUTPUT"
- echo "Reddit ETL falló (no crítico), se continúa con datos previos si existen." >> "$GITHUB_STEP_SUMMARY"
+ echo "Reddit ETL failed (non-critical); aggregate will continue with existing data fallback." >> "$GITHUB_STEP_SUMMARY"
exit 0
fi
echo "status=ok" >> "$GITHUB_OUTPUT"
+ - name: Upload Reddit artifacts
+ if: always()
+ uses: actions/upload-artifact@v4
+ with:
+ name: reddit-data
+ if-no-files-found: warn
+ path: |
+ datos/reddit_sentimiento_frameworks.csv
+ datos/reddit_temas_emergentes.csv
+ datos/interseccion_github_reddit.csv
+
+ job_aggregate:
+ name: Aggregate + Quality Gate
+ runs-on: ubuntu-latest
+ timeout-minutes: 20
+ needs:
+ - job_github
+ - job_stackoverflow
+ - job_reddit
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ env.PYTHON_VERSION }}
+ cache: pip
+ cache-dependency-path: backend/requirements.txt
+
+ - name: Install dependencies
+ run: |
+ pip install --upgrade pip
+ pip install -r backend/requirements.txt
+
+ - name: Download GitHub artifacts
+ uses: actions/download-artifact@v4
+ with:
+ name: github-data
+ path: artifacts/github
+
+ - name: Download StackOverflow artifacts
+ uses: actions/download-artifact@v4
+ with:
+ name: stackoverflow-data
+ path: artifacts/stackoverflow
+
+ - name: Download Reddit artifacts
+ uses: actions/download-artifact@v4
+ with:
+ name: reddit-data
+ path: artifacts/reddit
+
+ - name: Materialize source outputs
+ shell: bash
+ run: |
+ mkdir -p datos frontend/assets/data
+ cp -f artifacts/github/*.csv datos/ 2>/dev/null || true
+ cp -f artifacts/stackoverflow/*.csv datos/ 2>/dev/null || true
+ cp -f artifacts/reddit/*.csv datos/ 2>/dev/null || true
+
+ - name: Verify artifact handoff
+ shell: bash
+ run: |
+ missing=0
+ for required in \
+ github_repos_2025.csv \
+ github_lenguajes.csv \
+ github_commits_frameworks.csv \
+ github_correlacion.csv \
+ so_volumen_preguntas.csv \
+ so_tasa_aceptacion.csv \
+ so_tendencias_mensuales.csv; do
+ if [ ! -f "datos/${required}" ]; then
+ echo "::error::Missing required artifact file datos/${required}"
+ missing=1
+ fi
+ done
+ for optional in \
+ reddit_sentimiento_frameworks.csv \
+ reddit_temas_emergentes.csv \
+ interseccion_github_reddit.csv; do
+ if [ ! -f "datos/${optional}" ]; then
+ echo "::warning::Optional artifact missing (degraded mode may continue): datos/${optional}"
+ fi
+ done
+ if [ "$missing" -ne 0 ]; then
+ exit 1
+ fi
+
- name: Run Trend Score
- id: trend_score
run: python backend/trend_score.py
- name: Sync CSVs to frontend assets
run: python backend/sync_assets.py
- name: Validate CSV contract headers
- id: validate_contract
run: python backend/validate_csv_contract.py
+ - name: Verify aggregate outputs
+ shell: bash
+ run: |
+ missing=0
+ for required in \
+ datos/trend_score.csv \
+ frontend/assets/data/trend_score.csv \
+ frontend/assets/data/github_lenguajes.csv \
+ frontend/assets/data/github_commits_frameworks.csv \
+ frontend/assets/data/github_correlacion.csv \
+ frontend/assets/data/so_volumen_preguntas.csv \
+ frontend/assets/data/so_tasa_aceptacion.csv \
+ frontend/assets/data/so_tendencias_mensuales.csv \
+ frontend/assets/data/reddit_sentimiento_frameworks.csv \
+ frontend/assets/data/reddit_temas_emergentes.csv \
+ frontend/assets/data/interseccion_github_reddit.csv; do
+ if [ ! -f "$required" ]; then
+ echo "::error::Missing aggregate output $required"
+ missing=1
+ fi
+ done
+ if [ "${EXPORT_HISTORY_BRIDGE_JSON}" = "1" ]; then
+ for bridge_file in frontend/assets/data/history_index.json frontend/assets/data/trend_score_history.json; do
+ if [ ! -f "$bridge_file" ]; then
+ echo "::error::Missing bridge output $bridge_file"
+ missing=1
+ fi
+ done
+ fi
+ if [ "$missing" -ne 0 ]; then
+ exit 1
+ fi
+
+ - name: Upload aggregate artifacts
+ uses: actions/upload-artifact@v4
+ with:
+ name: aggregate-data
+ if-no-files-found: error
+ path: |
+ datos/*.csv
+ datos/latest/*.csv
+ datos/history/**/*.csv
+ datos/metadata/*.json
+ frontend/assets/data/*.csv
+ frontend/assets/data/*.json
+
+ - name: ETL aggregate summary
+ if: always()
+ run: |
+ echo "## ETL Aggregate Summary" >> "$GITHUB_STEP_SUMMARY"
+ echo "- GitHub job: ${{ needs.job_github.result }}" >> "$GITHUB_STEP_SUMMARY"
+ echo "- StackOverflow job: ${{ needs.job_stackoverflow.result }}" >> "$GITHUB_STEP_SUMMARY"
+ echo "- Reddit job: ${{ needs.job_reddit.outputs.status || 'ok' }}" >> "$GITHUB_STEP_SUMMARY"
+ echo "- Aggregate status: ${{ job.status }}" >> "$GITHUB_STEP_SUMMARY"
+
+ job_publish:
+ name: Publish Data
+ runs-on: ubuntu-latest
+ timeout-minutes: 10
+ needs:
+ - job_aggregate
+ - job_reddit
+ if: ${{ needs.job_aggregate.result == 'success' }}
+ permissions:
+ contents: write
+
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v4
+
+ - name: Download aggregate artifacts
+ uses: actions/download-artifact@v4
+ with:
+ name: aggregate-data
+ path: artifact_payload
+
+ - name: Restore aggregated files into workspace
+ shell: bash
+ run: |
+ mkdir -p datos datos/latest datos/history datos/metadata frontend/assets/data
+ cp -f artifact_payload/datos/*.csv datos/ 2>/dev/null || true
+ cp -f artifact_payload/datos/latest/*.csv datos/latest/ 2>/dev/null || true
+ cp -f artifact_payload/datos/metadata/*.json datos/metadata/ 2>/dev/null || true
+ cp -f artifact_payload/frontend/assets/data/*.json frontend/assets/data/ 2>/dev/null || true
+ cp -f artifact_payload/frontend/assets/data/*.csv frontend/assets/data/ 2>/dev/null || true
+ if [ -d artifact_payload/datos/history ]; then
+ rsync -a artifact_payload/datos/history/ datos/history/
+ fi
+
- name: Commit updated data
id: commit_data
shell: bash
@@ -88,21 +344,18 @@ jobs:
git add datos/ frontend/assets/data/
if git diff --staged --quiet; then
echo "changed=false" >> "$GITHUB_OUTPUT"
- echo "Sin cambios de datos para commit." >> "$GITHUB_STEP_SUMMARY"
+ echo "No data changes to commit." >> "$GITHUB_STEP_SUMMARY"
exit 0
fi
- git commit -m "data: actualizar CSVs"
+ git commit -m "data: refresh CSV outputs"
git pull --rebase origin "$TARGET_BRANCH"
git push origin "HEAD:$TARGET_BRANCH"
echo "changed=true" >> "$GITHUB_OUTPUT"
- - name: ETL Summary
+ - name: Publish summary
if: always()
run: |
- echo "## Resumen ETL" >> $GITHUB_STEP_SUMMARY
- echo "- GitHub ETL: ${{ steps.github_etl.outcome }}" >> $GITHUB_STEP_SUMMARY
- echo "- StackOverflow ETL: ${{ steps.so_etl.outcome }}" >> $GITHUB_STEP_SUMMARY
- echo "- Reddit ETL: ${{ steps.reddit_etl.outputs.status || 'ok' }}" >> $GITHUB_STEP_SUMMARY
- echo "- Trend Score: ${{ steps.trend_score.outcome }}" >> $GITHUB_STEP_SUMMARY
- echo "- Contrato CSV: ${{ steps.validate_contract.outcome }}" >> $GITHUB_STEP_SUMMARY
- echo "- Commit de datos: ${{ steps.commit_data.outputs.changed || 'false' }}" >> $GITHUB_STEP_SUMMARY
+ echo "## ETL Publish Summary" >> "$GITHUB_STEP_SUMMARY"
+ echo "- Aggregate gate: ${{ needs.job_aggregate.result }}" >> "$GITHUB_STEP_SUMMARY"
+ echo "- Reddit status: ${{ needs.job_reddit.outputs.status || 'ok' }}" >> "$GITHUB_STEP_SUMMARY"
+ echo "- Data committed: ${{ steps.commit_data.outputs.changed || 'false' }}" >> "$GITHUB_STEP_SUMMARY"
diff --git a/.gitignore b/.gitignore
index 806da86..98159d1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,3 +47,8 @@ logs/
# Temporary files
etl_log_full.txt
+
+# Runtime ETL outputs (dual-write paths)
+datos/latest/
+datos/history/
+datos/metadata/
diff --git a/README.md b/README.md
index 2bb54ff..02f2304 100644
--- a/README.md
+++ b/README.md
@@ -1,258 +1,153 @@
-# Tech Trends 2025
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
----
-
-## Project Overview
-
-End-to-end data engineering platform that extracts, transforms, and visualizes technology trends from the three largest developer communities: GitHub, StackOverflow, and Reddit.
-
-| Challenge | Solution | Impact |
-|-----------|----------|--------|
-| Fragmented trend data | Multi-source ETL pipeline | Unified technology ranking |
-| No cross-platform comparison | Composite Trend Score index | Weighted ranking across 3 sources |
-| Manual analysis | Automated pipeline with OOP | Repeatable, testable, maintainable |
-| Raw data, no insights | Interactive Flutter dashboard | Real-time trend visualization |
-
-> **Core Value:** This platform demonstrates a production-grade data pipeline that ingests from 3 APIs, applies NLP sentiment analysis, and produces a composite ranking — the kind of system that powers real technology intelligence products.
-
----
-
-## Pipeline Architecture
-
-```
-┌─────────────┐ ┌─────────────┐ ┌─────────────┐
-│ GitHub │ │StackOverflow│ │ Reddit │
-│ API │ │ API │ │ JSON API │
-└──────┬──────┘ └──────┬──────┘ └──────┬──────┘
- │ │ │
- ▼ ▼ ▼
-┌─────────────────────────────────────────────────────┐
-│ BaseETL (Abstract Class) │
-│ configurar_logging() · guardar_csv() · ejecutar() │
-├─────────────┬─────────────────┬─────────────────────┤
-│ GitHubETL │ StackOverflowETL│ RedditETL │
-│ 4 analyses │ 3 analyses │ 3 analyses + NLP │
-└──────┬──────┘────────┬────────┘──────────┬──────────┘
- │ │ │
- ▼ ▼ ▼
-┌─────────────────────────────────────────────────────┐
-│ datos/ (11 CSVs) │
-│ Validated by validador.py before each save │
-└──────────────────────┬──────────────────────────────┘
- │
- ▼
-┌─────────────────────────────────────────────────────┐
-│ Trend Score Engine │
-│ GitHub 40% + StackOverflow 35% + Reddit 25% │
-│ Min-max normalization · Outer join · Ranking │
-└──────────────────────┬──────────────────────────────┘
- │
- ▼
-┌─────────────────────────────────────────────────────┐
-│ Flutter Web Dashboard │
-│ 4 views · fl_chart · Export ZIP · Responsive │
-└─────────────────────────────────────────────────────┘
+# Technology Trend Analysis Platform
+
+End-to-end data pipeline and dashboard for technology trends across GitHub, StackOverflow, and Reddit.
+
+## Current Status
+
+- Backend refactor implementation is complete for F2-F7.
+- Test suite is green (`133 passed`).
+- Operational cutover is still pending: 4 weekly ETL runs without critical failures.
+
+## What Is Implemented
+
+- Multi-source ETL pipeline (GitHub, StackOverflow, Reddit).
+- Dual write strategy:
+ - `datos/*.csv` (legacy)
+ - `datos/latest/*.csv` (latest)
+ - `datos/history//year=YYYY/month=MM/day=DD/*.csv` (history snapshots)
+- Trend Score engine selector:
+ - `legacy` (pandas)
+ - `duckdb` (SQL engine with equivalence tests)
+- Severity-based quality gate (`critical`, `warning`, `info`) with Pandera support.
+- Data product contract for run and dataset manifests.
+- Frontend bridge JSON assets:
+ - `history_index.json`
+ - `trend_score_history.json`
+- Frontend feature flag for partial cutover to bridge JSON.
+
+## Repository Layout
+
+```text
+backend/
+ base_etl.py
+ trend_score.py
+ trend_score_duckdb.py
+ sync_assets.py
+ export_history_json.py
+ validate_csv_contract.py
+ validador.py
+ config/
+ settings.py
+ csv_contract.py
+ data_product_contract.py
+ schema_contract_utils.py
+ quality/
+ pandera_schemas.py
+ degradation_policy.py
+
+datos/
+ *.csv
+ latest/*.csv
+ history//year=YYYY/month=MM/day=DD/*.csv
+ metadata/
+
+frontend/
+ lib/
+ assets/data/
+
+docs/
+tests/
+.github/workflows/
```
-| Layer | Component | Output |
-|-------|-----------|--------|
-| **Extraction** | 3 API connectors | Raw data from GitHub, SO, Reddit |
-| **Transformation** | BaseETL + 3 children | 11 processed CSVs |
-| **Scoring** | trend_score.py | Unified technology ranking |
-| **Validation** | validador.py + csv_contract.py | Column checks + strict schema/types validation |
-| **Presentation** | Flutter Web | 4 interactive dashboards |
-
----
-
-## Key Metrics & Results
+## Runtime Workflows
-| Metric | Value |
-|--------|-------|
-| **Repositories analyzed** | 1,000 |
-| **StackOverflow questions** | 5 languages + 5 frameworks |
-| **Reddit posts** | 500+ from r/webdev |
-| **Output CSVs** | 11 validated datasets |
-| **Trend Score** | Top technology ranking |
-| **Tests** | 72 passing (pytest) |
-| **Code coverage** | All ETL modules tested |
+### 1) ETL Weekly Data Refresh (`etl_semanal.yml`)
----
+Trigger:
+- Schedule: every Monday at `08:00 UTC`.
+- Manual: `workflow_dispatch`.
-## Dashboard Features
+Flow:
+1. Run source jobs in parallel: GitHub, StackOverflow, Reddit.
+2. Upload source artifacts.
+3. Aggregate job downloads artifacts, runs Trend Score, syncs frontend assets, validates data contract.
+4. Publish job commits refreshed data if aggregate is successful.
-| Page | Visualizations |
-|------|----------------|
-| **Home** | Executive KPIs, global insights, navigation |
-| **GitHub** | Top 10 languages · Framework commits · Stars vs Contributors correlation |
-| **StackOverflow** | Question volume · Acceptance rates · Monthly trends (Python/JS/TS) |
-| **Reddit** | Framework sentiment · Emerging topics · GitHub-Reddit intersection |
+Important behavior:
+- Reddit source is non-blocking in source stage (degraded mode is allowed).
+- Aggregate stage enforces required outputs for frontend and trend artifacts.
-Each dashboard includes **Key Insights** cards and an **Export ZIP** button.
+### 2) CI - Tests (`ci.yml`)
----
+Trigger:
+- Push and pull request checks for Python tests.
-## Tech Stack
+### 3) Dependency Security Audit (`dependency_security.yml`)
-| Layer | Technologies |
-|-------|--------------|
-| **ETL Pipeline** | Python 3.9+, pandas, requests, NLTK |
-| **Architecture** | BaseETL (OOP), custom exceptions, data validation |
-| **Testing** | pytest, unittest.mock (72 tests, API mocking) |
-| **Frontend** | Flutter Web, Dart, fl_chart, google_fonts |
-| **Data Storage** | CSV (11 files, pathlib paths) |
-| **Automation** | Makefile, sync_assets.py, GitHub Actions |
-| **Security** | pip-audit, dependency security workflow |
-| **Deployment** | GitHub Pages |
+Trigger:
+- Dependency file changes and weekly schedule (Monday at `09:00 UTC`).
+- Manual execution supported.
----
+### 4) Frontend Deploy (`deploy_frontend.yml`)
-## Quick Start
+Trigger:
+- Push to `main` affecting frontend/data paths.
+- Successful completion of ETL workflow.
+- Manual execution.
-```bash
-# Clone repository
-git clone https://github.com/Sam-24-dev/Technology-trend-analysis-platform.git
-cd Technology-trend-analysis-platform
-
-# Install dependencies
-make install
+## Environment Variables
-# Run full pipeline (ETL + Trend Score)
-make etl
+Create `.env` in repo root:
-# Run tests
-make test
+```env
+GITHUB_TOKEN=your_token
+STACKOVERFLOW_KEY=your_key
+REDDIT_CLIENT_ID=your_client_id
+REDDIT_CLIENT_SECRET=your_client_secret
+
+DATA_WRITE_LEGACY_CSV=1
+DATA_WRITE_LATEST_CSV=0
+DATA_WRITE_HISTORY_CSV=0
+EXPORT_HISTORY_BRIDGE_JSON=1
+TREND_SCORE_ENGINE=legacy
+```
-# Sync CSVs to frontend
-make sync
+Notes:
+- Local defaults keep legacy behavior.
+- CI workflow sets dual write and DuckDB explicitly for weekly runs.
-# Or run everything at once
-make all
-```
+## Local Commands
-### Environment Setup
+```bash
+# backend
+pip install -r backend/requirements.txt
+python -m pytest -q
-Create a `.env` file in the project root:
-```env
-GITHUB_TOKEN=your_github_personal_access_token
-STACKOVERFLOW_KEY=your_so_api_key # optional
-REDDIT_CLIENT_ID=your_reddit_client_id # optional (OAuth)
-REDDIT_CLIENT_SECRET=your_reddit_secret # optional (OAuth)
-```
+# run ETLs
+python backend/github_etl.py
+python backend/stackoverflow_etl.py
+python backend/reddit_etl.py
+python backend/trend_score.py
-### Run Frontend
+# sync assets + bridge
+python backend/sync_assets.py
-```bash
+# frontend
cd frontend
flutter pub get
flutter run -d chrome
```
-> **Note:** Pre-processed data is included in `datos/`. Only run ETL if you need fresh data.
-
----
-
-## Project Structure
-
-```
-Technology-trend-analysis-platform/
-├── backend/ # ETL Pipeline (Python)
-│ ├── config/
-│ │ ├── __init__.py
-│ │ └── settings.py # Centralized config (pathlib, dates)
-│ ├── base_etl.py # Abstract ETL base class (OOP)
-│ ├── github_etl.py # GitHubETL: 4 analysis steps
-│ ├── stackoverflow_etl.py # StackOverflowETL: 3 analysis steps
-│ ├── reddit_etl.py # RedditETL: 3 analysis steps + NLP
-│ ├── trend_score.py # Composite index (3 sources)
-│ ├── validador.py # DataFrame validation before save
-│ ├── exceptions.py # ETLExtractionError, ETLValidationError
-│ ├── sync_assets.py # Copy CSVs to frontend
-│ └── requirements.txt
-├── datos/ # Processed CSVs (11 files)
-├── docs/
-│ └── architecture.md
-├── frontend/ # Flutter Web Dashboard
-│ ├── lib/
-│ │ ├── main.dart
-│ │ ├── screens/ # 5 screens (home, github, so, reddit)
-│ │ ├── models/ # Data models per source
-│ │ ├── services/ # CSV parsing service
-│ │ └── widgets/ # Reusable chart card
-│ ├── assets/
-│ │ ├── data/ # CSVs for visualization
-│ │ └── images/ # Technology logos
-│ └── pubspec.yaml
-├── logs/ # Daily ETL logs
-├── tests/ # pytest suite (72 tests)
-│ ├── conftest.py
-│ ├── test_github_etl.py
-│ ├── test_stackoverflow_etl.py
-│ ├── test_reddit_etl.py
-│ └── test_trend_score.py
-├── .env.example
-├── .gitignore
-├── LICENSE
-├── Makefile # make install/etl/test/sync/all
-├── pyproject.toml # Pylint + pytest config
-└── README.md
-```
-
----
-
-## Scalability & Roadmap
-
-- **Orchestration:** Pipeline structure is compatible with Apache Airflow for scheduled runs
-- **Database:** Migration path to PostgreSQL/BigQuery for data warehousing
-- **Containerization:** Ready for Docker deployment
-- **CI/CD:** GitHub Actions for automated testing and deployment
-- **API Layer:** FastAPI integration for programmatic data access
-
----
-
-## Team
-
-| Member | Role | Responsibility |
-|--------|------|----------------|
-| **Samir Caizapasto** | Lead Developer | GitHub ETL + Dashboard + Architecture |
-| **Andrés Salinas** | Developer | StackOverflow ETL + Dashboard |
-| **Mateo Mayorga** | Developer | Reddit ETL + Dashboard + NLP |
-
----
-
-
-
-### Author
-
-**Samir Caizapasto**
-*Junior Data Engineer & Analyst*
-
-[](https://www.linkedin.com/in/samir-caizapasto/)
-[](https://portafolio-samir-tau.vercel.app/)
-[](https://github.com/Sam-24-dev)
-
-
+## Release Readiness
----
+Release and cutover policy is defined in:
+- `docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md` (sections 19 and 20)
-
+In short:
+- Implementation is done.
+- Production cutover requires operational stability gates.
-⭐ If this project demonstrates useful data engineering practices, please give it a star.
+## License
-
+MIT
diff --git a/backend/base_etl.py b/backend/base_etl.py
index 2912e03..8a47152 100644
--- a/backend/base_etl.py
+++ b/backend/base_etl.py
@@ -9,17 +9,24 @@
class GitHubETL(BaseETL):
def definir_pasos(self):
return [
- ("Extraccion de repos", self.extraer_repos),
- ("Analisis de lenguajes", self.analizar_lenguajes),
+ ("Repository extraction", self.extraer_repos),
+ ("Language analysis", self.analizar_lenguajes),
]
"""
import logging
import sys
-from datetime import datetime
+from datetime import datetime, timezone
from abc import ABC, abstractmethod
from time import perf_counter
from config.settings import LOG_FORMAT, LOG_DATE_FORMAT, LOGS_DIR, ARCHIVOS_SALIDA
+from config.settings import (
+ WRITE_LEGACY_CSV,
+ WRITE_LATEST_CSV,
+ WRITE_HISTORY_CSV,
+ get_latest_output_path,
+ get_history_output_path,
+)
from exceptions import ETLExtractionError, ETLValidationError
from validador import validar_dataframe
@@ -68,7 +75,7 @@ def configurar_logging(self):
self.logger.addHandler(file_handler)
def guardar_csv(self, df, nombre_archivo):
- """Validates and saves a DataFrame to CSV.
+ """Validates and saves a DataFrame to one or more CSV destinations.
Args:
df: DataFrame to save.
@@ -77,17 +84,45 @@ def guardar_csv(self, df, nombre_archivo):
Raises:
ETLValidationError: If the DataFrame is empty.
"""
- ruta = ARCHIVOS_SALIDA.get(nombre_archivo)
- if ruta is None:
+ ruta_legacy = ARCHIVOS_SALIDA.get(nombre_archivo)
+ if ruta_legacy is None:
self.logger.warning("No hay ruta de salida para '%s'", nombre_archivo)
return
validar_dataframe(df, nombre_archivo)
- df.to_csv(ruta, index=False, encoding="utf-8")
+
+ destinos = []
+ if WRITE_LEGACY_CSV:
+ destinos.append(("legacy", ruta_legacy))
+ if WRITE_LATEST_CSV:
+ ruta_latest = get_latest_output_path(nombre_archivo)
+ if ruta_latest is not None:
+ destinos.append(("latest", ruta_latest))
+ if WRITE_HISTORY_CSV:
+ ruta_history = get_history_output_path(nombre_archivo, fecha=datetime.now(timezone.utc))
+ if ruta_history is not None:
+ destinos.append(("history", ruta_history))
+
+ if not destinos:
+ self.logger.warning(
+ "Escritura deshabilitada para '%s' (sin destinos activos por config)",
+ nombre_archivo,
+ )
+ return
+
+ rutas_escritas = set()
+ for salida, ruta in destinos:
+ ruta = ruta.resolve()
+ if ruta in rutas_escritas:
+ continue
+ ruta.parent.mkdir(parents=True, exist_ok=True)
+ df.to_csv(ruta, index=False, encoding="utf-8")
+ rutas_escritas.add(ruta)
+ self._run_summary["files_written"].append(str(ruta))
+ self.logger.info("[WRITE] archivo=%s destino=%s filas=%d", ruta, salida, len(df))
+
filas = len(df)
- self._run_summary["files_written"].append(str(ruta))
self._run_summary["rows_written"] += filas
- self.logger.info("[WRITE] archivo=%s filas=%d", ruta, filas)
@abstractmethod
def definir_pasos(self):
@@ -102,8 +137,8 @@ def definir_pasos(self):
Example:
return [
- ("Extraccion", self.extraer_repos),
- ("Lenguajes", self.analizar_lenguajes),
+ ("Extraction", self.extraer_repos),
+ ("Languages", self.analizar_lenguajes),
]
"""
raise NotImplementedError
diff --git a/backend/config/csv_contract.py b/backend/config/csv_contract.py
index 9bcf569..9234e45 100644
--- a/backend/config/csv_contract.py
+++ b/backend/config/csv_contract.py
@@ -1,12 +1,12 @@
"""
-Contrato de esquemas CSV compartido entre backend y frontend.
+Shared CSV schema contract between backend and frontend.
-Este módulo define las columnas requeridas/criticas para cada salida
-del pipeline ETL. Centralizar este contrato reduce acoplamiento implícito
-y hace explícitas las dependencias de datos entre módulos.
+This module defines required/critical columns for each ETL output.
+Centralizing the contract reduces implicit coupling and makes
+data dependencies explicit across modules.
"""
-CONTRACT_VERSION = "2026.03"
+CONTRACT_VERSION = "2026.04"
CSV_SCHEMA_CONTRACT = {
"github_repos": {
@@ -156,25 +156,30 @@
def get_required_columns(nombre_archivo):
- """Retorna las columnas requeridas para un archivo lógico de salida."""
+ """Returns required columns for a logical output file."""
return CSV_SCHEMA_CONTRACT.get(nombre_archivo, {}).get("required_columns", [])
def get_critical_columns(nombre_archivo):
- """Retorna las columnas críticas para un archivo lógico de salida."""
+ """Returns critical columns for a logical output file."""
return CSV_SCHEMA_CONTRACT.get(nombre_archivo, {}).get("critical_columns", [])
def get_optional_columns(nombre_archivo):
- """Retorna columnas opcionales para un archivo lógico de salida."""
+ """Returns optional columns for a logical output file."""
return CSV_SCHEMA_CONTRACT.get(nombre_archivo, {}).get("optional_columns", [])
def get_column_types(nombre_archivo):
- """Retorna contrato mínimo de tipos por columna para un CSV lógico."""
+ """Returns the minimal column-type contract for a logical CSV."""
return CSV_SCHEMA_CONTRACT.get(nombre_archivo, {}).get("column_types", {})
def get_contract_version():
- """Retorna la versión vigente del contrato de datos CSV."""
+ """Returns the current CSV data contract version."""
return CONTRACT_VERSION
+
+
+def get_logical_dataset_names():
+ """Returns logical dataset names available in the CSV contract."""
+ return sorted(CSV_SCHEMA_CONTRACT.keys())
diff --git a/backend/config/data_product_contract.py b/backend/config/data_product_contract.py
new file mode 100644
index 0000000..cbd9c5f
--- /dev/null
+++ b/backend/config/data_product_contract.py
@@ -0,0 +1,254 @@
+"""Data product contract for ETL run manifests.
+
+This module defines the minimal structure and validations for:
+1. Run manifest (execution level)
+2. Dataset manifest (output level)
+
+It stays separate from the CSV contract to enable storage evolution
+(latest/history/metadata) without breaking V1.
+"""
+
+from __future__ import annotations
+
+import re
+from datetime import datetime, timezone
+from typing import Any, Mapping
+
+
+DATA_PRODUCT_CONTRACT_VERSION = "1.0.0"
+
+QUALITY_GATE_STATUSES = {"pass", "pass_with_warnings", "fail"}
+DATASET_QUALITY_STATUSES = {"pass", "warning", "fail"}
+
+RUN_REQUIRED_FIELDS = (
+ "run_id",
+ "generated_at_utc",
+ "git_sha",
+ "branch",
+ "source_window_start_utc",
+ "source_window_end_utc",
+ "quality_gate_status",
+ "datasets",
+)
+
+DATASET_REQUIRED_FIELDS = (
+ "dataset_logical_name",
+ "version_semver",
+ "generated_at_utc",
+ "source_run_id",
+ "schema_hash",
+ "row_count",
+ "quality_status",
+ "latest_path",
+ "history_path",
+)
+
+_SEMVER_RE = re.compile(
+ r"^(0|[1-9]\d*)\."
+ r"(0|[1-9]\d*)\."
+ r"(0|[1-9]\d*)"
+ r"(?:-[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?"
+ r"(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?$"
+)
+_HEX64_RE = re.compile(r"^[a-fA-F0-9]{64}$")
+
+
+def _is_non_empty_string(value: Any) -> bool:
+ return isinstance(value, str) and bool(value.strip())
+
+
+def get_data_product_contract_version() -> str:
+ """Returns the current data product contract version."""
+ return DATA_PRODUCT_CONTRACT_VERSION
+
+
+def utc_now_iso() -> str:
+ """Returns UTC datetime in ISO-8601 format with Z suffix."""
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def is_valid_semver(version: Any) -> bool:
+ """Validates semantic versioning (SemVer 2.0.0)."""
+ return _is_non_empty_string(version) and _SEMVER_RE.fullmatch(version.strip()) is not None
+
+
+def is_valid_iso_utc(value: Any) -> bool:
+ """Validates ISO-8601 datetime with timezone."""
+ if not _is_non_empty_string(value):
+ return False
+
+ text = value.strip().replace("Z", "+00:00")
+ try:
+ parsed = datetime.fromisoformat(text)
+ except ValueError:
+ return False
+
+ return parsed.tzinfo is not None
+
+
+def validate_dataset_manifest(dataset_manifest: Mapping[str, Any], expected_run_id: str | None = None) -> list[str]:
+ """Validates minimal structure and rules for a dataset manifest.
+
+ Args:
+ dataset_manifest: Individual dataset manifest.
+ expected_run_id: If provided, validates source_run_id == expected_run_id.
+
+ Returns:
+ Error list. An empty list means valid manifest.
+ """
+ errors: list[str] = []
+
+ if not isinstance(dataset_manifest, Mapping):
+ return ["dataset manifest debe ser un objeto (dict/mapping)"]
+
+ for field in DATASET_REQUIRED_FIELDS:
+ if field not in dataset_manifest:
+ errors.append(f"falta campo requerido '{field}'")
+
+ dataset_name = dataset_manifest.get("dataset_logical_name")
+ if "dataset_logical_name" in dataset_manifest and not _is_non_empty_string(dataset_name):
+ errors.append("'dataset_logical_name' debe ser string no vacio")
+
+ version_semver = dataset_manifest.get("version_semver")
+ if "version_semver" in dataset_manifest and not is_valid_semver(version_semver):
+ errors.append("'version_semver' no cumple SemVer")
+
+ generated_at_utc = dataset_manifest.get("generated_at_utc")
+ if "generated_at_utc" in dataset_manifest and not is_valid_iso_utc(generated_at_utc):
+ errors.append("'generated_at_utc' no es ISO-8601 valido con zona horaria")
+
+ source_run_id = dataset_manifest.get("source_run_id")
+ if "source_run_id" in dataset_manifest and not _is_non_empty_string(source_run_id):
+ errors.append("'source_run_id' debe ser string no vacio")
+ if expected_run_id and source_run_id != expected_run_id:
+ errors.append("'source_run_id' no coincide con run_id del manifest principal")
+
+ schema_hash = dataset_manifest.get("schema_hash")
+ if "schema_hash" in dataset_manifest:
+ if not _is_non_empty_string(schema_hash) or _HEX64_RE.fullmatch(schema_hash.strip()) is None:
+ errors.append("'schema_hash' debe ser hash sha256 en hexadecimal (64 chars)")
+
+ row_count = dataset_manifest.get("row_count")
+ if "row_count" in dataset_manifest:
+ if not isinstance(row_count, int):
+ errors.append("'row_count' debe ser integer")
+ elif row_count < 0:
+ errors.append("'row_count' no puede ser negativo")
+
+ quality_status = dataset_manifest.get("quality_status")
+ if "quality_status" in dataset_manifest and quality_status not in DATASET_QUALITY_STATUSES:
+ errors.append(f"'quality_status' invalido: {quality_status}")
+
+ latest_path = dataset_manifest.get("latest_path")
+ if "latest_path" in dataset_manifest and not _is_non_empty_string(latest_path):
+ errors.append("'latest_path' debe ser string no vacio")
+
+ history_path = dataset_manifest.get("history_path")
+ if "history_path" in dataset_manifest:
+ if quality_status == "fail":
+ if history_path is not None and not _is_non_empty_string(history_path):
+ errors.append("'history_path' debe ser null o string no vacio cuando quality_status=fail")
+ elif not _is_non_empty_string(history_path):
+ errors.append("'history_path' debe ser string no vacio")
+
+ return errors
+
+
+def validate_run_manifest(run_manifest: Mapping[str, Any]) -> tuple[bool, list[str]]:
+ """Validates minimal structure and rules for a run manifest."""
+ errors: list[str] = []
+
+ if not isinstance(run_manifest, Mapping):
+ return False, ["run manifest debe ser un objeto (dict/mapping)"]
+
+ for field in RUN_REQUIRED_FIELDS:
+ if field not in run_manifest:
+ errors.append(f"falta campo requerido '{field}'")
+
+ run_id = run_manifest.get("run_id")
+ if "run_id" in run_manifest and not _is_non_empty_string(run_id):
+ errors.append("'run_id' debe ser string no vacio")
+
+ generated_at_utc = run_manifest.get("generated_at_utc")
+ if "generated_at_utc" in run_manifest and not is_valid_iso_utc(generated_at_utc):
+ errors.append("'generated_at_utc' no es ISO-8601 valido con zona horaria")
+
+ for field in ("source_window_start_utc", "source_window_end_utc"):
+ value = run_manifest.get(field)
+ if field in run_manifest and not is_valid_iso_utc(value):
+ errors.append(f"'{field}' no es ISO-8601 valido con zona horaria")
+
+ quality_gate_status = run_manifest.get("quality_gate_status")
+ if "quality_gate_status" in run_manifest and quality_gate_status not in QUALITY_GATE_STATUSES:
+ errors.append(f"'quality_gate_status' invalido: {quality_gate_status}")
+
+ for field in ("git_sha", "branch"):
+ value = run_manifest.get(field)
+ if field in run_manifest and not _is_non_empty_string(value):
+ errors.append(f"'{field}' debe ser string no vacio")
+
+ datasets = run_manifest.get("datasets")
+ if "datasets" in run_manifest:
+ if not isinstance(datasets, list):
+ errors.append("'datasets' debe ser lista")
+ elif not datasets:
+ errors.append("'datasets' no puede estar vacio")
+ else:
+ for index, dataset_manifest in enumerate(datasets):
+ dataset_errors = validate_dataset_manifest(
+ dataset_manifest,
+ expected_run_id=run_id if _is_non_empty_string(run_id) else None,
+ )
+ errors.extend(f"datasets[{index}]: {message}" for message in dataset_errors)
+
+ return len(errors) == 0, errors
+
+
+def build_dataset_manifest(
+ *,
+ dataset_logical_name: str,
+ version_semver: str,
+ source_run_id: str,
+ schema_hash: str,
+ row_count: int,
+ quality_status: str,
+ latest_path: str,
+ history_path: str | None,
+ generated_at_utc: str | None = None,
+) -> dict[str, Any]:
+ """Builds a dataset manifest with standard fields."""
+ return {
+ "dataset_logical_name": dataset_logical_name,
+ "version_semver": version_semver,
+ "generated_at_utc": generated_at_utc or utc_now_iso(),
+ "source_run_id": source_run_id,
+ "schema_hash": schema_hash,
+ "row_count": row_count,
+ "quality_status": quality_status,
+ "latest_path": latest_path,
+ "history_path": history_path,
+ }
+
+
+def build_run_manifest(
+ *,
+ run_id: str,
+ git_sha: str,
+ branch: str,
+ source_window_start_utc: str,
+ source_window_end_utc: str,
+ quality_gate_status: str,
+ datasets: list[dict[str, Any]],
+ generated_at_utc: str | None = None,
+) -> dict[str, Any]:
+ """Builds a run manifest with standard fields."""
+ return {
+ "run_id": run_id,
+ "generated_at_utc": generated_at_utc or utc_now_iso(),
+ "git_sha": git_sha,
+ "branch": branch,
+ "source_window_start_utc": source_window_start_utc,
+ "source_window_end_utc": source_window_end_utc,
+ "quality_gate_status": quality_gate_status,
+ "datasets": datasets,
+ }
diff --git a/backend/config/schema_contract_utils.py b/backend/config/schema_contract_utils.py
new file mode 100644
index 0000000..7a53dcf
--- /dev/null
+++ b/backend/config/schema_contract_utils.py
@@ -0,0 +1,112 @@
+"""Schema contract utilities for deterministic hashing and versioning policy."""
+
+from __future__ import annotations
+
+import hashlib
+import json
+from typing import Any, Iterable, Mapping
+
+
+SEMVER_MAJOR = "major"
+SEMVER_MINOR = "minor"
+SEMVER_PATCH = "patch"
+VALID_BUMP_LEVELS = (SEMVER_MAJOR, SEMVER_MINOR, SEMVER_PATCH)
+
+_CHANGE_TO_BUMP = {
+ "remove_required_column": SEMVER_MAJOR,
+ "rename_required_column": SEMVER_MAJOR,
+ "change_type_incompatible": SEMVER_MAJOR,
+ "tighten_nullability": SEMVER_MAJOR,
+ "drop_dataset": SEMVER_MAJOR,
+ "change_partition_key_breaking": SEMVER_MAJOR,
+ "add_optional_column": SEMVER_MINOR,
+ "add_required_column_with_default": SEMVER_MINOR,
+ "add_non_breaking_quality_rule": SEMVER_MINOR,
+ "add_partition_field_backward_compatible": SEMVER_MINOR,
+ "add_optional_dataset_metadata": SEMVER_MINOR,
+ "fix_quality_rule_bug": SEMVER_PATCH,
+ "relax_warning_threshold": SEMVER_PATCH,
+ "metadata_only_change": SEMVER_PATCH,
+ "reorder_columns_only": SEMVER_PATCH,
+ "backfill_without_schema_change": SEMVER_PATCH,
+}
+
+_BUMP_PRIORITY = {
+ SEMVER_MAJOR: 3,
+ SEMVER_MINOR: 2,
+ SEMVER_PATCH: 1,
+}
+
+
+def _canonical_type_name(raw_type: Any) -> str:
+ text = str(raw_type or "").strip().lower()
+ aliases = {
+ "int": "integer",
+ "int32": "integer",
+ "int64": "integer",
+ "long": "integer",
+ "float": "number",
+ "float32": "number",
+ "float64": "number",
+ "double": "number",
+ "str": "string",
+ "string": "string",
+ "bool": "boolean",
+ "boolean": "boolean",
+ "datetime64[ns]": "datetime",
+ "timestamp": "datetime",
+ }
+ return aliases.get(text, text)
+
+
+def canonicalize_schema_columns(columns: Iterable[Mapping[str, Any]]) -> list[dict[str, Any]]:
+ """Returns a deterministic canonical schema representation."""
+ normalized: list[dict[str, Any]] = []
+
+ for column in columns:
+ name = str(column.get("name", "")).strip()
+ if not name:
+ continue
+
+ nullable_value = column.get("nullable", True)
+ nullable = bool(nullable_value)
+ normalized.append(
+ {
+ "name": name.lower(),
+ "type": _canonical_type_name(column.get("type")),
+ "nullable": nullable,
+ }
+ )
+
+ normalized.sort(key=lambda item: item["name"])
+ return normalized
+
+
+def compute_schema_hash(columns: Iterable[Mapping[str, Any]]) -> str:
+ """Computes deterministic SHA-256 hash for a canonicalized schema."""
+ canonical = canonicalize_schema_columns(columns)
+ payload = json.dumps(canonical, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
+ return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+
+
+def recommend_semver_bump(change_kind: str) -> str:
+ """Maps a schema/data-contract change kind to SemVer bump level."""
+ normalized = str(change_kind or "").strip().lower()
+ if normalized not in _CHANGE_TO_BUMP:
+ raise ValueError(f"Unknown change kind: {change_kind}")
+ return _CHANGE_TO_BUMP[normalized]
+
+
+def aggregate_semver_bump(change_kinds: Iterable[str]) -> str:
+ """Returns the highest-priority bump required by a list of changes."""
+ selected_level = SEMVER_PATCH
+ selected_priority = _BUMP_PRIORITY[selected_level]
+
+ for change_kind in change_kinds:
+ level = recommend_semver_bump(change_kind)
+ priority = _BUMP_PRIORITY[level]
+ if priority > selected_priority:
+ selected_level = level
+ selected_priority = priority
+
+ return selected_level
diff --git a/backend/config/settings.py b/backend/config/settings.py
index b2d6b4e..9096008 100644
--- a/backend/config/settings.py
+++ b/backend/config/settings.py
@@ -6,20 +6,26 @@
"""
import os
from pathlib import Path
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
from dotenv import load_dotenv
-# Rutas del proyecto (cross-platform con pathlib)
+# Project paths (cross-platform with pathlib)
PROYECTO_ROOT = Path(__file__).resolve().parent.parent.parent
BACKEND_DIR = PROYECTO_ROOT / "backend"
DATOS_DIR = PROYECTO_ROOT / "datos"
+DATOS_LATEST_DIR = DATOS_DIR / "latest"
+DATOS_HISTORY_DIR = DATOS_DIR / "history"
+DATOS_METADATA_DIR = DATOS_DIR / "metadata"
FRONTEND_ASSETS_DIR = PROYECTO_ROOT / "frontend" / "assets" / "data"
LOGS_DIR = PROYECTO_ROOT / "logs"
DATOS_DIR.mkdir(exist_ok=True)
+DATOS_LATEST_DIR.mkdir(parents=True, exist_ok=True)
+DATOS_HISTORY_DIR.mkdir(parents=True, exist_ok=True)
+DATOS_METADATA_DIR.mkdir(parents=True, exist_ok=True)
LOGS_DIR.mkdir(exist_ok=True)
-# Variables de entorno
+# Environment variables
env_path = PROYECTO_ROOT / ".env"
load_dotenv(env_path)
@@ -44,7 +50,7 @@
SO_API_KEY = os.getenv("STACKOVERFLOW_KEY")
SO_API_URL = "https://api.stackexchange.com/2.3/search/advanced"
-# Reddit API (OAuth para evitar bloqueo de IP en CI)
+# Reddit API (OAuth to avoid CI datacenter IP blocking)
REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_SUBREDDIT = "webdev"
@@ -57,7 +63,7 @@
"User-Agent": REDDIT_USER_AGENT
}
-# Archivos de salida
+# Output files
ARCHIVOS_SALIDA = {
"github_repos": DATOS_DIR / "github_repos_2025.csv",
"github_lenguajes": DATOS_DIR / "github_lenguajes.csv",
@@ -73,11 +79,44 @@
"trend_score": DATOS_DIR / "trend_score.csv",
}
+# Data write strategy (incremental refactor)
+# - LEGACY: keeps current historical behavior
+# - LATEST: publishes CSVs in datos/latest for sync consumption
+# - HISTORY: stores date-partitioned snapshots (CSV for now)
+WRITE_LEGACY_CSV = os.getenv("DATA_WRITE_LEGACY_CSV", "1") == "1"
+WRITE_LATEST_CSV = os.getenv("DATA_WRITE_LATEST_CSV", "0") == "1"
+WRITE_HISTORY_CSV = os.getenv("DATA_WRITE_HISTORY_CSV", "0") == "1"
+
+
+def get_latest_output_path(nombre_archivo):
+ """Returns the datos/latest path for a logical output file."""
+ ruta_legacy = ARCHIVOS_SALIDA.get(nombre_archivo)
+ if ruta_legacy is None:
+ return None
+ return DATOS_LATEST_DIR / ruta_legacy.name
+
+
+def get_history_output_path(nombre_archivo, fecha=None):
+ """Returns a date-partitioned path for CSV history."""
+ ruta_legacy = ARCHIVOS_SALIDA.get(nombre_archivo)
+ if ruta_legacy is None:
+ return None
+
+ fecha_ref = fecha or datetime.now(timezone.utc)
+ particion = (
+ DATOS_HISTORY_DIR
+ / nombre_archivo
+ / f"year={fecha_ref.strftime('%Y')}"
+ / f"month={fecha_ref.strftime('%m')}"
+ / f"day={fecha_ref.strftime('%d')}"
+ )
+ return particion / ruta_legacy.name
+
# Logging
LOG_FORMAT = "[%(asctime)s] [%(levelname)s] %(name)s - %(message)s"
LOG_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
-# Resiliencia de red (compartido entre ETLs)
+# Network resilience (shared across ETLs)
REQUEST_TIMEOUT_SECONDS = 10
HTTP_MAX_RETRIES = 3
HTTP_RETRY_BACKOFF_SECONDS = 2
@@ -85,7 +124,7 @@
REQUEST_MEDIUM_DELAY_SECONDS = 0.5
REQUEST_SHORT_DELAY_SECONDS = 0.3
-# Rango de fechas dinamico (ultimos 12 meses)
+# Dynamic date range (last 12 months)
FECHA_FIN = datetime.now()
FECHA_INICIO = FECHA_FIN - timedelta(days=365)
diff --git a/backend/export_history_json.py b/backend/export_history_json.py
new file mode 100644
index 0000000..75dd2d4
--- /dev/null
+++ b/backend/export_history_json.py
@@ -0,0 +1,366 @@
+"""Exports frontend bridge JSON assets from ETL history snapshots."""
+
+from __future__ import annotations
+
+import json
+import logging
+from datetime import datetime, timezone
+from pathlib import Path
+
+import pandas as pd
+
+
+logger = logging.getLogger("export_history_json")
+
+HISTORY_INDEX_FILENAME = "history_index.json"
+TREND_SCORE_HISTORY_FILENAME = "trend_score_history.json"
+
+
+def _utc_now_iso():
+ return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def _to_relative_path(path, project_root):
+ try:
+ return path.relative_to(project_root).as_posix()
+ except ValueError:
+ return path.as_posix()
+
+
+def _safe_int(value, default=0):
+ try:
+ return int(value)
+ except (TypeError, ValueError):
+ return default
+
+
+def _safe_float(value, default=0.0):
+ try:
+ return float(value)
+ except (TypeError, ValueError):
+ return default
+
+
+def _extract_partition_date(parts):
+ if len(parts) < 4:
+ return None
+ year_part, month_part, day_part = parts[1], parts[2], parts[3]
+ if not (year_part.startswith("year=") and month_part.startswith("month=") and day_part.startswith("day=")):
+ return None
+ year = year_part.split("=", maxsplit=1)[1]
+ month = month_part.split("=", maxsplit=1)[1]
+ day = day_part.split("=", maxsplit=1)[1]
+ return f"{year}-{month}-{day}"
+
+
+def _count_rows(csv_path):
+ try:
+ return len(pd.read_csv(csv_path))
+ except Exception: # pylint: disable=broad-exception-caught
+ return None
+
+
+def _collect_history_files(project_root):
+ history_root = project_root / "datos" / "history"
+ if not history_root.exists():
+ return {}
+
+ datasets = {}
+ for csv_path in history_root.rglob("*.csv"):
+ rel_parts = csv_path.relative_to(history_root).parts
+ if len(rel_parts) < 5:
+ continue
+
+ dataset = rel_parts[0]
+ snapshot_date = _extract_partition_date(rel_parts)
+ if snapshot_date is None:
+ continue
+
+ datasets.setdefault(dataset, [])
+ datasets[dataset].append(
+ {
+ "date": snapshot_date,
+ "path": _to_relative_path(csv_path, project_root),
+ "row_count": _count_rows(csv_path),
+ }
+ )
+
+ for dataset in datasets:
+ datasets[dataset] = sorted(
+ datasets[dataset],
+ key=lambda item: (item["date"], item["path"]),
+ )
+ return datasets
+
+
+def _collect_latest_files(project_root):
+ latest_root = project_root / "datos" / "latest"
+ if not latest_root.exists():
+ return {}
+
+ latest_files = {}
+ for csv_path in latest_root.glob("*.csv"):
+ dataset = csv_path.stem
+ latest_files[dataset] = {
+ "path": _to_relative_path(csv_path, project_root),
+ "row_count": _count_rows(csv_path),
+ }
+ return latest_files
+
+
+def build_history_index(project_root):
+ """Builds history index metadata for frontend bridge use."""
+ history_files = _collect_history_files(project_root)
+ latest_files = _collect_latest_files(project_root)
+ datasets = []
+
+ for dataset_name in sorted(set(history_files.keys()) | set(latest_files.keys())):
+ latest_info = latest_files.get(dataset_name)
+ snapshots = history_files.get(dataset_name, [])
+ datasets.append(
+ {
+ "dataset": dataset_name,
+ "latest_path": latest_info["path"] if latest_info else None,
+ "latest_row_count": latest_info["row_count"] if latest_info else None,
+ "history_snapshot_count": len(snapshots),
+ "snapshots": snapshots,
+ }
+ )
+
+ return {
+ "generated_at_utc": _utc_now_iso(),
+ "dataset_count": len(datasets),
+ "datasets": datasets,
+ }
+
+
+def _resolve_trend_snapshot_sources(project_root, history_index):
+ trend_entry = next((item for item in history_index["datasets"] if item["dataset"] == "trend_score"), None)
+ if trend_entry is None:
+ return []
+
+ sources = []
+ for snapshot in trend_entry["snapshots"]:
+ csv_path = project_root / snapshot["path"]
+ if csv_path.exists():
+ sources.append(
+ {
+ "date": snapshot["date"],
+ "path": snapshot["path"],
+ "source_type": "history",
+ }
+ )
+
+ if not sources and trend_entry.get("latest_path"):
+ latest_path = project_root / trend_entry["latest_path"]
+ if latest_path.exists():
+ mtime = datetime.fromtimestamp(latest_path.stat().st_mtime, tz=timezone.utc)
+ sources.append(
+ {
+ "date": mtime.strftime("%Y-%m-%d"),
+ "path": trend_entry["latest_path"],
+ "source_type": "latest",
+ }
+ )
+
+ return sorted(sources, key=lambda item: (item["date"], item["path"]))
+
+
+def _build_trend_snapshot_record(df, date_label, relative_path, source_type):
+ working = df.copy()
+ if "ranking" not in working.columns:
+ working = working.sort_values("trend_score", ascending=False).reset_index(drop=True)
+ working["ranking"] = range(1, len(working) + 1)
+
+ top_10 = []
+ for _, row in working.sort_values("ranking", ascending=True).head(10).iterrows():
+ top_10.append(
+ {
+ "ranking": _safe_int(row.get("ranking"), default=0),
+ "tecnologia": str(row.get("tecnologia", "")),
+ "trend_score": round(_safe_float(row.get("trend_score"), default=0.0), 2),
+ "fuentes": _safe_int(row.get("fuentes"), default=0),
+ }
+ )
+
+ return {
+ "date": date_label,
+ "path": relative_path,
+ "source_type": source_type,
+ "row_count": len(working),
+ "top_10": top_10,
+ }
+
+
+def _is_valid_trend_snapshot_df(df):
+ required_columns = {"tecnologia", "trend_score"}
+ return required_columns.issubset(df.columns)
+
+
+def _append_trend_snapshot(
+ *,
+ snapshots,
+ snapshots_with_df,
+ dataframe,
+ date_label,
+ relative_path,
+ source_type,
+):
+ snapshots.append(
+ _build_trend_snapshot_record(
+ df=dataframe,
+ date_label=date_label,
+ relative_path=relative_path,
+ source_type=source_type,
+ )
+ )
+ snapshots_with_df.append(
+ {
+ "date": date_label,
+ "dataframe": dataframe,
+ }
+ )
+
+
+def _build_trend_series(snapshots_with_df):
+ series_map = {}
+ for snapshot in snapshots_with_df:
+ date_label = snapshot["date"]
+ df = snapshot["dataframe"]
+ working = df.copy()
+ if "ranking" not in working.columns:
+ working = working.sort_values("trend_score", ascending=False).reset_index(drop=True)
+ working["ranking"] = range(1, len(working) + 1)
+
+ for _, row in working.iterrows():
+ tech = str(row.get("tecnologia", "")).strip()
+ if not tech:
+ continue
+ series_map.setdefault(tech, [])
+ series_map[tech].append(
+ {
+ "date": date_label,
+ "ranking": _safe_int(row.get("ranking"), default=0),
+ "trend_score": round(_safe_float(row.get("trend_score"), default=0.0), 2),
+ "fuentes": _safe_int(row.get("fuentes"), default=0),
+ }
+ )
+
+ series = []
+ for tech, points in series_map.items():
+ sorted_points = sorted(points, key=lambda item: item["date"])
+ latest_ranking = sorted_points[-1]["ranking"] if sorted_points else 999999
+ series.append(
+ {
+ "tecnologia": tech,
+ "points": sorted_points,
+ "_latest_ranking": latest_ranking,
+ }
+ )
+
+ series = sorted(series, key=lambda item: (item["_latest_ranking"], item["tecnologia"]))
+ for item in series:
+ item.pop("_latest_ranking", None)
+ return series
+
+
+def build_trend_score_history(project_root, history_index):
+ """Builds trend_score_history payload for frontend bridge use."""
+ sources = _resolve_trend_snapshot_sources(project_root, history_index)
+ snapshots = []
+ snapshots_with_df = []
+
+ for source in sources:
+ csv_path = project_root / source["path"]
+ try:
+ df = pd.read_csv(csv_path)
+ except Exception as exc: # pylint: disable=broad-exception-caught
+ logger.warning("Skipping trend snapshot %s due to read error: %s", csv_path, exc)
+ continue
+
+ if not _is_valid_trend_snapshot_df(df):
+ logger.warning("Skipping trend snapshot %s due to missing required columns", csv_path)
+ continue
+
+ _append_trend_snapshot(
+ snapshots=snapshots,
+ snapshots_with_df=snapshots_with_df,
+ dataframe=df,
+ date_label=source["date"],
+ relative_path=source["path"],
+ source_type=source["source_type"],
+ )
+
+ # If history entries exist but all are corrupted/invalid, fallback to latest snapshot.
+ if not snapshots:
+ trend_entry = next((item for item in history_index["datasets"] if item["dataset"] == "trend_score"), None)
+ latest_path = trend_entry.get("latest_path") if trend_entry else None
+ if latest_path:
+ latest_csv_path = project_root / latest_path
+ if latest_csv_path.exists():
+ try:
+ latest_df = pd.read_csv(latest_csv_path)
+ if _is_valid_trend_snapshot_df(latest_df):
+ mtime = datetime.fromtimestamp(latest_csv_path.stat().st_mtime, tz=timezone.utc)
+ _append_trend_snapshot(
+ snapshots=snapshots,
+ snapshots_with_df=snapshots_with_df,
+ dataframe=latest_df,
+ date_label=mtime.strftime("%Y-%m-%d"),
+ relative_path=latest_path,
+ source_type="latest",
+ )
+ except Exception as exc: # pylint: disable=broad-exception-caught
+ logger.warning("Skipping latest trend snapshot fallback due to read error: %s", exc)
+
+ return {
+ "generated_at_utc": _utc_now_iso(),
+ "snapshot_count": len(snapshots),
+ "snapshots": snapshots,
+ "series": _build_trend_series(snapshots_with_df),
+ }
+
+
+def _write_json(path, payload):
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def export_bridge_assets(project_root):
+ """Exports bridge JSON files for frontend historical access."""
+ project_root = Path(project_root)
+ output_dir = project_root / "frontend" / "assets" / "data"
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ history_index_payload = build_history_index(project_root)
+ trend_history_payload = build_trend_score_history(project_root, history_index_payload)
+
+ history_index_path = output_dir / HISTORY_INDEX_FILENAME
+ trend_history_path = output_dir / TREND_SCORE_HISTORY_FILENAME
+ _write_json(history_index_path, history_index_payload)
+ _write_json(trend_history_path, trend_history_payload)
+
+ summary = {
+ "files_written": 2,
+ "history_index_path": str(history_index_path),
+ "trend_score_history_path": str(trend_history_path),
+ "dataset_count": int(history_index_payload["dataset_count"]),
+ "trend_snapshot_count": int(trend_history_payload["snapshot_count"]),
+ }
+ return summary
+
+
+def main():
+ logging.basicConfig(level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(name)s - %(message)s")
+ project_root = Path(__file__).resolve().parent.parent
+ summary = export_bridge_assets(project_root)
+ logger.info(
+ "[RUN][SUMMARY] status=success files_written=%d datasets=%d trend_snapshots=%d",
+ summary["files_written"],
+ summary["dataset_count"],
+ summary["trend_snapshot_count"],
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/backend/quality/__init__.py b/backend/quality/__init__.py
new file mode 100644
index 0000000..ba1eb5c
--- /dev/null
+++ b/backend/quality/__init__.py
@@ -0,0 +1 @@
+"""Quality utilities package."""
diff --git a/backend/quality/degradation_policy.py b/backend/quality/degradation_policy.py
new file mode 100644
index 0000000..14ec4d0
--- /dev/null
+++ b/backend/quality/degradation_policy.py
@@ -0,0 +1,81 @@
+"""Trend score degradation policy based on source availability."""
+
+from __future__ import annotations
+
+from typing import Mapping
+
+
+DEFAULT_SOURCE_WEIGHTS = {
+ "github": 0.40,
+ "stackoverflow": 0.35,
+ "reddit": 0.25,
+}
+
+_REQUIRED_SOURCES = ("github", "stackoverflow", "reddit")
+
+
+def _normalize_status(source_status: Mapping[str, bool]) -> dict[str, bool]:
+ return {source: bool(source_status.get(source, False)) for source in _REQUIRED_SOURCES}
+
+
+def _renormalize_weights(default_weights: Mapping[str, float], status: Mapping[str, bool]) -> dict[str, float]:
+ active_sources = [source for source, available in status.items() if available]
+ if not active_sources:
+ return {}
+
+ total = sum(float(default_weights[source]) for source in active_sources)
+ if total <= 0:
+ return {}
+
+ return {
+ source: round(float(default_weights[source]) / total, 6)
+ for source in active_sources
+ }
+
+
+def evaluate_degradation_policy(
+ source_status: Mapping[str, bool],
+ default_weights: Mapping[str, float] | None = None,
+) -> dict[str, object]:
+ """Evaluates publication decision and weights for source availability."""
+ weights = default_weights or DEFAULT_SOURCE_WEIGHTS
+ status = _normalize_status(source_status)
+
+ available_sources = [source for source, available in status.items() if available]
+ missing_sources = [source for source, available in status.items() if not available]
+ available_count = len(available_sources)
+
+ if available_count == 3:
+ return {
+ "available_count": 3,
+ "available_sources": available_sources,
+ "missing_sources": missing_sources,
+ "publish_allowed": True,
+ "quality_gate_status": "pass",
+ "weights_mode": "default",
+ "effective_weights": dict(weights),
+ "reason": "all_sources_available",
+ }
+
+ if available_count == 2:
+ return {
+ "available_count": 2,
+ "available_sources": available_sources,
+ "missing_sources": missing_sources,
+ "publish_allowed": True,
+ "quality_gate_status": "pass_with_warnings",
+ "weights_mode": "renormalized",
+ "effective_weights": _renormalize_weights(weights, status),
+ "reason": "single_source_missing",
+ }
+
+ return {
+ "available_count": available_count,
+ "available_sources": available_sources,
+ "missing_sources": missing_sources,
+ "publish_allowed": False,
+ "quality_gate_status": "fail",
+ "weights_mode": "unavailable",
+ "effective_weights": {},
+ "reason": "insufficient_sources",
+ }
diff --git a/backend/quality/pandera_schemas.py b/backend/quality/pandera_schemas.py
new file mode 100644
index 0000000..51a4208
--- /dev/null
+++ b/backend/quality/pandera_schemas.py
@@ -0,0 +1,215 @@
+"""Pandera quality checks with severity routing.
+
+This module defines dataset-level Pandera schemas and complementary
+quality rules with explicit severities:
+- critical: candidate to block publication in strict mode
+- warning: publish allowed with quality flag
+- info: observability only
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import pandas as pd
+
+SEVERITY_CRITICAL = "critical"
+SEVERITY_WARNING = "warning"
+SEVERITY_INFO = "info"
+VALID_SEVERITIES = {SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO}
+
+try:
+ import pandera as pa
+ from pandera import Check
+ from pandera.errors import SchemaError, SchemaErrors
+
+ PANDERA_AVAILABLE = True
+except Exception: # pylint: disable=broad-exception-caught
+ pa = None
+ Check = None
+ SchemaError = Exception
+ SchemaErrors = Exception
+ PANDERA_AVAILABLE = False
+
+
+def _make_issue(dataset: str, severity: str, rule: str, message: str) -> dict[str, str]:
+ safe_severity = severity if severity in VALID_SEVERITIES else SEVERITY_INFO
+ return {
+ "dataset": dataset,
+ "severity": safe_severity,
+ "rule": rule,
+ "message": message,
+ }
+
+
+def _build_schema_registry() -> dict[str, Any]:
+ if not PANDERA_AVAILABLE:
+ return {}
+
+ return {
+ "trend_score": pa.DataFrameSchema(
+ {
+ "ranking": pa.Column(
+ pa.Int64,
+ nullable=False,
+ checks=[
+ Check(lambda series: (series >= 1).all(), error="ranking_must_be_positive"),
+ Check(lambda series: series.is_unique, error="ranking_must_be_unique"),
+ ],
+ ),
+ "tecnologia": pa.Column(pa.String, nullable=False),
+ "trend_score": pa.Column(
+ pa.Float64,
+ nullable=False,
+ checks=[Check(lambda series: (series >= 0).all(), error="trend_score_non_negative")],
+ ),
+ "fuentes": pa.Column(
+ pa.Int64,
+ nullable=False,
+ checks=[
+ Check(
+ lambda series: ((series >= 0) & (series <= 3)).all(),
+ error="fuentes_must_be_in_range_0_3",
+ )
+ ],
+ ),
+ },
+ strict=False,
+ coerce=False,
+ ),
+ "so_volumen": pa.DataFrameSchema(
+ {
+ "lenguaje": pa.Column(pa.String, nullable=False),
+ "preguntas_nuevas_2025": pa.Column(
+ pa.Int64,
+ nullable=False,
+ checks=[Check(lambda series: (series >= 0).all(), error="yearly_volume_non_negative")],
+ ),
+ },
+ strict=False,
+ coerce=False,
+ ),
+ }
+
+
+PANDERA_SCHEMAS = _build_schema_registry()
+
+
+def _parse_schema_errors(dataset: str, exc: Exception) -> list[dict[str, str]]:
+ issues: list[dict[str, str]] = []
+ failure_cases = getattr(exc, "failure_cases", None)
+
+ if isinstance(failure_cases, pd.DataFrame) and not failure_cases.empty:
+ for _, row in failure_cases.iterrows():
+ column = row.get("column", "")
+ check = row.get("check", "schema_validation")
+ failure_case = row.get("failure_case", "")
+ message = f"column={column} check={check} failure={failure_case}"
+ issues.append(
+ _make_issue(
+ dataset=dataset,
+ severity=SEVERITY_CRITICAL,
+ rule="pandera_schema",
+ message=message,
+ )
+ )
+ return issues
+
+ issues.append(
+ _make_issue(
+ dataset=dataset,
+ severity=SEVERITY_CRITICAL,
+ rule="pandera_schema",
+ message=str(exc),
+ )
+ )
+ return issues
+
+
+def _run_warning_checks(df: pd.DataFrame, logical_name: str) -> list[dict[str, str]]:
+ issues: list[dict[str, str]] = []
+
+ if logical_name == "trend_score" and "tecnologia" in df.columns:
+ if df["tecnologia"].nunique(dropna=True) < 10:
+ issues.append(
+ _make_issue(
+ dataset=logical_name,
+ severity=SEVERITY_WARNING,
+ rule="low_technology_coverage",
+ message="fewer than 10 unique technologies in trend score output",
+ )
+ )
+
+ if logical_name == "trend_score" and "fuentes" in df.columns:
+ numeric_fuentes = pd.to_numeric(df["fuentes"], errors="coerce").fillna(0)
+ zero_source_count = int((numeric_fuentes == 0).sum())
+ if zero_source_count > 0:
+ issues.append(
+ _make_issue(
+ dataset=logical_name,
+ severity=SEVERITY_WARNING,
+ rule="zero_source_rows",
+ message=f"{zero_source_count} rows have fuentes=0",
+ )
+ )
+
+ if logical_name == "so_volumen" and "preguntas_nuevas_2025" in df.columns:
+ numeric = pd.to_numeric(df["preguntas_nuevas_2025"], errors="coerce").fillna(0)
+ if not numeric.empty and (numeric == 0).all():
+ issues.append(
+ _make_issue(
+ dataset=logical_name,
+ severity=SEVERITY_WARNING,
+ rule="all_zero_volume",
+ message="all StackOverflow yearly volumes are zero",
+ )
+ )
+
+ return issues
+
+
+def _run_info_checks(df: pd.DataFrame, logical_name: str) -> list[dict[str, str]]:
+ issues: list[dict[str, str]] = []
+
+ duplicate_rows = int(df.duplicated().sum())
+ if duplicate_rows > 0:
+ issues.append(
+ _make_issue(
+ dataset=logical_name,
+ severity=SEVERITY_INFO,
+ rule="duplicate_rows_detected",
+ message=f"{duplicate_rows} duplicated rows detected",
+ )
+ )
+
+ return issues
+
+
+def run_pandera_quality_checks(df: pd.DataFrame, logical_name: str) -> list[dict[str, str]]:
+ """Runs Pandera schema validation and severity checks for one dataset."""
+ issues: list[dict[str, str]] = []
+
+ if not PANDERA_AVAILABLE:
+ issues.append(
+ _make_issue(
+ dataset=logical_name,
+ severity=SEVERITY_INFO,
+ rule="pandera_unavailable",
+ message="Pandera is not installed; schema checks were skipped",
+ )
+ )
+ issues.extend(_run_info_checks(df, logical_name))
+ return issues
+
+ schema = PANDERA_SCHEMAS.get(logical_name)
+ if schema is not None:
+ try:
+ schema.validate(df, lazy=True)
+ except SchemaErrors as exc:
+ issues.extend(_parse_schema_errors(logical_name, exc))
+ except SchemaError as exc:
+ issues.extend(_parse_schema_errors(logical_name, exc))
+
+ issues.extend(_run_warning_checks(df, logical_name))
+ issues.extend(_run_info_checks(df, logical_name))
+ return issues
diff --git a/backend/reddit_etl.py b/backend/reddit_etl.py
index 523bc4f..4c835e8 100644
--- a/backend/reddit_etl.py
+++ b/backend/reddit_etl.py
@@ -229,7 +229,7 @@ def extraer_posts(self, subreddit_name=REDDIT_SUBREDDIT, limit=REDDIT_LIMIT):
self.logger.error(f"Error obteniendo posts: {e}")
if not posts_data:
- # Intentar cargar datos anteriores si existen
+ # Try loading previous data if available
ruta_anterior = ARCHIVOS_SALIDA.get("reddit_sentimiento")
if ruta_anterior and ruta_anterior.exists():
self.logger.warning(f"No se pudo extraer posts de r/{subreddit_name} — usando datos anteriores")
@@ -449,4 +449,4 @@ def main():
if __name__ == "__main__":
- main()
\ No newline at end of file
+ main()
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 4f81a08..d14d1d0 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -4,6 +4,8 @@ pandas>=2.2.0,<3.0
numpy>=1.24,<3.0
python-dotenv>=1.0.0,<2.0
nltk>=3.8.1,<3.10
+pandera>=0.22.0,<0.23.0
+duckdb>=1.2.2,<2.0
# Testing
-pytest>=8.0.0,<9.0
\ No newline at end of file
+pytest>=8.0.0,<9.0
diff --git a/backend/sync_assets.py b/backend/sync_assets.py
index cd16305..3ca30c1 100644
--- a/backend/sync_assets.py
+++ b/backend/sync_assets.py
@@ -4,10 +4,67 @@
Ensures the Flutter Web dashboard always uses the latest
processed data from the ETL pipeline.
"""
+
import logging
+import os
import shutil
from pathlib import Path
+from export_history_json import export_bridge_assets
+
+
+def _is_bridge_export_enabled():
+ return os.getenv("EXPORT_HISTORY_BRIDGE_JSON", "1") == "1"
+
+
+def _resolver_origen_csv(proyecto_root):
+ """Resolves CSV source strategy, prioritizing latest per file with legacy fallback."""
+ origen_latest = proyecto_root / "datos" / "latest"
+ origen_legacy = proyecto_root / "datos"
+ csv_by_name = {}
+
+ if origen_legacy.exists():
+ for csv_file in origen_legacy.glob("*.csv"):
+ csv_by_name[csv_file.name] = csv_file
+
+ if origen_latest.exists():
+ for csv_file in origen_latest.glob("*.csv"):
+ csv_by_name[csv_file.name] = csv_file
+
+ return csv_by_name, origen_latest, origen_legacy
+
+
+def _describe_source_mode(csv_by_name, origen_latest, origen_legacy):
+ if not csv_by_name:
+ return "none"
+
+ latest_used = {
+ name
+ for name, path in csv_by_name.items()
+ if path.parent.resolve() == origen_latest.resolve()
+ }
+ legacy_used = {
+ name
+ for name, path in csv_by_name.items()
+ if path.parent.resolve() == origen_legacy.resolve()
+ }
+
+ if latest_used and legacy_used:
+ return "mixed"
+ if latest_used:
+ return "latest"
+ return "legacy"
+
+
+def _resolve_summary_source(source_mode, origen_latest, origen_legacy):
+ if source_mode == "latest":
+ return str(origen_latest)
+ if source_mode == "legacy":
+ return str(origen_legacy)
+ if source_mode == "mixed":
+ return "mixed(latest+legacy)"
+ return "none"
+
def sincronizar():
"""Copies all CSV files from datos/ to frontend/assets/data/."""
@@ -16,34 +73,60 @@ def sincronizar():
logging.basicConfig(level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(name)s - %(message)s")
proyecto_root = Path(__file__).resolve().parent.parent
- origen = proyecto_root / "datos"
+ csv_by_name, origen_latest, origen_legacy = _resolver_origen_csv(proyecto_root)
+ source_mode = _describe_source_mode(csv_by_name, origen_latest, origen_legacy)
+ origen = _resolve_summary_source(source_mode, origen_latest, origen_legacy)
destino = proyecto_root / "frontend" / "assets" / "data"
destino.mkdir(parents=True, exist_ok=True)
- logger.info("[RUN][START] origen=%s destino=%s", origen, destino)
+ logger.info("[RUN][START] origen=%s source_mode=%s destino=%s", origen, source_mode, destino)
archivos_copiados = 0
errores = 0
- for csv_file in origen.glob("*.csv"):
+ bridge_files_written = 0
+ bridge_enabled = _is_bridge_export_enabled()
+
+ for csv_name in sorted(csv_by_name):
+ csv_file = csv_by_name[csv_name]
try:
shutil.copy2(csv_file, destino / csv_file.name)
archivos_copiados += 1
- logger.info("[STEP][END] accion=copy archivo=%s estado=success", csv_file.name)
+ logger.info("[STEP][END] accion=copy archivo=%s origen=%s estado=success", csv_file.name, csv_file.parent)
except Exception as exc: # pylint: disable=broad-exception-caught
errores += 1
logger.error("[STEP][END] accion=copy archivo=%s estado=failed error=%s", csv_file.name, exc)
+ if bridge_enabled:
+ try:
+ bridge_summary = export_bridge_assets(proyecto_root)
+ bridge_files_written = int(bridge_summary["files_written"])
+ logger.info(
+ "[STEP][END] action=bridge_export status=success files_written=%d trend_snapshots=%d",
+ bridge_files_written,
+ bridge_summary["trend_snapshot_count"],
+ )
+ except Exception as exc: # pylint: disable=broad-exception-caught
+ errores += 1
+ logger.error("[STEP][END] action=bridge_export status=failed error=%s", exc)
+
logger.info(
- "[RUN][SUMMARY] estado=%s archivos_copiados=%d errores=%d origen=%s destino=%s",
+ "[RUN][SUMMARY] estado=%s archivos_copiados=%d bridge_files=%d bridge_enabled=%s "
+ "errores=%d source_mode=%s origen=%s destino=%s",
"success" if errores == 0 else "partial",
archivos_copiados,
+ bridge_files_written,
+ bridge_enabled,
errores,
+ source_mode,
origen,
destino,
)
return {
"files_copied": archivos_copiados,
+ "bridge_files_written": bridge_files_written,
+ "bridge_export_enabled": bridge_enabled,
"errors": errores,
+ "source_mode": source_mode,
"source": str(origen),
"destination": str(destino),
}
diff --git a/backend/tech_normalization.py b/backend/tech_normalization.py
index 6064314..4325862 100644
--- a/backend/tech_normalization.py
+++ b/backend/tech_normalization.py
@@ -1,6 +1,6 @@
-"""Utilidades compartidas para normalizar nombres de tecnologías.
+"""Shared utilities to normalize technology names.
-Centraliza mapeos usados por ETLs para evitar drift entre módulos.
+Centralizes mappings used by ETLs to avoid cross-module drift.
"""
from __future__ import annotations
@@ -57,7 +57,7 @@
def normalize_technology_name(name: str) -> str:
- """Normaliza nombre a etiqueta legible consistente."""
+ """Normalizes a name into a consistent display label."""
text = str(name or "").strip()
if not text:
return ""
@@ -65,7 +65,7 @@ def normalize_technology_name(name: str) -> str:
def normalize_for_match(name: str) -> str:
- """Normaliza nombre para comparación flexible cross-source."""
+ """Normalizes a name for flexible cross-source matching."""
raw = str(name or "").strip().lower()
if not raw:
return ""
diff --git a/backend/trend_score.py b/backend/trend_score.py
index e394a08..d60c2e3 100644
--- a/backend/trend_score.py
+++ b/backend/trend_score.py
@@ -1,38 +1,28 @@
-"""
-Trend Score Generator - Technology Trend Analysis Platform
+"""Trend Score generator for the Technology Trend Analysis Platform."""
-Combines data from GitHub, StackOverflow, and Reddit to produce
-a unified technology ranking. The composite score uses weighted
-metrics from each source.
-
-Formula:
- Trend Score = (peso_github × github_score) +
- (peso_so × so_score) +
- (peso_reddit × reddit_score)
-
-Author: Samir Caizapasto
-"""
-import pandas as pd
import logging
+import os
from datetime import datetime
-from config.settings import (
- ARCHIVOS_SALIDA,
-)
-from validador import validar_dataframe
+import pandas as pd
+
from base_etl import BaseETL
+from config.settings import ARCHIVOS_SALIDA
from exceptions import ETLExtractionError
from tech_normalization import normalize_technology_name
+from trend_score_duckdb import calcular_trend_score_duckdb
+from validador import validar_dataframe
logger = logging.getLogger("trend_score")
-# Pesos para cada fuente de datos
PESOS = {
"github": 0.40,
"stackoverflow": 0.35,
- "reddit": 0.25
+ "reddit": 0.25,
}
+TREND_ENGINES = {"legacy", "duckdb"}
+
ETIQUETAS_NO_LENGUAJE = {
"sin especificar",
"llms/ai",
@@ -45,26 +35,12 @@
def normalizar_nombre(nombre):
- """Normalizes technology names for cross-source comparison.
-
- Args:
- nombre: Raw technology name from any source.
-
- Returns:
- Normalized lowercase name.
- """
+ """Normalizes technology names for cross-source comparison."""
return normalize_technology_name(nombre)
def normalizar_scores(serie):
- """Normalizes a numeric series to 0-100 scale using min-max.
-
- Args:
- serie: pandas Series with numeric values.
-
- Returns:
- Normalized series (0-100).
- """
+ """Normalizes a numeric series to 0-100 scale using min-max."""
if serie.max() == serie.min():
return pd.Series([50.0] * len(serie), index=serie.index)
@@ -72,94 +48,71 @@ def normalizar_scores(serie):
def cargar_github():
- """Loads and processes GitHub data for scoring.
-
- Returns:
- DataFrame with columns: [tecnologia, github_score]
- """
+ """Loads and processes GitHub data for scoring."""
try:
df_repos = pd.read_csv(ARCHIVOS_SALIDA["github_repos"])
df_repos["language"] = df_repos["language"].fillna("Sin especificar").astype(str).str.strip()
df_repos = df_repos[~df_repos["language"].str.lower().isin(ETIQUETAS_NO_LENGUAJE)]
if df_repos.empty:
- logger.warning("GitHub: sin lenguajes clasificables tras aplicar filtros")
+ logger.warning("GitHub: no classifiable languages after filters")
return pd.DataFrame(columns=["tecnologia", "github_score"])
langs = df_repos["language"].value_counts().head(15).reset_index()
langs.columns = ["tecnologia", "repos_count"]
langs["tecnologia"] = langs["tecnologia"].apply(normalizar_nombre)
langs["github_score"] = normalizar_scores(langs["repos_count"])
- logger.info("GitHub: %d tecnologias cargadas", len(langs))
+ logger.info("GitHub: %d technologies loaded", len(langs))
return langs[["tecnologia", "github_score"]]
except FileNotFoundError:
- logger.warning("No se encontro github_repos_2025.csv")
+ logger.warning("github_repos_2025.csv was not found")
return pd.DataFrame(columns=["tecnologia", "github_score"])
- except (KeyError, ValueError) as e:
- logger.error("Error procesando datos de GitHub: %s", e)
+ except (KeyError, ValueError) as exc:
+ logger.error("Error processing GitHub data: %s", exc)
return pd.DataFrame(columns=["tecnologia", "github_score"])
def cargar_stackoverflow():
- """Loads and processes StackOverflow data for scoring.
-
- Returns:
- DataFrame with columns: [tecnologia, so_score]
- """
+ """Loads and processes StackOverflow data for scoring."""
try:
df_vol = pd.read_csv(ARCHIVOS_SALIDA["so_volumen"])
df_vol["tecnologia"] = df_vol["lenguaje"].apply(normalizar_nombre)
df_vol["so_score"] = normalizar_scores(df_vol["preguntas_nuevas_2025"])
- logger.info("StackOverflow: %d tecnologias cargadas", len(df_vol))
+ logger.info("StackOverflow: %d technologies loaded", len(df_vol))
return df_vol[["tecnologia", "so_score"]]
except FileNotFoundError:
- logger.warning("No se encontro so_volumen_preguntas.csv")
+ logger.warning("so_volumen_preguntas.csv was not found")
return pd.DataFrame(columns=["tecnologia", "so_score"])
- except (KeyError, ValueError) as e:
- logger.error("Error procesando datos de StackOverflow: %s", e)
+ except (KeyError, ValueError) as exc:
+ logger.error("Error processing StackOverflow data: %s", exc)
return pd.DataFrame(columns=["tecnologia", "so_score"])
def cargar_reddit():
- """Loads and processes Reddit data for scoring.
-
- Returns:
- DataFrame with columns: [tecnologia, reddit_score]
- """
+ """Loads and processes Reddit data for scoring."""
try:
df_temas = pd.read_csv(ARCHIVOS_SALIDA["reddit_temas"])
df_temas["tecnologia"] = df_temas["tema"].apply(normalizar_nombre)
df_temas["reddit_score"] = normalizar_scores(df_temas["menciones"])
- logger.info("Reddit: %d tecnologias cargadas", len(df_temas))
+ logger.info("Reddit: %d technologies loaded", len(df_temas))
return df_temas[["tecnologia", "reddit_score"]]
except FileNotFoundError:
- logger.warning("No se encontro reddit_temas_emergentes.csv")
+ logger.warning("reddit_temas_emergentes.csv was not found")
return pd.DataFrame(columns=["tecnologia", "reddit_score"])
- except (KeyError, ValueError) as e:
- logger.error("Error procesando datos de Reddit: %s", e)
+ except (KeyError, ValueError) as exc:
+ logger.error("Error processing Reddit data: %s", exc)
return pd.DataFrame(columns=["tecnologia", "reddit_score"])
-def calcular_trend_score():
- """Calculates the composite Trend Score for all technologies.
-
- Combines normalized scores from GitHub, StackOverflow, and Reddit
- using weighted average. Technologies not found in a source get
- a score of 0 for that source.
-
- Returns:
- DataFrame with columns: [tecnologia, github_score, so_score,
- reddit_score, trend_score, ranking]
- """
- logger.info("Calculando Trend Score compuesto...")
- logger.info("Pesos: GitHub=%s, SO=%s, Reddit=%s", PESOS['github'], PESOS['stackoverflow'], PESOS['reddit'])
-
- # Cargar datos de cada fuente
+def _load_score_sources():
df_github = cargar_github()
df_so = cargar_stackoverflow()
df_reddit = cargar_reddit()
+ return df_github, df_so, df_reddit
- # Combinar todas las tecnologias (outer join)
+
+def _build_legacy_trend_score(df_github, df_so, df_reddit):
+ """Builds Trend Score with the legacy pandas merge strategy."""
df_combined = pd.DataFrame({"tecnologia": []})
if not df_github.empty:
@@ -170,49 +123,100 @@ def calcular_trend_score():
df_combined = pd.merge(df_combined, df_reddit, on="tecnologia", how="outer")
if df_combined.empty:
- logger.error("No hay datos de ninguna fuente para calcular Trend Score")
+ logger.error("No data from any source to calculate Trend Score")
return pd.DataFrame()
- # Rellenar NaN con 0 (tecnologia no encontrada en esa fuente)
for col in ["github_score", "so_score", "reddit_score"]:
if col not in df_combined.columns:
df_combined[col] = 0.0
else:
df_combined[col] = df_combined[col].fillna(0.0)
- # Calcular score compuesto
df_combined["trend_score"] = (
- PESOS["github"] * df_combined["github_score"] +
- PESOS["stackoverflow"] * df_combined["so_score"] +
- PESOS["reddit"] * df_combined["reddit_score"]
+ PESOS["github"] * df_combined["github_score"]
+ + PESOS["stackoverflow"] * df_combined["so_score"]
+ + PESOS["reddit"] * df_combined["reddit_score"]
).round(2)
- # Ordenar por trend_score y agregar ranking
df_combined = df_combined.sort_values("trend_score", ascending=False).reset_index(drop=True)
df_combined["ranking"] = range(1, len(df_combined) + 1)
- # Contar en cuantas fuentes aparece cada tecnologia
df_combined["fuentes"] = (
- (df_combined["github_score"] > 0).astype(int) +
- (df_combined["so_score"] > 0).astype(int) +
- (df_combined["reddit_score"] > 0).astype(int)
+ (df_combined["github_score"] > 0).astype(int)
+ + (df_combined["so_score"] > 0).astype(int)
+ + (df_combined["reddit_score"] > 0).astype(int)
)
- # Log del ranking
- logger.info("\nTrend Score - Top Tecnologias (%d total):", len(df_combined))
- logger.info("%3s %-20s %8s %8s %8s %8s %8s", "#", "Tecnologia", "GitHub", "SO", "Reddit", "Score", "Fuentes")
+ return df_combined[
+ ["ranking", "tecnologia", "github_score", "so_score", "reddit_score", "trend_score", "fuentes"]
+ ]
+
+
+def calculate_trend_score_legacy(df_github, df_so, df_reddit):
+ """Public helper to compute trend score with the legacy engine."""
+ return _build_legacy_trend_score(df_github, df_so, df_reddit)
+
+
+def resolve_trend_engine(engine=None):
+ """Resolves the Trend Score engine from explicit input or environment."""
+ resolved = str(engine or os.getenv("TREND_SCORE_ENGINE", "legacy")).strip().lower()
+ if resolved not in TREND_ENGINES:
+ logger.warning("Unknown trend engine '%s'. Falling back to 'legacy'.", resolved)
+ return "legacy"
+ return resolved
+
+
+def _log_ranking_preview(df_combined):
+ logger.info("\nTrend Score - Top Technologies (%d total):", len(df_combined))
+ logger.info("%3s %-20s %8s %8s %8s %8s %8s", "#", "Technology", "GitHub", "SO", "Reddit", "Score", "Sources")
logger.info("-" * 75)
for _, row in df_combined.head(15).iterrows():
logger.info(
"#%2d %-20s %7.1f %7.1f %7.1f %7.1f %5d/3",
- row['ranking'], row['tecnologia'],
- row['github_score'], row['so_score'],
- row['reddit_score'], row['trend_score'],
- int(row['fuentes'])
+ row["ranking"],
+ row["tecnologia"],
+ row["github_score"],
+ row["so_score"],
+ row["reddit_score"],
+ row["trend_score"],
+ int(row["fuentes"]),
)
- return df_combined
+
+def calcular_trend_score(engine=None):
+ """Calculates the composite Trend Score for all technologies."""
+ logger.info("Calculating composite Trend Score...")
+ logger.info("Weights: GitHub=%s, SO=%s, Reddit=%s", PESOS["github"], PESOS["stackoverflow"], PESOS["reddit"])
+
+ df_github, df_so, df_reddit = _load_score_sources()
+
+ if df_github.empty and df_so.empty and df_reddit.empty:
+ logger.error("No data from any source to calculate Trend Score")
+ return pd.DataFrame()
+
+ engine_name = resolve_trend_engine(engine)
+ logger.info("Trend engine selected: %s", engine_name)
+
+ if engine_name == "duckdb":
+ try:
+ df_result = calcular_trend_score_duckdb(
+ df_github=df_github,
+ df_so=df_so,
+ df_reddit=df_reddit,
+ pesos=PESOS,
+ )
+ except Exception as exc: # pylint: disable=broad-exception-caught
+ logger.error("DuckDB engine failed (%s). Falling back to legacy engine.", exc)
+ df_result = _build_legacy_trend_score(df_github, df_so, df_reddit)
+ else:
+ df_result = _build_legacy_trend_score(df_github, df_so, df_reddit)
+
+ if df_result.empty:
+ return df_result
+
+ _log_ranking_preview(df_result)
+ return df_result
def main():
@@ -222,53 +226,55 @@ def main():
class TrendScoreETL(BaseETL):
- """Adaptador ETL para Trend Score con el contrato de BaseETL.
-
- Mantiene el comportamiento existente sin sobreingeniería: un único paso
- que calcula, valida y guarda el CSV de trend score.
- """
+ """ETL adapter for Trend Score under the BaseETL contract."""
def __init__(self):
super().__init__("trend_score")
def definir_pasos(self):
- return [("Calcular Trend Score", self._calcular_y_guardar)]
+ return [("Calculate Trend Score", self._calcular_y_guardar)]
def _calcular_y_guardar(self):
self.logger.info("Trend Score Generator - Technology Trend Analysis Platform")
- self.logger.info("Fecha: %s", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+ self.logger.info("Execution date: %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
try:
df_trend = calcular_trend_score()
if df_trend.empty:
raise ETLExtractionError(
- "No se pudo generar Trend Score (sin datos de ninguna fuente)",
+ "Trend Score could not be generated (no data from any source)",
critical=True,
)
columnas_salida = [
- "ranking", "tecnologia", "github_score",
- "so_score", "reddit_score", "trend_score", "fuentes"
+ "ranking",
+ "tecnologia",
+ "github_score",
+ "so_score",
+ "reddit_score",
+ "trend_score",
+ "fuentes",
]
df_salida = df_trend[columnas_salida]
- # Se mantiene validación explícita por contrato + guardado uniforme
validar_dataframe(df_salida, "trend_score")
self.guardar_csv(df_salida, "trend_score")
top3 = df_salida.head(3)
- self.logger.info("\nTop 3 tecnologias trending:")
+ self.logger.info("\nTop 3 trending technologies:")
for _, row in top3.iterrows():
self.logger.info(
" #%d. %s (Score: %s)",
- int(row['ranking']), row['tecnologia'], row['trend_score']
+ int(row["ranking"]),
+ row["tecnologia"],
+ row["trend_score"],
)
- self.logger.info("Trend Score completado")
+ self.logger.info("Trend Score completed")
except ETLExtractionError:
raise
- except Exception as e: # pylint: disable=broad-exception-caught
- raise ETLExtractionError(f"Error fatal en Trend Score: {e}", critical=True) from e
+ except Exception as exc: # pylint: disable=broad-exception-caught
+ raise ETLExtractionError(f"Fatal error in Trend Score: {exc}", critical=True) from exc
if __name__ == "__main__":
diff --git a/backend/trend_score_duckdb.py b/backend/trend_score_duckdb.py
new file mode 100644
index 0000000..209891d
--- /dev/null
+++ b/backend/trend_score_duckdb.py
@@ -0,0 +1,96 @@
+"""DuckDB engine for Trend Score computation."""
+
+from __future__ import annotations
+
+import pandas as pd
+
+try:
+ import duckdb
+except Exception: # pylint: disable=broad-exception-caught
+ duckdb = None
+
+
+def calcular_trend_score_duckdb(df_github, df_so, df_reddit, pesos):
+ """Computes Trend Score using DuckDB SQL over in-memory DataFrames."""
+ if duckdb is None:
+ raise RuntimeError("DuckDB engine is unavailable. Install 'duckdb' to use this engine.")
+
+ github_scores = (
+ df_github[["tecnologia", "github_score"]].copy()
+ if not df_github.empty
+ else pd.DataFrame(columns=["tecnologia", "github_score"])
+ )
+ so_scores = (
+ df_so[["tecnologia", "so_score"]].copy()
+ if not df_so.empty
+ else pd.DataFrame(columns=["tecnologia", "so_score"])
+ )
+ reddit_scores = (
+ df_reddit[["tecnologia", "reddit_score"]].copy()
+ if not df_reddit.empty
+ else pd.DataFrame(columns=["tecnologia", "reddit_score"])
+ )
+
+ connection = duckdb.connect(database=":memory:")
+ try:
+ connection.register("github_scores", github_scores)
+ connection.register("so_scores", so_scores)
+ connection.register("reddit_scores", reddit_scores)
+
+ query = f"""
+ WITH merged AS (
+ SELECT
+ COALESCE(g.tecnologia, s.tecnologia, r.tecnologia) AS tecnologia,
+ COALESCE(g.github_score, 0.0) AS github_score,
+ COALESCE(s.so_score, 0.0) AS so_score,
+ COALESCE(r.reddit_score, 0.0) AS reddit_score
+ FROM github_scores g
+ FULL OUTER JOIN so_scores s
+ ON g.tecnologia = s.tecnologia
+ FULL OUTER JOIN reddit_scores r
+ ON COALESCE(g.tecnologia, s.tecnologia) = r.tecnologia
+ ),
+ scored AS (
+ SELECT
+ tecnologia,
+ github_score,
+ so_score,
+ reddit_score,
+ ROUND((
+ {pesos['github']} * github_score +
+ {pesos['stackoverflow']} * so_score +
+ {pesos['reddit']} * reddit_score
+ ), 2) AS trend_score,
+ (
+ CASE WHEN github_score > 0 THEN 1 ELSE 0 END +
+ CASE WHEN so_score > 0 THEN 1 ELSE 0 END +
+ CASE WHEN reddit_score > 0 THEN 1 ELSE 0 END
+ ) AS fuentes
+ FROM merged
+ ),
+ ranked AS (
+ SELECT
+ ROW_NUMBER() OVER (ORDER BY trend_score DESC, tecnologia ASC) AS ranking,
+ tecnologia,
+ github_score,
+ so_score,
+ reddit_score,
+ trend_score,
+ fuentes
+ FROM scored
+ )
+ SELECT
+ ranking,
+ tecnologia,
+ github_score,
+ so_score,
+ reddit_score,
+ trend_score,
+ fuentes
+ FROM ranked
+ ORDER BY ranking
+ """
+
+ return connection.execute(query).df()
+ finally:
+ connection.close()
diff --git a/backend/validador.py b/backend/validador.py
index 33acb0d..0eb16d2 100644
--- a/backend/validador.py
+++ b/backend/validador.py
@@ -2,14 +2,20 @@
Data validation utilities for the ETL pipeline.
Provides reusable functions to validate DataFrames before
-saving them to CSV: empty checks, column verification,
-and null detection.
+saving them to CSV, including severity-aware quality checks.
"""
+
import logging
import pandas as pd
from exceptions import ETLValidationError
from config.csv_contract import get_required_columns, get_critical_columns, get_column_types
+from quality.pandera_schemas import (
+ run_pandera_quality_checks,
+ SEVERITY_CRITICAL,
+ SEVERITY_WARNING,
+ SEVERITY_INFO,
+)
logger = logging.getLogger("validador")
@@ -47,7 +53,79 @@ def _ok(value):
return True
-def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False):
+def _empty_quality_report():
+ return {
+ "critical": 0,
+ "warning": 0,
+ "info": 0,
+ "issues": [],
+ }
+
+
+def _normalize_quality_issue(nombre_archivo, issue):
+ dataset = str(issue.get("dataset") or nombre_archivo)
+ severity = str(issue.get("severity") or SEVERITY_INFO).lower()
+ if severity not in {SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO}:
+ severity = SEVERITY_INFO
+ rule = str(issue.get("rule") or "unspecified_rule")
+ message = str(issue.get("message") or "unspecified quality issue")
+
+ return {
+ "dataset": dataset,
+ "severity": severity,
+ "rule": rule,
+ "message": message,
+ }
+
+
+def _apply_quality_issues(nombre_archivo, issues, pandera_warn_only):
+ report = _empty_quality_report()
+
+ for issue in issues:
+ normalized = _normalize_quality_issue(nombre_archivo, issue)
+ severity = normalized["severity"]
+ report[severity] += 1
+ report["issues"].append(normalized)
+
+ if severity == SEVERITY_CRITICAL:
+ logger.error(
+ "[QUALITY][CRITICAL] dataset=%s rule=%s message=%s",
+ normalized["dataset"],
+ normalized["rule"],
+ normalized["message"],
+ )
+ elif severity == SEVERITY_WARNING:
+ logger.warning(
+ "[QUALITY][WARNING] dataset=%s rule=%s message=%s",
+ normalized["dataset"],
+ normalized["rule"],
+ normalized["message"],
+ )
+ else:
+ logger.info(
+ "[QUALITY][INFO] dataset=%s rule=%s message=%s",
+ normalized["dataset"],
+ normalized["rule"],
+ normalized["message"],
+ )
+
+ if report["critical"] > 0 and not pandera_warn_only:
+ raise ETLValidationError(
+ f"'{nombre_archivo}' quality gate failed with {report['critical']} critical issue(s)"
+ )
+
+ return report
+
+
+def validar_dataframe(
+ df,
+ nombre_archivo,
+ strict=False,
+ validate_types=False,
+ enable_pandera=False,
+ pandera_warn_only=True,
+ return_quality_report=False,
+):
"""Validates a DataFrame before saving.
Checks:
@@ -60,17 +138,20 @@ def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False):
nombre_archivo: Key from ARCHIVOS_SALIDA (e.g. 'github_repos').
strict: If True, raises ETLValidationError on schema violations.
validate_types: If True, applies minimal type checks defined in the contract.
+ enable_pandera: If True, executes Pandera-based quality checks.
+ pandera_warn_only: If True, Pandera critical issues are routed as warnings (no block).
+ return_quality_report: If True, returns quality report instead of bool.
Raises:
ETLValidationError: If the DataFrame is empty.
"""
- # 1. Verificar que no esta vacio
+ # 1. Verify DataFrame is not empty
if df.empty:
raise ETLValidationError(f"DataFrame '{nombre_archivo}' esta vacio, no se puede guardar")
logger.info("Validando '%s': %d filas, %d columnas", nombre_archivo, len(df), len(df.columns))
- # 2. Verificar columnas esperadas
+ # 2. Verify expected columns
esperadas = get_required_columns(nombre_archivo)
if esperadas:
faltantes = [col for col in esperadas if col not in df.columns]
@@ -81,7 +162,7 @@ def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False):
f"'{nombre_archivo}' no cumple schema requerido, faltan columnas: {faltantes}"
)
- # 3. Verificar nulos en columnas criticas
+ # 3. Verify nulls in critical columns
criticas = get_critical_columns(nombre_archivo)
for col in criticas:
if col not in df.columns:
@@ -103,7 +184,7 @@ def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False):
f"'{nombre_archivo}' no cumple schema critico: columna '{col}' con nulos"
)
- # 4. Verificar tipos mínimos (opcional)
+ # 4. Verify minimal types (optional)
if validate_types:
type_map = get_column_types(nombre_archivo)
for col, expected_type in type_map.items():
@@ -122,4 +203,16 @@ def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False):
f"'{nombre_archivo}' no cumple tipo esperado en '{col}': {expected_type}"
)
+ quality_report = _empty_quality_report()
+ if enable_pandera:
+ quality_issues = run_pandera_quality_checks(df, nombre_archivo)
+ quality_report = _apply_quality_issues(
+ nombre_archivo=nombre_archivo,
+ issues=quality_issues,
+ pandera_warn_only=pandera_warn_only,
+ )
+
+ if return_quality_report:
+ return quality_report
+
return True
diff --git a/backend/validate_csv_contract.py b/backend/validate_csv_contract.py
index 461bf1e..56c1af6 100644
--- a/backend/validate_csv_contract.py
+++ b/backend/validate_csv_contract.py
@@ -1,11 +1,11 @@
-"""Valida headers CSV contra el contrato backend/frontend.
+"""Validates CSV outputs against the shared backend/frontend contract.
-Se utiliza en CI/ETL para detectar cambios incompatibles de esquema
-antes de publicar datos al frontend.
+Used in CI/ETL to detect incompatible schema changes
+before publishing data to the frontend.
"""
-import sys
import logging
+import sys
from pathlib import Path
import pandas as pd
@@ -19,39 +19,73 @@
logger = logging.getLogger("validate_csv_contract")
-def validate_contract(strict=True):
- """Valida archivos CSV existentes contra columnas requeridas del contrato.
+def validate_contract(strict=True, enable_pandera=True, pandera_warn_only=True):
+ """Validates CSV files and routes quality issues by severity.
+
+ Args:
+ strict: Enforces required schema and type checks.
+ enable_pandera: Enables/disables the Pandera quality stage.
+ pandera_warn_only: Routes Pandera critical issues as warnings when True.
- Retorna:
- tuple(bool, list[str]): (ok_global, mensajes)
+ Returns:
+ tuple(bool, list[str]): (overall_ok, messages)
"""
mode = "strict" if strict else "warn-only"
- messages = [f"Validando contrato CSV v{get_contract_version()} (modo={mode})..."]
+ pandera_mode = "warn-only" if pandera_warn_only else "strict"
+ messages = [
+ f"Validating CSV contract v{get_contract_version()} "
+ f"(mode={mode}, pandera_enabled={enable_pandera}, pandera_mode={pandera_mode})"
+ ]
ok = True
- for logical_name, schema in CSV_SCHEMA_CONTRACT.items():
+ for logical_name in CSV_SCHEMA_CONTRACT:
csv_path = Path(ARCHIVOS_SALIDA[logical_name])
if not csv_path.exists():
- messages.append(f"[WARN] {logical_name}: archivo no existe ({csv_path.name})")
+ messages.append(f"[WARN] {logical_name}: file not found ({csv_path.name})")
if strict:
ok = False
continue
try:
df = pd.read_csv(csv_path)
- validar_dataframe(
- df,
- logical_name,
+ quality_report = validar_dataframe(
+ df=df,
+ nombre_archivo=logical_name,
strict=strict,
validate_types=True,
+ enable_pandera=enable_pandera,
+ pandera_warn_only=pandera_warn_only,
+ return_quality_report=True,
)
- messages.append(f"[OK] {logical_name}: contrato válido")
+
+ critical = int(quality_report["critical"])
+ warning = int(quality_report["warning"])
+ info = int(quality_report["info"])
+
+ if critical > 0:
+ if pandera_warn_only:
+ messages.append(
+ f"[WARN] {logical_name}: quality critical={critical} routed by warn-only mode"
+ )
+ else:
+ messages.append(
+ f"[ERROR] {logical_name}: quality gate failed (critical={critical})"
+ )
+ ok = False
+ continue
+
+ if warning > 0:
+ messages.append(f"[WARN] {logical_name}: quality warnings={warning}")
+ if info > 0:
+ messages.append(f"[INFO] {logical_name}: quality info={info}")
+
+ messages.append(f"[OK] {logical_name}: contract valid")
except ETLValidationError as exc:
messages.append(f"[ERROR] {logical_name}: {exc}")
ok = False
except Exception as exc: # pylint: disable=broad-exception-caught
- messages.append(f"[ERROR] {logical_name}: no se pudo validar ({exc})")
+ messages.append(f"[ERROR] {logical_name}: validation execution failed ({exc})")
ok = False
return ok, messages
@@ -60,7 +94,15 @@ def validate_contract(strict=True):
def main():
logging.basicConfig(level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(name)s - %(message)s")
strict = "--no-strict" not in sys.argv
- ok, messages = validate_contract(strict=strict)
+ enable_pandera = "--skip-pandera" not in sys.argv
+ pandera_warn_only = "--pandera-strict" not in sys.argv
+
+ ok, messages = validate_contract(
+ strict=strict,
+ enable_pandera=enable_pandera,
+ pandera_warn_only=pandera_warn_only,
+ )
+
for msg in messages:
if msg.startswith("[ERROR]"):
logger.error(msg)
@@ -70,10 +112,10 @@ def main():
logger.info(msg)
if not ok:
- logger.error("[RUN][SUMMARY] estado=failed")
+ logger.error("[RUN][SUMMARY] status=failed")
sys.exit(1)
- logger.info("[RUN][SUMMARY] estado=success")
+ logger.info("[RUN][SUMMARY] status=success")
if __name__ == "__main__":
diff --git a/docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md b/docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md
index 994df3c..9be14e0 100644
--- a/docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md
+++ b/docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md
@@ -1,37 +1,37 @@
-# ROADMAP V2 FINAL - Technology Trend Analysis Platform
+# ROADMAP V2 FINAL - Technology Trend Analysis Platform
-## 1) Summary
+## 1) Resumen
-This is the final, decision-complete plan for V2.
-Goal: migrate from V1 CSV-only pipeline to a serverless data stack V2 without breaking current frontend behavior.
+Este es el plan final, **decision-complete**, para la V2.
+Objetivo: migrar desde el pipeline V1 (CSV-only) a un serverless data stack V2 sin romper el comportamiento actual del frontend.
-Primary outcomes:
-1. Data Product Contract V2 with run and dataset metadata.
-2. Dual write (latest + history) with controlled transition.
-3. Quality gate with severity levels.
-4. Trend Score V1 vs V2 numeric equivalence tests.
-5. Parallel CI pipeline with artifacts and conditional publish.
-6. Frontend bridge with JSON history while keeping CSV compatibility.
+Resultados principales:
+1. Data Product Contract V2 con metadata de run y de dataset.
+2. Dual write (latest + history) con transicion controlada.
+3. Quality gate con niveles de severidad.
+4. Pruebas de equivalencia numerica V1 vs V2 para Trend Score.
+5. Pipeline CI paralelo con artifacts y publicacion condicional.
+6. Frontend bridge con JSON historico manteniendo compatibilidad CSV.
-## 2) Scope
+## 2) Alcance
-In scope now (V2 core): F2-F7
+En alcance ahora (core V2): F2-F7
- Contract V2
- Dual write
- Pandera quality gate
- DuckDB trend score engine
-- GitHub Actions parallel jobs with artifacts
+- GitHub Actions con jobs paralelos y artifacts
- Frontend bridge
-- Cutover governance
+- Gobernanza de cutover
-Out of scope now:
-- Advanced forecasting productionization
-- Advanced topic modeling productionization
-- External BI platform integration
+Fuera de alcance ahora:
+- Productivizacion avanzada de forecasting
+- Productivizacion avanzada de topic modeling
+- Integracion con plataformas BI externas
-These move to V2.1 or post-V2.
+Eso pasa a V2.1 o post-V2.
-## 3) Current baseline (verified)
+## 3) Baseline actual (verificado)
Backend:
- ETLs: GitHub, StackOverflow, Reddit
@@ -40,28 +40,28 @@ Backend:
- CSV contract validator: `backend/validate_csv_contract.py`
Frontend:
-- Flutter dashboards read CSV from `frontend/assets/data/`
+- Dashboards Flutter leen CSV desde `frontend/assets/data/`
- Loader: `frontend/lib/services/csv_service.dart`
CI/CD:
-- Weekly ETL workflow exists and works
-- Current flow is mostly sequential for ETL processing
+- Workflow ETL semanal paralelo por fuente + aggregate + publish
+- Verificaciones de artifacts y outputs frontend activas en CI
-## 4) Branch strategy and governance
+## 4) Estrategia de ramas y gobernanza
-Branches:
-- Backend work branch: `feat/backend`
-- Frontend work branch: `feat/frontend`
+Ramas:
+- Rama de trabajo backend: `feat/backend`
+- Rama de trabajo frontend: `feat/frontend`
-Default merge policy:
-- `squash merge` unless explicit reason to preserve detailed commit graph.
+Politica de merge por defecto:
+- `squash merge`, salvo razon explicita para preservar el grafo detallado de commits.
-Sync policy before each backend PR:
+Politica de sincronizacion antes de cada PR de backend:
1. `git fetch --all --prune`
2. `git switch main && git pull --ff-only origin main`
3. `git switch feat/backend && git merge --ff-only main`
-If exact-commit alignment is required and FF does not apply:
+Si se requiere alineacion exacta de commit y no aplica fast-forward:
- `git reset --hard main`
- `git push --force-with-lease`
@@ -76,7 +76,7 @@ If exact-commit alignment is required and FF does not apply:
- `source_window_start_utc`
- `source_window_end_utc`
- `quality_gate_status` (`pass`, `pass_with_warnings`, `fail`)
-- `datasets` (array of dataset manifests)
+- `datasets` (array de dataset manifests)
### 5.2 Dataset-level metadata (required)
@@ -90,31 +90,33 @@ If exact-commit alignment is required and FF does not apply:
- `latest_path`
- `history_path`
-### 5.3 SemVer rules for datasets
+### 5.3 Reglas SemVer para datasets
-- MAJOR: breaking schema change (remove/rename required column, incompatible type change)
-- MINOR: backward-compatible additions (optional columns, non-breaking checks)
-- PATCH: internal fixes with no schema contract break
+- MAJOR: cambio breaking de schema (eliminar/renombrar columna requerida, cambio de tipo incompatible)
+- MINOR: adiciones backward-compatible (columnas opcionales, checks no-breaking)
+- PATCH: correcciones internas sin romper el contrato de schema
-## 6) Storage layout (fixed now)
+## 6) Layout de almacenamiento (estado implementado)
Latest outputs:
- `datos/latest/*.csv`
-- `datos/latest/history_index.json`
-- `datos/latest/trend_score_history.json`
History outputs:
-- `datos/history//year=YYYY/month=MM/day=DD/part-0000.parquet`
+- `datos/history//year=YYYY/month=MM/day=DD/*.csv`
Metadata outputs:
- `datos/metadata/run_manifest.json`
- `datos/metadata/runs/.json`
-Examples:
-- `datos/history/trend_score/year=2026/month=02/day=22/part-0000.parquet`
-- `datos/history/so_volumen/year=2026/month=02/day=22/part-0000.parquet`
+Frontend bridge outputs:
+- `frontend/assets/data/history_index.json`
+- `frontend/assets/data/trend_score_history.json`
-## 7) V1 -> V2 compatibility matrix (core)
+Ejemplos:
+- `datos/history/trend_score/year=2026/month=02/day=22/trend_score.csv`
+- `datos/history/so_volumen/year=2026/month=02/day=22/so_volumen_preguntas.csv`
+
+## 7) Matriz de compatibilidad V1 -> V2 (core)
- `datos/trend_score.csv` -> `datos/latest/trend_score.csv` + `datos/history/trend_score/...`
- `datos/so_volumen_preguntas.csv` -> `datos/latest/so_volumen_preguntas.csv` + `datos/history/so_volumen/...`
@@ -122,308 +124,385 @@ Examples:
- `datos/reddit_temas_emergentes.csv` -> `datos/latest/reddit_temas_emergentes.csv` + `datos/history/reddit_temas/...`
- `datos/github_lenguajes.csv` -> `datos/latest/github_lenguajes.csv` + `datos/history/github_lenguajes/...`
-Frontend cutover rule:
-- CSV stays until bridge JSON passes 4 consecutive weekly runs without critical failures.
+Regla de cutover frontend:
+- CSV se mantiene hasta que el bridge JSON pase 4 corridas semanales consecutivas sin fallos `critical`.
-## 8) Quality model (Pandera + severity)
+## 8) Modelo de calidad (Pandera + severity)
-Severity and actions:
-- `critical`: fail pipeline, no publish
-- `warning`: publish with warning flag
-- `info`: publish, observability only
+Severidad y acciones:
+- `critical`: falla pipeline, no publica
+- `warning`: publica con warning flag
+- `info`: publica, solo observabilidad
-Minimum required rules:
-1. Required columns present (critical)
-2. Critical types valid (critical)
-3. Critical columns no nulls (critical)
-4. `trend_score >= 0` (critical)
-5. Ranking uniqueness (critical)
-6. Core dataset row_count > 0 (warning)
-7. Freshness threshold exceeded (warning)
-8. Distribution drift soft breach (warning)
-9. Optional fields missing (info)
-10. Minor cardinality variation (info)
+Reglas minimas obligatorias:
+1. Required columns presentes (`critical`)
+2. Critical types validos (`critical`)
+3. No nulos en critical columns (`critical`)
+4. `trend_score >= 0` (`critical`)
+5. Unicidad de ranking (`critical`)
+6. `row_count > 0` en datasets core (`warning`)
+7. Freshness fuera de umbral (`warning`)
+8. Distribution drift suave (`warning`)
+9. Optional fields faltantes (`info`)
+10. Variacion menor de cardinalidad (`info`)
-## 9) Trend score equivalence V1 vs V2
+## 9) Equivalencia de Trend Score V1 vs V2
-Acceptance thresholds:
-- Absolute score difference per shared technology: `<= 0.01`
+Umbrales de aceptacion:
+- Diferencia absoluta por tecnologia compartida: `<= 0.01`
- Top-10 overlap: `>= 90%`
-- Ranking delta: `<= 1` for at least 90% of shared technologies
-- Tie handling allowed when score delta is `<= 0.01`
+- Delta de ranking: `<= 1` para al menos 90% de tecnologias compartidas
+- Empates permitidos cuando delta de score `<= 0.01`
-## 10) Source failure degradation policy
+## 10) Politica de degradacion ante fallo de fuentes
-- 3/3 sources available: publish, normal weights
-- 2/3 sources available: renormalize available weights, publish with warning
-- 1/3 source available: do not publish new latest, mark fail
-- 0/3 available: fail run
+- 3/3 fuentes disponibles: publica, weights normales
+- 2/3 fuentes disponibles: renormaliza weights disponibles, publica con warning
+- 1/3 fuente disponible: no publica nuevo latest, marca fail
+- 0/3 fuentes disponibles: fail run
-## 11) CI/CD V2 architecture (artifacts)
+## 11) Arquitectura CI/CD V2 (artifacts)
-Main workflow: `.github/workflows/etl_semanal.yml`
+Workflow principal: `.github/workflows/etl_semanal.yml`
Jobs:
1. `job_github`
2. `job_stackoverflow`
3. `job_reddit`
-4. `job_aggregate` (downloads artifacts, computes trend, runs quality gate, writes manifest)
-5. `job_publish` (conditional on quality gate)
+4. `job_aggregate` (descarga artifacts, calcula trend, corre quality gate, escribe manifest)
+5. `job_publish` (condicional por quality gate)
-Publish condition:
-- only if quality status is `pass` or `pass_with_warnings`
+Condicion de publicacion:
+- solo si quality status es `pass` o `pass_with_warnings`
-## 12) Runtime and cost budgets (GitHub Actions)
+## 12) Presupuesto runtime y costo (GitHub Actions)
-Per-run limits:
-- Source job timeout: 20 min each
-- Aggregate timeout: 15 min
-- Publish timeout: 10 min
-- Total run budget: 60 min
+Limites por run:
+- Timeout por source job: 20 min cada uno
+- Timeout aggregate: 15 min
+- Timeout publish: 10 min
+- Presupuesto total por run: 60 min
-Artifact budget:
-- Warning at 75 MB total
-- Critical at 100 MB total
+Presupuesto de artifacts:
+- Warning en 75 MB total
+- Critical en 100 MB total
-Alerting thresholds:
+Umbrales de alerta:
- Warning: runtime > 45 min
- Critical: runtime > 60 min
-## 13) Reproducibility
+## 13) Reproducibilidad
-- Python lock file for deterministic installs
-- Flutter lock file committed
-- Deterministic seed for transforms where applicable
-- Baseline fixtures for V1 equivalence tests
-- Historical replay by `run_id` supported through manifest metadata
+- Python lock file para instalaciones deterministicas
+- Flutter lock file commiteado
+- Seed deterministica para transformaciones donde aplique
+- Baseline fixtures V1 para pruebas de equivalencia
+- Replay historico por `run_id` soportado via manifest metadata
-## 14) Retention and lifecycle
+## 14) Retencion y ciclo de vida
-Core aggregated datasets:
-- Daily: 180 days
-- Monthly compacted: 5 years
+Datasets core agregados:
+- Diario: 180 dias
+- Mensual compactado: 5 anios
-Heavy raw-like datasets:
-- Daily: 90 days
-- Monthly compacted: 24 months
+Datasets pesados tipo raw:
+- Diario: 90 dias
+- Mensual compactado: 24 meses
-Compaction:
-- Monthly parquet compaction
-- Integrity validation after compaction (row_count, schema_hash, checksums)
+Compactacion:
+- Compactacion parquet mensual
+- Validacion de integridad post-compactacion (`row_count`, `schema_hash`, checksums)
-## 15) Security and compliance in CI
+## 15) Security and compliance en CI
-- Least-privilege workflow permissions
-- `contents: write` only where publish is needed
-- Secrets required:
+- Workflow permissions con minimo privilegio
+- `contents: write` solo donde la publicacion lo requiera
+- Secrets requeridos:
- `GH_PAT`
- `STACKOVERFLOW_KEY`
- `REDDIT_CLIENT_ID`
- `REDDIT_CLIENT_SECRET`
-- Secret masking required
-- No sensitive payloads in logs/artifacts
-- Preflight secret checks before extraction
+- Secret masking obligatorio
+- No exponer payloads sensibles en logs/artifacts
+- Preflight checks de secretos antes de extraer datos
-## 16) PR plan (F2-F7, PR-ready)
+## 16) Plan de PRs (F2-F7, PR-ready)
### PR-01 (F2) - Contract V2 foundation
-Goal:
-- Introduce V2 contract and manifest model.
+Objetivo:
+- Introducir contrato V2 y modelo de manifest.
-Files:
-- `backend/config/data_product_contract_v2.py` (new)
+Archivos:
+- `backend/config/data_product_contract.py` (new)
- `backend/config/csv_contract.py`
- `docs/data_contract.md`
Checks:
-- contract tests pass
-- schema validation tests pass
+- contract tests en verde
+- schema validation tests en verde
Merge criteria:
-- no regressions in current tests
+- sin regresiones en la suite actual
Rollback:
- revert PR
### PR-02 (F3) - Dual write infrastructure
-Goal:
-- Add latest/history writing path while preserving existing CSV behavior.
+Objetivo:
+- Agregar latest/history write path preservando el comportamiento CSV existente.
-Files:
+Archivos:
- `backend/base_etl.py`
- `backend/config/settings.py`
- `backend/sync_assets.py`
-- tests for write behavior
+- tests de write behavior
Checks:
-- write tests pass
-- current ETL tests pass
+- write tests en verde
+- ETL tests actuales en verde
Rollback:
-- disable history writes via config flag
+- desactivar history writes con config flag
### PR-03 (F5) - Quality gate warn-only
-Goal:
-- Add Pandera validation with severity routing.
+Objetivo:
+- Agregar validacion Pandera con enrutamiento por severidad.
-Files:
+Archivos:
- `backend/validador.py`
- `backend/validate_csv_contract.py`
- `backend/quality/pandera_schemas.py` (new)
-- tests for severity handling
+- tests de manejo de severidad
Checks:
-- quality tests pass
-- warning path does not block publish
+- quality tests en verde
+- warning path no bloquea publish
Rollback:
-- bypass Pandera stage
+- bypass de etapa Pandera
### PR-04 (F4) - DuckDB trend engine + equivalence tests
-Goal:
-- Move trend calculation to DuckDB while proving equivalence.
+Objetivo:
+- Mover calculo de trend a DuckDB demostrando equivalencia.
-Files:
+Archivos:
- `backend/trend_score.py`
-- `backend/trend_score_v2_duckdb.py` (new)
+- `backend/trend_score_duckdb.py` (new)
- `tests/test_trend_equivalence_v1_v2.py` (new)
Checks:
-- equivalence thresholds satisfied
+- equivalence thresholds cumplidos
Rollback:
-- switch to previous trend engine path
+- volver a ruta de trend engine anterior
### PR-05 (F6) - Parallel workflow with artifacts
-Goal:
-- Split source jobs and aggregate with artifacts.
+Objetivo:
+- Separar source jobs y agregar aggregate por artifacts.
-Files:
+Archivos:
- `.github/workflows/etl_semanal.yml`
Checks:
-- manual workflow run succeeds
-- artifact handoff valid
+- manual workflow run exitoso
+- artifact handoff valido
Rollback:
-- restore sequential workflow version
+- restaurar version secuencial del workflow
### PR-06 (F7) - Frontend bridge assets
-Goal:
-- Produce JSON history bridge assets while keeping CSV.
+Objetivo:
+- Producir bridge JSON historico manteniendo CSV.
-Files:
+Archivos:
- `backend/export_history_json.py` (new)
- `backend/sync_assets.py`
-- generated files under `frontend/assets/data/`
+- archivos generados en `frontend/assets/data/`
Checks:
-- bridge files generated
-- frontend can load existing CSV unchanged
+- bridge files generados
+- frontend sigue cargando CSV sin cambios
Rollback:
-- disable bridge export
+- desactivar bridge export
### PR-07 (F7) - Frontend partial cutover
-Goal:
-- Consume bridge JSON via feature flag.
+Objetivo:
+- Consumir bridge JSON por feature flag.
-Files:
+Archivos:
- `frontend/lib/services/csv_service.dart`
- `frontend/lib/config/feature_flags.dart` (new)
-- minimal temporal view wiring
+- wiring minimo de vista temporal
Checks:
-- smoke load for CSV and JSON paths
-- no regressions in existing dashboards
+- smoke load para path CSV y JSON
+- sin regresiones en dashboards actuales
Rollback:
- feature flag off
-## 17) DoD by phase (F2-F7)
+## 17) DoD por fase (F2-F7)
F2:
-- Deliverables: V2 contract + manifest schema
+- Deliverables: contrato V2 + manifest schema
- Tests: contract schema tests
-- Acceptance: manifest valid in sample run
-- Rollback: PR revert
+- Acceptance: manifest valido en sample run
+- Rollback: revert PR
+- Estado: DONE
+- Evidencia:
+ - `pytest -q tests/test_data_product_contract.py tests/test_csv_contract.py` -> 15 passed
+ - sample run manifest validado con `validate_run_manifest` -> `manifest_valid=True`, `errors=0`
F3:
- Deliverables: dual write latest/history
- Tests: write path + idempotency tests
-- Acceptance: expected files created in fixed layout
-- Rollback: disable history flag
+- Acceptance: archivos esperados creados en el layout fijo
+- Rollback: desactivar history flag
+- Estado: DONE
+- Evidencia:
+ - `pytest -q tests/test_base_etl.py tests/test_sync_assets.py` -> 16 passed
+ - prueba de idempotencia de escritura en script aislado -> `legacy_exists=True`, `latest_exists=True`, `history_exists=True`, `history_file_count=1`
+ - validacion de acceptance en run real -> `acceptance_paths_ok=True` para `datos/`, `datos/latest/`, `datos/history/...`
+ - validacion de rollback por flag -> `rollback_history_mtime_unchanged=True` con `DATA_WRITE_HISTORY_CSV=0`
F4:
- Deliverables: DuckDB trend engine
- Tests: equivalence suite
-- Acceptance: all thresholds pass
-- Rollback: switch back to V1 engine
+- Acceptance: todos los umbrales en verde
+- Rollback: volver a V1 engine
+- Estado: DONE
+- Evidencia:
+ - `pytest -q tests/test_trend_score.py tests/test_trend_equivalence_v1_v2.py` -> 20 passed
+ - validacion de umbrales de equivalencia -> `max_abs_diff=0.0000`, `top10_overlap=1.00`, `pct_rank_delta_le_1=1.00`
+ - verificacion de rollback por engine -> `duckdb_exit=0`, `legacy_exit=0` con `TREND_SCORE_ENGINE=duckdb|legacy`
F5:
- Deliverables: severity quality gate
-- Tests: critical/warning/info routing
-- Acceptance: critical blocks publish, warning allows publish-with-flag
-- Rollback: bypass new gate
+- Tests: enrutamiento `critical`/`warning`/`info`
+- Acceptance: critical bloquea publish, warning permite publish-with-flag
+- Rollback: bypass de gate nuevo
+- Estado: DONE
+- Evidencia:
+ - `pytest -q tests/test_validador.py tests/test_validate_csv_contract.py` -> 12 passed
+ - tests especificos de severidad (`warning` no bloquea y `critical` bloquea en strict) -> 4 passed
+ - `python backend/validate_csv_contract.py --no-strict` -> `status=success` con warning routeado
+ - `python backend/validate_csv_contract.py --pandera-strict` -> `status=success` con warnings no bloqueantes en dataset actual
+ - `python backend/validate_csv_contract.py --no-strict --skip-pandera` -> `status=success` (bypass operativo)
F6:
-- Deliverables: parallel CI with artifacts
+- Deliverables: CI paralelo con artifacts
- Tests: workflow dry run + artifact contract
-- Acceptance: successful end-to-end run
-- Rollback: sequential workflow restore
+- Acceptance: corrida end-to-end exitosa
+- Rollback: restaurar workflow secuencial
+- Estado: DONE
+- Evidencia:
+ - workflow actualizado con jobs paralelos (`job_github`, `job_stackoverflow`, `job_reddit`) y agregador con `needs`
+ - contrato de artifacts validado en `tests/test_workflow_etl_contract.py` -> 4 passed
+ - suite F6/F7 backend (`pytest -q tests/test_workflow_etl_contract.py tests/test_sync_assets.py tests/test_export_history_json.py`) -> 13 passed
+ - dry run local del flujo aggregate (`python backend/sync_assets.py` + `python backend/export_history_json.py`) -> `status=success` en ambos comandos
+ - gate de rollback preservado: se puede volver al workflow secuencial restaurando `.github/workflows/etl_semanal.yml`
F7:
-- Deliverables: bridge JSON + frontend flag cutover
+- Deliverables: bridge JSON + cutover parcial frontend
- Tests: frontend smoke path
-- Acceptance: 4 weekly runs stable before CSV retirement decision
-- Rollback: flag off and CSV-only fallback
-
-## 18) Test scenarios (mandatory)
-
-1. Manifest schema: valid and invalid samples
-2. SemVer bump correctness on representative changes
-3. Deterministic schema_hash stability
-4. Dual write idempotent behavior by run_id
-5. Quality gate severity actions
-6. V1 vs V2 trend equivalence thresholds
-7. Degradation matrix (3/3, 2/3, 1/3, 0/3 sources)
-8. Artifact corruption or missing artifact handling
-9. Frontend bridge fallback behavior
-10. Rollback verification per PR
-
-## 19) Release and tags
-
-Recommended release checkpoints:
+- Acceptance: 4 corridas semanales estables antes de retiro de CSV
+- Rollback: flag off y fallback CSV-only
+- Estado: DONE (implementacion) / OPERATIVO EN CURSO (estabilidad semanal)
+- Evidencia:
+ - bridge JSON export activo con `backend/export_history_json.py` y `backend/sync_assets.py`
+ - assets bridge generados en `frontend/assets/data/history_index.json` y `frontend/assets/data/trend_score_history.json`
+ - cutover parcial implementado por feature flag en `frontend/lib/config/feature_flags.dart`
+ - consumo bridge con fallback CSV implementado en `frontend/lib/services/csv_service.dart`
+ - wiring de UI temporal aplicado en `frontend/lib/screens/home_screen.dart`
+ - smoke build frontend (`flutter build web --debug`) -> success
+ - criterio de 4 corridas semanales se valida en ejecucion real de workflow tras push (no bloquea la implementacion del PR)
+
+## 18) Escenarios de prueba (obligatorios)
+
+Estado general: `DONE`
+
+- [x] 1) Manifest schema: muestras validas e invalidas
+ - Evidencia: `tests/test_data_product_contract.py`
+- [x] 2) Correctitud de SemVer bump en cambios representativos
+ - Evidencia: `tests/test_schema_contract_utils.py` (matriz representativa de cambios -> bump esperado)
+- [x] 3) Estabilidad deterministica de `schema_hash`
+ - Evidencia: `tests/test_schema_contract_utils.py` (`compute_schema_hash` deterministico y sensible a cambios semanticos)
+- [x] 4) Idempotencia de dual write por `run_id`
+ - Evidencia: `tests/test_base_etl.py`, `tests/test_sync_assets.py`
+- [x] 5) Acciones del quality gate por severidad
+ - Evidencia: `tests/test_validador.py`, `tests/test_validate_csv_contract.py`
+- [x] 6) Umbrales de equivalencia trend V1 vs V2
+ - Evidencia: `tests/test_trend_equivalence_v1_v2.py`
+- [x] 7) Matriz de degradacion (3/3, 2/3, 1/3, 0/3 fuentes)
+ - Evidencia: `tests/test_degradation_policy.py`
+- [x] 8) Manejo de artifact corrupto o faltante
+ - Evidencia: `tests/test_workflow_etl_contract.py`, `tests/test_export_history_json.py`
+- [x] 9) Comportamiento de fallback del frontend bridge
+ - Evidencia: `tests/test_frontend_bridge_contract.py`, `tests/test_export_history_json.py`
+- [x] 10) Verificacion de rollback por PR
+ - Evidencia: `tests/test_trend_score.py`, `tests/test_sync_assets.py`, `tests/test_workflow_etl_contract.py`
+
+Resultado de verificacion:
+- `pytest -q` -> `133 passed`
+
+## 19) Releases y tags
+
+Estado: `READY FOR EXECUTION` (sin bloqueo tecnico; pendiente gate operativo semanal)
+
+Checkpoints recomendados:
- `v2.0.0-rc1`: F2 + F3
- `v2.0.0-rc2`: F5 + F4
- `v2.0.0-rc3`: F6
-- `v2.0.0`: F7 stable and cutover-ready
+- `v2.0.0`: F7 estable y cutover-ready
- `v2.1.0`: advanced analytics
-Cutover complete criteria:
-- 4 consecutive weekly runs without critical quality failures
-- SLO targets met
-- trend equivalence stable
-- frontend bridge stable under flag-on
+Criterios de cutover completo:
+- 4 corridas semanales consecutivas sin fallos `critical`
+- SLOs cumplidos
+- equivalencia trend estable
+- frontend bridge estable con flag on
+
+Estado actual de criterios:
+- suite tecnica en verde (`pytest -q` -> `133 passed`)
+- equivalencia trend validada (F4 en verde)
+- bridge frontend implementado con fallback y feature flag
+- pendiente operativo: 4 corridas semanales reales + verificacion SLO
+
+Procedimiento de ejecucion de tags (cuando se autorice):
+1. Validar rama `feat/backend` actualizada con `main`.
+2. Ejecutar tests y smoke checks.
+3. Crear tag anotado del checkpoint objetivo.
+4. Publicar tag remoto.
+5. Registrar notas de release.
## 20) Decision timeline tags
-- Adopt now:
+Estado: `DECISIONES CERRADAS`
+
+- Adoptar ahora:
- Contract V2
- Dual write
- Pandera severity
- DuckDB equivalence
- CI artifacts
- Frontend bridge
+ - estado: implementado en `feat/backend`
-- Adopt in V2.1:
- - forecasting and advanced NLP
+- Adoptar en V2.1:
+ - forecasting y NLP avanzado
+ - estado: pendiente (no bloquea release V2.0.0)
- Post-V2:
- - external BI and non-GitHub long-term storage
+ - BI externo y almacenamiento long-term fuera de GitHub
+ - estado: backlog estrategico
+
+Regla de control de alcance:
+- toda mejora nueva no critica entra a V2.1 o Post-V2
+- solo fixes de estabilidad/regresion entran antes de `v2.0.0`
-## 21) Final assumptions
+## 21) Supuestos finales
-1. Serverless architecture remains mandatory.
-2. This document is the execution source of truth for backend V2 in `feat/backend`.
-3. No open decision should be left to implementers outside this plan.
+1. La arquitectura serverless se mantiene como restriccion principal.
+2. Este documento es la fuente de verdad de ejecucion para backend V2 en `feat/backend`.
+3. No se dejan decisiones abiertas fuera de este plan.
diff --git a/docs/architecture.md b/docs/architecture.md
index 67a7bae..2947b6f 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -1,109 +1,96 @@
-# Architecture -- Technology Trend Analysis Platform
+# Arquitectura del Proyecto
-## System Overview
+## Resumen
-Plataforma multi-fuente que extrae, transforma y visualiza datos de tendencias tecnologicas
-desde tres comunidades de desarrolladores: GitHub, StackOverflow y Reddit.
+La plataforma procesa tendencias de tecnologia desde tres fuentes (GitHub, StackOverflow, Reddit),
+calcula un Trend Score compuesto, valida calidad de datos y publica activos para frontend.
-## Data Flow
+## Flujo de Datos
-```
- .env (API Keys)
- |
- +--------------+--------------+
- v v v
- GitHub API StackOverflow API Reddit API
- (REST) (REST) (OAuth/JSON)
- | | |
- v v v
- github_etl.py so_etl.py reddit_etl.py
- | | |
- +--------------+--------------+
- v
- datos/ (CSV)
- Fuente de Verdad
- |
- sync_assets.py
- |
- v
- frontend/assets/data/
- |
- v
- Flutter Web Dashboard
- (fl_chart)
-```
-
-## Data Schema
-
-### GitHub
-
-| Archivo | Columnas | Descripcion |
-|---------|----------|-------------|
-| github_repos_2025.csv | repo_name, language, stars, forks, created_at, description | Top 1000 repos creados en 2025 |
-| github_lenguajes.csv | lenguaje, repos_count, porcentaje | Top 10 lenguajes por cantidad de repos |
-| github_commits_frameworks.csv | framework, repo, commits_2025, ranking | Actividad de commits en frameworks frontend |
-| github_correlacion.csv | repo_name, stars, contributors, language | Correlacion Stars vs Contributors |
-
-### StackOverflow
-
-| Archivo | Columnas | Descripcion |
-|---------|----------|-------------|
-| so_volumen_preguntas.csv | lenguaje, preguntas_nuevas_2025 | Volumen de preguntas por lenguaje |
-| so_tasa_aceptacion.csv | tecnologia, total_preguntas, respuestas_aceptadas, tasa_aceptacion_pct | Tasa de respuestas aceptadas por framework |
-| so_tendencias_mensuales.csv | mes, python, javascript, typescript | Tendencias mensuales de preguntas |
-
-### Reddit
-
-| Archivo | Columnas | Descripcion |
-|---------|----------|-------------|
-| reddit_sentimiento_frameworks.csv | framework, total_menciones, positivos, neutros, negativos, % positivo, % neutro, % negativo | Analisis de sentimiento para frameworks backend |
-| reddit_temas_emergentes.csv | tema, menciones | Temas emergentes en r/webdev |
-| interseccion_github_reddit.csv | tecnologia, tipo, ranking_github, ranking_reddit, diferencia | Comparacion de rankings entre plataformas |
-
-### Trend Score
-
-| Archivo | Columnas | Descripcion |
-|---------|----------|-------------|
-| trend_score.csv | ranking, tecnologia, github_score, so_score, reddit_score, trend_score, fuentes | Indice compuesto ponderado (GitHub 40% + SO 35% + Reddit 25%) |
-
-## Frontend Architecture
-
-```
-Flutter Web Dashboard
- HomeScreen - KPIs globales, insights
- GithubDashboard - 3 graficos (barras, donut, scatter)
- SODashboard - 3 graficos (barras, stacked, lineas)
- RedditDashboard - 3 graficos (divergentes, barras, rankings)
-
-Cada dashboard incluye:
- - Carga de CSV via CsvService
- - Graficos interactivos (fl_chart)
- - Key Insights
- - Exportar ZIP
-```
-
-## Deployment
-
-### Local
-```bash
-# Backend
-make install
-make etl
-
-# Frontend
-cd frontend
-flutter pub get
-flutter run -d chrome
-```
+```text
+GitHub ETL -------\
+StackOverflow ETL --> datos/*.csv --> Trend Score --> sync_assets --> frontend/assets/data/*
+Reddit ETL -------/
-### GitHub Pages
-```bash
-cd frontend
-flutter build web --base-href "/Technology-trend-analysis-platform/"
+Adicional:
+- dual write opcional a datos/latest y datos/history
+- export de bridge JSON para historico de trend
```
-### Automatizacion (GitHub Actions)
-- Cron: cada lunes a las 08:00 UTC (03:00 Ecuador)
-- Ejecuta el pipeline ETL completo
-- Sincroniza CSVs al frontend
-- Rebuild y deploy de Flutter Web
+## Componentes Backend
+
+- `backend/base_etl.py`
+ - clase base para ejecucion, logging y escritura.
+- `backend/config/settings.py`
+ - rutas, flags de escritura y configuracion global.
+- `backend/trend_score.py`
+ - motor principal de Trend Score con selector de engine.
+- `backend/trend_score_duckdb.py`
+ - engine DuckDB para calculo SQL.
+- `backend/validador.py`
+ - validacion de schema y quality report por severidad.
+- `backend/quality/pandera_schemas.py`
+ - reglas `critical/warning/info` con Pandera.
+- `backend/quality/degradation_policy.py`
+ - politica de degradacion por disponibilidad de fuentes.
+- `backend/validate_csv_contract.py`
+ - contrato CSV para compatibilidad backend/frontend.
+- `backend/config/data_product_contract.py`
+ - contrato de run manifest y dataset manifest.
+- `backend/config/schema_contract_utils.py`
+ - `schema_hash` deterministico y reglas SemVer bump.
+- `backend/sync_assets.py`
+ - sincroniza CSV a frontend con prioridad por archivo (`latest` -> fallback `legacy`).
+- `backend/export_history_json.py`
+ - genera `history_index.json` y `trend_score_history.json`.
+
+## Estrategia de Escritura
+
+Control por variables de entorno:
+
+- `DATA_WRITE_LEGACY_CSV`
+- `DATA_WRITE_LATEST_CSV`
+- `DATA_WRITE_HISTORY_CSV`
+
+Rutas:
+
+- Legacy: `datos/*.csv`
+- Latest: `datos/latest/*.csv`
+- History: `datos/history//year=YYYY/month=MM/day=DD/*.csv`
+- Metadata: `datos/metadata/`
+
+## Conexion con Frontend
+
+El frontend consume:
+
+- CSV tradicionales en `frontend/assets/data/*.csv`
+- Bridge JSON opcional:
+ - `frontend/assets/data/history_index.json`
+ - `frontend/assets/data/trend_score_history.json`
+
+Feature flag:
+
+- `frontend/lib/config/feature_flags.dart`
+- `USE_HISTORY_BRIDGE_JSON=false` por defecto.
+
+Esto permite cutover parcial sin romper dashboards existentes.
+
+## GitHub Actions
+
+Workflows activos:
+
+1. `etl_semanal.yml`
+ - lunes `08:00 UTC` + manual.
+ - jobs paralelos por fuente + aggregate + publish.
+2. `ci.yml`
+ - tests en `main`, `feat/backend`, `feat/frontend`.
+3. `dependency_security.yml`
+ - auditoria de dependencias (push/PR/schedule/manual).
+4. `deploy_frontend.yml`
+ - deploy de Flutter Web en `main` o tras ETL exitoso.
+
+## Estado de Backend V2
+
+- Implementacion tecnica: completada.
+- Gate operativo pendiente para cutover final:
+ - 4 corridas semanales consecutivas sin fallos `critical`.
diff --git a/docs/coding_style.md b/docs/coding_style.md
new file mode 100644
index 0000000..7761994
--- /dev/null
+++ b/docs/coding_style.md
@@ -0,0 +1,62 @@
+# Estandar de Estilo del Repositorio
+
+Este documento define reglas para mantener consistencia tecnica y colaboracion.
+
+## 1) Idioma por Area
+
+- `README.md`: ingles.
+- `docs/`: espanol (terminos tecnicos pueden quedar en ingles).
+- `backend/`:
+ - comentarios y docstrings en ingles.
+ - nombres ETL de negocio existentes pueden mantenerse en espanol.
+ - modulos tecnicos y utilidades compartidas en ingles.
+
+## 2) Naming y Estructura
+
+- usar nombres profesionales y estables.
+- evitar nombres temporales en codigo (`fase`, `pr-xx`, `tmp`, etc).
+- mantener coherencia con el estilo del modulo antes de renombrar APIs.
+- evitar sobre-comentarios; comentar solo cuando agrega contexto real.
+- no usar emojis en codigo backend.
+
+## 3) Reglas de Implementacion
+
+- cambios incrementales y compatibles con comportamiento actual.
+- no romper contratos publicos sin requerimiento funcional explicito.
+- separar logica de negocio y utilidades tecnicas.
+- toda capa nueva debe incluir tests.
+
+## 4) Flujo de Ramas
+
+- `main`: rama estable.
+- `feat/backend`: cambios backend.
+- `feat/frontend`: cambios frontend.
+
+Antes de PR:
+1. actualizar rama con `main`.
+2. correr tests relevantes.
+3. verificar build/smoke cuando aplique.
+
+## 5) Commits
+
+- mensajes en ingles, claros y breves.
+- evitar titulos con terminologia interna del plan (`f2`, `pr03`, etc).
+- un commit debe agrupar cambios coherentes.
+
+## 6) Validacion Minima antes de Push
+
+- `pytest -q`
+- smoke ETL si se toca pipeline.
+- smoke frontend si se toca integracion de assets.
+- confirmar que cambios no relacionados no se incluyan por error.
+
+## 7) Politica de Artefactos Generados
+
+- no commitear salidas runtime (`datos/latest`, `datos/history`, `datos/metadata`) salvo decision explicita.
+- commitear codigo, tests y documentacion.
+
+## 8) Definicion de Listo
+
+- tests en verde.
+- sin regresiones de contrato de datos.
+- comportamiento de rollback definido para cambios de riesgo.
diff --git a/docs/data_contract.md b/docs/data_contract.md
index 84dd5c9..480996d 100644
--- a/docs/data_contract.md
+++ b/docs/data_contract.md
@@ -1,85 +1,113 @@
-# Contrato de datos CSV (Backend ↔ Frontend)
+# Contrato de Datos (Backend <-> Frontend)
-Este documento formaliza el contrato de columnas entre el pipeline ETL (`backend/`) y el dashboard Flutter (`frontend/`).
+Este documento define los contratos activos para salidas de datos y metadata.
-## Fuente de verdad
-
-El contrato ejecutable vive en:
+## 1) Contrato CSV
+Fuente de verdad:
- `backend/config/csv_contract.py`
-Versión actual del contrato:
+Objetivo:
+- mantener compatibilidad entre salidas backend y consumo frontend.
+
+Validacion:
+
+```bash
+python backend/validate_csv_contract.py
+```
+
+Modos relevantes:
+
+```bash
+python backend/validate_csv_contract.py --no-strict
+python backend/validate_csv_contract.py --pandera-strict
+python backend/validate_csv_contract.py --no-strict --skip-pandera
+```
+
+## 2) Contrato de Producto de Datos (Manifest)
+
+Fuente de verdad:
+- `backend/config/data_product_contract.py`
+
+Incluye:
+- run manifest
+- dataset manifest
-- `CONTRACT_VERSION = 2026.03`
+### 2.1 Campos obligatorios de run manifest
-El validador consume ese contrato para verificar columnas requeridas y columnas críticas.
+- `run_id`
+- `generated_at_utc`
+- `git_sha`
+- `branch`
+- `source_window_start_utc`
+- `source_window_end_utc`
+- `quality_gate_status` (`pass`, `pass_with_warnings`, `fail`)
+- `datasets`
-Además, el pipeline ETL semanal ejecuta validación de headers con:
+### 2.2 Campos obligatorios de dataset manifest
-- `python backend/validate_csv_contract.py`
+- `dataset_logical_name`
+- `version_semver`
+- `generated_at_utc`
+- `source_run_id`
+- `schema_hash`
+- `row_count`
+- `quality_status` (`pass`, `warning`, `fail`)
+- `latest_path`
+- `history_path`
-Si faltan columnas requeridas o no se cumplen tipos mínimos, el workflow falla antes de publicar cambios de datos.
+## 3) Reglas de Validacion
-Modo opcional no estricto (solo advertencias):
+- fechas en formato ISO-8601 con zona horaria.
+- `version_semver` valida SemVer.
+- `schema_hash` debe ser SHA-256 hexadecimal de 64 caracteres.
+- `row_count` debe ser entero >= 0.
+- `source_run_id` debe coincidir con `run_id`.
+- `history_path` puede ser `null` cuando `quality_status=fail`.
-- `python backend/validate_csv_contract.py --no-strict`
+## 4) Utilidades de Schema y Versionado
-## Reglas del contrato
+Fuente de verdad:
+- `backend/config/schema_contract_utils.py`
-1. **required_columns**: deben existir para considerar que el CSV cumple contrato.
-2. **critical_columns**: no deberían contener nulos; en modo estricto, fallan la validación.
-3. **column_types**: define tipos mínimos esperados por columna (`string`, `integer`, `number`, `datetime`, `string_or_integer`).
-4. **optional_columns**: columnas permitidas (compatibilidad y métricas adicionales), pero no obligatorias.
+Funciones:
+- `compute_schema_hash(...)`
+- `recommend_semver_bump(...)`
+- `aggregate_semver_bump(...)`
-## Tipos mínimos por archivo (resumen)
+Politica SemVer implementada:
+- `major`: cambio breaking (remove/rename required column, tipo incompatible, etc).
+- `minor`: cambios backward-compatible (columna opcional, regla no breaking, etc).
+- `patch`: cambios internos sin romper contrato.
-- `github_repos.csv`
- - `repo_name:string`, `language:string`, `stars:integer`, `forks:integer`, `created_at:datetime`
-- `github_lenguajes.csv`
- - `lenguaje:string`, `repos_count:integer`, `porcentaje:number`
-- `github_ai_repos_insights.csv`
- - `total_repos_analizados:integer`, `repos_ai_detectados:integer`, `porcentaje_ai:number`, `mes_pico_ai:string`, `repos_mes_pico_ai:integer`, `top_keywords_ai:string`, `top_repos_ai:string`
-- `github_commits_frameworks.csv`
- - `framework:string`, `repo:string`, `commits_2025:integer`, `ranking:integer`
-- `github_correlacion.csv`
- - `repo_name:string`, `stars:integer`, `contributors:integer`, `language:string`
-- `so_volumen_preguntas.csv`
- - `lenguaje:string`, `preguntas_nuevas_2025:integer`
-- `so_tasa_aceptacion.csv`
- - `tecnologia:string`, `total_preguntas:integer`, `respuestas_aceptadas:integer`, `tasa_aceptacion_pct:number`
-- `so_tendencias_mensuales.csv`
- - `mes:string`, `python:integer`, `javascript:integer`, `typescript:integer`
-- `reddit_sentimiento_frameworks.csv`
- - `framework:string`, `total_menciones:integer`, `positivos:integer`, `neutros:integer`, `negativos:integer`
- - opcionales: `% positivo:number`, `% neutro:number`, `% negativo:number`
-- `reddit_temas_emergentes.csv`
- - `tema:string`, `menciones:integer`
-- `interseccion_github_reddit.csv`
- - `tecnologia:string`, `tipo:string`, `ranking_github:integer`, `ranking_reddit:string_or_integer`
-- `trend_score.csv`
- - `ranking:integer`, `tecnologia:string`, `github_score:number`, `so_score:number`, `reddit_score:number`, `trend_score:number`, `fuentes:integer`
+## 5) Estrategia de Escritura
-## Archivos clave consumidos por frontend
+Control por flags:
+- `DATA_WRITE_LEGACY_CSV`
+- `DATA_WRITE_LATEST_CSV`
+- `DATA_WRITE_HISTORY_CSV`
-- `github_lenguajes.csv`
- - requeridas: `lenguaje`, `repos_count`, `porcentaje`
-- `so_volumen_preguntas.csv`
- - requeridas: `lenguaje`, `preguntas_nuevas_2025`
-- `so_tasa_aceptacion.csv`
- - requeridas: `tecnologia`, `total_preguntas`, `respuestas_aceptadas`, `tasa_aceptacion_pct`
-- `reddit_temas_emergentes.csv`
- - requeridas: `tema`, `menciones`
-- `trend_score.csv`
- - requeridas: `ranking`, `tecnologia`, `github_score`, `so_score`, `reddit_score`, `trend_score`, `fuentes`
+Rutas:
+- Legacy: `datos/*.csv`
+- Latest: `datos/latest/*.csv`
+- History: `datos/history//year=YYYY/month=MM/day=DD/*.csv`
-## Compatibilidad de `reddit_sentimiento_frameworks.csv`
+## 6) Bridge Frontend
-El backend mantiene como requeridas:
+Fuente de verdad:
+- `backend/export_history_json.py`
-- `framework`, `total_menciones`, `positivos`, `neutros`, `negativos`
+Activos generados:
+- `frontend/assets/data/history_index.json`
+- `frontend/assets/data/trend_score_history.json`
-Y como opcionales para visualización:
+Comportamiento:
+- si el historial esta incompleto o corrupto, se usa fallback a `latest` para trend.
-- `% positivo`, `% neutro`, `% negativo`
+## 7) Recomendacion Operativa
-Esto evita acoplamiento implícito y deja explícita la coexistencia de métricas absolutas y porcentuales.
+Antes de publicar cambios de contrato:
+1. actualizar contrato en backend.
+2. agregar o ajustar tests.
+3. ejecutar `pytest -q`.
+4. validar que frontend sigue consumiendo sin regresiones.
diff --git a/docs/dependency_policy.md b/docs/dependency_policy.md
index abffc9e..4478e22 100644
--- a/docs/dependency_policy.md
+++ b/docs/dependency_policy.md
@@ -1,47 +1,61 @@
-# Política mínima de dependencias y seguridad
+# Politica de Dependencias y Seguridad
-Esta política reduce riesgo técnico y mejora reproducibilidad para el backend.
+Esta politica define criterios minimos para mantener estabilidad y seguridad en backend.
## Objetivos
-- Mantener rangos de versiones controlados en `backend/requirements.txt`.
-- Detectar vulnerabilidades conocidas de forma continua.
-- Definir una cadencia mínima de actualización.
+- controlar versiones en `backend/requirements.txt`.
+- detectar vulnerabilidades conocidas de forma continua.
+- mantener reproducibilidad en CI y local.
-## Reglas de versionado
+## Reglas de Versionado
-1. Evitar rangos abiertos en major (`<3.0` para todo) cuando no sea necesario.
-2. Usar límites superiores por compatibilidad real del proyecto.
-3. Mantener `pytest` en major estable (`>=8,<9`).
+1. usar rangos compatibles con limite superior.
+2. evitar upgrades de major sin validacion de regresion.
+3. mantener dependencias de test en major estable (`pytest >=8,<9`).
-## Auditoría de seguridad
+## Dependencias Core Actuales
-- Local:
- - `make security`
-- CI:
- - Workflow: `Dependency Security Audit`
- - Se ejecuta en:
- - cambios de `backend/requirements.txt`
- - `pull_request` hacia `main`
- - semanalmente (lunes)
- - manualmente (`workflow_dispatch`)
+- `pandas`
+- `requests`
+- `nltk`
+- `pandera`
+- `duckdb`
+- `python-dotenv`
-Si se detectan CVEs, el job falla y se debe corregir antes de mergear a `main`.
+## Auditoria de Seguridad
-## Política mínima de actualización
+Workflow: `.github/workflows/dependency_security.yml`
-- **Mensual**: revisar updates menores/patch de librerías.
-- **Trimestral**: revisar nuevos majors y plan de adopción.
-- **Inmediato**: parchear CVEs con severidad alta/crítica.
+Se ejecuta en:
+- push a `main` y `feat/backend` con cambios en `backend/requirements.txt`.
+- pull request a `main` con cambios en dependencias.
+- schedule semanal: lunes `09:00 UTC`.
+- manual (`workflow_dispatch`).
-## Flujo recomendado
+Herramienta:
+- `pip-audit`
-1. Crear rama de actualización.
-2. Ajustar `backend/requirements.txt` con cambios mínimos.
-3. Ejecutar:
- - `python -m pytest tests/ -q`
- - `make security`
-4. Abrir PR con resumen:
- - librerías cambiadas
- - motivo (bugfix/CVE/compatibilidad)
- - evidencia de tests y auditoría.
+Si hay vulnerabilidades, el job falla y no se recomienda merge a `main`.
+
+Excepcion temporal vigente:
+- `CVE-2025-14009` (`nltk 3.9.2`) esta ignorado en CI porque no existe version parche publicada.
+- La excepcion debe revisarse semanalmente y removerse apenas exista fix oficial de `nltk`.
+
+## Flujo Recomendado de Actualizacion
+
+1. crear rama de trabajo.
+2. cambiar dependencias minimas necesarias.
+3. ejecutar:
+ - `python -m pytest -q`
+ - pipeline de seguridad.
+4. documentar en PR:
+ - librerias cambiadas
+ - motivo
+ - evidencia de tests
+
+## Cadencia Recomendada
+
+- mensual: patch/minor review.
+- trimestral: analisis de majors.
+- inmediato: CVEs de severidad alta/critica.
diff --git a/frontend/lib/config/feature_flags.dart b/frontend/lib/config/feature_flags.dart
new file mode 100644
index 0000000..aee0534
--- /dev/null
+++ b/frontend/lib/config/feature_flags.dart
@@ -0,0 +1,11 @@
+class FeatureFlags {
+ FeatureFlags._();
+
+ /// Partial cutover flag for historical bridge JSON assets.
+ ///
+ /// Default is disabled to preserve current CSV-only behavior.
+ static const bool useHistoryBridgeJson = bool.fromEnvironment(
+ 'USE_HISTORY_BRIDGE_JSON',
+ defaultValue: false,
+ );
+}
diff --git a/frontend/lib/screens/home_screen.dart b/frontend/lib/screens/home_screen.dart
index f2b9e6a..683e414 100644
--- a/frontend/lib/screens/home_screen.dart
+++ b/frontend/lib/screens/home_screen.dart
@@ -1,5 +1,7 @@
import 'package:flutter/material.dart';
import 'package:font_awesome_flutter/font_awesome_flutter.dart';
+import '../config/feature_flags.dart';
+import '../services/csv_service.dart';
class HomeScreen extends StatelessWidget {
const HomeScreen({super.key});
@@ -22,7 +24,7 @@ class HomeScreen extends StatelessWidget {
style: TextStyle(fontSize: 18, color: Colors.grey),
),
const SizedBox(height: 40),
-
+
// KPIs principales con iconos oficiales
Wrap(
spacing: 24,
@@ -103,16 +105,20 @@ class HomeScreen extends StatelessWidget {
],
),
),
-
+
const SizedBox(height: 48),
-
+
+ const _TrendTemporalBridgeCard(),
+
+ const SizedBox(height: 32),
+
// Seccion Sobre el Dashboard
const Text(
'Sobre el Dashboard',
style: TextStyle(fontSize: 24, fontWeight: FontWeight.bold),
),
const SizedBox(height: 20),
-
+
Wrap(
spacing: 24,
runSpacing: 24,
@@ -120,36 +126,54 @@ class HomeScreen extends StatelessWidget {
_buildInfoCardFA(
icon: FontAwesomeIcons.github,
title: 'GitHub Data',
- description: 'Análisis de repositorios, lenguajes más populares y correlación entre stars y contribuidores',
+ description:
+ 'Análisis de repositorios, lenguajes más populares y correlación entre stars y contribuidores',
color: Colors.blue,
),
_buildInfoCardFA(
icon: FontAwesomeIcons.stackOverflow,
title: 'StackOverflow Data',
- description: 'Madurez de tecnologías y evolución del interés en frameworks a lo largo del año',
+ description:
+ 'Madurez de tecnologías y evolución del interés en frameworks a lo largo del año',
color: const Color(0xFFF48024),
),
_buildInfoCardFA(
icon: FontAwesomeIcons.reddit,
title: 'Reddit Data',
- description: 'Sentimiento de la comunidad sobre frameworks backend y temas de discusión frecuentes',
+ description:
+ 'Sentimiento de la comunidad sobre frameworks backend y temas de discusión frecuentes',
color: const Color(0xFFFF4500),
),
],
),
-
+
const SizedBox(height: 48),
-
+
// Integrantes
const Text(
'Integrantes del Equipo',
style: TextStyle(fontSize: 24, fontWeight: FontWeight.bold),
),
const SizedBox(height: 16),
- _buildTeamMemberFA('Samir Caizapasto', 'GitHub ETL & Dashboard', FontAwesomeIcons.github, Colors.blue),
- _buildTeamMemberFA('Andrés Salinas', 'StackOverflow ETL & Dashboard', FontAwesomeIcons.stackOverflow, const Color(0xFFF48024)),
- _buildTeamMemberFA('Mateo Mayorga', 'Reddit ETL & Dashboard', FontAwesomeIcons.reddit, const Color(0xFFFF4500)),
-
+ _buildTeamMemberFA(
+ 'Samir Caizapasto',
+ 'GitHub ETL & Dashboard',
+ FontAwesomeIcons.github,
+ Colors.blue,
+ ),
+ _buildTeamMemberFA(
+ 'Andrés Salinas',
+ 'StackOverflow ETL & Dashboard',
+ FontAwesomeIcons.stackOverflow,
+ const Color(0xFFF48024),
+ ),
+ _buildTeamMemberFA(
+ 'Mateo Mayorga',
+ 'Reddit ETL & Dashboard',
+ FontAwesomeIcons.reddit,
+ const Color(0xFFFF4500),
+ ),
+
const SizedBox(height: 48),
],
),
@@ -218,9 +242,7 @@ class HomeScreen extends StatelessWidget {
decoration: BoxDecoration(
color: Colors.white,
borderRadius: BorderRadius.circular(12),
- border: Border(
- top: BorderSide(color: color, width: 4),
- ),
+ border: Border(top: BorderSide(color: color, width: 4)),
boxShadow: [
BoxShadow(
color: Colors.black.withOpacity(0.05),
@@ -259,7 +281,12 @@ class HomeScreen extends StatelessWidget {
);
}
- Widget _buildTeamMemberFA(String name, String role, IconData icon, Color color) {
+ Widget _buildTeamMemberFA(
+ String name,
+ String role,
+ IconData icon,
+ Color color,
+ ) {
return Padding(
padding: const EdgeInsets.symmetric(vertical: 8),
child: Row(
@@ -282,7 +309,12 @@ class HomeScreen extends StatelessWidget {
}
// Widget con imagen de logo oficial
- Widget _buildImageInsight(String imagePath, String title, String description, Color accentColor) {
+ Widget _buildImageInsight(
+ String imagePath,
+ String title,
+ String description,
+ Color accentColor,
+ ) {
return Container(
padding: const EdgeInsets.all(16),
decoration: BoxDecoration(
@@ -303,15 +335,10 @@ class HomeScreen extends StatelessWidget {
Container(
width: 48,
height: 48,
- decoration: BoxDecoration(
- borderRadius: BorderRadius.circular(10),
- ),
+ decoration: BoxDecoration(borderRadius: BorderRadius.circular(10)),
child: ClipRRect(
borderRadius: BorderRadius.circular(8),
- child: Image.asset(
- imagePath,
- fit: BoxFit.contain,
- ),
+ child: Image.asset(imagePath, fit: BoxFit.contain),
),
),
const SizedBox(width: 16),
@@ -343,3 +370,171 @@ class HomeScreen extends StatelessWidget {
);
}
}
+
+class _TrendTemporalBridgeCard extends StatefulWidget {
+ const _TrendTemporalBridgeCard();
+
+ @override
+ State<_TrendTemporalBridgeCard> createState() =>
+ _TrendTemporalBridgeCardState();
+}
+
+class _TrendTemporalBridgeCardState extends State<_TrendTemporalBridgeCard> {
+ late final Future