diff --git a/.env.example b/.env.example index 92e7a35..513c975 100644 --- a/.env.example +++ b/.env.example @@ -13,3 +13,14 @@ STACKOVERFLOW_KEY=tu_key_aqui # Registrar en: https://old.reddit.com/prefs/apps (tipo: script) REDDIT_CLIENT_ID=tu_client_id_aqui REDDIT_CLIENT_SECRET=tu_client_secret_aqui + +# Estrategia de escritura de datos (dual write) +# 1 = habilitado, 0 = deshabilitado +DATA_WRITE_LEGACY_CSV=1 +DATA_WRITE_LATEST_CSV=0 +DATA_WRITE_HISTORY_CSV=0 +EXPORT_HISTORY_BRIDGE_JSON=1 + +# Trend score engine selector +# allowed: legacy | duckdb +TREND_SCORE_ENGINE=legacy diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 399aefe..8e5379e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,7 @@ name: CI - Tests on: push: - branches: [main, refactor/standard-structure] + branches: [main, feat/backend, feat/frontend] paths-ignore: - 'datos/**' - 'frontend/assets/data/**' diff --git a/.github/workflows/dependency_security.yml b/.github/workflows/dependency_security.yml index ad11b16..47b6f31 100644 --- a/.github/workflows/dependency_security.yml +++ b/.github/workflows/dependency_security.yml @@ -2,7 +2,7 @@ name: Dependency Security Audit on: push: - branches: [main, refactor/standard-structure] + branches: [main, feat/backend] paths: - 'backend/requirements.txt' - '.github/workflows/dependency_security.yml' @@ -42,4 +42,6 @@ jobs: - name: Run vulnerability audit run: | - pip-audit -r backend/requirements.txt + # Temporary exception: CVE-2025-14009 has no fixed NLTK release yet. + # Keep this ignored only until upstream publishes a patched version. + pip-audit -r backend/requirements.txt --ignore-vuln CVE-2025-14009 diff --git a/.github/workflows/deploy_frontend.yml b/.github/workflows/deploy_frontend.yml index 9582217..3a2b337 100644 --- a/.github/workflows/deploy_frontend.yml +++ b/.github/workflows/deploy_frontend.yml @@ -17,7 +17,7 @@ permissions: jobs: deploy: - if: ${{ github.event_name != 'workflow_run' || github.event.workflow_run.conclusion == 'success' }} + if: ${{ github.event_name != 'workflow_run' || (github.event.workflow_run.conclusion == 'success' && github.event.workflow_run.head_branch == 'main') }} runs-on: ubuntu-latest timeout-minutes: 30 @@ -47,4 +47,4 @@ jobs: uses: peaceiris/actions-gh-pages@v4 with: github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: frontend/build/web \ No newline at end of file + publish_dir: frontend/build/web diff --git a/.github/workflows/etl_semanal.yml b/.github/workflows/etl_semanal.yml index d662e94..46f31e1 100644 --- a/.github/workflows/etl_semanal.yml +++ b/.github/workflows/etl_semanal.yml @@ -2,32 +2,40 @@ name: ETL Weekly Data Refresh on: schedule: - - cron: '0 8 * * 1' # Cada lunes a las 08:00 UTC (03:00 Ecuador) - workflow_dispatch: # Boton manual en Actions + - cron: "0 8 * * 1" + workflow_dispatch: permissions: - contents: write + contents: read -# Evitar que dos runs del ETL corran al mismo tiempo concurrency: group: etl-pipeline cancel-in-progress: false +env: + PYTHON_VERSION: "3.11" + DATA_WRITE_LEGACY_CSV: "1" + DATA_WRITE_LATEST_CSV: "1" + DATA_WRITE_HISTORY_CSV: "1" + EXPORT_HISTORY_BRIDGE_JSON: "1" + TREND_SCORE_ENGINE: "duckdb" + jobs: - etl: + job_github: + name: Source - GitHub runs-on: ubuntu-latest - timeout-minutes: 45 + timeout-minutes: 20 steps: - name: Checkout code uses: actions/checkout@v4 - - name: Set up Python 3.11 + - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.11' - cache: 'pip' - cache-dependency-path: 'backend/requirements.txt' + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + cache-dependency-path: backend/requirements.txt - name: Install dependencies run: | @@ -39,19 +47,89 @@ jobs: python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')" - name: Run GitHub ETL - id: github_etl env: GITHUB_TOKEN: ${{ secrets.GH_PAT }} run: python backend/github_etl.py + - name: Upload GitHub artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: github-data + if-no-files-found: warn + path: | + datos/github_repos_2025.csv + datos/github_lenguajes.csv + datos/github_ai_repos_insights.csv + datos/github_commits_frameworks.csv + datos/github_correlacion.csv + + job_stackoverflow: + name: Source - StackOverflow + runs-on: ubuntu-latest + timeout-minutes: 20 + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + cache-dependency-path: backend/requirements.txt + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -r backend/requirements.txt + - name: Run StackOverflow ETL - id: so_etl env: STACKOVERFLOW_KEY: ${{ secrets.STACKOVERFLOW_KEY }} run: python backend/stackoverflow_etl.py - - name: Run Reddit ETL - id: reddit_etl + - name: Upload StackOverflow artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: stackoverflow-data + if-no-files-found: warn + path: | + datos/so_volumen_preguntas.csv + datos/so_tasa_aceptacion.csv + datos/so_tendencias_mensuales.csv + + job_reddit: + name: Source - Reddit + runs-on: ubuntu-latest + timeout-minutes: 20 + outputs: + status: ${{ steps.reddit_run.outputs.status }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + cache-dependency-path: backend/requirements.txt + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -r backend/requirements.txt + + - name: Download NLTK data + run: | + python -c "import nltk; nltk.download('vader_lexicon'); nltk.download('stopwords')" + + - name: Run Reddit ETL (non-blocking) + id: reddit_run env: REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }} REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }} @@ -62,22 +140,200 @@ jobs: code=$? if [ $code -ne 0 ]; then echo "status=failed" >> "$GITHUB_OUTPUT" - echo "Reddit ETL falló (no crítico), se continúa con datos previos si existen." >> "$GITHUB_STEP_SUMMARY" + echo "Reddit ETL failed (non-critical); aggregate will continue with existing data fallback." >> "$GITHUB_STEP_SUMMARY" exit 0 fi echo "status=ok" >> "$GITHUB_OUTPUT" + - name: Upload Reddit artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: reddit-data + if-no-files-found: warn + path: | + datos/reddit_sentimiento_frameworks.csv + datos/reddit_temas_emergentes.csv + datos/interseccion_github_reddit.csv + + job_aggregate: + name: Aggregate + Quality Gate + runs-on: ubuntu-latest + timeout-minutes: 20 + needs: + - job_github + - job_stackoverflow + - job_reddit + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: pip + cache-dependency-path: backend/requirements.txt + + - name: Install dependencies + run: | + pip install --upgrade pip + pip install -r backend/requirements.txt + + - name: Download GitHub artifacts + uses: actions/download-artifact@v4 + with: + name: github-data + path: artifacts/github + + - name: Download StackOverflow artifacts + uses: actions/download-artifact@v4 + with: + name: stackoverflow-data + path: artifacts/stackoverflow + + - name: Download Reddit artifacts + uses: actions/download-artifact@v4 + with: + name: reddit-data + path: artifacts/reddit + + - name: Materialize source outputs + shell: bash + run: | + mkdir -p datos frontend/assets/data + cp -f artifacts/github/*.csv datos/ 2>/dev/null || true + cp -f artifacts/stackoverflow/*.csv datos/ 2>/dev/null || true + cp -f artifacts/reddit/*.csv datos/ 2>/dev/null || true + + - name: Verify artifact handoff + shell: bash + run: | + missing=0 + for required in \ + github_repos_2025.csv \ + github_lenguajes.csv \ + github_commits_frameworks.csv \ + github_correlacion.csv \ + so_volumen_preguntas.csv \ + so_tasa_aceptacion.csv \ + so_tendencias_mensuales.csv; do + if [ ! -f "datos/${required}" ]; then + echo "::error::Missing required artifact file datos/${required}" + missing=1 + fi + done + for optional in \ + reddit_sentimiento_frameworks.csv \ + reddit_temas_emergentes.csv \ + interseccion_github_reddit.csv; do + if [ ! -f "datos/${optional}" ]; then + echo "::warning::Optional artifact missing (degraded mode may continue): datos/${optional}" + fi + done + if [ "$missing" -ne 0 ]; then + exit 1 + fi + - name: Run Trend Score - id: trend_score run: python backend/trend_score.py - name: Sync CSVs to frontend assets run: python backend/sync_assets.py - name: Validate CSV contract headers - id: validate_contract run: python backend/validate_csv_contract.py + - name: Verify aggregate outputs + shell: bash + run: | + missing=0 + for required in \ + datos/trend_score.csv \ + frontend/assets/data/trend_score.csv \ + frontend/assets/data/github_lenguajes.csv \ + frontend/assets/data/github_commits_frameworks.csv \ + frontend/assets/data/github_correlacion.csv \ + frontend/assets/data/so_volumen_preguntas.csv \ + frontend/assets/data/so_tasa_aceptacion.csv \ + frontend/assets/data/so_tendencias_mensuales.csv \ + frontend/assets/data/reddit_sentimiento_frameworks.csv \ + frontend/assets/data/reddit_temas_emergentes.csv \ + frontend/assets/data/interseccion_github_reddit.csv; do + if [ ! -f "$required" ]; then + echo "::error::Missing aggregate output $required" + missing=1 + fi + done + if [ "${EXPORT_HISTORY_BRIDGE_JSON}" = "1" ]; then + for bridge_file in frontend/assets/data/history_index.json frontend/assets/data/trend_score_history.json; do + if [ ! -f "$bridge_file" ]; then + echo "::error::Missing bridge output $bridge_file" + missing=1 + fi + done + fi + if [ "$missing" -ne 0 ]; then + exit 1 + fi + + - name: Upload aggregate artifacts + uses: actions/upload-artifact@v4 + with: + name: aggregate-data + if-no-files-found: error + path: | + datos/*.csv + datos/latest/*.csv + datos/history/**/*.csv + datos/metadata/*.json + frontend/assets/data/*.csv + frontend/assets/data/*.json + + - name: ETL aggregate summary + if: always() + run: | + echo "## ETL Aggregate Summary" >> "$GITHUB_STEP_SUMMARY" + echo "- GitHub job: ${{ needs.job_github.result }}" >> "$GITHUB_STEP_SUMMARY" + echo "- StackOverflow job: ${{ needs.job_stackoverflow.result }}" >> "$GITHUB_STEP_SUMMARY" + echo "- Reddit job: ${{ needs.job_reddit.outputs.status || 'ok' }}" >> "$GITHUB_STEP_SUMMARY" + echo "- Aggregate status: ${{ job.status }}" >> "$GITHUB_STEP_SUMMARY" + + job_publish: + name: Publish Data + runs-on: ubuntu-latest + timeout-minutes: 10 + needs: + - job_aggregate + - job_reddit + if: ${{ needs.job_aggregate.result == 'success' }} + permissions: + contents: write + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download aggregate artifacts + uses: actions/download-artifact@v4 + with: + name: aggregate-data + path: artifact_payload + + - name: Restore aggregated files into workspace + shell: bash + run: | + mkdir -p datos datos/latest datos/history datos/metadata frontend/assets/data + cp -f artifact_payload/datos/*.csv datos/ 2>/dev/null || true + cp -f artifact_payload/datos/latest/*.csv datos/latest/ 2>/dev/null || true + cp -f artifact_payload/datos/metadata/*.json datos/metadata/ 2>/dev/null || true + cp -f artifact_payload/frontend/assets/data/*.json frontend/assets/data/ 2>/dev/null || true + cp -f artifact_payload/frontend/assets/data/*.csv frontend/assets/data/ 2>/dev/null || true + if [ -d artifact_payload/datos/history ]; then + rsync -a artifact_payload/datos/history/ datos/history/ + fi + - name: Commit updated data id: commit_data shell: bash @@ -88,21 +344,18 @@ jobs: git add datos/ frontend/assets/data/ if git diff --staged --quiet; then echo "changed=false" >> "$GITHUB_OUTPUT" - echo "Sin cambios de datos para commit." >> "$GITHUB_STEP_SUMMARY" + echo "No data changes to commit." >> "$GITHUB_STEP_SUMMARY" exit 0 fi - git commit -m "data: actualizar CSVs" + git commit -m "data: refresh CSV outputs" git pull --rebase origin "$TARGET_BRANCH" git push origin "HEAD:$TARGET_BRANCH" echo "changed=true" >> "$GITHUB_OUTPUT" - - name: ETL Summary + - name: Publish summary if: always() run: | - echo "## Resumen ETL" >> $GITHUB_STEP_SUMMARY - echo "- GitHub ETL: ${{ steps.github_etl.outcome }}" >> $GITHUB_STEP_SUMMARY - echo "- StackOverflow ETL: ${{ steps.so_etl.outcome }}" >> $GITHUB_STEP_SUMMARY - echo "- Reddit ETL: ${{ steps.reddit_etl.outputs.status || 'ok' }}" >> $GITHUB_STEP_SUMMARY - echo "- Trend Score: ${{ steps.trend_score.outcome }}" >> $GITHUB_STEP_SUMMARY - echo "- Contrato CSV: ${{ steps.validate_contract.outcome }}" >> $GITHUB_STEP_SUMMARY - echo "- Commit de datos: ${{ steps.commit_data.outputs.changed || 'false' }}" >> $GITHUB_STEP_SUMMARY + echo "## ETL Publish Summary" >> "$GITHUB_STEP_SUMMARY" + echo "- Aggregate gate: ${{ needs.job_aggregate.result }}" >> "$GITHUB_STEP_SUMMARY" + echo "- Reddit status: ${{ needs.job_reddit.outputs.status || 'ok' }}" >> "$GITHUB_STEP_SUMMARY" + echo "- Data committed: ${{ steps.commit_data.outputs.changed || 'false' }}" >> "$GITHUB_STEP_SUMMARY" diff --git a/.gitignore b/.gitignore index 806da86..98159d1 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,8 @@ logs/ # Temporary files etl_log_full.txt + +# Runtime ETL outputs (dual-write paths) +datos/latest/ +datos/history/ +datos/metadata/ diff --git a/README.md b/README.md index 2bb54ff..02f2304 100644 --- a/README.md +++ b/README.md @@ -1,258 +1,153 @@ -# Tech Trends 2025 - -
- -![Data Engineer](https://img.shields.io/badge/Role-Data_Engineer-orange?style=for-the-badge&logo=apache-spark&logoColor=white) -![Python](https://img.shields.io/badge/Python-3.9+-3776AB?style=for-the-badge&logo=python&logoColor=white) -![Flutter](https://img.shields.io/badge/Flutter-3.38-02569B?style=for-the-badge&logo=flutter&logoColor=white) -![Status](https://img.shields.io/badge/Status-Completed-success?style=for-the-badge) -![Tests](https://img.shields.io/badge/Tests-72_Passed-brightgreen?style=for-the-badge&logo=pytest&logoColor=white) -![CI](https://github.com/Sam-24-dev/Technology-trend-analysis-platform/actions/workflows/ci.yml/badge.svg) - -
- - - - - -
- ---- - -## Project Overview - -End-to-end data engineering platform that extracts, transforms, and visualizes technology trends from the three largest developer communities: GitHub, StackOverflow, and Reddit. - -| Challenge | Solution | Impact | -|-----------|----------|--------| -| Fragmented trend data | Multi-source ETL pipeline | Unified technology ranking | -| No cross-platform comparison | Composite Trend Score index | Weighted ranking across 3 sources | -| Manual analysis | Automated pipeline with OOP | Repeatable, testable, maintainable | -| Raw data, no insights | Interactive Flutter dashboard | Real-time trend visualization | - -> **Core Value:** This platform demonstrates a production-grade data pipeline that ingests from 3 APIs, applies NLP sentiment analysis, and produces a composite ranking — the kind of system that powers real technology intelligence products. - ---- - -## Pipeline Architecture - -``` -┌─────────────┐ ┌─────────────┐ ┌─────────────┐ -│ GitHub │ │StackOverflow│ │ Reddit │ -│ API │ │ API │ │ JSON API │ -└──────┬──────┘ └──────┬──────┘ └──────┬──────┘ - │ │ │ - ▼ ▼ ▼ -┌─────────────────────────────────────────────────────┐ -│ BaseETL (Abstract Class) │ -│ configurar_logging() · guardar_csv() · ejecutar() │ -├─────────────┬─────────────────┬─────────────────────┤ -│ GitHubETL │ StackOverflowETL│ RedditETL │ -│ 4 analyses │ 3 analyses │ 3 analyses + NLP │ -└──────┬──────┘────────┬────────┘──────────┬──────────┘ - │ │ │ - ▼ ▼ ▼ -┌─────────────────────────────────────────────────────┐ -│ datos/ (11 CSVs) │ -│ Validated by validador.py before each save │ -└──────────────────────┬──────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────┐ -│ Trend Score Engine │ -│ GitHub 40% + StackOverflow 35% + Reddit 25% │ -│ Min-max normalization · Outer join · Ranking │ -└──────────────────────┬──────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────┐ -│ Flutter Web Dashboard │ -│ 4 views · fl_chart · Export ZIP · Responsive │ -└─────────────────────────────────────────────────────┘ +# Technology Trend Analysis Platform + +End-to-end data pipeline and dashboard for technology trends across GitHub, StackOverflow, and Reddit. + +## Current Status + +- Backend refactor implementation is complete for F2-F7. +- Test suite is green (`133 passed`). +- Operational cutover is still pending: 4 weekly ETL runs without critical failures. + +## What Is Implemented + +- Multi-source ETL pipeline (GitHub, StackOverflow, Reddit). +- Dual write strategy: + - `datos/*.csv` (legacy) + - `datos/latest/*.csv` (latest) + - `datos/history//year=YYYY/month=MM/day=DD/*.csv` (history snapshots) +- Trend Score engine selector: + - `legacy` (pandas) + - `duckdb` (SQL engine with equivalence tests) +- Severity-based quality gate (`critical`, `warning`, `info`) with Pandera support. +- Data product contract for run and dataset manifests. +- Frontend bridge JSON assets: + - `history_index.json` + - `trend_score_history.json` +- Frontend feature flag for partial cutover to bridge JSON. + +## Repository Layout + +```text +backend/ + base_etl.py + trend_score.py + trend_score_duckdb.py + sync_assets.py + export_history_json.py + validate_csv_contract.py + validador.py + config/ + settings.py + csv_contract.py + data_product_contract.py + schema_contract_utils.py + quality/ + pandera_schemas.py + degradation_policy.py + +datos/ + *.csv + latest/*.csv + history//year=YYYY/month=MM/day=DD/*.csv + metadata/ + +frontend/ + lib/ + assets/data/ + +docs/ +tests/ +.github/workflows/ ``` -| Layer | Component | Output | -|-------|-----------|--------| -| **Extraction** | 3 API connectors | Raw data from GitHub, SO, Reddit | -| **Transformation** | BaseETL + 3 children | 11 processed CSVs | -| **Scoring** | trend_score.py | Unified technology ranking | -| **Validation** | validador.py + csv_contract.py | Column checks + strict schema/types validation | -| **Presentation** | Flutter Web | 4 interactive dashboards | - ---- - -## Key Metrics & Results +## Runtime Workflows -| Metric | Value | -|--------|-------| -| **Repositories analyzed** | 1,000 | -| **StackOverflow questions** | 5 languages + 5 frameworks | -| **Reddit posts** | 500+ from r/webdev | -| **Output CSVs** | 11 validated datasets | -| **Trend Score** | Top technology ranking | -| **Tests** | 72 passing (pytest) | -| **Code coverage** | All ETL modules tested | +### 1) ETL Weekly Data Refresh (`etl_semanal.yml`) ---- +Trigger: +- Schedule: every Monday at `08:00 UTC`. +- Manual: `workflow_dispatch`. -## Dashboard Features +Flow: +1. Run source jobs in parallel: GitHub, StackOverflow, Reddit. +2. Upload source artifacts. +3. Aggregate job downloads artifacts, runs Trend Score, syncs frontend assets, validates data contract. +4. Publish job commits refreshed data if aggregate is successful. -| Page | Visualizations | -|------|----------------| -| **Home** | Executive KPIs, global insights, navigation | -| **GitHub** | Top 10 languages · Framework commits · Stars vs Contributors correlation | -| **StackOverflow** | Question volume · Acceptance rates · Monthly trends (Python/JS/TS) | -| **Reddit** | Framework sentiment · Emerging topics · GitHub-Reddit intersection | +Important behavior: +- Reddit source is non-blocking in source stage (degraded mode is allowed). +- Aggregate stage enforces required outputs for frontend and trend artifacts. -Each dashboard includes **Key Insights** cards and an **Export ZIP** button. +### 2) CI - Tests (`ci.yml`) ---- +Trigger: +- Push and pull request checks for Python tests. -## Tech Stack +### 3) Dependency Security Audit (`dependency_security.yml`) -| Layer | Technologies | -|-------|--------------| -| **ETL Pipeline** | Python 3.9+, pandas, requests, NLTK | -| **Architecture** | BaseETL (OOP), custom exceptions, data validation | -| **Testing** | pytest, unittest.mock (72 tests, API mocking) | -| **Frontend** | Flutter Web, Dart, fl_chart, google_fonts | -| **Data Storage** | CSV (11 files, pathlib paths) | -| **Automation** | Makefile, sync_assets.py, GitHub Actions | -| **Security** | pip-audit, dependency security workflow | -| **Deployment** | GitHub Pages | +Trigger: +- Dependency file changes and weekly schedule (Monday at `09:00 UTC`). +- Manual execution supported. ---- +### 4) Frontend Deploy (`deploy_frontend.yml`) -## Quick Start +Trigger: +- Push to `main` affecting frontend/data paths. +- Successful completion of ETL workflow. +- Manual execution. -```bash -# Clone repository -git clone https://github.com/Sam-24-dev/Technology-trend-analysis-platform.git -cd Technology-trend-analysis-platform - -# Install dependencies -make install +## Environment Variables -# Run full pipeline (ETL + Trend Score) -make etl +Create `.env` in repo root: -# Run tests -make test +```env +GITHUB_TOKEN=your_token +STACKOVERFLOW_KEY=your_key +REDDIT_CLIENT_ID=your_client_id +REDDIT_CLIENT_SECRET=your_client_secret + +DATA_WRITE_LEGACY_CSV=1 +DATA_WRITE_LATEST_CSV=0 +DATA_WRITE_HISTORY_CSV=0 +EXPORT_HISTORY_BRIDGE_JSON=1 +TREND_SCORE_ENGINE=legacy +``` -# Sync CSVs to frontend -make sync +Notes: +- Local defaults keep legacy behavior. +- CI workflow sets dual write and DuckDB explicitly for weekly runs. -# Or run everything at once -make all -``` +## Local Commands -### Environment Setup +```bash +# backend +pip install -r backend/requirements.txt +python -m pytest -q -Create a `.env` file in the project root: -```env -GITHUB_TOKEN=your_github_personal_access_token -STACKOVERFLOW_KEY=your_so_api_key # optional -REDDIT_CLIENT_ID=your_reddit_client_id # optional (OAuth) -REDDIT_CLIENT_SECRET=your_reddit_secret # optional (OAuth) -``` +# run ETLs +python backend/github_etl.py +python backend/stackoverflow_etl.py +python backend/reddit_etl.py +python backend/trend_score.py -### Run Frontend +# sync assets + bridge +python backend/sync_assets.py -```bash +# frontend cd frontend flutter pub get flutter run -d chrome ``` -> **Note:** Pre-processed data is included in `datos/`. Only run ETL if you need fresh data. - ---- - -## Project Structure - -``` -Technology-trend-analysis-platform/ -├── backend/ # ETL Pipeline (Python) -│ ├── config/ -│ │ ├── __init__.py -│ │ └── settings.py # Centralized config (pathlib, dates) -│ ├── base_etl.py # Abstract ETL base class (OOP) -│ ├── github_etl.py # GitHubETL: 4 analysis steps -│ ├── stackoverflow_etl.py # StackOverflowETL: 3 analysis steps -│ ├── reddit_etl.py # RedditETL: 3 analysis steps + NLP -│ ├── trend_score.py # Composite index (3 sources) -│ ├── validador.py # DataFrame validation before save -│ ├── exceptions.py # ETLExtractionError, ETLValidationError -│ ├── sync_assets.py # Copy CSVs to frontend -│ └── requirements.txt -├── datos/ # Processed CSVs (11 files) -├── docs/ -│ └── architecture.md -├── frontend/ # Flutter Web Dashboard -│ ├── lib/ -│ │ ├── main.dart -│ │ ├── screens/ # 5 screens (home, github, so, reddit) -│ │ ├── models/ # Data models per source -│ │ ├── services/ # CSV parsing service -│ │ └── widgets/ # Reusable chart card -│ ├── assets/ -│ │ ├── data/ # CSVs for visualization -│ │ └── images/ # Technology logos -│ └── pubspec.yaml -├── logs/ # Daily ETL logs -├── tests/ # pytest suite (72 tests) -│ ├── conftest.py -│ ├── test_github_etl.py -│ ├── test_stackoverflow_etl.py -│ ├── test_reddit_etl.py -│ └── test_trend_score.py -├── .env.example -├── .gitignore -├── LICENSE -├── Makefile # make install/etl/test/sync/all -├── pyproject.toml # Pylint + pytest config -└── README.md -``` - ---- - -## Scalability & Roadmap - -- **Orchestration:** Pipeline structure is compatible with Apache Airflow for scheduled runs -- **Database:** Migration path to PostgreSQL/BigQuery for data warehousing -- **Containerization:** Ready for Docker deployment -- **CI/CD:** GitHub Actions for automated testing and deployment -- **API Layer:** FastAPI integration for programmatic data access - ---- - -## Team - -| Member | Role | Responsibility | -|--------|------|----------------| -| **Samir Caizapasto** | Lead Developer | GitHub ETL + Dashboard + Architecture | -| **Andrés Salinas** | Developer | StackOverflow ETL + Dashboard | -| **Mateo Mayorga** | Developer | Reddit ETL + Dashboard + NLP | - ---- - -
- -### Author - -**Samir Caizapasto** -*Junior Data Engineer & Analyst* - -[![](https://img.shields.io/badge/LinkedIn-Connect-0077B5?style=for-the-badge&logo=linkedin)](https://www.linkedin.com/in/samir-caizapasto/) -[![](https://img.shields.io/badge/Portfolio-Visit-00d4ff?style=for-the-badge&logo=vercel)](https://portafolio-samir-tau.vercel.app/) -[![](https://img.shields.io/badge/GitHub-Follow-181717?style=for-the-badge&logo=github)](https://github.com/Sam-24-dev) - -
+## Release Readiness ---- +Release and cutover policy is defined in: +- `docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md` (sections 19 and 20) -
+In short: +- Implementation is done. +- Production cutover requires operational stability gates. -⭐ If this project demonstrates useful data engineering practices, please give it a star. +## License -
+MIT diff --git a/backend/base_etl.py b/backend/base_etl.py index 2912e03..8a47152 100644 --- a/backend/base_etl.py +++ b/backend/base_etl.py @@ -9,17 +9,24 @@ class GitHubETL(BaseETL): def definir_pasos(self): return [ - ("Extraccion de repos", self.extraer_repos), - ("Analisis de lenguajes", self.analizar_lenguajes), + ("Repository extraction", self.extraer_repos), + ("Language analysis", self.analizar_lenguajes), ] """ import logging import sys -from datetime import datetime +from datetime import datetime, timezone from abc import ABC, abstractmethod from time import perf_counter from config.settings import LOG_FORMAT, LOG_DATE_FORMAT, LOGS_DIR, ARCHIVOS_SALIDA +from config.settings import ( + WRITE_LEGACY_CSV, + WRITE_LATEST_CSV, + WRITE_HISTORY_CSV, + get_latest_output_path, + get_history_output_path, +) from exceptions import ETLExtractionError, ETLValidationError from validador import validar_dataframe @@ -68,7 +75,7 @@ def configurar_logging(self): self.logger.addHandler(file_handler) def guardar_csv(self, df, nombre_archivo): - """Validates and saves a DataFrame to CSV. + """Validates and saves a DataFrame to one or more CSV destinations. Args: df: DataFrame to save. @@ -77,17 +84,45 @@ def guardar_csv(self, df, nombre_archivo): Raises: ETLValidationError: If the DataFrame is empty. """ - ruta = ARCHIVOS_SALIDA.get(nombre_archivo) - if ruta is None: + ruta_legacy = ARCHIVOS_SALIDA.get(nombre_archivo) + if ruta_legacy is None: self.logger.warning("No hay ruta de salida para '%s'", nombre_archivo) return validar_dataframe(df, nombre_archivo) - df.to_csv(ruta, index=False, encoding="utf-8") + + destinos = [] + if WRITE_LEGACY_CSV: + destinos.append(("legacy", ruta_legacy)) + if WRITE_LATEST_CSV: + ruta_latest = get_latest_output_path(nombre_archivo) + if ruta_latest is not None: + destinos.append(("latest", ruta_latest)) + if WRITE_HISTORY_CSV: + ruta_history = get_history_output_path(nombre_archivo, fecha=datetime.now(timezone.utc)) + if ruta_history is not None: + destinos.append(("history", ruta_history)) + + if not destinos: + self.logger.warning( + "Escritura deshabilitada para '%s' (sin destinos activos por config)", + nombre_archivo, + ) + return + + rutas_escritas = set() + for salida, ruta in destinos: + ruta = ruta.resolve() + if ruta in rutas_escritas: + continue + ruta.parent.mkdir(parents=True, exist_ok=True) + df.to_csv(ruta, index=False, encoding="utf-8") + rutas_escritas.add(ruta) + self._run_summary["files_written"].append(str(ruta)) + self.logger.info("[WRITE] archivo=%s destino=%s filas=%d", ruta, salida, len(df)) + filas = len(df) - self._run_summary["files_written"].append(str(ruta)) self._run_summary["rows_written"] += filas - self.logger.info("[WRITE] archivo=%s filas=%d", ruta, filas) @abstractmethod def definir_pasos(self): @@ -102,8 +137,8 @@ def definir_pasos(self): Example: return [ - ("Extraccion", self.extraer_repos), - ("Lenguajes", self.analizar_lenguajes), + ("Extraction", self.extraer_repos), + ("Languages", self.analizar_lenguajes), ] """ raise NotImplementedError diff --git a/backend/config/csv_contract.py b/backend/config/csv_contract.py index 9bcf569..9234e45 100644 --- a/backend/config/csv_contract.py +++ b/backend/config/csv_contract.py @@ -1,12 +1,12 @@ """ -Contrato de esquemas CSV compartido entre backend y frontend. +Shared CSV schema contract between backend and frontend. -Este módulo define las columnas requeridas/criticas para cada salida -del pipeline ETL. Centralizar este contrato reduce acoplamiento implícito -y hace explícitas las dependencias de datos entre módulos. +This module defines required/critical columns for each ETL output. +Centralizing the contract reduces implicit coupling and makes +data dependencies explicit across modules. """ -CONTRACT_VERSION = "2026.03" +CONTRACT_VERSION = "2026.04" CSV_SCHEMA_CONTRACT = { "github_repos": { @@ -156,25 +156,30 @@ def get_required_columns(nombre_archivo): - """Retorna las columnas requeridas para un archivo lógico de salida.""" + """Returns required columns for a logical output file.""" return CSV_SCHEMA_CONTRACT.get(nombre_archivo, {}).get("required_columns", []) def get_critical_columns(nombre_archivo): - """Retorna las columnas críticas para un archivo lógico de salida.""" + """Returns critical columns for a logical output file.""" return CSV_SCHEMA_CONTRACT.get(nombre_archivo, {}).get("critical_columns", []) def get_optional_columns(nombre_archivo): - """Retorna columnas opcionales para un archivo lógico de salida.""" + """Returns optional columns for a logical output file.""" return CSV_SCHEMA_CONTRACT.get(nombre_archivo, {}).get("optional_columns", []) def get_column_types(nombre_archivo): - """Retorna contrato mínimo de tipos por columna para un CSV lógico.""" + """Returns the minimal column-type contract for a logical CSV.""" return CSV_SCHEMA_CONTRACT.get(nombre_archivo, {}).get("column_types", {}) def get_contract_version(): - """Retorna la versión vigente del contrato de datos CSV.""" + """Returns the current CSV data contract version.""" return CONTRACT_VERSION + + +def get_logical_dataset_names(): + """Returns logical dataset names available in the CSV contract.""" + return sorted(CSV_SCHEMA_CONTRACT.keys()) diff --git a/backend/config/data_product_contract.py b/backend/config/data_product_contract.py new file mode 100644 index 0000000..cbd9c5f --- /dev/null +++ b/backend/config/data_product_contract.py @@ -0,0 +1,254 @@ +"""Data product contract for ETL run manifests. + +This module defines the minimal structure and validations for: +1. Run manifest (execution level) +2. Dataset manifest (output level) + +It stays separate from the CSV contract to enable storage evolution +(latest/history/metadata) without breaking V1. +""" + +from __future__ import annotations + +import re +from datetime import datetime, timezone +from typing import Any, Mapping + + +DATA_PRODUCT_CONTRACT_VERSION = "1.0.0" + +QUALITY_GATE_STATUSES = {"pass", "pass_with_warnings", "fail"} +DATASET_QUALITY_STATUSES = {"pass", "warning", "fail"} + +RUN_REQUIRED_FIELDS = ( + "run_id", + "generated_at_utc", + "git_sha", + "branch", + "source_window_start_utc", + "source_window_end_utc", + "quality_gate_status", + "datasets", +) + +DATASET_REQUIRED_FIELDS = ( + "dataset_logical_name", + "version_semver", + "generated_at_utc", + "source_run_id", + "schema_hash", + "row_count", + "quality_status", + "latest_path", + "history_path", +) + +_SEMVER_RE = re.compile( + r"^(0|[1-9]\d*)\." + r"(0|[1-9]\d*)\." + r"(0|[1-9]\d*)" + r"(?:-[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?" + r"(?:\+[0-9A-Za-z-]+(?:\.[0-9A-Za-z-]+)*)?$" +) +_HEX64_RE = re.compile(r"^[a-fA-F0-9]{64}$") + + +def _is_non_empty_string(value: Any) -> bool: + return isinstance(value, str) and bool(value.strip()) + + +def get_data_product_contract_version() -> str: + """Returns the current data product contract version.""" + return DATA_PRODUCT_CONTRACT_VERSION + + +def utc_now_iso() -> str: + """Returns UTC datetime in ISO-8601 format with Z suffix.""" + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def is_valid_semver(version: Any) -> bool: + """Validates semantic versioning (SemVer 2.0.0).""" + return _is_non_empty_string(version) and _SEMVER_RE.fullmatch(version.strip()) is not None + + +def is_valid_iso_utc(value: Any) -> bool: + """Validates ISO-8601 datetime with timezone.""" + if not _is_non_empty_string(value): + return False + + text = value.strip().replace("Z", "+00:00") + try: + parsed = datetime.fromisoformat(text) + except ValueError: + return False + + return parsed.tzinfo is not None + + +def validate_dataset_manifest(dataset_manifest: Mapping[str, Any], expected_run_id: str | None = None) -> list[str]: + """Validates minimal structure and rules for a dataset manifest. + + Args: + dataset_manifest: Individual dataset manifest. + expected_run_id: If provided, validates source_run_id == expected_run_id. + + Returns: + Error list. An empty list means valid manifest. + """ + errors: list[str] = [] + + if not isinstance(dataset_manifest, Mapping): + return ["dataset manifest debe ser un objeto (dict/mapping)"] + + for field in DATASET_REQUIRED_FIELDS: + if field not in dataset_manifest: + errors.append(f"falta campo requerido '{field}'") + + dataset_name = dataset_manifest.get("dataset_logical_name") + if "dataset_logical_name" in dataset_manifest and not _is_non_empty_string(dataset_name): + errors.append("'dataset_logical_name' debe ser string no vacio") + + version_semver = dataset_manifest.get("version_semver") + if "version_semver" in dataset_manifest and not is_valid_semver(version_semver): + errors.append("'version_semver' no cumple SemVer") + + generated_at_utc = dataset_manifest.get("generated_at_utc") + if "generated_at_utc" in dataset_manifest and not is_valid_iso_utc(generated_at_utc): + errors.append("'generated_at_utc' no es ISO-8601 valido con zona horaria") + + source_run_id = dataset_manifest.get("source_run_id") + if "source_run_id" in dataset_manifest and not _is_non_empty_string(source_run_id): + errors.append("'source_run_id' debe ser string no vacio") + if expected_run_id and source_run_id != expected_run_id: + errors.append("'source_run_id' no coincide con run_id del manifest principal") + + schema_hash = dataset_manifest.get("schema_hash") + if "schema_hash" in dataset_manifest: + if not _is_non_empty_string(schema_hash) or _HEX64_RE.fullmatch(schema_hash.strip()) is None: + errors.append("'schema_hash' debe ser hash sha256 en hexadecimal (64 chars)") + + row_count = dataset_manifest.get("row_count") + if "row_count" in dataset_manifest: + if not isinstance(row_count, int): + errors.append("'row_count' debe ser integer") + elif row_count < 0: + errors.append("'row_count' no puede ser negativo") + + quality_status = dataset_manifest.get("quality_status") + if "quality_status" in dataset_manifest and quality_status not in DATASET_QUALITY_STATUSES: + errors.append(f"'quality_status' invalido: {quality_status}") + + latest_path = dataset_manifest.get("latest_path") + if "latest_path" in dataset_manifest and not _is_non_empty_string(latest_path): + errors.append("'latest_path' debe ser string no vacio") + + history_path = dataset_manifest.get("history_path") + if "history_path" in dataset_manifest: + if quality_status == "fail": + if history_path is not None and not _is_non_empty_string(history_path): + errors.append("'history_path' debe ser null o string no vacio cuando quality_status=fail") + elif not _is_non_empty_string(history_path): + errors.append("'history_path' debe ser string no vacio") + + return errors + + +def validate_run_manifest(run_manifest: Mapping[str, Any]) -> tuple[bool, list[str]]: + """Validates minimal structure and rules for a run manifest.""" + errors: list[str] = [] + + if not isinstance(run_manifest, Mapping): + return False, ["run manifest debe ser un objeto (dict/mapping)"] + + for field in RUN_REQUIRED_FIELDS: + if field not in run_manifest: + errors.append(f"falta campo requerido '{field}'") + + run_id = run_manifest.get("run_id") + if "run_id" in run_manifest and not _is_non_empty_string(run_id): + errors.append("'run_id' debe ser string no vacio") + + generated_at_utc = run_manifest.get("generated_at_utc") + if "generated_at_utc" in run_manifest and not is_valid_iso_utc(generated_at_utc): + errors.append("'generated_at_utc' no es ISO-8601 valido con zona horaria") + + for field in ("source_window_start_utc", "source_window_end_utc"): + value = run_manifest.get(field) + if field in run_manifest and not is_valid_iso_utc(value): + errors.append(f"'{field}' no es ISO-8601 valido con zona horaria") + + quality_gate_status = run_manifest.get("quality_gate_status") + if "quality_gate_status" in run_manifest and quality_gate_status not in QUALITY_GATE_STATUSES: + errors.append(f"'quality_gate_status' invalido: {quality_gate_status}") + + for field in ("git_sha", "branch"): + value = run_manifest.get(field) + if field in run_manifest and not _is_non_empty_string(value): + errors.append(f"'{field}' debe ser string no vacio") + + datasets = run_manifest.get("datasets") + if "datasets" in run_manifest: + if not isinstance(datasets, list): + errors.append("'datasets' debe ser lista") + elif not datasets: + errors.append("'datasets' no puede estar vacio") + else: + for index, dataset_manifest in enumerate(datasets): + dataset_errors = validate_dataset_manifest( + dataset_manifest, + expected_run_id=run_id if _is_non_empty_string(run_id) else None, + ) + errors.extend(f"datasets[{index}]: {message}" for message in dataset_errors) + + return len(errors) == 0, errors + + +def build_dataset_manifest( + *, + dataset_logical_name: str, + version_semver: str, + source_run_id: str, + schema_hash: str, + row_count: int, + quality_status: str, + latest_path: str, + history_path: str | None, + generated_at_utc: str | None = None, +) -> dict[str, Any]: + """Builds a dataset manifest with standard fields.""" + return { + "dataset_logical_name": dataset_logical_name, + "version_semver": version_semver, + "generated_at_utc": generated_at_utc or utc_now_iso(), + "source_run_id": source_run_id, + "schema_hash": schema_hash, + "row_count": row_count, + "quality_status": quality_status, + "latest_path": latest_path, + "history_path": history_path, + } + + +def build_run_manifest( + *, + run_id: str, + git_sha: str, + branch: str, + source_window_start_utc: str, + source_window_end_utc: str, + quality_gate_status: str, + datasets: list[dict[str, Any]], + generated_at_utc: str | None = None, +) -> dict[str, Any]: + """Builds a run manifest with standard fields.""" + return { + "run_id": run_id, + "generated_at_utc": generated_at_utc or utc_now_iso(), + "git_sha": git_sha, + "branch": branch, + "source_window_start_utc": source_window_start_utc, + "source_window_end_utc": source_window_end_utc, + "quality_gate_status": quality_gate_status, + "datasets": datasets, + } diff --git a/backend/config/schema_contract_utils.py b/backend/config/schema_contract_utils.py new file mode 100644 index 0000000..7a53dcf --- /dev/null +++ b/backend/config/schema_contract_utils.py @@ -0,0 +1,112 @@ +"""Schema contract utilities for deterministic hashing and versioning policy.""" + +from __future__ import annotations + +import hashlib +import json +from typing import Any, Iterable, Mapping + + +SEMVER_MAJOR = "major" +SEMVER_MINOR = "minor" +SEMVER_PATCH = "patch" +VALID_BUMP_LEVELS = (SEMVER_MAJOR, SEMVER_MINOR, SEMVER_PATCH) + +_CHANGE_TO_BUMP = { + "remove_required_column": SEMVER_MAJOR, + "rename_required_column": SEMVER_MAJOR, + "change_type_incompatible": SEMVER_MAJOR, + "tighten_nullability": SEMVER_MAJOR, + "drop_dataset": SEMVER_MAJOR, + "change_partition_key_breaking": SEMVER_MAJOR, + "add_optional_column": SEMVER_MINOR, + "add_required_column_with_default": SEMVER_MINOR, + "add_non_breaking_quality_rule": SEMVER_MINOR, + "add_partition_field_backward_compatible": SEMVER_MINOR, + "add_optional_dataset_metadata": SEMVER_MINOR, + "fix_quality_rule_bug": SEMVER_PATCH, + "relax_warning_threshold": SEMVER_PATCH, + "metadata_only_change": SEMVER_PATCH, + "reorder_columns_only": SEMVER_PATCH, + "backfill_without_schema_change": SEMVER_PATCH, +} + +_BUMP_PRIORITY = { + SEMVER_MAJOR: 3, + SEMVER_MINOR: 2, + SEMVER_PATCH: 1, +} + + +def _canonical_type_name(raw_type: Any) -> str: + text = str(raw_type or "").strip().lower() + aliases = { + "int": "integer", + "int32": "integer", + "int64": "integer", + "long": "integer", + "float": "number", + "float32": "number", + "float64": "number", + "double": "number", + "str": "string", + "string": "string", + "bool": "boolean", + "boolean": "boolean", + "datetime64[ns]": "datetime", + "timestamp": "datetime", + } + return aliases.get(text, text) + + +def canonicalize_schema_columns(columns: Iterable[Mapping[str, Any]]) -> list[dict[str, Any]]: + """Returns a deterministic canonical schema representation.""" + normalized: list[dict[str, Any]] = [] + + for column in columns: + name = str(column.get("name", "")).strip() + if not name: + continue + + nullable_value = column.get("nullable", True) + nullable = bool(nullable_value) + normalized.append( + { + "name": name.lower(), + "type": _canonical_type_name(column.get("type")), + "nullable": nullable, + } + ) + + normalized.sort(key=lambda item: item["name"]) + return normalized + + +def compute_schema_hash(columns: Iterable[Mapping[str, Any]]) -> str: + """Computes deterministic SHA-256 hash for a canonicalized schema.""" + canonical = canonicalize_schema_columns(columns) + payload = json.dumps(canonical, sort_keys=True, separators=(",", ":"), ensure_ascii=True) + return hashlib.sha256(payload.encode("utf-8")).hexdigest() + + +def recommend_semver_bump(change_kind: str) -> str: + """Maps a schema/data-contract change kind to SemVer bump level.""" + normalized = str(change_kind or "").strip().lower() + if normalized not in _CHANGE_TO_BUMP: + raise ValueError(f"Unknown change kind: {change_kind}") + return _CHANGE_TO_BUMP[normalized] + + +def aggregate_semver_bump(change_kinds: Iterable[str]) -> str: + """Returns the highest-priority bump required by a list of changes.""" + selected_level = SEMVER_PATCH + selected_priority = _BUMP_PRIORITY[selected_level] + + for change_kind in change_kinds: + level = recommend_semver_bump(change_kind) + priority = _BUMP_PRIORITY[level] + if priority > selected_priority: + selected_level = level + selected_priority = priority + + return selected_level diff --git a/backend/config/settings.py b/backend/config/settings.py index b2d6b4e..9096008 100644 --- a/backend/config/settings.py +++ b/backend/config/settings.py @@ -6,20 +6,26 @@ """ import os from pathlib import Path -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone from dotenv import load_dotenv -# Rutas del proyecto (cross-platform con pathlib) +# Project paths (cross-platform with pathlib) PROYECTO_ROOT = Path(__file__).resolve().parent.parent.parent BACKEND_DIR = PROYECTO_ROOT / "backend" DATOS_DIR = PROYECTO_ROOT / "datos" +DATOS_LATEST_DIR = DATOS_DIR / "latest" +DATOS_HISTORY_DIR = DATOS_DIR / "history" +DATOS_METADATA_DIR = DATOS_DIR / "metadata" FRONTEND_ASSETS_DIR = PROYECTO_ROOT / "frontend" / "assets" / "data" LOGS_DIR = PROYECTO_ROOT / "logs" DATOS_DIR.mkdir(exist_ok=True) +DATOS_LATEST_DIR.mkdir(parents=True, exist_ok=True) +DATOS_HISTORY_DIR.mkdir(parents=True, exist_ok=True) +DATOS_METADATA_DIR.mkdir(parents=True, exist_ok=True) LOGS_DIR.mkdir(exist_ok=True) -# Variables de entorno +# Environment variables env_path = PROYECTO_ROOT / ".env" load_dotenv(env_path) @@ -44,7 +50,7 @@ SO_API_KEY = os.getenv("STACKOVERFLOW_KEY") SO_API_URL = "https://api.stackexchange.com/2.3/search/advanced" -# Reddit API (OAuth para evitar bloqueo de IP en CI) +# Reddit API (OAuth to avoid CI datacenter IP blocking) REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID") REDDIT_CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET") REDDIT_SUBREDDIT = "webdev" @@ -57,7 +63,7 @@ "User-Agent": REDDIT_USER_AGENT } -# Archivos de salida +# Output files ARCHIVOS_SALIDA = { "github_repos": DATOS_DIR / "github_repos_2025.csv", "github_lenguajes": DATOS_DIR / "github_lenguajes.csv", @@ -73,11 +79,44 @@ "trend_score": DATOS_DIR / "trend_score.csv", } +# Data write strategy (incremental refactor) +# - LEGACY: keeps current historical behavior +# - LATEST: publishes CSVs in datos/latest for sync consumption +# - HISTORY: stores date-partitioned snapshots (CSV for now) +WRITE_LEGACY_CSV = os.getenv("DATA_WRITE_LEGACY_CSV", "1") == "1" +WRITE_LATEST_CSV = os.getenv("DATA_WRITE_LATEST_CSV", "0") == "1" +WRITE_HISTORY_CSV = os.getenv("DATA_WRITE_HISTORY_CSV", "0") == "1" + + +def get_latest_output_path(nombre_archivo): + """Returns the datos/latest path for a logical output file.""" + ruta_legacy = ARCHIVOS_SALIDA.get(nombre_archivo) + if ruta_legacy is None: + return None + return DATOS_LATEST_DIR / ruta_legacy.name + + +def get_history_output_path(nombre_archivo, fecha=None): + """Returns a date-partitioned path for CSV history.""" + ruta_legacy = ARCHIVOS_SALIDA.get(nombre_archivo) + if ruta_legacy is None: + return None + + fecha_ref = fecha or datetime.now(timezone.utc) + particion = ( + DATOS_HISTORY_DIR + / nombre_archivo + / f"year={fecha_ref.strftime('%Y')}" + / f"month={fecha_ref.strftime('%m')}" + / f"day={fecha_ref.strftime('%d')}" + ) + return particion / ruta_legacy.name + # Logging LOG_FORMAT = "[%(asctime)s] [%(levelname)s] %(name)s - %(message)s" LOG_DATE_FORMAT = "%Y-%m-%d %H:%M:%S" -# Resiliencia de red (compartido entre ETLs) +# Network resilience (shared across ETLs) REQUEST_TIMEOUT_SECONDS = 10 HTTP_MAX_RETRIES = 3 HTTP_RETRY_BACKOFF_SECONDS = 2 @@ -85,7 +124,7 @@ REQUEST_MEDIUM_DELAY_SECONDS = 0.5 REQUEST_SHORT_DELAY_SECONDS = 0.3 -# Rango de fechas dinamico (ultimos 12 meses) +# Dynamic date range (last 12 months) FECHA_FIN = datetime.now() FECHA_INICIO = FECHA_FIN - timedelta(days=365) diff --git a/backend/export_history_json.py b/backend/export_history_json.py new file mode 100644 index 0000000..75dd2d4 --- /dev/null +++ b/backend/export_history_json.py @@ -0,0 +1,366 @@ +"""Exports frontend bridge JSON assets from ETL history snapshots.""" + +from __future__ import annotations + +import json +import logging +from datetime import datetime, timezone +from pathlib import Path + +import pandas as pd + + +logger = logging.getLogger("export_history_json") + +HISTORY_INDEX_FILENAME = "history_index.json" +TREND_SCORE_HISTORY_FILENAME = "trend_score_history.json" + + +def _utc_now_iso(): + return datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z") + + +def _to_relative_path(path, project_root): + try: + return path.relative_to(project_root).as_posix() + except ValueError: + return path.as_posix() + + +def _safe_int(value, default=0): + try: + return int(value) + except (TypeError, ValueError): + return default + + +def _safe_float(value, default=0.0): + try: + return float(value) + except (TypeError, ValueError): + return default + + +def _extract_partition_date(parts): + if len(parts) < 4: + return None + year_part, month_part, day_part = parts[1], parts[2], parts[3] + if not (year_part.startswith("year=") and month_part.startswith("month=") and day_part.startswith("day=")): + return None + year = year_part.split("=", maxsplit=1)[1] + month = month_part.split("=", maxsplit=1)[1] + day = day_part.split("=", maxsplit=1)[1] + return f"{year}-{month}-{day}" + + +def _count_rows(csv_path): + try: + return len(pd.read_csv(csv_path)) + except Exception: # pylint: disable=broad-exception-caught + return None + + +def _collect_history_files(project_root): + history_root = project_root / "datos" / "history" + if not history_root.exists(): + return {} + + datasets = {} + for csv_path in history_root.rglob("*.csv"): + rel_parts = csv_path.relative_to(history_root).parts + if len(rel_parts) < 5: + continue + + dataset = rel_parts[0] + snapshot_date = _extract_partition_date(rel_parts) + if snapshot_date is None: + continue + + datasets.setdefault(dataset, []) + datasets[dataset].append( + { + "date": snapshot_date, + "path": _to_relative_path(csv_path, project_root), + "row_count": _count_rows(csv_path), + } + ) + + for dataset in datasets: + datasets[dataset] = sorted( + datasets[dataset], + key=lambda item: (item["date"], item["path"]), + ) + return datasets + + +def _collect_latest_files(project_root): + latest_root = project_root / "datos" / "latest" + if not latest_root.exists(): + return {} + + latest_files = {} + for csv_path in latest_root.glob("*.csv"): + dataset = csv_path.stem + latest_files[dataset] = { + "path": _to_relative_path(csv_path, project_root), + "row_count": _count_rows(csv_path), + } + return latest_files + + +def build_history_index(project_root): + """Builds history index metadata for frontend bridge use.""" + history_files = _collect_history_files(project_root) + latest_files = _collect_latest_files(project_root) + datasets = [] + + for dataset_name in sorted(set(history_files.keys()) | set(latest_files.keys())): + latest_info = latest_files.get(dataset_name) + snapshots = history_files.get(dataset_name, []) + datasets.append( + { + "dataset": dataset_name, + "latest_path": latest_info["path"] if latest_info else None, + "latest_row_count": latest_info["row_count"] if latest_info else None, + "history_snapshot_count": len(snapshots), + "snapshots": snapshots, + } + ) + + return { + "generated_at_utc": _utc_now_iso(), + "dataset_count": len(datasets), + "datasets": datasets, + } + + +def _resolve_trend_snapshot_sources(project_root, history_index): + trend_entry = next((item for item in history_index["datasets"] if item["dataset"] == "trend_score"), None) + if trend_entry is None: + return [] + + sources = [] + for snapshot in trend_entry["snapshots"]: + csv_path = project_root / snapshot["path"] + if csv_path.exists(): + sources.append( + { + "date": snapshot["date"], + "path": snapshot["path"], + "source_type": "history", + } + ) + + if not sources and trend_entry.get("latest_path"): + latest_path = project_root / trend_entry["latest_path"] + if latest_path.exists(): + mtime = datetime.fromtimestamp(latest_path.stat().st_mtime, tz=timezone.utc) + sources.append( + { + "date": mtime.strftime("%Y-%m-%d"), + "path": trend_entry["latest_path"], + "source_type": "latest", + } + ) + + return sorted(sources, key=lambda item: (item["date"], item["path"])) + + +def _build_trend_snapshot_record(df, date_label, relative_path, source_type): + working = df.copy() + if "ranking" not in working.columns: + working = working.sort_values("trend_score", ascending=False).reset_index(drop=True) + working["ranking"] = range(1, len(working) + 1) + + top_10 = [] + for _, row in working.sort_values("ranking", ascending=True).head(10).iterrows(): + top_10.append( + { + "ranking": _safe_int(row.get("ranking"), default=0), + "tecnologia": str(row.get("tecnologia", "")), + "trend_score": round(_safe_float(row.get("trend_score"), default=0.0), 2), + "fuentes": _safe_int(row.get("fuentes"), default=0), + } + ) + + return { + "date": date_label, + "path": relative_path, + "source_type": source_type, + "row_count": len(working), + "top_10": top_10, + } + + +def _is_valid_trend_snapshot_df(df): + required_columns = {"tecnologia", "trend_score"} + return required_columns.issubset(df.columns) + + +def _append_trend_snapshot( + *, + snapshots, + snapshots_with_df, + dataframe, + date_label, + relative_path, + source_type, +): + snapshots.append( + _build_trend_snapshot_record( + df=dataframe, + date_label=date_label, + relative_path=relative_path, + source_type=source_type, + ) + ) + snapshots_with_df.append( + { + "date": date_label, + "dataframe": dataframe, + } + ) + + +def _build_trend_series(snapshots_with_df): + series_map = {} + for snapshot in snapshots_with_df: + date_label = snapshot["date"] + df = snapshot["dataframe"] + working = df.copy() + if "ranking" not in working.columns: + working = working.sort_values("trend_score", ascending=False).reset_index(drop=True) + working["ranking"] = range(1, len(working) + 1) + + for _, row in working.iterrows(): + tech = str(row.get("tecnologia", "")).strip() + if not tech: + continue + series_map.setdefault(tech, []) + series_map[tech].append( + { + "date": date_label, + "ranking": _safe_int(row.get("ranking"), default=0), + "trend_score": round(_safe_float(row.get("trend_score"), default=0.0), 2), + "fuentes": _safe_int(row.get("fuentes"), default=0), + } + ) + + series = [] + for tech, points in series_map.items(): + sorted_points = sorted(points, key=lambda item: item["date"]) + latest_ranking = sorted_points[-1]["ranking"] if sorted_points else 999999 + series.append( + { + "tecnologia": tech, + "points": sorted_points, + "_latest_ranking": latest_ranking, + } + ) + + series = sorted(series, key=lambda item: (item["_latest_ranking"], item["tecnologia"])) + for item in series: + item.pop("_latest_ranking", None) + return series + + +def build_trend_score_history(project_root, history_index): + """Builds trend_score_history payload for frontend bridge use.""" + sources = _resolve_trend_snapshot_sources(project_root, history_index) + snapshots = [] + snapshots_with_df = [] + + for source in sources: + csv_path = project_root / source["path"] + try: + df = pd.read_csv(csv_path) + except Exception as exc: # pylint: disable=broad-exception-caught + logger.warning("Skipping trend snapshot %s due to read error: %s", csv_path, exc) + continue + + if not _is_valid_trend_snapshot_df(df): + logger.warning("Skipping trend snapshot %s due to missing required columns", csv_path) + continue + + _append_trend_snapshot( + snapshots=snapshots, + snapshots_with_df=snapshots_with_df, + dataframe=df, + date_label=source["date"], + relative_path=source["path"], + source_type=source["source_type"], + ) + + # If history entries exist but all are corrupted/invalid, fallback to latest snapshot. + if not snapshots: + trend_entry = next((item for item in history_index["datasets"] if item["dataset"] == "trend_score"), None) + latest_path = trend_entry.get("latest_path") if trend_entry else None + if latest_path: + latest_csv_path = project_root / latest_path + if latest_csv_path.exists(): + try: + latest_df = pd.read_csv(latest_csv_path) + if _is_valid_trend_snapshot_df(latest_df): + mtime = datetime.fromtimestamp(latest_csv_path.stat().st_mtime, tz=timezone.utc) + _append_trend_snapshot( + snapshots=snapshots, + snapshots_with_df=snapshots_with_df, + dataframe=latest_df, + date_label=mtime.strftime("%Y-%m-%d"), + relative_path=latest_path, + source_type="latest", + ) + except Exception as exc: # pylint: disable=broad-exception-caught + logger.warning("Skipping latest trend snapshot fallback due to read error: %s", exc) + + return { + "generated_at_utc": _utc_now_iso(), + "snapshot_count": len(snapshots), + "snapshots": snapshots, + "series": _build_trend_series(snapshots_with_df), + } + + +def _write_json(path, payload): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def export_bridge_assets(project_root): + """Exports bridge JSON files for frontend historical access.""" + project_root = Path(project_root) + output_dir = project_root / "frontend" / "assets" / "data" + output_dir.mkdir(parents=True, exist_ok=True) + + history_index_payload = build_history_index(project_root) + trend_history_payload = build_trend_score_history(project_root, history_index_payload) + + history_index_path = output_dir / HISTORY_INDEX_FILENAME + trend_history_path = output_dir / TREND_SCORE_HISTORY_FILENAME + _write_json(history_index_path, history_index_payload) + _write_json(trend_history_path, trend_history_payload) + + summary = { + "files_written": 2, + "history_index_path": str(history_index_path), + "trend_score_history_path": str(trend_history_path), + "dataset_count": int(history_index_payload["dataset_count"]), + "trend_snapshot_count": int(trend_history_payload["snapshot_count"]), + } + return summary + + +def main(): + logging.basicConfig(level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(name)s - %(message)s") + project_root = Path(__file__).resolve().parent.parent + summary = export_bridge_assets(project_root) + logger.info( + "[RUN][SUMMARY] status=success files_written=%d datasets=%d trend_snapshots=%d", + summary["files_written"], + summary["dataset_count"], + summary["trend_snapshot_count"], + ) + + +if __name__ == "__main__": + main() diff --git a/backend/quality/__init__.py b/backend/quality/__init__.py new file mode 100644 index 0000000..ba1eb5c --- /dev/null +++ b/backend/quality/__init__.py @@ -0,0 +1 @@ +"""Quality utilities package.""" diff --git a/backend/quality/degradation_policy.py b/backend/quality/degradation_policy.py new file mode 100644 index 0000000..14ec4d0 --- /dev/null +++ b/backend/quality/degradation_policy.py @@ -0,0 +1,81 @@ +"""Trend score degradation policy based on source availability.""" + +from __future__ import annotations + +from typing import Mapping + + +DEFAULT_SOURCE_WEIGHTS = { + "github": 0.40, + "stackoverflow": 0.35, + "reddit": 0.25, +} + +_REQUIRED_SOURCES = ("github", "stackoverflow", "reddit") + + +def _normalize_status(source_status: Mapping[str, bool]) -> dict[str, bool]: + return {source: bool(source_status.get(source, False)) for source in _REQUIRED_SOURCES} + + +def _renormalize_weights(default_weights: Mapping[str, float], status: Mapping[str, bool]) -> dict[str, float]: + active_sources = [source for source, available in status.items() if available] + if not active_sources: + return {} + + total = sum(float(default_weights[source]) for source in active_sources) + if total <= 0: + return {} + + return { + source: round(float(default_weights[source]) / total, 6) + for source in active_sources + } + + +def evaluate_degradation_policy( + source_status: Mapping[str, bool], + default_weights: Mapping[str, float] | None = None, +) -> dict[str, object]: + """Evaluates publication decision and weights for source availability.""" + weights = default_weights or DEFAULT_SOURCE_WEIGHTS + status = _normalize_status(source_status) + + available_sources = [source for source, available in status.items() if available] + missing_sources = [source for source, available in status.items() if not available] + available_count = len(available_sources) + + if available_count == 3: + return { + "available_count": 3, + "available_sources": available_sources, + "missing_sources": missing_sources, + "publish_allowed": True, + "quality_gate_status": "pass", + "weights_mode": "default", + "effective_weights": dict(weights), + "reason": "all_sources_available", + } + + if available_count == 2: + return { + "available_count": 2, + "available_sources": available_sources, + "missing_sources": missing_sources, + "publish_allowed": True, + "quality_gate_status": "pass_with_warnings", + "weights_mode": "renormalized", + "effective_weights": _renormalize_weights(weights, status), + "reason": "single_source_missing", + } + + return { + "available_count": available_count, + "available_sources": available_sources, + "missing_sources": missing_sources, + "publish_allowed": False, + "quality_gate_status": "fail", + "weights_mode": "unavailable", + "effective_weights": {}, + "reason": "insufficient_sources", + } diff --git a/backend/quality/pandera_schemas.py b/backend/quality/pandera_schemas.py new file mode 100644 index 0000000..51a4208 --- /dev/null +++ b/backend/quality/pandera_schemas.py @@ -0,0 +1,215 @@ +"""Pandera quality checks with severity routing. + +This module defines dataset-level Pandera schemas and complementary +quality rules with explicit severities: +- critical: candidate to block publication in strict mode +- warning: publish allowed with quality flag +- info: observability only +""" + +from __future__ import annotations + +from typing import Any + +import pandas as pd + +SEVERITY_CRITICAL = "critical" +SEVERITY_WARNING = "warning" +SEVERITY_INFO = "info" +VALID_SEVERITIES = {SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO} + +try: + import pandera as pa + from pandera import Check + from pandera.errors import SchemaError, SchemaErrors + + PANDERA_AVAILABLE = True +except Exception: # pylint: disable=broad-exception-caught + pa = None + Check = None + SchemaError = Exception + SchemaErrors = Exception + PANDERA_AVAILABLE = False + + +def _make_issue(dataset: str, severity: str, rule: str, message: str) -> dict[str, str]: + safe_severity = severity if severity in VALID_SEVERITIES else SEVERITY_INFO + return { + "dataset": dataset, + "severity": safe_severity, + "rule": rule, + "message": message, + } + + +def _build_schema_registry() -> dict[str, Any]: + if not PANDERA_AVAILABLE: + return {} + + return { + "trend_score": pa.DataFrameSchema( + { + "ranking": pa.Column( + pa.Int64, + nullable=False, + checks=[ + Check(lambda series: (series >= 1).all(), error="ranking_must_be_positive"), + Check(lambda series: series.is_unique, error="ranking_must_be_unique"), + ], + ), + "tecnologia": pa.Column(pa.String, nullable=False), + "trend_score": pa.Column( + pa.Float64, + nullable=False, + checks=[Check(lambda series: (series >= 0).all(), error="trend_score_non_negative")], + ), + "fuentes": pa.Column( + pa.Int64, + nullable=False, + checks=[ + Check( + lambda series: ((series >= 0) & (series <= 3)).all(), + error="fuentes_must_be_in_range_0_3", + ) + ], + ), + }, + strict=False, + coerce=False, + ), + "so_volumen": pa.DataFrameSchema( + { + "lenguaje": pa.Column(pa.String, nullable=False), + "preguntas_nuevas_2025": pa.Column( + pa.Int64, + nullable=False, + checks=[Check(lambda series: (series >= 0).all(), error="yearly_volume_non_negative")], + ), + }, + strict=False, + coerce=False, + ), + } + + +PANDERA_SCHEMAS = _build_schema_registry() + + +def _parse_schema_errors(dataset: str, exc: Exception) -> list[dict[str, str]]: + issues: list[dict[str, str]] = [] + failure_cases = getattr(exc, "failure_cases", None) + + if isinstance(failure_cases, pd.DataFrame) and not failure_cases.empty: + for _, row in failure_cases.iterrows(): + column = row.get("column", "") + check = row.get("check", "schema_validation") + failure_case = row.get("failure_case", "") + message = f"column={column} check={check} failure={failure_case}" + issues.append( + _make_issue( + dataset=dataset, + severity=SEVERITY_CRITICAL, + rule="pandera_schema", + message=message, + ) + ) + return issues + + issues.append( + _make_issue( + dataset=dataset, + severity=SEVERITY_CRITICAL, + rule="pandera_schema", + message=str(exc), + ) + ) + return issues + + +def _run_warning_checks(df: pd.DataFrame, logical_name: str) -> list[dict[str, str]]: + issues: list[dict[str, str]] = [] + + if logical_name == "trend_score" and "tecnologia" in df.columns: + if df["tecnologia"].nunique(dropna=True) < 10: + issues.append( + _make_issue( + dataset=logical_name, + severity=SEVERITY_WARNING, + rule="low_technology_coverage", + message="fewer than 10 unique technologies in trend score output", + ) + ) + + if logical_name == "trend_score" and "fuentes" in df.columns: + numeric_fuentes = pd.to_numeric(df["fuentes"], errors="coerce").fillna(0) + zero_source_count = int((numeric_fuentes == 0).sum()) + if zero_source_count > 0: + issues.append( + _make_issue( + dataset=logical_name, + severity=SEVERITY_WARNING, + rule="zero_source_rows", + message=f"{zero_source_count} rows have fuentes=0", + ) + ) + + if logical_name == "so_volumen" and "preguntas_nuevas_2025" in df.columns: + numeric = pd.to_numeric(df["preguntas_nuevas_2025"], errors="coerce").fillna(0) + if not numeric.empty and (numeric == 0).all(): + issues.append( + _make_issue( + dataset=logical_name, + severity=SEVERITY_WARNING, + rule="all_zero_volume", + message="all StackOverflow yearly volumes are zero", + ) + ) + + return issues + + +def _run_info_checks(df: pd.DataFrame, logical_name: str) -> list[dict[str, str]]: + issues: list[dict[str, str]] = [] + + duplicate_rows = int(df.duplicated().sum()) + if duplicate_rows > 0: + issues.append( + _make_issue( + dataset=logical_name, + severity=SEVERITY_INFO, + rule="duplicate_rows_detected", + message=f"{duplicate_rows} duplicated rows detected", + ) + ) + + return issues + + +def run_pandera_quality_checks(df: pd.DataFrame, logical_name: str) -> list[dict[str, str]]: + """Runs Pandera schema validation and severity checks for one dataset.""" + issues: list[dict[str, str]] = [] + + if not PANDERA_AVAILABLE: + issues.append( + _make_issue( + dataset=logical_name, + severity=SEVERITY_INFO, + rule="pandera_unavailable", + message="Pandera is not installed; schema checks were skipped", + ) + ) + issues.extend(_run_info_checks(df, logical_name)) + return issues + + schema = PANDERA_SCHEMAS.get(logical_name) + if schema is not None: + try: + schema.validate(df, lazy=True) + except SchemaErrors as exc: + issues.extend(_parse_schema_errors(logical_name, exc)) + except SchemaError as exc: + issues.extend(_parse_schema_errors(logical_name, exc)) + + issues.extend(_run_warning_checks(df, logical_name)) + issues.extend(_run_info_checks(df, logical_name)) + return issues diff --git a/backend/reddit_etl.py b/backend/reddit_etl.py index 523bc4f..4c835e8 100644 --- a/backend/reddit_etl.py +++ b/backend/reddit_etl.py @@ -229,7 +229,7 @@ def extraer_posts(self, subreddit_name=REDDIT_SUBREDDIT, limit=REDDIT_LIMIT): self.logger.error(f"Error obteniendo posts: {e}") if not posts_data: - # Intentar cargar datos anteriores si existen + # Try loading previous data if available ruta_anterior = ARCHIVOS_SALIDA.get("reddit_sentimiento") if ruta_anterior and ruta_anterior.exists(): self.logger.warning(f"No se pudo extraer posts de r/{subreddit_name} — usando datos anteriores") @@ -449,4 +449,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/backend/requirements.txt b/backend/requirements.txt index 4f81a08..d14d1d0 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -4,6 +4,8 @@ pandas>=2.2.0,<3.0 numpy>=1.24,<3.0 python-dotenv>=1.0.0,<2.0 nltk>=3.8.1,<3.10 +pandera>=0.22.0,<0.23.0 +duckdb>=1.2.2,<2.0 # Testing -pytest>=8.0.0,<9.0 \ No newline at end of file +pytest>=8.0.0,<9.0 diff --git a/backend/sync_assets.py b/backend/sync_assets.py index cd16305..3ca30c1 100644 --- a/backend/sync_assets.py +++ b/backend/sync_assets.py @@ -4,10 +4,67 @@ Ensures the Flutter Web dashboard always uses the latest processed data from the ETL pipeline. """ + import logging +import os import shutil from pathlib import Path +from export_history_json import export_bridge_assets + + +def _is_bridge_export_enabled(): + return os.getenv("EXPORT_HISTORY_BRIDGE_JSON", "1") == "1" + + +def _resolver_origen_csv(proyecto_root): + """Resolves CSV source strategy, prioritizing latest per file with legacy fallback.""" + origen_latest = proyecto_root / "datos" / "latest" + origen_legacy = proyecto_root / "datos" + csv_by_name = {} + + if origen_legacy.exists(): + for csv_file in origen_legacy.glob("*.csv"): + csv_by_name[csv_file.name] = csv_file + + if origen_latest.exists(): + for csv_file in origen_latest.glob("*.csv"): + csv_by_name[csv_file.name] = csv_file + + return csv_by_name, origen_latest, origen_legacy + + +def _describe_source_mode(csv_by_name, origen_latest, origen_legacy): + if not csv_by_name: + return "none" + + latest_used = { + name + for name, path in csv_by_name.items() + if path.parent.resolve() == origen_latest.resolve() + } + legacy_used = { + name + for name, path in csv_by_name.items() + if path.parent.resolve() == origen_legacy.resolve() + } + + if latest_used and legacy_used: + return "mixed" + if latest_used: + return "latest" + return "legacy" + + +def _resolve_summary_source(source_mode, origen_latest, origen_legacy): + if source_mode == "latest": + return str(origen_latest) + if source_mode == "legacy": + return str(origen_legacy) + if source_mode == "mixed": + return "mixed(latest+legacy)" + return "none" + def sincronizar(): """Copies all CSV files from datos/ to frontend/assets/data/.""" @@ -16,34 +73,60 @@ def sincronizar(): logging.basicConfig(level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(name)s - %(message)s") proyecto_root = Path(__file__).resolve().parent.parent - origen = proyecto_root / "datos" + csv_by_name, origen_latest, origen_legacy = _resolver_origen_csv(proyecto_root) + source_mode = _describe_source_mode(csv_by_name, origen_latest, origen_legacy) + origen = _resolve_summary_source(source_mode, origen_latest, origen_legacy) destino = proyecto_root / "frontend" / "assets" / "data" destino.mkdir(parents=True, exist_ok=True) - logger.info("[RUN][START] origen=%s destino=%s", origen, destino) + logger.info("[RUN][START] origen=%s source_mode=%s destino=%s", origen, source_mode, destino) archivos_copiados = 0 errores = 0 - for csv_file in origen.glob("*.csv"): + bridge_files_written = 0 + bridge_enabled = _is_bridge_export_enabled() + + for csv_name in sorted(csv_by_name): + csv_file = csv_by_name[csv_name] try: shutil.copy2(csv_file, destino / csv_file.name) archivos_copiados += 1 - logger.info("[STEP][END] accion=copy archivo=%s estado=success", csv_file.name) + logger.info("[STEP][END] accion=copy archivo=%s origen=%s estado=success", csv_file.name, csv_file.parent) except Exception as exc: # pylint: disable=broad-exception-caught errores += 1 logger.error("[STEP][END] accion=copy archivo=%s estado=failed error=%s", csv_file.name, exc) + if bridge_enabled: + try: + bridge_summary = export_bridge_assets(proyecto_root) + bridge_files_written = int(bridge_summary["files_written"]) + logger.info( + "[STEP][END] action=bridge_export status=success files_written=%d trend_snapshots=%d", + bridge_files_written, + bridge_summary["trend_snapshot_count"], + ) + except Exception as exc: # pylint: disable=broad-exception-caught + errores += 1 + logger.error("[STEP][END] action=bridge_export status=failed error=%s", exc) + logger.info( - "[RUN][SUMMARY] estado=%s archivos_copiados=%d errores=%d origen=%s destino=%s", + "[RUN][SUMMARY] estado=%s archivos_copiados=%d bridge_files=%d bridge_enabled=%s " + "errores=%d source_mode=%s origen=%s destino=%s", "success" if errores == 0 else "partial", archivos_copiados, + bridge_files_written, + bridge_enabled, errores, + source_mode, origen, destino, ) return { "files_copied": archivos_copiados, + "bridge_files_written": bridge_files_written, + "bridge_export_enabled": bridge_enabled, "errors": errores, + "source_mode": source_mode, "source": str(origen), "destination": str(destino), } diff --git a/backend/tech_normalization.py b/backend/tech_normalization.py index 6064314..4325862 100644 --- a/backend/tech_normalization.py +++ b/backend/tech_normalization.py @@ -1,6 +1,6 @@ -"""Utilidades compartidas para normalizar nombres de tecnologías. +"""Shared utilities to normalize technology names. -Centraliza mapeos usados por ETLs para evitar drift entre módulos. +Centralizes mappings used by ETLs to avoid cross-module drift. """ from __future__ import annotations @@ -57,7 +57,7 @@ def normalize_technology_name(name: str) -> str: - """Normaliza nombre a etiqueta legible consistente.""" + """Normalizes a name into a consistent display label.""" text = str(name or "").strip() if not text: return "" @@ -65,7 +65,7 @@ def normalize_technology_name(name: str) -> str: def normalize_for_match(name: str) -> str: - """Normaliza nombre para comparación flexible cross-source.""" + """Normalizes a name for flexible cross-source matching.""" raw = str(name or "").strip().lower() if not raw: return "" diff --git a/backend/trend_score.py b/backend/trend_score.py index e394a08..d60c2e3 100644 --- a/backend/trend_score.py +++ b/backend/trend_score.py @@ -1,38 +1,28 @@ -""" -Trend Score Generator - Technology Trend Analysis Platform +"""Trend Score generator for the Technology Trend Analysis Platform.""" -Combines data from GitHub, StackOverflow, and Reddit to produce -a unified technology ranking. The composite score uses weighted -metrics from each source. - -Formula: - Trend Score = (peso_github × github_score) + - (peso_so × so_score) + - (peso_reddit × reddit_score) - -Author: Samir Caizapasto -""" -import pandas as pd import logging +import os from datetime import datetime -from config.settings import ( - ARCHIVOS_SALIDA, -) -from validador import validar_dataframe +import pandas as pd + from base_etl import BaseETL +from config.settings import ARCHIVOS_SALIDA from exceptions import ETLExtractionError from tech_normalization import normalize_technology_name +from trend_score_duckdb import calcular_trend_score_duckdb +from validador import validar_dataframe logger = logging.getLogger("trend_score") -# Pesos para cada fuente de datos PESOS = { "github": 0.40, "stackoverflow": 0.35, - "reddit": 0.25 + "reddit": 0.25, } +TREND_ENGINES = {"legacy", "duckdb"} + ETIQUETAS_NO_LENGUAJE = { "sin especificar", "llms/ai", @@ -45,26 +35,12 @@ def normalizar_nombre(nombre): - """Normalizes technology names for cross-source comparison. - - Args: - nombre: Raw technology name from any source. - - Returns: - Normalized lowercase name. - """ + """Normalizes technology names for cross-source comparison.""" return normalize_technology_name(nombre) def normalizar_scores(serie): - """Normalizes a numeric series to 0-100 scale using min-max. - - Args: - serie: pandas Series with numeric values. - - Returns: - Normalized series (0-100). - """ + """Normalizes a numeric series to 0-100 scale using min-max.""" if serie.max() == serie.min(): return pd.Series([50.0] * len(serie), index=serie.index) @@ -72,94 +48,71 @@ def normalizar_scores(serie): def cargar_github(): - """Loads and processes GitHub data for scoring. - - Returns: - DataFrame with columns: [tecnologia, github_score] - """ + """Loads and processes GitHub data for scoring.""" try: df_repos = pd.read_csv(ARCHIVOS_SALIDA["github_repos"]) df_repos["language"] = df_repos["language"].fillna("Sin especificar").astype(str).str.strip() df_repos = df_repos[~df_repos["language"].str.lower().isin(ETIQUETAS_NO_LENGUAJE)] if df_repos.empty: - logger.warning("GitHub: sin lenguajes clasificables tras aplicar filtros") + logger.warning("GitHub: no classifiable languages after filters") return pd.DataFrame(columns=["tecnologia", "github_score"]) langs = df_repos["language"].value_counts().head(15).reset_index() langs.columns = ["tecnologia", "repos_count"] langs["tecnologia"] = langs["tecnologia"].apply(normalizar_nombre) langs["github_score"] = normalizar_scores(langs["repos_count"]) - logger.info("GitHub: %d tecnologias cargadas", len(langs)) + logger.info("GitHub: %d technologies loaded", len(langs)) return langs[["tecnologia", "github_score"]] except FileNotFoundError: - logger.warning("No se encontro github_repos_2025.csv") + logger.warning("github_repos_2025.csv was not found") return pd.DataFrame(columns=["tecnologia", "github_score"]) - except (KeyError, ValueError) as e: - logger.error("Error procesando datos de GitHub: %s", e) + except (KeyError, ValueError) as exc: + logger.error("Error processing GitHub data: %s", exc) return pd.DataFrame(columns=["tecnologia", "github_score"]) def cargar_stackoverflow(): - """Loads and processes StackOverflow data for scoring. - - Returns: - DataFrame with columns: [tecnologia, so_score] - """ + """Loads and processes StackOverflow data for scoring.""" try: df_vol = pd.read_csv(ARCHIVOS_SALIDA["so_volumen"]) df_vol["tecnologia"] = df_vol["lenguaje"].apply(normalizar_nombre) df_vol["so_score"] = normalizar_scores(df_vol["preguntas_nuevas_2025"]) - logger.info("StackOverflow: %d tecnologias cargadas", len(df_vol)) + logger.info("StackOverflow: %d technologies loaded", len(df_vol)) return df_vol[["tecnologia", "so_score"]] except FileNotFoundError: - logger.warning("No se encontro so_volumen_preguntas.csv") + logger.warning("so_volumen_preguntas.csv was not found") return pd.DataFrame(columns=["tecnologia", "so_score"]) - except (KeyError, ValueError) as e: - logger.error("Error procesando datos de StackOverflow: %s", e) + except (KeyError, ValueError) as exc: + logger.error("Error processing StackOverflow data: %s", exc) return pd.DataFrame(columns=["tecnologia", "so_score"]) def cargar_reddit(): - """Loads and processes Reddit data for scoring. - - Returns: - DataFrame with columns: [tecnologia, reddit_score] - """ + """Loads and processes Reddit data for scoring.""" try: df_temas = pd.read_csv(ARCHIVOS_SALIDA["reddit_temas"]) df_temas["tecnologia"] = df_temas["tema"].apply(normalizar_nombre) df_temas["reddit_score"] = normalizar_scores(df_temas["menciones"]) - logger.info("Reddit: %d tecnologias cargadas", len(df_temas)) + logger.info("Reddit: %d technologies loaded", len(df_temas)) return df_temas[["tecnologia", "reddit_score"]] except FileNotFoundError: - logger.warning("No se encontro reddit_temas_emergentes.csv") + logger.warning("reddit_temas_emergentes.csv was not found") return pd.DataFrame(columns=["tecnologia", "reddit_score"]) - except (KeyError, ValueError) as e: - logger.error("Error procesando datos de Reddit: %s", e) + except (KeyError, ValueError) as exc: + logger.error("Error processing Reddit data: %s", exc) return pd.DataFrame(columns=["tecnologia", "reddit_score"]) -def calcular_trend_score(): - """Calculates the composite Trend Score for all technologies. - - Combines normalized scores from GitHub, StackOverflow, and Reddit - using weighted average. Technologies not found in a source get - a score of 0 for that source. - - Returns: - DataFrame with columns: [tecnologia, github_score, so_score, - reddit_score, trend_score, ranking] - """ - logger.info("Calculando Trend Score compuesto...") - logger.info("Pesos: GitHub=%s, SO=%s, Reddit=%s", PESOS['github'], PESOS['stackoverflow'], PESOS['reddit']) - - # Cargar datos de cada fuente +def _load_score_sources(): df_github = cargar_github() df_so = cargar_stackoverflow() df_reddit = cargar_reddit() + return df_github, df_so, df_reddit - # Combinar todas las tecnologias (outer join) + +def _build_legacy_trend_score(df_github, df_so, df_reddit): + """Builds Trend Score with the legacy pandas merge strategy.""" df_combined = pd.DataFrame({"tecnologia": []}) if not df_github.empty: @@ -170,49 +123,100 @@ def calcular_trend_score(): df_combined = pd.merge(df_combined, df_reddit, on="tecnologia", how="outer") if df_combined.empty: - logger.error("No hay datos de ninguna fuente para calcular Trend Score") + logger.error("No data from any source to calculate Trend Score") return pd.DataFrame() - # Rellenar NaN con 0 (tecnologia no encontrada en esa fuente) for col in ["github_score", "so_score", "reddit_score"]: if col not in df_combined.columns: df_combined[col] = 0.0 else: df_combined[col] = df_combined[col].fillna(0.0) - # Calcular score compuesto df_combined["trend_score"] = ( - PESOS["github"] * df_combined["github_score"] + - PESOS["stackoverflow"] * df_combined["so_score"] + - PESOS["reddit"] * df_combined["reddit_score"] + PESOS["github"] * df_combined["github_score"] + + PESOS["stackoverflow"] * df_combined["so_score"] + + PESOS["reddit"] * df_combined["reddit_score"] ).round(2) - # Ordenar por trend_score y agregar ranking df_combined = df_combined.sort_values("trend_score", ascending=False).reset_index(drop=True) df_combined["ranking"] = range(1, len(df_combined) + 1) - # Contar en cuantas fuentes aparece cada tecnologia df_combined["fuentes"] = ( - (df_combined["github_score"] > 0).astype(int) + - (df_combined["so_score"] > 0).astype(int) + - (df_combined["reddit_score"] > 0).astype(int) + (df_combined["github_score"] > 0).astype(int) + + (df_combined["so_score"] > 0).astype(int) + + (df_combined["reddit_score"] > 0).astype(int) ) - # Log del ranking - logger.info("\nTrend Score - Top Tecnologias (%d total):", len(df_combined)) - logger.info("%3s %-20s %8s %8s %8s %8s %8s", "#", "Tecnologia", "GitHub", "SO", "Reddit", "Score", "Fuentes") + return df_combined[ + ["ranking", "tecnologia", "github_score", "so_score", "reddit_score", "trend_score", "fuentes"] + ] + + +def calculate_trend_score_legacy(df_github, df_so, df_reddit): + """Public helper to compute trend score with the legacy engine.""" + return _build_legacy_trend_score(df_github, df_so, df_reddit) + + +def resolve_trend_engine(engine=None): + """Resolves the Trend Score engine from explicit input or environment.""" + resolved = str(engine or os.getenv("TREND_SCORE_ENGINE", "legacy")).strip().lower() + if resolved not in TREND_ENGINES: + logger.warning("Unknown trend engine '%s'. Falling back to 'legacy'.", resolved) + return "legacy" + return resolved + + +def _log_ranking_preview(df_combined): + logger.info("\nTrend Score - Top Technologies (%d total):", len(df_combined)) + logger.info("%3s %-20s %8s %8s %8s %8s %8s", "#", "Technology", "GitHub", "SO", "Reddit", "Score", "Sources") logger.info("-" * 75) for _, row in df_combined.head(15).iterrows(): logger.info( "#%2d %-20s %7.1f %7.1f %7.1f %7.1f %5d/3", - row['ranking'], row['tecnologia'], - row['github_score'], row['so_score'], - row['reddit_score'], row['trend_score'], - int(row['fuentes']) + row["ranking"], + row["tecnologia"], + row["github_score"], + row["so_score"], + row["reddit_score"], + row["trend_score"], + int(row["fuentes"]), ) - return df_combined + +def calcular_trend_score(engine=None): + """Calculates the composite Trend Score for all technologies.""" + logger.info("Calculating composite Trend Score...") + logger.info("Weights: GitHub=%s, SO=%s, Reddit=%s", PESOS["github"], PESOS["stackoverflow"], PESOS["reddit"]) + + df_github, df_so, df_reddit = _load_score_sources() + + if df_github.empty and df_so.empty and df_reddit.empty: + logger.error("No data from any source to calculate Trend Score") + return pd.DataFrame() + + engine_name = resolve_trend_engine(engine) + logger.info("Trend engine selected: %s", engine_name) + + if engine_name == "duckdb": + try: + df_result = calcular_trend_score_duckdb( + df_github=df_github, + df_so=df_so, + df_reddit=df_reddit, + pesos=PESOS, + ) + except Exception as exc: # pylint: disable=broad-exception-caught + logger.error("DuckDB engine failed (%s). Falling back to legacy engine.", exc) + df_result = _build_legacy_trend_score(df_github, df_so, df_reddit) + else: + df_result = _build_legacy_trend_score(df_github, df_so, df_reddit) + + if df_result.empty: + return df_result + + _log_ranking_preview(df_result) + return df_result def main(): @@ -222,53 +226,55 @@ def main(): class TrendScoreETL(BaseETL): - """Adaptador ETL para Trend Score con el contrato de BaseETL. - - Mantiene el comportamiento existente sin sobreingeniería: un único paso - que calcula, valida y guarda el CSV de trend score. - """ + """ETL adapter for Trend Score under the BaseETL contract.""" def __init__(self): super().__init__("trend_score") def definir_pasos(self): - return [("Calcular Trend Score", self._calcular_y_guardar)] + return [("Calculate Trend Score", self._calcular_y_guardar)] def _calcular_y_guardar(self): self.logger.info("Trend Score Generator - Technology Trend Analysis Platform") - self.logger.info("Fecha: %s", datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + self.logger.info("Execution date: %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S")) try: df_trend = calcular_trend_score() if df_trend.empty: raise ETLExtractionError( - "No se pudo generar Trend Score (sin datos de ninguna fuente)", + "Trend Score could not be generated (no data from any source)", critical=True, ) columnas_salida = [ - "ranking", "tecnologia", "github_score", - "so_score", "reddit_score", "trend_score", "fuentes" + "ranking", + "tecnologia", + "github_score", + "so_score", + "reddit_score", + "trend_score", + "fuentes", ] df_salida = df_trend[columnas_salida] - # Se mantiene validación explícita por contrato + guardado uniforme validar_dataframe(df_salida, "trend_score") self.guardar_csv(df_salida, "trend_score") top3 = df_salida.head(3) - self.logger.info("\nTop 3 tecnologias trending:") + self.logger.info("\nTop 3 trending technologies:") for _, row in top3.iterrows(): self.logger.info( " #%d. %s (Score: %s)", - int(row['ranking']), row['tecnologia'], row['trend_score'] + int(row["ranking"]), + row["tecnologia"], + row["trend_score"], ) - self.logger.info("Trend Score completado") + self.logger.info("Trend Score completed") except ETLExtractionError: raise - except Exception as e: # pylint: disable=broad-exception-caught - raise ETLExtractionError(f"Error fatal en Trend Score: {e}", critical=True) from e + except Exception as exc: # pylint: disable=broad-exception-caught + raise ETLExtractionError(f"Fatal error in Trend Score: {exc}", critical=True) from exc if __name__ == "__main__": diff --git a/backend/trend_score_duckdb.py b/backend/trend_score_duckdb.py new file mode 100644 index 0000000..209891d --- /dev/null +++ b/backend/trend_score_duckdb.py @@ -0,0 +1,96 @@ +"""DuckDB engine for Trend Score computation.""" + +from __future__ import annotations + +import pandas as pd + +try: + import duckdb +except Exception: # pylint: disable=broad-exception-caught + duckdb = None + + +def calcular_trend_score_duckdb(df_github, df_so, df_reddit, pesos): + """Computes Trend Score using DuckDB SQL over in-memory DataFrames.""" + if duckdb is None: + raise RuntimeError("DuckDB engine is unavailable. Install 'duckdb' to use this engine.") + + github_scores = ( + df_github[["tecnologia", "github_score"]].copy() + if not df_github.empty + else pd.DataFrame(columns=["tecnologia", "github_score"]) + ) + so_scores = ( + df_so[["tecnologia", "so_score"]].copy() + if not df_so.empty + else pd.DataFrame(columns=["tecnologia", "so_score"]) + ) + reddit_scores = ( + df_reddit[["tecnologia", "reddit_score"]].copy() + if not df_reddit.empty + else pd.DataFrame(columns=["tecnologia", "reddit_score"]) + ) + + connection = duckdb.connect(database=":memory:") + try: + connection.register("github_scores", github_scores) + connection.register("so_scores", so_scores) + connection.register("reddit_scores", reddit_scores) + + query = f""" + WITH merged AS ( + SELECT + COALESCE(g.tecnologia, s.tecnologia, r.tecnologia) AS tecnologia, + COALESCE(g.github_score, 0.0) AS github_score, + COALESCE(s.so_score, 0.0) AS so_score, + COALESCE(r.reddit_score, 0.0) AS reddit_score + FROM github_scores g + FULL OUTER JOIN so_scores s + ON g.tecnologia = s.tecnologia + FULL OUTER JOIN reddit_scores r + ON COALESCE(g.tecnologia, s.tecnologia) = r.tecnologia + ), + scored AS ( + SELECT + tecnologia, + github_score, + so_score, + reddit_score, + ROUND(( + {pesos['github']} * github_score + + {pesos['stackoverflow']} * so_score + + {pesos['reddit']} * reddit_score + ), 2) AS trend_score, + ( + CASE WHEN github_score > 0 THEN 1 ELSE 0 END + + CASE WHEN so_score > 0 THEN 1 ELSE 0 END + + CASE WHEN reddit_score > 0 THEN 1 ELSE 0 END + ) AS fuentes + FROM merged + ), + ranked AS ( + SELECT + ROW_NUMBER() OVER (ORDER BY trend_score DESC, tecnologia ASC) AS ranking, + tecnologia, + github_score, + so_score, + reddit_score, + trend_score, + fuentes + FROM scored + ) + SELECT + ranking, + tecnologia, + github_score, + so_score, + reddit_score, + trend_score, + fuentes + FROM ranked + ORDER BY ranking + """ + + return connection.execute(query).df() + finally: + connection.close() diff --git a/backend/validador.py b/backend/validador.py index 33acb0d..0eb16d2 100644 --- a/backend/validador.py +++ b/backend/validador.py @@ -2,14 +2,20 @@ Data validation utilities for the ETL pipeline. Provides reusable functions to validate DataFrames before -saving them to CSV: empty checks, column verification, -and null detection. +saving them to CSV, including severity-aware quality checks. """ + import logging import pandas as pd from exceptions import ETLValidationError from config.csv_contract import get_required_columns, get_critical_columns, get_column_types +from quality.pandera_schemas import ( + run_pandera_quality_checks, + SEVERITY_CRITICAL, + SEVERITY_WARNING, + SEVERITY_INFO, +) logger = logging.getLogger("validador") @@ -47,7 +53,79 @@ def _ok(value): return True -def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False): +def _empty_quality_report(): + return { + "critical": 0, + "warning": 0, + "info": 0, + "issues": [], + } + + +def _normalize_quality_issue(nombre_archivo, issue): + dataset = str(issue.get("dataset") or nombre_archivo) + severity = str(issue.get("severity") or SEVERITY_INFO).lower() + if severity not in {SEVERITY_CRITICAL, SEVERITY_WARNING, SEVERITY_INFO}: + severity = SEVERITY_INFO + rule = str(issue.get("rule") or "unspecified_rule") + message = str(issue.get("message") or "unspecified quality issue") + + return { + "dataset": dataset, + "severity": severity, + "rule": rule, + "message": message, + } + + +def _apply_quality_issues(nombre_archivo, issues, pandera_warn_only): + report = _empty_quality_report() + + for issue in issues: + normalized = _normalize_quality_issue(nombre_archivo, issue) + severity = normalized["severity"] + report[severity] += 1 + report["issues"].append(normalized) + + if severity == SEVERITY_CRITICAL: + logger.error( + "[QUALITY][CRITICAL] dataset=%s rule=%s message=%s", + normalized["dataset"], + normalized["rule"], + normalized["message"], + ) + elif severity == SEVERITY_WARNING: + logger.warning( + "[QUALITY][WARNING] dataset=%s rule=%s message=%s", + normalized["dataset"], + normalized["rule"], + normalized["message"], + ) + else: + logger.info( + "[QUALITY][INFO] dataset=%s rule=%s message=%s", + normalized["dataset"], + normalized["rule"], + normalized["message"], + ) + + if report["critical"] > 0 and not pandera_warn_only: + raise ETLValidationError( + f"'{nombre_archivo}' quality gate failed with {report['critical']} critical issue(s)" + ) + + return report + + +def validar_dataframe( + df, + nombre_archivo, + strict=False, + validate_types=False, + enable_pandera=False, + pandera_warn_only=True, + return_quality_report=False, +): """Validates a DataFrame before saving. Checks: @@ -60,17 +138,20 @@ def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False): nombre_archivo: Key from ARCHIVOS_SALIDA (e.g. 'github_repos'). strict: If True, raises ETLValidationError on schema violations. validate_types: If True, applies minimal type checks defined in the contract. + enable_pandera: If True, executes Pandera-based quality checks. + pandera_warn_only: If True, Pandera critical issues are routed as warnings (no block). + return_quality_report: If True, returns quality report instead of bool. Raises: ETLValidationError: If the DataFrame is empty. """ - # 1. Verificar que no esta vacio + # 1. Verify DataFrame is not empty if df.empty: raise ETLValidationError(f"DataFrame '{nombre_archivo}' esta vacio, no se puede guardar") logger.info("Validando '%s': %d filas, %d columnas", nombre_archivo, len(df), len(df.columns)) - # 2. Verificar columnas esperadas + # 2. Verify expected columns esperadas = get_required_columns(nombre_archivo) if esperadas: faltantes = [col for col in esperadas if col not in df.columns] @@ -81,7 +162,7 @@ def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False): f"'{nombre_archivo}' no cumple schema requerido, faltan columnas: {faltantes}" ) - # 3. Verificar nulos en columnas criticas + # 3. Verify nulls in critical columns criticas = get_critical_columns(nombre_archivo) for col in criticas: if col not in df.columns: @@ -103,7 +184,7 @@ def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False): f"'{nombre_archivo}' no cumple schema critico: columna '{col}' con nulos" ) - # 4. Verificar tipos mínimos (opcional) + # 4. Verify minimal types (optional) if validate_types: type_map = get_column_types(nombre_archivo) for col, expected_type in type_map.items(): @@ -122,4 +203,16 @@ def validar_dataframe(df, nombre_archivo, strict=False, validate_types=False): f"'{nombre_archivo}' no cumple tipo esperado en '{col}': {expected_type}" ) + quality_report = _empty_quality_report() + if enable_pandera: + quality_issues = run_pandera_quality_checks(df, nombre_archivo) + quality_report = _apply_quality_issues( + nombre_archivo=nombre_archivo, + issues=quality_issues, + pandera_warn_only=pandera_warn_only, + ) + + if return_quality_report: + return quality_report + return True diff --git a/backend/validate_csv_contract.py b/backend/validate_csv_contract.py index 461bf1e..56c1af6 100644 --- a/backend/validate_csv_contract.py +++ b/backend/validate_csv_contract.py @@ -1,11 +1,11 @@ -"""Valida headers CSV contra el contrato backend/frontend. +"""Validates CSV outputs against the shared backend/frontend contract. -Se utiliza en CI/ETL para detectar cambios incompatibles de esquema -antes de publicar datos al frontend. +Used in CI/ETL to detect incompatible schema changes +before publishing data to the frontend. """ -import sys import logging +import sys from pathlib import Path import pandas as pd @@ -19,39 +19,73 @@ logger = logging.getLogger("validate_csv_contract") -def validate_contract(strict=True): - """Valida archivos CSV existentes contra columnas requeridas del contrato. +def validate_contract(strict=True, enable_pandera=True, pandera_warn_only=True): + """Validates CSV files and routes quality issues by severity. + + Args: + strict: Enforces required schema and type checks. + enable_pandera: Enables/disables the Pandera quality stage. + pandera_warn_only: Routes Pandera critical issues as warnings when True. - Retorna: - tuple(bool, list[str]): (ok_global, mensajes) + Returns: + tuple(bool, list[str]): (overall_ok, messages) """ mode = "strict" if strict else "warn-only" - messages = [f"Validando contrato CSV v{get_contract_version()} (modo={mode})..."] + pandera_mode = "warn-only" if pandera_warn_only else "strict" + messages = [ + f"Validating CSV contract v{get_contract_version()} " + f"(mode={mode}, pandera_enabled={enable_pandera}, pandera_mode={pandera_mode})" + ] ok = True - for logical_name, schema in CSV_SCHEMA_CONTRACT.items(): + for logical_name in CSV_SCHEMA_CONTRACT: csv_path = Path(ARCHIVOS_SALIDA[logical_name]) if not csv_path.exists(): - messages.append(f"[WARN] {logical_name}: archivo no existe ({csv_path.name})") + messages.append(f"[WARN] {logical_name}: file not found ({csv_path.name})") if strict: ok = False continue try: df = pd.read_csv(csv_path) - validar_dataframe( - df, - logical_name, + quality_report = validar_dataframe( + df=df, + nombre_archivo=logical_name, strict=strict, validate_types=True, + enable_pandera=enable_pandera, + pandera_warn_only=pandera_warn_only, + return_quality_report=True, ) - messages.append(f"[OK] {logical_name}: contrato válido") + + critical = int(quality_report["critical"]) + warning = int(quality_report["warning"]) + info = int(quality_report["info"]) + + if critical > 0: + if pandera_warn_only: + messages.append( + f"[WARN] {logical_name}: quality critical={critical} routed by warn-only mode" + ) + else: + messages.append( + f"[ERROR] {logical_name}: quality gate failed (critical={critical})" + ) + ok = False + continue + + if warning > 0: + messages.append(f"[WARN] {logical_name}: quality warnings={warning}") + if info > 0: + messages.append(f"[INFO] {logical_name}: quality info={info}") + + messages.append(f"[OK] {logical_name}: contract valid") except ETLValidationError as exc: messages.append(f"[ERROR] {logical_name}: {exc}") ok = False except Exception as exc: # pylint: disable=broad-exception-caught - messages.append(f"[ERROR] {logical_name}: no se pudo validar ({exc})") + messages.append(f"[ERROR] {logical_name}: validation execution failed ({exc})") ok = False return ok, messages @@ -60,7 +94,15 @@ def validate_contract(strict=True): def main(): logging.basicConfig(level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(name)s - %(message)s") strict = "--no-strict" not in sys.argv - ok, messages = validate_contract(strict=strict) + enable_pandera = "--skip-pandera" not in sys.argv + pandera_warn_only = "--pandera-strict" not in sys.argv + + ok, messages = validate_contract( + strict=strict, + enable_pandera=enable_pandera, + pandera_warn_only=pandera_warn_only, + ) + for msg in messages: if msg.startswith("[ERROR]"): logger.error(msg) @@ -70,10 +112,10 @@ def main(): logger.info(msg) if not ok: - logger.error("[RUN][SUMMARY] estado=failed") + logger.error("[RUN][SUMMARY] status=failed") sys.exit(1) - logger.info("[RUN][SUMMARY] estado=success") + logger.info("[RUN][SUMMARY] status=success") if __name__ == "__main__": diff --git a/docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md b/docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md index 994df3c..9be14e0 100644 --- a/docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md +++ b/docs/ROADMAP_V2_IMPLEMENTATION_PLAN.md @@ -1,37 +1,37 @@ -# ROADMAP V2 FINAL - Technology Trend Analysis Platform +# ROADMAP V2 FINAL - Technology Trend Analysis Platform -## 1) Summary +## 1) Resumen -This is the final, decision-complete plan for V2. -Goal: migrate from V1 CSV-only pipeline to a serverless data stack V2 without breaking current frontend behavior. +Este es el plan final, **decision-complete**, para la V2. +Objetivo: migrar desde el pipeline V1 (CSV-only) a un serverless data stack V2 sin romper el comportamiento actual del frontend. -Primary outcomes: -1. Data Product Contract V2 with run and dataset metadata. -2. Dual write (latest + history) with controlled transition. -3. Quality gate with severity levels. -4. Trend Score V1 vs V2 numeric equivalence tests. -5. Parallel CI pipeline with artifacts and conditional publish. -6. Frontend bridge with JSON history while keeping CSV compatibility. +Resultados principales: +1. Data Product Contract V2 con metadata de run y de dataset. +2. Dual write (latest + history) con transicion controlada. +3. Quality gate con niveles de severidad. +4. Pruebas de equivalencia numerica V1 vs V2 para Trend Score. +5. Pipeline CI paralelo con artifacts y publicacion condicional. +6. Frontend bridge con JSON historico manteniendo compatibilidad CSV. -## 2) Scope +## 2) Alcance -In scope now (V2 core): F2-F7 +En alcance ahora (core V2): F2-F7 - Contract V2 - Dual write - Pandera quality gate - DuckDB trend score engine -- GitHub Actions parallel jobs with artifacts +- GitHub Actions con jobs paralelos y artifacts - Frontend bridge -- Cutover governance +- Gobernanza de cutover -Out of scope now: -- Advanced forecasting productionization -- Advanced topic modeling productionization -- External BI platform integration +Fuera de alcance ahora: +- Productivizacion avanzada de forecasting +- Productivizacion avanzada de topic modeling +- Integracion con plataformas BI externas -These move to V2.1 or post-V2. +Eso pasa a V2.1 o post-V2. -## 3) Current baseline (verified) +## 3) Baseline actual (verificado) Backend: - ETLs: GitHub, StackOverflow, Reddit @@ -40,28 +40,28 @@ Backend: - CSV contract validator: `backend/validate_csv_contract.py` Frontend: -- Flutter dashboards read CSV from `frontend/assets/data/` +- Dashboards Flutter leen CSV desde `frontend/assets/data/` - Loader: `frontend/lib/services/csv_service.dart` CI/CD: -- Weekly ETL workflow exists and works -- Current flow is mostly sequential for ETL processing +- Workflow ETL semanal paralelo por fuente + aggregate + publish +- Verificaciones de artifacts y outputs frontend activas en CI -## 4) Branch strategy and governance +## 4) Estrategia de ramas y gobernanza -Branches: -- Backend work branch: `feat/backend` -- Frontend work branch: `feat/frontend` +Ramas: +- Rama de trabajo backend: `feat/backend` +- Rama de trabajo frontend: `feat/frontend` -Default merge policy: -- `squash merge` unless explicit reason to preserve detailed commit graph. +Politica de merge por defecto: +- `squash merge`, salvo razon explicita para preservar el grafo detallado de commits. -Sync policy before each backend PR: +Politica de sincronizacion antes de cada PR de backend: 1. `git fetch --all --prune` 2. `git switch main && git pull --ff-only origin main` 3. `git switch feat/backend && git merge --ff-only main` -If exact-commit alignment is required and FF does not apply: +Si se requiere alineacion exacta de commit y no aplica fast-forward: - `git reset --hard main` - `git push --force-with-lease` @@ -76,7 +76,7 @@ If exact-commit alignment is required and FF does not apply: - `source_window_start_utc` - `source_window_end_utc` - `quality_gate_status` (`pass`, `pass_with_warnings`, `fail`) -- `datasets` (array of dataset manifests) +- `datasets` (array de dataset manifests) ### 5.2 Dataset-level metadata (required) @@ -90,31 +90,33 @@ If exact-commit alignment is required and FF does not apply: - `latest_path` - `history_path` -### 5.3 SemVer rules for datasets +### 5.3 Reglas SemVer para datasets -- MAJOR: breaking schema change (remove/rename required column, incompatible type change) -- MINOR: backward-compatible additions (optional columns, non-breaking checks) -- PATCH: internal fixes with no schema contract break +- MAJOR: cambio breaking de schema (eliminar/renombrar columna requerida, cambio de tipo incompatible) +- MINOR: adiciones backward-compatible (columnas opcionales, checks no-breaking) +- PATCH: correcciones internas sin romper el contrato de schema -## 6) Storage layout (fixed now) +## 6) Layout de almacenamiento (estado implementado) Latest outputs: - `datos/latest/*.csv` -- `datos/latest/history_index.json` -- `datos/latest/trend_score_history.json` History outputs: -- `datos/history//year=YYYY/month=MM/day=DD/part-0000.parquet` +- `datos/history//year=YYYY/month=MM/day=DD/*.csv` Metadata outputs: - `datos/metadata/run_manifest.json` - `datos/metadata/runs/.json` -Examples: -- `datos/history/trend_score/year=2026/month=02/day=22/part-0000.parquet` -- `datos/history/so_volumen/year=2026/month=02/day=22/part-0000.parquet` +Frontend bridge outputs: +- `frontend/assets/data/history_index.json` +- `frontend/assets/data/trend_score_history.json` -## 7) V1 -> V2 compatibility matrix (core) +Ejemplos: +- `datos/history/trend_score/year=2026/month=02/day=22/trend_score.csv` +- `datos/history/so_volumen/year=2026/month=02/day=22/so_volumen_preguntas.csv` + +## 7) Matriz de compatibilidad V1 -> V2 (core) - `datos/trend_score.csv` -> `datos/latest/trend_score.csv` + `datos/history/trend_score/...` - `datos/so_volumen_preguntas.csv` -> `datos/latest/so_volumen_preguntas.csv` + `datos/history/so_volumen/...` @@ -122,308 +124,385 @@ Examples: - `datos/reddit_temas_emergentes.csv` -> `datos/latest/reddit_temas_emergentes.csv` + `datos/history/reddit_temas/...` - `datos/github_lenguajes.csv` -> `datos/latest/github_lenguajes.csv` + `datos/history/github_lenguajes/...` -Frontend cutover rule: -- CSV stays until bridge JSON passes 4 consecutive weekly runs without critical failures. +Regla de cutover frontend: +- CSV se mantiene hasta que el bridge JSON pase 4 corridas semanales consecutivas sin fallos `critical`. -## 8) Quality model (Pandera + severity) +## 8) Modelo de calidad (Pandera + severity) -Severity and actions: -- `critical`: fail pipeline, no publish -- `warning`: publish with warning flag -- `info`: publish, observability only +Severidad y acciones: +- `critical`: falla pipeline, no publica +- `warning`: publica con warning flag +- `info`: publica, solo observabilidad -Minimum required rules: -1. Required columns present (critical) -2. Critical types valid (critical) -3. Critical columns no nulls (critical) -4. `trend_score >= 0` (critical) -5. Ranking uniqueness (critical) -6. Core dataset row_count > 0 (warning) -7. Freshness threshold exceeded (warning) -8. Distribution drift soft breach (warning) -9. Optional fields missing (info) -10. Minor cardinality variation (info) +Reglas minimas obligatorias: +1. Required columns presentes (`critical`) +2. Critical types validos (`critical`) +3. No nulos en critical columns (`critical`) +4. `trend_score >= 0` (`critical`) +5. Unicidad de ranking (`critical`) +6. `row_count > 0` en datasets core (`warning`) +7. Freshness fuera de umbral (`warning`) +8. Distribution drift suave (`warning`) +9. Optional fields faltantes (`info`) +10. Variacion menor de cardinalidad (`info`) -## 9) Trend score equivalence V1 vs V2 +## 9) Equivalencia de Trend Score V1 vs V2 -Acceptance thresholds: -- Absolute score difference per shared technology: `<= 0.01` +Umbrales de aceptacion: +- Diferencia absoluta por tecnologia compartida: `<= 0.01` - Top-10 overlap: `>= 90%` -- Ranking delta: `<= 1` for at least 90% of shared technologies -- Tie handling allowed when score delta is `<= 0.01` +- Delta de ranking: `<= 1` para al menos 90% de tecnologias compartidas +- Empates permitidos cuando delta de score `<= 0.01` -## 10) Source failure degradation policy +## 10) Politica de degradacion ante fallo de fuentes -- 3/3 sources available: publish, normal weights -- 2/3 sources available: renormalize available weights, publish with warning -- 1/3 source available: do not publish new latest, mark fail -- 0/3 available: fail run +- 3/3 fuentes disponibles: publica, weights normales +- 2/3 fuentes disponibles: renormaliza weights disponibles, publica con warning +- 1/3 fuente disponible: no publica nuevo latest, marca fail +- 0/3 fuentes disponibles: fail run -## 11) CI/CD V2 architecture (artifacts) +## 11) Arquitectura CI/CD V2 (artifacts) -Main workflow: `.github/workflows/etl_semanal.yml` +Workflow principal: `.github/workflows/etl_semanal.yml` Jobs: 1. `job_github` 2. `job_stackoverflow` 3. `job_reddit` -4. `job_aggregate` (downloads artifacts, computes trend, runs quality gate, writes manifest) -5. `job_publish` (conditional on quality gate) +4. `job_aggregate` (descarga artifacts, calcula trend, corre quality gate, escribe manifest) +5. `job_publish` (condicional por quality gate) -Publish condition: -- only if quality status is `pass` or `pass_with_warnings` +Condicion de publicacion: +- solo si quality status es `pass` o `pass_with_warnings` -## 12) Runtime and cost budgets (GitHub Actions) +## 12) Presupuesto runtime y costo (GitHub Actions) -Per-run limits: -- Source job timeout: 20 min each -- Aggregate timeout: 15 min -- Publish timeout: 10 min -- Total run budget: 60 min +Limites por run: +- Timeout por source job: 20 min cada uno +- Timeout aggregate: 15 min +- Timeout publish: 10 min +- Presupuesto total por run: 60 min -Artifact budget: -- Warning at 75 MB total -- Critical at 100 MB total +Presupuesto de artifacts: +- Warning en 75 MB total +- Critical en 100 MB total -Alerting thresholds: +Umbrales de alerta: - Warning: runtime > 45 min - Critical: runtime > 60 min -## 13) Reproducibility +## 13) Reproducibilidad -- Python lock file for deterministic installs -- Flutter lock file committed -- Deterministic seed for transforms where applicable -- Baseline fixtures for V1 equivalence tests -- Historical replay by `run_id` supported through manifest metadata +- Python lock file para instalaciones deterministicas +- Flutter lock file commiteado +- Seed deterministica para transformaciones donde aplique +- Baseline fixtures V1 para pruebas de equivalencia +- Replay historico por `run_id` soportado via manifest metadata -## 14) Retention and lifecycle +## 14) Retencion y ciclo de vida -Core aggregated datasets: -- Daily: 180 days -- Monthly compacted: 5 years +Datasets core agregados: +- Diario: 180 dias +- Mensual compactado: 5 anios -Heavy raw-like datasets: -- Daily: 90 days -- Monthly compacted: 24 months +Datasets pesados tipo raw: +- Diario: 90 dias +- Mensual compactado: 24 meses -Compaction: -- Monthly parquet compaction -- Integrity validation after compaction (row_count, schema_hash, checksums) +Compactacion: +- Compactacion parquet mensual +- Validacion de integridad post-compactacion (`row_count`, `schema_hash`, checksums) -## 15) Security and compliance in CI +## 15) Security and compliance en CI -- Least-privilege workflow permissions -- `contents: write` only where publish is needed -- Secrets required: +- Workflow permissions con minimo privilegio +- `contents: write` solo donde la publicacion lo requiera +- Secrets requeridos: - `GH_PAT` - `STACKOVERFLOW_KEY` - `REDDIT_CLIENT_ID` - `REDDIT_CLIENT_SECRET` -- Secret masking required -- No sensitive payloads in logs/artifacts -- Preflight secret checks before extraction +- Secret masking obligatorio +- No exponer payloads sensibles en logs/artifacts +- Preflight checks de secretos antes de extraer datos -## 16) PR plan (F2-F7, PR-ready) +## 16) Plan de PRs (F2-F7, PR-ready) ### PR-01 (F2) - Contract V2 foundation -Goal: -- Introduce V2 contract and manifest model. +Objetivo: +- Introducir contrato V2 y modelo de manifest. -Files: -- `backend/config/data_product_contract_v2.py` (new) +Archivos: +- `backend/config/data_product_contract.py` (new) - `backend/config/csv_contract.py` - `docs/data_contract.md` Checks: -- contract tests pass -- schema validation tests pass +- contract tests en verde +- schema validation tests en verde Merge criteria: -- no regressions in current tests +- sin regresiones en la suite actual Rollback: - revert PR ### PR-02 (F3) - Dual write infrastructure -Goal: -- Add latest/history writing path while preserving existing CSV behavior. +Objetivo: +- Agregar latest/history write path preservando el comportamiento CSV existente. -Files: +Archivos: - `backend/base_etl.py` - `backend/config/settings.py` - `backend/sync_assets.py` -- tests for write behavior +- tests de write behavior Checks: -- write tests pass -- current ETL tests pass +- write tests en verde +- ETL tests actuales en verde Rollback: -- disable history writes via config flag +- desactivar history writes con config flag ### PR-03 (F5) - Quality gate warn-only -Goal: -- Add Pandera validation with severity routing. +Objetivo: +- Agregar validacion Pandera con enrutamiento por severidad. -Files: +Archivos: - `backend/validador.py` - `backend/validate_csv_contract.py` - `backend/quality/pandera_schemas.py` (new) -- tests for severity handling +- tests de manejo de severidad Checks: -- quality tests pass -- warning path does not block publish +- quality tests en verde +- warning path no bloquea publish Rollback: -- bypass Pandera stage +- bypass de etapa Pandera ### PR-04 (F4) - DuckDB trend engine + equivalence tests -Goal: -- Move trend calculation to DuckDB while proving equivalence. +Objetivo: +- Mover calculo de trend a DuckDB demostrando equivalencia. -Files: +Archivos: - `backend/trend_score.py` -- `backend/trend_score_v2_duckdb.py` (new) +- `backend/trend_score_duckdb.py` (new) - `tests/test_trend_equivalence_v1_v2.py` (new) Checks: -- equivalence thresholds satisfied +- equivalence thresholds cumplidos Rollback: -- switch to previous trend engine path +- volver a ruta de trend engine anterior ### PR-05 (F6) - Parallel workflow with artifacts -Goal: -- Split source jobs and aggregate with artifacts. +Objetivo: +- Separar source jobs y agregar aggregate por artifacts. -Files: +Archivos: - `.github/workflows/etl_semanal.yml` Checks: -- manual workflow run succeeds -- artifact handoff valid +- manual workflow run exitoso +- artifact handoff valido Rollback: -- restore sequential workflow version +- restaurar version secuencial del workflow ### PR-06 (F7) - Frontend bridge assets -Goal: -- Produce JSON history bridge assets while keeping CSV. +Objetivo: +- Producir bridge JSON historico manteniendo CSV. -Files: +Archivos: - `backend/export_history_json.py` (new) - `backend/sync_assets.py` -- generated files under `frontend/assets/data/` +- archivos generados en `frontend/assets/data/` Checks: -- bridge files generated -- frontend can load existing CSV unchanged +- bridge files generados +- frontend sigue cargando CSV sin cambios Rollback: -- disable bridge export +- desactivar bridge export ### PR-07 (F7) - Frontend partial cutover -Goal: -- Consume bridge JSON via feature flag. +Objetivo: +- Consumir bridge JSON por feature flag. -Files: +Archivos: - `frontend/lib/services/csv_service.dart` - `frontend/lib/config/feature_flags.dart` (new) -- minimal temporal view wiring +- wiring minimo de vista temporal Checks: -- smoke load for CSV and JSON paths -- no regressions in existing dashboards +- smoke load para path CSV y JSON +- sin regresiones en dashboards actuales Rollback: - feature flag off -## 17) DoD by phase (F2-F7) +## 17) DoD por fase (F2-F7) F2: -- Deliverables: V2 contract + manifest schema +- Deliverables: contrato V2 + manifest schema - Tests: contract schema tests -- Acceptance: manifest valid in sample run -- Rollback: PR revert +- Acceptance: manifest valido en sample run +- Rollback: revert PR +- Estado: DONE +- Evidencia: + - `pytest -q tests/test_data_product_contract.py tests/test_csv_contract.py` -> 15 passed + - sample run manifest validado con `validate_run_manifest` -> `manifest_valid=True`, `errors=0` F3: - Deliverables: dual write latest/history - Tests: write path + idempotency tests -- Acceptance: expected files created in fixed layout -- Rollback: disable history flag +- Acceptance: archivos esperados creados en el layout fijo +- Rollback: desactivar history flag +- Estado: DONE +- Evidencia: + - `pytest -q tests/test_base_etl.py tests/test_sync_assets.py` -> 16 passed + - prueba de idempotencia de escritura en script aislado -> `legacy_exists=True`, `latest_exists=True`, `history_exists=True`, `history_file_count=1` + - validacion de acceptance en run real -> `acceptance_paths_ok=True` para `datos/`, `datos/latest/`, `datos/history/...` + - validacion de rollback por flag -> `rollback_history_mtime_unchanged=True` con `DATA_WRITE_HISTORY_CSV=0` F4: - Deliverables: DuckDB trend engine - Tests: equivalence suite -- Acceptance: all thresholds pass -- Rollback: switch back to V1 engine +- Acceptance: todos los umbrales en verde +- Rollback: volver a V1 engine +- Estado: DONE +- Evidencia: + - `pytest -q tests/test_trend_score.py tests/test_trend_equivalence_v1_v2.py` -> 20 passed + - validacion de umbrales de equivalencia -> `max_abs_diff=0.0000`, `top10_overlap=1.00`, `pct_rank_delta_le_1=1.00` + - verificacion de rollback por engine -> `duckdb_exit=0`, `legacy_exit=0` con `TREND_SCORE_ENGINE=duckdb|legacy` F5: - Deliverables: severity quality gate -- Tests: critical/warning/info routing -- Acceptance: critical blocks publish, warning allows publish-with-flag -- Rollback: bypass new gate +- Tests: enrutamiento `critical`/`warning`/`info` +- Acceptance: critical bloquea publish, warning permite publish-with-flag +- Rollback: bypass de gate nuevo +- Estado: DONE +- Evidencia: + - `pytest -q tests/test_validador.py tests/test_validate_csv_contract.py` -> 12 passed + - tests especificos de severidad (`warning` no bloquea y `critical` bloquea en strict) -> 4 passed + - `python backend/validate_csv_contract.py --no-strict` -> `status=success` con warning routeado + - `python backend/validate_csv_contract.py --pandera-strict` -> `status=success` con warnings no bloqueantes en dataset actual + - `python backend/validate_csv_contract.py --no-strict --skip-pandera` -> `status=success` (bypass operativo) F6: -- Deliverables: parallel CI with artifacts +- Deliverables: CI paralelo con artifacts - Tests: workflow dry run + artifact contract -- Acceptance: successful end-to-end run -- Rollback: sequential workflow restore +- Acceptance: corrida end-to-end exitosa +- Rollback: restaurar workflow secuencial +- Estado: DONE +- Evidencia: + - workflow actualizado con jobs paralelos (`job_github`, `job_stackoverflow`, `job_reddit`) y agregador con `needs` + - contrato de artifacts validado en `tests/test_workflow_etl_contract.py` -> 4 passed + - suite F6/F7 backend (`pytest -q tests/test_workflow_etl_contract.py tests/test_sync_assets.py tests/test_export_history_json.py`) -> 13 passed + - dry run local del flujo aggregate (`python backend/sync_assets.py` + `python backend/export_history_json.py`) -> `status=success` en ambos comandos + - gate de rollback preservado: se puede volver al workflow secuencial restaurando `.github/workflows/etl_semanal.yml` F7: -- Deliverables: bridge JSON + frontend flag cutover +- Deliverables: bridge JSON + cutover parcial frontend - Tests: frontend smoke path -- Acceptance: 4 weekly runs stable before CSV retirement decision -- Rollback: flag off and CSV-only fallback - -## 18) Test scenarios (mandatory) - -1. Manifest schema: valid and invalid samples -2. SemVer bump correctness on representative changes -3. Deterministic schema_hash stability -4. Dual write idempotent behavior by run_id -5. Quality gate severity actions -6. V1 vs V2 trend equivalence thresholds -7. Degradation matrix (3/3, 2/3, 1/3, 0/3 sources) -8. Artifact corruption or missing artifact handling -9. Frontend bridge fallback behavior -10. Rollback verification per PR - -## 19) Release and tags - -Recommended release checkpoints: +- Acceptance: 4 corridas semanales estables antes de retiro de CSV +- Rollback: flag off y fallback CSV-only +- Estado: DONE (implementacion) / OPERATIVO EN CURSO (estabilidad semanal) +- Evidencia: + - bridge JSON export activo con `backend/export_history_json.py` y `backend/sync_assets.py` + - assets bridge generados en `frontend/assets/data/history_index.json` y `frontend/assets/data/trend_score_history.json` + - cutover parcial implementado por feature flag en `frontend/lib/config/feature_flags.dart` + - consumo bridge con fallback CSV implementado en `frontend/lib/services/csv_service.dart` + - wiring de UI temporal aplicado en `frontend/lib/screens/home_screen.dart` + - smoke build frontend (`flutter build web --debug`) -> success + - criterio de 4 corridas semanales se valida en ejecucion real de workflow tras push (no bloquea la implementacion del PR) + +## 18) Escenarios de prueba (obligatorios) + +Estado general: `DONE` + +- [x] 1) Manifest schema: muestras validas e invalidas + - Evidencia: `tests/test_data_product_contract.py` +- [x] 2) Correctitud de SemVer bump en cambios representativos + - Evidencia: `tests/test_schema_contract_utils.py` (matriz representativa de cambios -> bump esperado) +- [x] 3) Estabilidad deterministica de `schema_hash` + - Evidencia: `tests/test_schema_contract_utils.py` (`compute_schema_hash` deterministico y sensible a cambios semanticos) +- [x] 4) Idempotencia de dual write por `run_id` + - Evidencia: `tests/test_base_etl.py`, `tests/test_sync_assets.py` +- [x] 5) Acciones del quality gate por severidad + - Evidencia: `tests/test_validador.py`, `tests/test_validate_csv_contract.py` +- [x] 6) Umbrales de equivalencia trend V1 vs V2 + - Evidencia: `tests/test_trend_equivalence_v1_v2.py` +- [x] 7) Matriz de degradacion (3/3, 2/3, 1/3, 0/3 fuentes) + - Evidencia: `tests/test_degradation_policy.py` +- [x] 8) Manejo de artifact corrupto o faltante + - Evidencia: `tests/test_workflow_etl_contract.py`, `tests/test_export_history_json.py` +- [x] 9) Comportamiento de fallback del frontend bridge + - Evidencia: `tests/test_frontend_bridge_contract.py`, `tests/test_export_history_json.py` +- [x] 10) Verificacion de rollback por PR + - Evidencia: `tests/test_trend_score.py`, `tests/test_sync_assets.py`, `tests/test_workflow_etl_contract.py` + +Resultado de verificacion: +- `pytest -q` -> `133 passed` + +## 19) Releases y tags + +Estado: `READY FOR EXECUTION` (sin bloqueo tecnico; pendiente gate operativo semanal) + +Checkpoints recomendados: - `v2.0.0-rc1`: F2 + F3 - `v2.0.0-rc2`: F5 + F4 - `v2.0.0-rc3`: F6 -- `v2.0.0`: F7 stable and cutover-ready +- `v2.0.0`: F7 estable y cutover-ready - `v2.1.0`: advanced analytics -Cutover complete criteria: -- 4 consecutive weekly runs without critical quality failures -- SLO targets met -- trend equivalence stable -- frontend bridge stable under flag-on +Criterios de cutover completo: +- 4 corridas semanales consecutivas sin fallos `critical` +- SLOs cumplidos +- equivalencia trend estable +- frontend bridge estable con flag on + +Estado actual de criterios: +- suite tecnica en verde (`pytest -q` -> `133 passed`) +- equivalencia trend validada (F4 en verde) +- bridge frontend implementado con fallback y feature flag +- pendiente operativo: 4 corridas semanales reales + verificacion SLO + +Procedimiento de ejecucion de tags (cuando se autorice): +1. Validar rama `feat/backend` actualizada con `main`. +2. Ejecutar tests y smoke checks. +3. Crear tag anotado del checkpoint objetivo. +4. Publicar tag remoto. +5. Registrar notas de release. ## 20) Decision timeline tags -- Adopt now: +Estado: `DECISIONES CERRADAS` + +- Adoptar ahora: - Contract V2 - Dual write - Pandera severity - DuckDB equivalence - CI artifacts - Frontend bridge + - estado: implementado en `feat/backend` -- Adopt in V2.1: - - forecasting and advanced NLP +- Adoptar en V2.1: + - forecasting y NLP avanzado + - estado: pendiente (no bloquea release V2.0.0) - Post-V2: - - external BI and non-GitHub long-term storage + - BI externo y almacenamiento long-term fuera de GitHub + - estado: backlog estrategico + +Regla de control de alcance: +- toda mejora nueva no critica entra a V2.1 o Post-V2 +- solo fixes de estabilidad/regresion entran antes de `v2.0.0` -## 21) Final assumptions +## 21) Supuestos finales -1. Serverless architecture remains mandatory. -2. This document is the execution source of truth for backend V2 in `feat/backend`. -3. No open decision should be left to implementers outside this plan. +1. La arquitectura serverless se mantiene como restriccion principal. +2. Este documento es la fuente de verdad de ejecucion para backend V2 en `feat/backend`. +3. No se dejan decisiones abiertas fuera de este plan. diff --git a/docs/architecture.md b/docs/architecture.md index 67a7bae..2947b6f 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,109 +1,96 @@ -# Architecture -- Technology Trend Analysis Platform +# Arquitectura del Proyecto -## System Overview +## Resumen -Plataforma multi-fuente que extrae, transforma y visualiza datos de tendencias tecnologicas -desde tres comunidades de desarrolladores: GitHub, StackOverflow y Reddit. +La plataforma procesa tendencias de tecnologia desde tres fuentes (GitHub, StackOverflow, Reddit), +calcula un Trend Score compuesto, valida calidad de datos y publica activos para frontend. -## Data Flow +## Flujo de Datos -``` - .env (API Keys) - | - +--------------+--------------+ - v v v - GitHub API StackOverflow API Reddit API - (REST) (REST) (OAuth/JSON) - | | | - v v v - github_etl.py so_etl.py reddit_etl.py - | | | - +--------------+--------------+ - v - datos/ (CSV) - Fuente de Verdad - | - sync_assets.py - | - v - frontend/assets/data/ - | - v - Flutter Web Dashboard - (fl_chart) -``` - -## Data Schema - -### GitHub - -| Archivo | Columnas | Descripcion | -|---------|----------|-------------| -| github_repos_2025.csv | repo_name, language, stars, forks, created_at, description | Top 1000 repos creados en 2025 | -| github_lenguajes.csv | lenguaje, repos_count, porcentaje | Top 10 lenguajes por cantidad de repos | -| github_commits_frameworks.csv | framework, repo, commits_2025, ranking | Actividad de commits en frameworks frontend | -| github_correlacion.csv | repo_name, stars, contributors, language | Correlacion Stars vs Contributors | - -### StackOverflow - -| Archivo | Columnas | Descripcion | -|---------|----------|-------------| -| so_volumen_preguntas.csv | lenguaje, preguntas_nuevas_2025 | Volumen de preguntas por lenguaje | -| so_tasa_aceptacion.csv | tecnologia, total_preguntas, respuestas_aceptadas, tasa_aceptacion_pct | Tasa de respuestas aceptadas por framework | -| so_tendencias_mensuales.csv | mes, python, javascript, typescript | Tendencias mensuales de preguntas | - -### Reddit - -| Archivo | Columnas | Descripcion | -|---------|----------|-------------| -| reddit_sentimiento_frameworks.csv | framework, total_menciones, positivos, neutros, negativos, % positivo, % neutro, % negativo | Analisis de sentimiento para frameworks backend | -| reddit_temas_emergentes.csv | tema, menciones | Temas emergentes en r/webdev | -| interseccion_github_reddit.csv | tecnologia, tipo, ranking_github, ranking_reddit, diferencia | Comparacion de rankings entre plataformas | - -### Trend Score - -| Archivo | Columnas | Descripcion | -|---------|----------|-------------| -| trend_score.csv | ranking, tecnologia, github_score, so_score, reddit_score, trend_score, fuentes | Indice compuesto ponderado (GitHub 40% + SO 35% + Reddit 25%) | - -## Frontend Architecture - -``` -Flutter Web Dashboard - HomeScreen - KPIs globales, insights - GithubDashboard - 3 graficos (barras, donut, scatter) - SODashboard - 3 graficos (barras, stacked, lineas) - RedditDashboard - 3 graficos (divergentes, barras, rankings) - -Cada dashboard incluye: - - Carga de CSV via CsvService - - Graficos interactivos (fl_chart) - - Key Insights - - Exportar ZIP -``` - -## Deployment - -### Local -```bash -# Backend -make install -make etl - -# Frontend -cd frontend -flutter pub get -flutter run -d chrome -``` +```text +GitHub ETL -------\ +StackOverflow ETL --> datos/*.csv --> Trend Score --> sync_assets --> frontend/assets/data/* +Reddit ETL -------/ -### GitHub Pages -```bash -cd frontend -flutter build web --base-href "/Technology-trend-analysis-platform/" +Adicional: +- dual write opcional a datos/latest y datos/history +- export de bridge JSON para historico de trend ``` -### Automatizacion (GitHub Actions) -- Cron: cada lunes a las 08:00 UTC (03:00 Ecuador) -- Ejecuta el pipeline ETL completo -- Sincroniza CSVs al frontend -- Rebuild y deploy de Flutter Web +## Componentes Backend + +- `backend/base_etl.py` + - clase base para ejecucion, logging y escritura. +- `backend/config/settings.py` + - rutas, flags de escritura y configuracion global. +- `backend/trend_score.py` + - motor principal de Trend Score con selector de engine. +- `backend/trend_score_duckdb.py` + - engine DuckDB para calculo SQL. +- `backend/validador.py` + - validacion de schema y quality report por severidad. +- `backend/quality/pandera_schemas.py` + - reglas `critical/warning/info` con Pandera. +- `backend/quality/degradation_policy.py` + - politica de degradacion por disponibilidad de fuentes. +- `backend/validate_csv_contract.py` + - contrato CSV para compatibilidad backend/frontend. +- `backend/config/data_product_contract.py` + - contrato de run manifest y dataset manifest. +- `backend/config/schema_contract_utils.py` + - `schema_hash` deterministico y reglas SemVer bump. +- `backend/sync_assets.py` + - sincroniza CSV a frontend con prioridad por archivo (`latest` -> fallback `legacy`). +- `backend/export_history_json.py` + - genera `history_index.json` y `trend_score_history.json`. + +## Estrategia de Escritura + +Control por variables de entorno: + +- `DATA_WRITE_LEGACY_CSV` +- `DATA_WRITE_LATEST_CSV` +- `DATA_WRITE_HISTORY_CSV` + +Rutas: + +- Legacy: `datos/*.csv` +- Latest: `datos/latest/*.csv` +- History: `datos/history//year=YYYY/month=MM/day=DD/*.csv` +- Metadata: `datos/metadata/` + +## Conexion con Frontend + +El frontend consume: + +- CSV tradicionales en `frontend/assets/data/*.csv` +- Bridge JSON opcional: + - `frontend/assets/data/history_index.json` + - `frontend/assets/data/trend_score_history.json` + +Feature flag: + +- `frontend/lib/config/feature_flags.dart` +- `USE_HISTORY_BRIDGE_JSON=false` por defecto. + +Esto permite cutover parcial sin romper dashboards existentes. + +## GitHub Actions + +Workflows activos: + +1. `etl_semanal.yml` + - lunes `08:00 UTC` + manual. + - jobs paralelos por fuente + aggregate + publish. +2. `ci.yml` + - tests en `main`, `feat/backend`, `feat/frontend`. +3. `dependency_security.yml` + - auditoria de dependencias (push/PR/schedule/manual). +4. `deploy_frontend.yml` + - deploy de Flutter Web en `main` o tras ETL exitoso. + +## Estado de Backend V2 + +- Implementacion tecnica: completada. +- Gate operativo pendiente para cutover final: + - 4 corridas semanales consecutivas sin fallos `critical`. diff --git a/docs/coding_style.md b/docs/coding_style.md new file mode 100644 index 0000000..7761994 --- /dev/null +++ b/docs/coding_style.md @@ -0,0 +1,62 @@ +# Estandar de Estilo del Repositorio + +Este documento define reglas para mantener consistencia tecnica y colaboracion. + +## 1) Idioma por Area + +- `README.md`: ingles. +- `docs/`: espanol (terminos tecnicos pueden quedar en ingles). +- `backend/`: + - comentarios y docstrings en ingles. + - nombres ETL de negocio existentes pueden mantenerse en espanol. + - modulos tecnicos y utilidades compartidas en ingles. + +## 2) Naming y Estructura + +- usar nombres profesionales y estables. +- evitar nombres temporales en codigo (`fase`, `pr-xx`, `tmp`, etc). +- mantener coherencia con el estilo del modulo antes de renombrar APIs. +- evitar sobre-comentarios; comentar solo cuando agrega contexto real. +- no usar emojis en codigo backend. + +## 3) Reglas de Implementacion + +- cambios incrementales y compatibles con comportamiento actual. +- no romper contratos publicos sin requerimiento funcional explicito. +- separar logica de negocio y utilidades tecnicas. +- toda capa nueva debe incluir tests. + +## 4) Flujo de Ramas + +- `main`: rama estable. +- `feat/backend`: cambios backend. +- `feat/frontend`: cambios frontend. + +Antes de PR: +1. actualizar rama con `main`. +2. correr tests relevantes. +3. verificar build/smoke cuando aplique. + +## 5) Commits + +- mensajes en ingles, claros y breves. +- evitar titulos con terminologia interna del plan (`f2`, `pr03`, etc). +- un commit debe agrupar cambios coherentes. + +## 6) Validacion Minima antes de Push + +- `pytest -q` +- smoke ETL si se toca pipeline. +- smoke frontend si se toca integracion de assets. +- confirmar que cambios no relacionados no se incluyan por error. + +## 7) Politica de Artefactos Generados + +- no commitear salidas runtime (`datos/latest`, `datos/history`, `datos/metadata`) salvo decision explicita. +- commitear codigo, tests y documentacion. + +## 8) Definicion de Listo + +- tests en verde. +- sin regresiones de contrato de datos. +- comportamiento de rollback definido para cambios de riesgo. diff --git a/docs/data_contract.md b/docs/data_contract.md index 84dd5c9..480996d 100644 --- a/docs/data_contract.md +++ b/docs/data_contract.md @@ -1,85 +1,113 @@ -# Contrato de datos CSV (Backend ↔ Frontend) +# Contrato de Datos (Backend <-> Frontend) -Este documento formaliza el contrato de columnas entre el pipeline ETL (`backend/`) y el dashboard Flutter (`frontend/`). +Este documento define los contratos activos para salidas de datos y metadata. -## Fuente de verdad - -El contrato ejecutable vive en: +## 1) Contrato CSV +Fuente de verdad: - `backend/config/csv_contract.py` -Versión actual del contrato: +Objetivo: +- mantener compatibilidad entre salidas backend y consumo frontend. + +Validacion: + +```bash +python backend/validate_csv_contract.py +``` + +Modos relevantes: + +```bash +python backend/validate_csv_contract.py --no-strict +python backend/validate_csv_contract.py --pandera-strict +python backend/validate_csv_contract.py --no-strict --skip-pandera +``` + +## 2) Contrato de Producto de Datos (Manifest) + +Fuente de verdad: +- `backend/config/data_product_contract.py` + +Incluye: +- run manifest +- dataset manifest -- `CONTRACT_VERSION = 2026.03` +### 2.1 Campos obligatorios de run manifest -El validador consume ese contrato para verificar columnas requeridas y columnas críticas. +- `run_id` +- `generated_at_utc` +- `git_sha` +- `branch` +- `source_window_start_utc` +- `source_window_end_utc` +- `quality_gate_status` (`pass`, `pass_with_warnings`, `fail`) +- `datasets` -Además, el pipeline ETL semanal ejecuta validación de headers con: +### 2.2 Campos obligatorios de dataset manifest -- `python backend/validate_csv_contract.py` +- `dataset_logical_name` +- `version_semver` +- `generated_at_utc` +- `source_run_id` +- `schema_hash` +- `row_count` +- `quality_status` (`pass`, `warning`, `fail`) +- `latest_path` +- `history_path` -Si faltan columnas requeridas o no se cumplen tipos mínimos, el workflow falla antes de publicar cambios de datos. +## 3) Reglas de Validacion -Modo opcional no estricto (solo advertencias): +- fechas en formato ISO-8601 con zona horaria. +- `version_semver` valida SemVer. +- `schema_hash` debe ser SHA-256 hexadecimal de 64 caracteres. +- `row_count` debe ser entero >= 0. +- `source_run_id` debe coincidir con `run_id`. +- `history_path` puede ser `null` cuando `quality_status=fail`. -- `python backend/validate_csv_contract.py --no-strict` +## 4) Utilidades de Schema y Versionado -## Reglas del contrato +Fuente de verdad: +- `backend/config/schema_contract_utils.py` -1. **required_columns**: deben existir para considerar que el CSV cumple contrato. -2. **critical_columns**: no deberían contener nulos; en modo estricto, fallan la validación. -3. **column_types**: define tipos mínimos esperados por columna (`string`, `integer`, `number`, `datetime`, `string_or_integer`). -4. **optional_columns**: columnas permitidas (compatibilidad y métricas adicionales), pero no obligatorias. +Funciones: +- `compute_schema_hash(...)` +- `recommend_semver_bump(...)` +- `aggregate_semver_bump(...)` -## Tipos mínimos por archivo (resumen) +Politica SemVer implementada: +- `major`: cambio breaking (remove/rename required column, tipo incompatible, etc). +- `minor`: cambios backward-compatible (columna opcional, regla no breaking, etc). +- `patch`: cambios internos sin romper contrato. -- `github_repos.csv` - - `repo_name:string`, `language:string`, `stars:integer`, `forks:integer`, `created_at:datetime` -- `github_lenguajes.csv` - - `lenguaje:string`, `repos_count:integer`, `porcentaje:number` -- `github_ai_repos_insights.csv` - - `total_repos_analizados:integer`, `repos_ai_detectados:integer`, `porcentaje_ai:number`, `mes_pico_ai:string`, `repos_mes_pico_ai:integer`, `top_keywords_ai:string`, `top_repos_ai:string` -- `github_commits_frameworks.csv` - - `framework:string`, `repo:string`, `commits_2025:integer`, `ranking:integer` -- `github_correlacion.csv` - - `repo_name:string`, `stars:integer`, `contributors:integer`, `language:string` -- `so_volumen_preguntas.csv` - - `lenguaje:string`, `preguntas_nuevas_2025:integer` -- `so_tasa_aceptacion.csv` - - `tecnologia:string`, `total_preguntas:integer`, `respuestas_aceptadas:integer`, `tasa_aceptacion_pct:number` -- `so_tendencias_mensuales.csv` - - `mes:string`, `python:integer`, `javascript:integer`, `typescript:integer` -- `reddit_sentimiento_frameworks.csv` - - `framework:string`, `total_menciones:integer`, `positivos:integer`, `neutros:integer`, `negativos:integer` - - opcionales: `% positivo:number`, `% neutro:number`, `% negativo:number` -- `reddit_temas_emergentes.csv` - - `tema:string`, `menciones:integer` -- `interseccion_github_reddit.csv` - - `tecnologia:string`, `tipo:string`, `ranking_github:integer`, `ranking_reddit:string_or_integer` -- `trend_score.csv` - - `ranking:integer`, `tecnologia:string`, `github_score:number`, `so_score:number`, `reddit_score:number`, `trend_score:number`, `fuentes:integer` +## 5) Estrategia de Escritura -## Archivos clave consumidos por frontend +Control por flags: +- `DATA_WRITE_LEGACY_CSV` +- `DATA_WRITE_LATEST_CSV` +- `DATA_WRITE_HISTORY_CSV` -- `github_lenguajes.csv` - - requeridas: `lenguaje`, `repos_count`, `porcentaje` -- `so_volumen_preguntas.csv` - - requeridas: `lenguaje`, `preguntas_nuevas_2025` -- `so_tasa_aceptacion.csv` - - requeridas: `tecnologia`, `total_preguntas`, `respuestas_aceptadas`, `tasa_aceptacion_pct` -- `reddit_temas_emergentes.csv` - - requeridas: `tema`, `menciones` -- `trend_score.csv` - - requeridas: `ranking`, `tecnologia`, `github_score`, `so_score`, `reddit_score`, `trend_score`, `fuentes` +Rutas: +- Legacy: `datos/*.csv` +- Latest: `datos/latest/*.csv` +- History: `datos/history//year=YYYY/month=MM/day=DD/*.csv` -## Compatibilidad de `reddit_sentimiento_frameworks.csv` +## 6) Bridge Frontend -El backend mantiene como requeridas: +Fuente de verdad: +- `backend/export_history_json.py` -- `framework`, `total_menciones`, `positivos`, `neutros`, `negativos` +Activos generados: +- `frontend/assets/data/history_index.json` +- `frontend/assets/data/trend_score_history.json` -Y como opcionales para visualización: +Comportamiento: +- si el historial esta incompleto o corrupto, se usa fallback a `latest` para trend. -- `% positivo`, `% neutro`, `% negativo` +## 7) Recomendacion Operativa -Esto evita acoplamiento implícito y deja explícita la coexistencia de métricas absolutas y porcentuales. +Antes de publicar cambios de contrato: +1. actualizar contrato en backend. +2. agregar o ajustar tests. +3. ejecutar `pytest -q`. +4. validar que frontend sigue consumiendo sin regresiones. diff --git a/docs/dependency_policy.md b/docs/dependency_policy.md index abffc9e..4478e22 100644 --- a/docs/dependency_policy.md +++ b/docs/dependency_policy.md @@ -1,47 +1,61 @@ -# Política mínima de dependencias y seguridad +# Politica de Dependencias y Seguridad -Esta política reduce riesgo técnico y mejora reproducibilidad para el backend. +Esta politica define criterios minimos para mantener estabilidad y seguridad en backend. ## Objetivos -- Mantener rangos de versiones controlados en `backend/requirements.txt`. -- Detectar vulnerabilidades conocidas de forma continua. -- Definir una cadencia mínima de actualización. +- controlar versiones en `backend/requirements.txt`. +- detectar vulnerabilidades conocidas de forma continua. +- mantener reproducibilidad en CI y local. -## Reglas de versionado +## Reglas de Versionado -1. Evitar rangos abiertos en major (`<3.0` para todo) cuando no sea necesario. -2. Usar límites superiores por compatibilidad real del proyecto. -3. Mantener `pytest` en major estable (`>=8,<9`). +1. usar rangos compatibles con limite superior. +2. evitar upgrades de major sin validacion de regresion. +3. mantener dependencias de test en major estable (`pytest >=8,<9`). -## Auditoría de seguridad +## Dependencias Core Actuales -- Local: - - `make security` -- CI: - - Workflow: `Dependency Security Audit` - - Se ejecuta en: - - cambios de `backend/requirements.txt` - - `pull_request` hacia `main` - - semanalmente (lunes) - - manualmente (`workflow_dispatch`) +- `pandas` +- `requests` +- `nltk` +- `pandera` +- `duckdb` +- `python-dotenv` -Si se detectan CVEs, el job falla y se debe corregir antes de mergear a `main`. +## Auditoria de Seguridad -## Política mínima de actualización +Workflow: `.github/workflows/dependency_security.yml` -- **Mensual**: revisar updates menores/patch de librerías. -- **Trimestral**: revisar nuevos majors y plan de adopción. -- **Inmediato**: parchear CVEs con severidad alta/crítica. +Se ejecuta en: +- push a `main` y `feat/backend` con cambios en `backend/requirements.txt`. +- pull request a `main` con cambios en dependencias. +- schedule semanal: lunes `09:00 UTC`. +- manual (`workflow_dispatch`). -## Flujo recomendado +Herramienta: +- `pip-audit` -1. Crear rama de actualización. -2. Ajustar `backend/requirements.txt` con cambios mínimos. -3. Ejecutar: - - `python -m pytest tests/ -q` - - `make security` -4. Abrir PR con resumen: - - librerías cambiadas - - motivo (bugfix/CVE/compatibilidad) - - evidencia de tests y auditoría. +Si hay vulnerabilidades, el job falla y no se recomienda merge a `main`. + +Excepcion temporal vigente: +- `CVE-2025-14009` (`nltk 3.9.2`) esta ignorado en CI porque no existe version parche publicada. +- La excepcion debe revisarse semanalmente y removerse apenas exista fix oficial de `nltk`. + +## Flujo Recomendado de Actualizacion + +1. crear rama de trabajo. +2. cambiar dependencias minimas necesarias. +3. ejecutar: + - `python -m pytest -q` + - pipeline de seguridad. +4. documentar en PR: + - librerias cambiadas + - motivo + - evidencia de tests + +## Cadencia Recomendada + +- mensual: patch/minor review. +- trimestral: analisis de majors. +- inmediato: CVEs de severidad alta/critica. diff --git a/frontend/lib/config/feature_flags.dart b/frontend/lib/config/feature_flags.dart new file mode 100644 index 0000000..aee0534 --- /dev/null +++ b/frontend/lib/config/feature_flags.dart @@ -0,0 +1,11 @@ +class FeatureFlags { + FeatureFlags._(); + + /// Partial cutover flag for historical bridge JSON assets. + /// + /// Default is disabled to preserve current CSV-only behavior. + static const bool useHistoryBridgeJson = bool.fromEnvironment( + 'USE_HISTORY_BRIDGE_JSON', + defaultValue: false, + ); +} diff --git a/frontend/lib/screens/home_screen.dart b/frontend/lib/screens/home_screen.dart index f2b9e6a..683e414 100644 --- a/frontend/lib/screens/home_screen.dart +++ b/frontend/lib/screens/home_screen.dart @@ -1,5 +1,7 @@ import 'package:flutter/material.dart'; import 'package:font_awesome_flutter/font_awesome_flutter.dart'; +import '../config/feature_flags.dart'; +import '../services/csv_service.dart'; class HomeScreen extends StatelessWidget { const HomeScreen({super.key}); @@ -22,7 +24,7 @@ class HomeScreen extends StatelessWidget { style: TextStyle(fontSize: 18, color: Colors.grey), ), const SizedBox(height: 40), - + // KPIs principales con iconos oficiales Wrap( spacing: 24, @@ -103,16 +105,20 @@ class HomeScreen extends StatelessWidget { ], ), ), - + const SizedBox(height: 48), - + + const _TrendTemporalBridgeCard(), + + const SizedBox(height: 32), + // Seccion Sobre el Dashboard const Text( 'Sobre el Dashboard', style: TextStyle(fontSize: 24, fontWeight: FontWeight.bold), ), const SizedBox(height: 20), - + Wrap( spacing: 24, runSpacing: 24, @@ -120,36 +126,54 @@ class HomeScreen extends StatelessWidget { _buildInfoCardFA( icon: FontAwesomeIcons.github, title: 'GitHub Data', - description: 'Análisis de repositorios, lenguajes más populares y correlación entre stars y contribuidores', + description: + 'Análisis de repositorios, lenguajes más populares y correlación entre stars y contribuidores', color: Colors.blue, ), _buildInfoCardFA( icon: FontAwesomeIcons.stackOverflow, title: 'StackOverflow Data', - description: 'Madurez de tecnologías y evolución del interés en frameworks a lo largo del año', + description: + 'Madurez de tecnologías y evolución del interés en frameworks a lo largo del año', color: const Color(0xFFF48024), ), _buildInfoCardFA( icon: FontAwesomeIcons.reddit, title: 'Reddit Data', - description: 'Sentimiento de la comunidad sobre frameworks backend y temas de discusión frecuentes', + description: + 'Sentimiento de la comunidad sobre frameworks backend y temas de discusión frecuentes', color: const Color(0xFFFF4500), ), ], ), - + const SizedBox(height: 48), - + // Integrantes const Text( 'Integrantes del Equipo', style: TextStyle(fontSize: 24, fontWeight: FontWeight.bold), ), const SizedBox(height: 16), - _buildTeamMemberFA('Samir Caizapasto', 'GitHub ETL & Dashboard', FontAwesomeIcons.github, Colors.blue), - _buildTeamMemberFA('Andrés Salinas', 'StackOverflow ETL & Dashboard', FontAwesomeIcons.stackOverflow, const Color(0xFFF48024)), - _buildTeamMemberFA('Mateo Mayorga', 'Reddit ETL & Dashboard', FontAwesomeIcons.reddit, const Color(0xFFFF4500)), - + _buildTeamMemberFA( + 'Samir Caizapasto', + 'GitHub ETL & Dashboard', + FontAwesomeIcons.github, + Colors.blue, + ), + _buildTeamMemberFA( + 'Andrés Salinas', + 'StackOverflow ETL & Dashboard', + FontAwesomeIcons.stackOverflow, + const Color(0xFFF48024), + ), + _buildTeamMemberFA( + 'Mateo Mayorga', + 'Reddit ETL & Dashboard', + FontAwesomeIcons.reddit, + const Color(0xFFFF4500), + ), + const SizedBox(height: 48), ], ), @@ -218,9 +242,7 @@ class HomeScreen extends StatelessWidget { decoration: BoxDecoration( color: Colors.white, borderRadius: BorderRadius.circular(12), - border: Border( - top: BorderSide(color: color, width: 4), - ), + border: Border(top: BorderSide(color: color, width: 4)), boxShadow: [ BoxShadow( color: Colors.black.withOpacity(0.05), @@ -259,7 +281,12 @@ class HomeScreen extends StatelessWidget { ); } - Widget _buildTeamMemberFA(String name, String role, IconData icon, Color color) { + Widget _buildTeamMemberFA( + String name, + String role, + IconData icon, + Color color, + ) { return Padding( padding: const EdgeInsets.symmetric(vertical: 8), child: Row( @@ -282,7 +309,12 @@ class HomeScreen extends StatelessWidget { } // Widget con imagen de logo oficial - Widget _buildImageInsight(String imagePath, String title, String description, Color accentColor) { + Widget _buildImageInsight( + String imagePath, + String title, + String description, + Color accentColor, + ) { return Container( padding: const EdgeInsets.all(16), decoration: BoxDecoration( @@ -303,15 +335,10 @@ class HomeScreen extends StatelessWidget { Container( width: 48, height: 48, - decoration: BoxDecoration( - borderRadius: BorderRadius.circular(10), - ), + decoration: BoxDecoration(borderRadius: BorderRadius.circular(10)), child: ClipRRect( borderRadius: BorderRadius.circular(8), - child: Image.asset( - imagePath, - fit: BoxFit.contain, - ), + child: Image.asset(imagePath, fit: BoxFit.contain), ), ), const SizedBox(width: 16), @@ -343,3 +370,171 @@ class HomeScreen extends StatelessWidget { ); } } + +class _TrendTemporalBridgeCard extends StatefulWidget { + const _TrendTemporalBridgeCard(); + + @override + State<_TrendTemporalBridgeCard> createState() => + _TrendTemporalBridgeCardState(); +} + +class _TrendTemporalBridgeCardState extends State<_TrendTemporalBridgeCard> { + late final Future> _futureTemporalData; + + @override + void initState() { + super.initState(); + _futureTemporalData = CsvService.loadTrendTemporalView(topN: 5); + } + + String _buildSourceLabel(String source) { + switch (source) { + case 'bridge_json': + return 'Bridge JSON'; + case 'csv_fallback': + return 'CSV fallback'; + default: + return 'CSV'; + } + } + + @override + Widget build(BuildContext context) { + return Container( + width: double.infinity, + padding: const EdgeInsets.all(24), + decoration: BoxDecoration( + color: Colors.white, + borderRadius: BorderRadius.circular(16), + border: Border.all(color: const Color(0xFFE5E7EB), width: 1), + boxShadow: [ + BoxShadow( + color: Colors.black.withOpacity(0.04), + blurRadius: 8, + offset: const Offset(0, 2), + ), + ], + ), + child: FutureBuilder>( + future: _futureTemporalData, + builder: (context, snapshot) { + if (snapshot.connectionState == ConnectionState.waiting) { + return const SizedBox( + height: 120, + child: Center(child: CircularProgressIndicator()), + ); + } + + if (snapshot.hasError) { + return Text( + 'No se pudo cargar la vista temporal: ${snapshot.error}', + style: const TextStyle(color: Colors.red), + ); + } + + final data = snapshot.data ?? const {}; + final source = data['source']?.toString() ?? 'csv'; + final snapshotCount = (data['snapshotCount'] as num?)?.toInt() ?? 0; + final items = (data['items'] as List?) ?? const []; + + return Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + Row( + children: [ + const Icon(Icons.timeline, color: Color(0xFF1F2937)), + const SizedBox(width: 8), + const Text( + 'Vista temporal de Trend Score', + style: TextStyle( + fontSize: 20, + fontWeight: FontWeight.bold, + color: Color(0xFF1F2937), + ), + ), + const SizedBox(width: 12), + Container( + padding: const EdgeInsets.symmetric( + horizontal: 10, + vertical: 4, + ), + decoration: BoxDecoration( + color: const Color(0xFFEFF6FF), + borderRadius: BorderRadius.circular(999), + ), + child: Text( + '${_buildSourceLabel(source)} • snapshots: $snapshotCount', + style: const TextStyle( + fontSize: 12, + fontWeight: FontWeight.w600, + color: Color(0xFF1D4ED8), + ), + ), + ), + ], + ), + const SizedBox(height: 8), + Text( + FeatureFlags.useHistoryBridgeJson + ? 'Modo bridge activo por feature flag' + : 'Modo bridge desactivado por feature flag (CSV por defecto)', + style: const TextStyle(fontSize: 13, color: Color(0xFF6B7280)), + ), + const SizedBox(height: 14), + if (items.isEmpty) + const Text( + 'No hay datos temporales disponibles.', + style: TextStyle(color: Color(0xFF6B7280)), + ) + else + Wrap( + spacing: 10, + runSpacing: 10, + children: [ + for (final dynamic raw in items) + () { + final row = raw is Map ? raw : const {}; + final rank = row['ranking']?.toString() ?? '-'; + final tech = row['tecnologia']?.toString() ?? 'N/A'; + final score = row['trend_score']?.toString() ?? '0.0'; + final fuentes = row['fuentes']?.toString() ?? '0'; + return Container( + width: 220, + padding: const EdgeInsets.all(12), + decoration: BoxDecoration( + color: const Color(0xFFF9FAFB), + borderRadius: BorderRadius.circular(10), + border: Border.all(color: const Color(0xFFE5E7EB)), + ), + child: Column( + crossAxisAlignment: CrossAxisAlignment.start, + children: [ + Text( + '#$rank $tech', + style: const TextStyle( + fontSize: 14, + fontWeight: FontWeight.bold, + ), + ), + const SizedBox(height: 4), + Text( + 'Score: $score • Fuentes: $fuentes', + style: const TextStyle( + fontSize: 12, + color: Color(0xFF6B7280), + ), + ), + ], + ), + ); + }(), + ], + ), + ], + ); + }, + ), + ); + } +} diff --git a/frontend/lib/services/csv_service.dart b/frontend/lib/services/csv_service.dart index a9281a1..40975ff 100644 --- a/frontend/lib/services/csv_service.dart +++ b/frontend/lib/services/csv_service.dart @@ -1,10 +1,12 @@ -import 'dart:convert' show utf8; +import 'dart:convert' show jsonDecode, utf8; import 'package:csv/csv.dart'; import 'package:flutter/foundation.dart' show kIsWeb; import 'package:flutter/services.dart' show rootBundle; import 'package:http/http.dart' as http; +import '../config/feature_flags.dart'; + /// Servicio central de carga de CSVs. /// /// Estrategia de carga (en orden): @@ -47,17 +49,21 @@ class CsvService { /// Parser manual de CSV a lista de mapas. static List> _parseCsvManual(String raw) { - final normalized = - raw.replaceAll('\r\n', '\n').replaceAll('\r', '\n').trim(); + final normalized = raw + .replaceAll('\r\n', '\n') + .replaceAll('\r', '\n') + .trim(); if (normalized.isEmpty) return []; - final lines = - normalized.split('\n').where((l) => l.trim().isNotEmpty).toList(); + final lines = normalized + .split('\n') + .where((l) => l.trim().isNotEmpty) + .toList(); if (lines.length < 2) return []; - final headers = _splitCsvLine(lines.first) - .map((h) => h.replaceFirst('\ufeff', '').trim()) - .toList(); + final headers = _splitCsvLine( + lines.first, + ).map((h) => h.replaceFirst('\ufeff', '').trim()).toList(); return [ for (final line in lines.skip(1)) @@ -89,15 +95,19 @@ class CsvService { // Log de diagnóstico: primeros 150 chars y longitud final preview = rawData.length > 150 ? rawData.substring(0, 150) : rawData; - print('[CsvService] _parseCsvToMap: ${rawData.length} chars, ' - 'first 20 code units: ${rawData.codeUnits.take(20).toList()}'); + print( + '[CsvService] _parseCsvToMap: ${rawData.length} chars, ' + 'first 20 code units: ${rawData.codeUnits.take(20).toList()}', + ); print('[CsvService] _parseCsvToMap preview: $preview'); // ── PRIMARY: Parser manual (funciona en todas las plataformas) ── final manual = _parseCsvManual(rawData); if (manual.isNotEmpty) { - print('[CsvService] _parseCsvToMap: manual parser OK ' - '(${manual.length} filas, headers: ${manual.first.keys.toList()})'); + print( + '[CsvService] _parseCsvToMap: manual parser OK ' + '(${manual.length} filas, headers: ${manual.first.keys.toList()})', + ); return manual; } print('[CsvService] _parseCsvToMap: manual parser devolvió vacío'); @@ -105,8 +115,10 @@ class CsvService { // ── FALLBACK: CsvToListConverter ── try { final csvData = const CsvToListConverter().convert(rawData); - print('[CsvService] _parseCsvToMap: CsvToListConverter ' - 'rows=${csvData.length}'); + print( + '[CsvService] _parseCsvToMap: CsvToListConverter ' + 'rows=${csvData.length}', + ); if (csvData.length < 2) return []; final headers = csvData[0] @@ -132,11 +144,12 @@ class CsvService { static Future>?> _tryHttp(String url) async { try { final uri = Uri.parse(url); - final response = - await http.get(uri).timeout(const Duration(seconds: 15)); + final response = await http.get(uri).timeout(const Duration(seconds: 15)); - print('[CsvService] HTTP ${response.statusCode} ← $url ' - '(${response.bodyBytes.length} bytes)'); + print( + '[CsvService] HTTP ${response.statusCode} ← $url ' + '(${response.bodyBytes.length} bytes)', + ); if (response.statusCode != 200) { throw Exception('HTTP ${response.statusCode} en $url'); @@ -194,10 +207,7 @@ class CsvService { // ── 1) HTTP con rutas relativas (web) ── if (kIsWeb) { - final urls = [ - 'assets/assets/data/$fileName', - 'assets/data/$fileName', - ]; + final urls = ['assets/assets/data/$fileName', 'assets/data/$fileName']; for (final url in urls) { try { final result = await _tryHttp(url); @@ -218,13 +228,17 @@ class CsvService { final raw = await rootBundle.loadString(path); final parsed = _parseCsvToMap(raw); if (parsed.isNotEmpty) { - print('[CsvService] OK via AssetBundle → $path ' - '(${parsed.length} filas)'); + print( + '[CsvService] OK via AssetBundle → $path ' + '(${parsed.length} filas)', + ); return parsed; } // Si parsed está vacío, registrar como error (antes se perdía) - errors.add('AssetBundle $path: parseo devolvió vacío ' - '(${raw.length} chars cargados)'); + errors.add( + 'AssetBundle $path: parseo devolvió vacío ' + '(${raw.length} chars cargados)', + ); } catch (e) { errors.add('AssetBundle $path: $e'); print('[CsvService] AssetBundle fallo en $path → $e'); @@ -237,4 +251,166 @@ class CsvService { print('[CsvService] FALLO total para $fileName'); throw Exception(errorMsg); } + + /// Carga un JSON asset y lo retorna como mapa. + static Future> loadJsonAsMap(String assetPath) async { + final fileName = assetPath.replaceAll('\\', '/').split('/').last; + final errors = []; + print('[CsvService] === Loading JSON: $fileName ==='); + + if (kIsWeb) { + final urls = ['assets/assets/data/$fileName', 'assets/data/$fileName']; + for (final url in urls) { + try { + final response = await http + .get(Uri.parse(url)) + .timeout(const Duration(seconds: 15)); + if (response.statusCode != 200) { + throw Exception('HTTP ${response.statusCode}'); + } + final body = utf8.decode(response.bodyBytes, allowMalformed: true); + final parsed = jsonDecode(body); + if (parsed is Map) { + return parsed; + } + if (parsed is Map) { + return parsed.map((key, value) => MapEntry(key.toString(), value)); + } + throw Exception('Invalid JSON payload shape for $url'); + } catch (e) { + errors.add('HTTP $url: $e'); + } + } + } + + final bundlePaths = ['assets/data/$fileName', assetPath]; + final seen = {}; + final unique = bundlePaths.where((p) => seen.add(p)).toList(); + + for (final path in unique) { + try { + final raw = await rootBundle.loadString(path); + final parsed = jsonDecode(raw); + if (parsed is Map) { + return parsed; + } + if (parsed is Map) { + return parsed.map((key, value) => MapEntry(key.toString(), value)); + } + errors.add('AssetBundle $path: invalid JSON payload shape'); + } catch (e) { + errors.add('AssetBundle $path: $e'); + } + } + + throw Exception( + 'No se pudo cargar JSON $fileName.\nErrores:\n${errors.join('\n')}', + ); + } + + static int _asInt(dynamic value, {int fallback = 0}) { + if (value is int) return value; + return int.tryParse(value?.toString() ?? '') ?? fallback; + } + + static double _asDouble(dynamic value, {double fallback = 0.0}) { + if (value is double) return value; + if (value is int) return value.toDouble(); + return double.tryParse(value?.toString() ?? '') ?? fallback; + } + + static List> _normalizeTrendRowsFromCsv( + List> csvRows, + int topN, + ) { + final normalized = csvRows + .map( + (row) => { + 'ranking': _asInt(row['ranking'], fallback: 999999), + 'tecnologia': row['tecnologia']?.toString() ?? '', + 'trend_score': _asDouble(row['trend_score'], fallback: 0.0), + 'fuentes': _asInt(row['fuentes'], fallback: 0), + }, + ) + .where((row) => (row['tecnologia']?.toString().isNotEmpty ?? false)) + .toList(); + + normalized.sort( + (a, b) => _asInt(a['ranking']).compareTo(_asInt(b['ranking'])), + ); + return normalized.take(topN).toList(); + } + + /// Carga vista temporal de Trend Score. + /// + /// Si `useHistoryBridgeJson` está activo, intenta usar bridge JSON y + /// mantiene fallback automático a CSV. + static Future> loadTrendTemporalView({ + int topN = 5, + }) async { + final csvRows = await loadCsvAsMap('assets/data/trend_score.csv'); + final csvTop = _normalizeTrendRowsFromCsv(csvRows, topN); + + if (!FeatureFlags.useHistoryBridgeJson) { + return { + 'source': 'csv', + 'snapshotCount': csvTop.isEmpty ? 0 : 1, + 'items': csvTop, + }; + } + + try { + final bridgePayload = await loadJsonAsMap( + 'assets/data/trend_score_history.json', + ); + final rawSnapshots = (bridgePayload['snapshots'] as List?) ?? const []; + if (rawSnapshots.isEmpty) { + return { + 'source': 'csv_fallback', + 'snapshotCount': csvTop.isEmpty ? 0 : 1, + 'items': csvTop, + }; + } + + final latestSnapshot = rawSnapshots.last; + if (latestSnapshot is! Map) { + return { + 'source': 'csv_fallback', + 'snapshotCount': csvTop.isEmpty ? 0 : 1, + 'items': csvTop, + }; + } + + final topRows = (latestSnapshot['top_10'] as List?) ?? const []; + final bridgeItems = topRows + .whereType() + .map( + (row) => { + 'ranking': _asInt(row['ranking'], fallback: 999999), + 'tecnologia': row['tecnologia']?.toString() ?? '', + 'trend_score': _asDouble(row['trend_score'], fallback: 0.0), + 'fuentes': _asInt(row['fuentes'], fallback: 0), + }, + ) + .where((row) => (row['tecnologia']?.toString().isNotEmpty ?? false)) + .take(topN) + .toList(); + + if (bridgeItems.isNotEmpty) { + return { + 'source': 'bridge_json', + 'snapshotCount': rawSnapshots.length, + 'items': bridgeItems, + }; + } + } catch (e) { + print('[CsvService] Bridge JSON fallback to CSV: $e'); + } + + return { + 'source': 'csv_fallback', + 'snapshotCount': csvTop.isEmpty ? 0 : 1, + 'items': csvTop, + }; + } } diff --git a/tests/test_base_etl.py b/tests/test_base_etl.py index b41c61e..c64f339 100644 --- a/tests/test_base_etl.py +++ b/tests/test_base_etl.py @@ -15,6 +15,12 @@ def definir_pasos(self): return self._steps +def _configure_write_flags(monkeypatch, *, legacy=True, latest=False, history=False): + monkeypatch.setattr(base_etl, "WRITE_LEGACY_CSV", legacy) + monkeypatch.setattr(base_etl, "WRITE_LATEST_CSV", latest) + monkeypatch.setattr(base_etl, "WRITE_HISTORY_CSV", history) + + def test_ejecutar_continues_after_non_critical_extraction_error(monkeypatch): called = {"step2": False} @@ -65,6 +71,7 @@ def step2(): def test_guardar_csv_writes_file_when_route_exists(tmp_path, monkeypatch): destino = tmp_path / "out.csv" monkeypatch.setattr(base_etl, "ARCHIVOS_SALIDA", {"github_lenguajes": destino}) + _configure_write_flags(monkeypatch, legacy=True, latest=False, history=False) etl = DummyETL([]) df = pd.DataFrame( @@ -107,6 +114,30 @@ def validar_configuracion(self): def test_guardar_csv_actualiza_resumen_de_ejecucion(tmp_path, monkeypatch): destino = tmp_path / "out.csv" monkeypatch.setattr(base_etl, "ARCHIVOS_SALIDA", {"github_lenguajes": destino}) + _configure_write_flags(monkeypatch, legacy=True, latest=False, history=False) + + etl = DummyETL([]) + df = pd.DataFrame( + { + "lenguaje": ["Python", "Go"], + "repos_count": [10, 5], + "porcentaje": [66.6, 33.4], + } + ) + + etl.guardar_csv(df, "github_lenguajes") + + assert etl._run_summary["rows_written"] == 2 + assert len(etl._run_summary["files_written"]) == 1 + + +def test_guardar_csv_writes_legacy_and_latest_when_enabled(tmp_path, monkeypatch): + legacy_destino = tmp_path / "legacy" / "out.csv" + latest_destino = tmp_path / "latest" / "out.csv" + + monkeypatch.setattr(base_etl, "ARCHIVOS_SALIDA", {"github_lenguajes": legacy_destino}) + _configure_write_flags(monkeypatch, legacy=True, latest=True, history=False) + monkeypatch.setattr(base_etl, "get_latest_output_path", lambda _nombre: latest_destino) etl = DummyETL([]) df = pd.DataFrame( @@ -119,5 +150,44 @@ def test_guardar_csv_actualiza_resumen_de_ejecucion(tmp_path, monkeypatch): etl.guardar_csv(df, "github_lenguajes") + assert legacy_destino.exists() + assert latest_destino.exists() assert etl._run_summary["rows_written"] == 2 + assert len(etl._run_summary["files_written"]) == 2 + + +def test_guardar_csv_writes_history_only_when_enabled(tmp_path, monkeypatch): + legacy_destino = tmp_path / "legacy" / "out.csv" + history_destino = ( + tmp_path + / "history" + / "github_lenguajes" + / "year=2026" + / "month=03" + / "day=01" + / "out.csv" + ) + + monkeypatch.setattr(base_etl, "ARCHIVOS_SALIDA", {"github_lenguajes": legacy_destino}) + _configure_write_flags(monkeypatch, legacy=False, latest=False, history=True) + monkeypatch.setattr( + base_etl, + "get_history_output_path", + lambda _nombre, fecha=None: history_destino, + ) + + etl = DummyETL([]) + df = pd.DataFrame( + { + "lenguaje": ["Python"], + "repos_count": [1], + "porcentaje": [100.0], + } + ) + + etl.guardar_csv(df, "github_lenguajes") + + assert not legacy_destino.exists() + assert history_destino.exists() + assert etl._run_summary["rows_written"] == 1 assert len(etl._run_summary["files_written"]) == 1 diff --git a/tests/test_csv_contract.py b/tests/test_csv_contract.py index 4eaa078..cbd3c7f 100644 --- a/tests/test_csv_contract.py +++ b/tests/test_csv_contract.py @@ -1,6 +1,10 @@ import pandas as pd -from config.csv_contract import CSV_SCHEMA_CONTRACT, get_contract_version +from config.csv_contract import ( + CSV_SCHEMA_CONTRACT, + get_contract_version, + get_logical_dataset_names, +) from config.settings import ARCHIVOS_SALIDA @@ -41,3 +45,10 @@ def test_contract_includes_column_types_for_core_outputs(): assert type_map, f"{key} no define column_types" for required_col in schema["required_columns"]: assert required_col in type_map, f"{key} no define tipo para {required_col}" + + +def test_get_logical_dataset_names_includes_core_outputs(): + logical_names = get_logical_dataset_names() + assert "trend_score" in logical_names + assert "github_lenguajes" in logical_names + assert "so_volumen" in logical_names diff --git a/tests/test_data_product_contract.py b/tests/test_data_product_contract.py new file mode 100644 index 0000000..f4dd0c9 --- /dev/null +++ b/tests/test_data_product_contract.py @@ -0,0 +1,108 @@ +from config.data_product_contract import ( + build_dataset_manifest, + build_run_manifest, + get_data_product_contract_version, + is_valid_iso_utc, + is_valid_semver, + validate_dataset_manifest, + validate_run_manifest, +) + + +VALID_HASH = "a" * 64 +VALID_RUN_ID = "run-2026-03-01T08:00:00Z" +VALID_GENERATED_AT = "2026-03-01T08:00:00Z" +VALID_WINDOW_START = "2026-02-22T00:00:00Z" +VALID_WINDOW_END = "2026-03-01T00:00:00Z" + + +def _build_valid_manifest(): + dataset = build_dataset_manifest( + dataset_logical_name="trend_score", + version_semver="1.0.0", + source_run_id=VALID_RUN_ID, + schema_hash=VALID_HASH, + row_count=25, + quality_status="pass", + latest_path="datos/latest/trend_score.csv", + history_path="datos/history/trend_score/year=2026/month=03/day=01/part-0000.parquet", + generated_at_utc=VALID_GENERATED_AT, + ) + return build_run_manifest( + run_id=VALID_RUN_ID, + git_sha="d8d7c8133c60fc11d8ccd104198e1e317903b565", + branch="feat/backend", + source_window_start_utc=VALID_WINDOW_START, + source_window_end_utc=VALID_WINDOW_END, + quality_gate_status="pass", + datasets=[dataset], + generated_at_utc=VALID_GENERATED_AT, + ) + + +def test_contract_version_is_defined(): + version = get_data_product_contract_version() + assert isinstance(version, str) + assert version.strip() + + +def test_validate_run_manifest_valid_case(): + run_manifest = _build_valid_manifest() + ok, errors = validate_run_manifest(run_manifest) + assert ok is True + assert errors == [] + + +def test_validate_run_manifest_invalid_quality_gate_status(): + run_manifest = _build_valid_manifest() + run_manifest["quality_gate_status"] = "unknown-status" + ok, errors = validate_run_manifest(run_manifest) + assert ok is False + assert any("quality_gate_status" in err for err in errors) + + +def test_validate_run_manifest_requires_datasets(): + run_manifest = _build_valid_manifest() + run_manifest["datasets"] = [] + ok, errors = validate_run_manifest(run_manifest) + assert ok is False + assert any("'datasets' no puede estar vacio" in err for err in errors) + + +def test_validate_dataset_manifest_detects_source_run_id_mismatch(): + dataset_manifest = _build_valid_manifest()["datasets"][0] + errors = validate_dataset_manifest(dataset_manifest, expected_run_id="another-run") + assert errors + assert any("source_run_id" in err for err in errors) + + +def test_validate_dataset_manifest_rejects_invalid_semver(): + dataset_manifest = _build_valid_manifest()["datasets"][0] + dataset_manifest["version_semver"] = "1.0" + errors = validate_dataset_manifest(dataset_manifest) + assert errors + assert any("SemVer" in err for err in errors) + + +def test_validate_dataset_manifest_allows_null_history_path_when_failed_quality(): + dataset_manifest = _build_valid_manifest()["datasets"][0] + dataset_manifest["quality_status"] = "fail" + dataset_manifest["history_path"] = None + errors = validate_dataset_manifest(dataset_manifest) + assert errors == [] + + +def test_validate_dataset_manifest_requires_valid_schema_hash(): + dataset_manifest = _build_valid_manifest()["datasets"][0] + dataset_manifest["schema_hash"] = "abc123" + errors = validate_dataset_manifest(dataset_manifest) + assert errors + assert any("schema_hash" in err for err in errors) + + +def test_semver_and_iso_helpers(): + assert is_valid_semver("1.0.0") is True + assert is_valid_semver("1.0") is False + assert is_valid_iso_utc("2026-03-01T08:00:00Z") is True + assert is_valid_iso_utc("2026/03/01 08:00:00") is False + diff --git a/tests/test_degradation_policy.py b/tests/test_degradation_policy.py new file mode 100644 index 0000000..a5bc71a --- /dev/null +++ b/tests/test_degradation_policy.py @@ -0,0 +1,50 @@ +import pytest + +from quality.degradation_policy import ( + DEFAULT_SOURCE_WEIGHTS, + evaluate_degradation_policy, +) + + +@pytest.mark.parametrize( + ("source_status", "expected_publish", "expected_quality", "expected_mode", "expected_count"), + [ + ({"github": True, "stackoverflow": True, "reddit": True}, True, "pass", "default", 3), + ({"github": True, "stackoverflow": True, "reddit": False}, True, "pass_with_warnings", "renormalized", 2), + ({"github": True, "stackoverflow": False, "reddit": False}, False, "fail", "unavailable", 1), + ({"github": False, "stackoverflow": False, "reddit": False}, False, "fail", "unavailable", 0), + ], +) +def test_degradation_matrix(source_status, expected_publish, expected_quality, expected_mode, expected_count): + decision = evaluate_degradation_policy(source_status) + assert decision["publish_allowed"] is expected_publish + assert decision["quality_gate_status"] == expected_quality + assert decision["weights_mode"] == expected_mode + assert decision["available_count"] == expected_count + + +def test_degradation_policy_renormalizes_weights_when_one_source_is_missing(): + decision = evaluate_degradation_policy({"github": True, "stackoverflow": False, "reddit": True}) + assert decision["publish_allowed"] is True + assert decision["weights_mode"] == "renormalized" + + effective = decision["effective_weights"] + assert set(effective.keys()) == {"github", "reddit"} + assert round(sum(effective.values()), 6) == 1.0 + assert effective["github"] > effective["reddit"] + + +def test_degradation_policy_handles_missing_keys_as_unavailable(): + decision = evaluate_degradation_policy({"github": True}) + assert decision["available_count"] == 1 + assert decision["publish_allowed"] is False + assert decision["effective_weights"] == {} + + +def test_degradation_policy_preserves_default_weights_with_all_sources(): + decision = evaluate_degradation_policy( + {"github": True, "stackoverflow": True, "reddit": True}, + default_weights=DEFAULT_SOURCE_WEIGHTS, + ) + assert decision["weights_mode"] == "default" + assert decision["effective_weights"] == DEFAULT_SOURCE_WEIGHTS diff --git a/tests/test_export_history_json.py b/tests/test_export_history_json.py new file mode 100644 index 0000000..70146d0 --- /dev/null +++ b/tests/test_export_history_json.py @@ -0,0 +1,94 @@ +import json +from pathlib import Path + +import export_history_json + + +def test_export_bridge_assets_generates_history_and_trend_json(tmp_path): + project_root = tmp_path + history_dir = project_root / "datos" / "history" / "trend_score" / "year=2026" / "month=02" / "day=22" + latest_dir = project_root / "datos" / "latest" + + history_dir.mkdir(parents=True, exist_ok=True) + latest_dir.mkdir(parents=True, exist_ok=True) + (project_root / "frontend" / "assets" / "data").mkdir(parents=True, exist_ok=True) + + trend_csv_content = ( + "ranking,tecnologia,github_score,so_score,reddit_score,trend_score,fuentes\n" + "1,Python,100,100,5.8,76.45,3\n" + "2,TypeScript,70.7,17.7,1.0,34.74,3\n" + ) + (history_dir / "trend_score.csv").write_text(trend_csv_content, encoding="utf-8") + (latest_dir / "trend_score.csv").write_text(trend_csv_content, encoding="utf-8") + + summary = export_history_json.export_bridge_assets(project_root) + + assert summary["files_written"] == 2 + history_index = project_root / "frontend" / "assets" / "data" / "history_index.json" + trend_history = project_root / "frontend" / "assets" / "data" / "trend_score_history.json" + + assert history_index.exists() + assert trend_history.exists() + + history_payload = json.loads(history_index.read_text(encoding="utf-8")) + trend_payload = json.loads(trend_history.read_text(encoding="utf-8")) + + assert history_payload["dataset_count"] >= 1 + assert any(dataset["dataset"] == "trend_score" for dataset in history_payload["datasets"]) + assert trend_payload["snapshot_count"] == 1 + assert trend_payload["snapshots"][0]["source_type"] == "history" + assert trend_payload["snapshots"][0]["top_10"][0]["tecnologia"] == "Python" + + +def test_build_trend_score_history_falls_back_to_latest_when_history_missing(tmp_path): + project_root = tmp_path + latest_dir = project_root / "datos" / "latest" + latest_dir.mkdir(parents=True, exist_ok=True) + (project_root / "frontend" / "assets" / "data").mkdir(parents=True, exist_ok=True) + + (latest_dir / "trend_score.csv").write_text( + ( + "ranking,tecnologia,github_score,so_score,reddit_score,trend_score,fuentes\n" + "1,Python,100,100,5.8,76.45,3\n" + "2,TypeScript,70.7,17.7,1.0,34.74,3\n" + ), + encoding="utf-8", + ) + + history_index = export_history_json.build_history_index(project_root) + trend_payload = export_history_json.build_trend_score_history(project_root, history_index) + + assert trend_payload["snapshot_count"] == 1 + assert trend_payload["snapshots"][0]["source_type"] == "latest" + assert len(trend_payload["series"]) == 2 + + +def test_build_trend_score_history_falls_back_to_latest_when_history_is_corrupted(tmp_path): + project_root = tmp_path + history_dir = project_root / "datos" / "history" / "trend_score" / "year=2026" / "month=02" / "day=22" + latest_dir = project_root / "datos" / "latest" + + history_dir.mkdir(parents=True, exist_ok=True) + latest_dir.mkdir(parents=True, exist_ok=True) + (project_root / "frontend" / "assets" / "data").mkdir(parents=True, exist_ok=True) + + # Corrupted history schema for trend snapshot (missing required columns). + (history_dir / "trend_score.csv").write_text( + "foo,bar\n1,2\n", + encoding="utf-8", + ) + (latest_dir / "trend_score.csv").write_text( + ( + "ranking,tecnologia,github_score,so_score,reddit_score,trend_score,fuentes\n" + "1,Python,100,100,5.8,76.45,3\n" + "2,TypeScript,70.7,17.7,1.0,34.74,3\n" + ), + encoding="utf-8", + ) + + history_index = export_history_json.build_history_index(project_root) + trend_payload = export_history_json.build_trend_score_history(project_root, history_index) + + assert trend_payload["snapshot_count"] == 1 + assert trend_payload["snapshots"][0]["source_type"] == "latest" + assert trend_payload["snapshots"][0]["top_10"][0]["tecnologia"] == "Python" diff --git a/tests/test_frontend_bridge_contract.py b/tests/test_frontend_bridge_contract.py new file mode 100644 index 0000000..8c25b98 --- /dev/null +++ b/tests/test_frontend_bridge_contract.py @@ -0,0 +1,21 @@ +from pathlib import Path + + +PROJECT_ROOT = Path(__file__).resolve().parent.parent +CSV_SERVICE_PATH = PROJECT_ROOT / "frontend" / "lib" / "services" / "csv_service.dart" +FEATURE_FLAGS_PATH = PROJECT_ROOT / "frontend" / "lib" / "config" / "feature_flags.dart" + + +def test_feature_flag_defaults_to_csv_behavior(): + content = FEATURE_FLAGS_PATH.read_text(encoding="utf-8") + assert "USE_HISTORY_BRIDGE_JSON" in content + assert "defaultValue: false" in content + + +def test_csv_service_declares_bridge_json_fallback_paths(): + content = CSV_SERVICE_PATH.read_text(encoding="utf-8") + + assert "loadTrendTemporalView" in content + assert "'source': 'bridge_json'" in content + assert "'source': 'csv_fallback'" in content + assert "Bridge JSON fallback to CSV" in content diff --git a/tests/test_schema_contract_utils.py b/tests/test_schema_contract_utils.py new file mode 100644 index 0000000..c128281 --- /dev/null +++ b/tests/test_schema_contract_utils.py @@ -0,0 +1,98 @@ +import pytest + +from config.schema_contract_utils import ( + SEMVER_MAJOR, + SEMVER_MINOR, + SEMVER_PATCH, + aggregate_semver_bump, + canonicalize_schema_columns, + compute_schema_hash, + recommend_semver_bump, +) + + +def test_compute_schema_hash_is_deterministic_for_equivalent_columns(): + schema_a = [ + {"name": "technology", "type": "string", "nullable": False}, + {"name": "trend_score", "type": "float64", "nullable": False}, + {"name": "ranking", "type": "int64", "nullable": False}, + ] + schema_b = [ + {"name": "ranking", "type": "integer", "nullable": False}, + {"name": "trend_score", "type": "number", "nullable": False}, + {"name": "technology", "type": "str", "nullable": False}, + ] + + hash_a = compute_schema_hash(schema_a) + hash_b = compute_schema_hash(schema_b) + assert hash_a == hash_b + assert len(hash_a) == 64 + + +def test_compute_schema_hash_changes_when_semantic_schema_changes(): + baseline = [ + {"name": "technology", "type": "string", "nullable": False}, + {"name": "trend_score", "type": "number", "nullable": False}, + ] + with_nullable_change = [ + {"name": "technology", "type": "string", "nullable": True}, + {"name": "trend_score", "type": "number", "nullable": False}, + ] + with_type_change = [ + {"name": "technology", "type": "string", "nullable": False}, + {"name": "trend_score", "type": "integer", "nullable": False}, + ] + + baseline_hash = compute_schema_hash(baseline) + nullable_hash = compute_schema_hash(with_nullable_change) + type_hash = compute_schema_hash(with_type_change) + + assert baseline_hash != nullable_hash + assert baseline_hash != type_hash + + +def test_canonicalize_schema_columns_drops_invalid_entries_and_sorts(): + raw_columns = [ + {"name": "", "type": "string", "nullable": False}, + {"name": "Trend_Score", "type": "double", "nullable": False}, + {"name": "technology", "type": "str", "nullable": False}, + ] + canonical = canonicalize_schema_columns(raw_columns) + assert canonical == [ + {"name": "technology", "type": "string", "nullable": False}, + {"name": "trend_score", "type": "number", "nullable": False}, + ] + + +@pytest.mark.parametrize( + ("change_kind", "expected_bump"), + [ + ("remove_required_column", SEMVER_MAJOR), + ("rename_required_column", SEMVER_MAJOR), + ("change_type_incompatible", SEMVER_MAJOR), + ("tighten_nullability", SEMVER_MAJOR), + ("drop_dataset", SEMVER_MAJOR), + ("add_optional_column", SEMVER_MINOR), + ("add_required_column_with_default", SEMVER_MINOR), + ("add_non_breaking_quality_rule", SEMVER_MINOR), + ("add_partition_field_backward_compatible", SEMVER_MINOR), + ("fix_quality_rule_bug", SEMVER_PATCH), + ("metadata_only_change", SEMVER_PATCH), + ("backfill_without_schema_change", SEMVER_PATCH), + ], +) +def test_recommend_semver_bump_for_representative_changes(change_kind, expected_bump): + assert recommend_semver_bump(change_kind) == expected_bump + + +def test_recommend_semver_bump_rejects_unknown_changes(): + with pytest.raises(ValueError): + recommend_semver_bump("unknown_change") + + +def test_aggregate_semver_bump_uses_highest_required_priority(): + assert aggregate_semver_bump(["metadata_only_change", "add_optional_column"]) == SEMVER_MINOR + assert ( + aggregate_semver_bump(["metadata_only_change", "add_optional_column", "remove_required_column"]) + == SEMVER_MAJOR + ) diff --git a/tests/test_sync_assets.py b/tests/test_sync_assets.py index fca3680..0591706 100644 --- a/tests/test_sync_assets.py +++ b/tests/test_sync_assets.py @@ -79,3 +79,112 @@ def test_sincronizar_returns_summary(tmp_path, monkeypatch): assert summary["files_copied"] == 1 assert summary["errors"] == 0 + + +def test_sincronizar_prefers_latest_directory_when_available(tmp_path, monkeypatch): + project_root = tmp_path + backend_dir = project_root / "backend" + datos_dir = project_root / "datos" + latest_dir = datos_dir / "latest" + destino_dir = project_root / "frontend" / "assets" / "data" + + backend_dir.mkdir(parents=True) + datos_dir.mkdir(parents=True) + latest_dir.mkdir(parents=True) + + (datos_dir / "same.csv").write_text("v\nlegacy\n", encoding="utf-8") + (latest_dir / "same.csv").write_text("v\nlatest\n", encoding="utf-8") + + monkeypatch.setattr(sync_assets, "__file__", str(backend_dir / "sync_assets.py")) + + summary = sync_assets.sincronizar() + + assert summary["files_copied"] == 1 + assert (destino_dir / "same.csv").read_text(encoding="utf-8") == "v\nlatest\n" + assert summary["source_mode"] == "latest" + assert summary["source"].endswith(str(Path("datos") / "latest")) + + +def test_sincronizar_uses_latest_per_file_with_legacy_fallback(tmp_path, monkeypatch): + project_root = tmp_path + backend_dir = project_root / "backend" + datos_dir = project_root / "datos" + latest_dir = datos_dir / "latest" + destino_dir = project_root / "frontend" / "assets" / "data" + + backend_dir.mkdir(parents=True) + datos_dir.mkdir(parents=True) + latest_dir.mkdir(parents=True) + + (datos_dir / "github_lenguajes.csv").write_text("lang\nlegacy\n", encoding="utf-8") + (datos_dir / "trend_score.csv").write_text("score\nlegacy\n", encoding="utf-8") + (latest_dir / "trend_score.csv").write_text("score\nlatest\n", encoding="utf-8") + + monkeypatch.setattr(sync_assets, "__file__", str(backend_dir / "sync_assets.py")) + monkeypatch.setenv("EXPORT_HISTORY_BRIDGE_JSON", "0") + + summary = sync_assets.sincronizar() + + assert summary["files_copied"] == 2 + assert summary["source_mode"] == "mixed" + assert summary["source"] == "mixed(latest+legacy)" + assert (destino_dir / "trend_score.csv").read_text(encoding="utf-8") == "score\nlatest\n" + assert (destino_dir / "github_lenguajes.csv").read_text(encoding="utf-8") == "lang\nlegacy\n" + + +def test_sincronizar_generates_bridge_json_when_enabled(tmp_path, monkeypatch): + project_root = tmp_path + backend_dir = project_root / "backend" + datos_dir = project_root / "datos" + latest_dir = datos_dir / "latest" + history_dir = datos_dir / "history" / "trend_score" / "year=2026" / "month=02" / "day=22" + destino_dir = project_root / "frontend" / "assets" / "data" + + backend_dir.mkdir(parents=True) + latest_dir.mkdir(parents=True) + history_dir.mkdir(parents=True) + + trend_csv = ( + "ranking,tecnologia,github_score,so_score,reddit_score,trend_score,fuentes\n" + "1,Python,100,100,5.8,76.45,3\n" + ) + (latest_dir / "trend_score.csv").write_text(trend_csv, encoding="utf-8") + (history_dir / "trend_score.csv").write_text(trend_csv, encoding="utf-8") + + monkeypatch.setattr(sync_assets, "__file__", str(backend_dir / "sync_assets.py")) + monkeypatch.setenv("EXPORT_HISTORY_BRIDGE_JSON", "1") + + summary = sync_assets.sincronizar() + + assert summary["bridge_export_enabled"] is True + assert summary["bridge_files_written"] == 2 + assert (destino_dir / "history_index.json").exists() + assert (destino_dir / "trend_score_history.json").exists() + + +def test_sincronizar_skips_bridge_json_when_disabled(tmp_path, monkeypatch): + project_root = tmp_path + backend_dir = project_root / "backend" + datos_dir = project_root / "datos" + latest_dir = datos_dir / "latest" + destino_dir = project_root / "frontend" / "assets" / "data" + + backend_dir.mkdir(parents=True) + latest_dir.mkdir(parents=True) + (latest_dir / "trend_score.csv").write_text( + ( + "ranking,tecnologia,github_score,so_score,reddit_score,trend_score,fuentes\n" + "1,Python,100,100,5.8,76.45,3\n" + ), + encoding="utf-8", + ) + + monkeypatch.setattr(sync_assets, "__file__", str(backend_dir / "sync_assets.py")) + monkeypatch.setenv("EXPORT_HISTORY_BRIDGE_JSON", "0") + + summary = sync_assets.sincronizar() + + assert summary["bridge_export_enabled"] is False + assert summary["bridge_files_written"] == 0 + assert not (destino_dir / "history_index.json").exists() + assert not (destino_dir / "trend_score_history.json").exists() diff --git a/tests/test_trend_equivalence_v1_v2.py b/tests/test_trend_equivalence_v1_v2.py new file mode 100644 index 0000000..33dfa1b --- /dev/null +++ b/tests/test_trend_equivalence_v1_v2.py @@ -0,0 +1,111 @@ +import pandas as pd + +from trend_score import calculate_trend_score_legacy +from trend_score_duckdb import calcular_trend_score_duckdb + + +PESOS = { + "github": 0.40, + "stackoverflow": 0.35, + "reddit": 0.25, +} + + +def _sample_sources(): + df_github = pd.DataFrame( + { + "tecnologia": [ + "Python", + "TypeScript", + "JavaScript", + "Go", + "Rust", + "Java", + "C#", + "PHP", + "Kotlin", + "Swift", + "Ruby", + "Dart", + ], + "github_score": [100, 82, 77, 51, 44, 39, 33, 29, 26, 22, 19, 17], + } + ) + df_so = pd.DataFrame( + { + "tecnologia": [ + "Python", + "TypeScript", + "JavaScript", + "Go", + "Java", + "C#", + "PHP", + "Ruby", + "Scala", + "Elixir", + ], + "so_score": [100, 76, 72, 45, 52, 35, 31, 20, 15, 11], + } + ) + df_reddit = pd.DataFrame( + { + "tecnologia": [ + "Python", + "TypeScript", + "JavaScript", + "Rust", + "Go", + "Kubernetes", + "DevOps", + "AI/ML", + "Cloud", + "Dart", + ], + "reddit_score": [95, 70, 68, 50, 47, 30, 27, 45, 25, 22], + } + ) + return df_github, df_so, df_reddit + + +def _compare_scores(df_legacy, df_duckdb): + merged = df_legacy.merge( + df_duckdb, + on="tecnologia", + how="inner", + suffixes=("_legacy", "_duckdb"), + ) + merged["score_abs_diff"] = (merged["trend_score_legacy"] - merged["trend_score_duckdb"]).abs() + return merged + + +def test_equivalence_score_abs_error_threshold(): + df_github, df_so, df_reddit = _sample_sources() + legacy = calculate_trend_score_legacy(df_github, df_so, df_reddit) + duckdb = calcular_trend_score_duckdb(df_github, df_so, df_reddit, PESOS) + + comparison = _compare_scores(legacy, duckdb) + assert not comparison.empty + assert (comparison["score_abs_diff"] <= 0.01).all() + + +def test_equivalence_top10_overlap_threshold(): + df_github, df_so, df_reddit = _sample_sources() + legacy = calculate_trend_score_legacy(df_github, df_so, df_reddit) + duckdb = calcular_trend_score_duckdb(df_github, df_so, df_reddit, PESOS) + + top10_legacy = set(legacy.head(10)["tecnologia"]) + top10_duckdb = set(duckdb.head(10)["tecnologia"]) + overlap = len(top10_legacy.intersection(top10_duckdb)) / 10.0 + assert overlap >= 0.90 + + +def test_equivalence_ranking_delta_threshold(): + df_github, df_so, df_reddit = _sample_sources() + legacy = calculate_trend_score_legacy(df_github, df_so, df_reddit) + duckdb = calcular_trend_score_duckdb(df_github, df_so, df_reddit, PESOS) + + comparison = _compare_scores(legacy, duckdb) + comparison["ranking_delta"] = (comparison["ranking_legacy"] - comparison["ranking_duckdb"]).abs() + pct_within_delta_1 = (comparison["ranking_delta"] <= 1).sum() / len(comparison) + assert pct_within_delta_1 >= 0.90 diff --git a/tests/test_trend_score.py b/tests/test_trend_score.py index f131c2b..8326829 100644 --- a/tests/test_trend_score.py +++ b/tests/test_trend_score.py @@ -14,6 +14,7 @@ normalizar_scores, cargar_github, calcular_trend_score, + resolve_trend_engine, PESOS ) @@ -171,6 +172,37 @@ def test_empty_all_sources(self): assert result.empty + def test_resolve_unknown_engine_falls_back_to_legacy(self): + assert resolve_trend_engine("custom_engine") == "legacy" + + def test_calcular_trend_score_duckdb_engine(self): + df_github = pd.DataFrame( + { + "tecnologia": ["Python", "JavaScript"], + "github_score": [100.0, 50.0], + } + ) + df_so = pd.DataFrame( + { + "tecnologia": ["Python", "JavaScript"], + "so_score": [80.0, 60.0], + } + ) + df_reddit = pd.DataFrame( + { + "tecnologia": ["Python", "JavaScript"], + "reddit_score": [70.0, 40.0], + } + ) + + with patch("trend_score.cargar_github", return_value=df_github), \ + patch("trend_score.cargar_stackoverflow", return_value=df_so), \ + patch("trend_score.cargar_reddit", return_value=df_reddit): + result = calcular_trend_score(engine="duckdb") + + assert not result.empty + assert result.iloc[0]["tecnologia"] == "Python" + class TestCargarGitHub: """Tests for GitHub data loading and filtering in trend score.""" diff --git a/tests/test_validador.py b/tests/test_validador.py index e98630a..65b2527 100644 --- a/tests/test_validador.py +++ b/tests/test_validador.py @@ -1,6 +1,7 @@ import pandas as pd import pytest +import validador from exceptions import ETLValidationError from validador import validar_dataframe @@ -78,3 +79,71 @@ def test_validar_dataframe_strict_raises_on_invalid_type(): with pytest.raises(ETLValidationError): validar_dataframe(df, "github_lenguajes", strict=True, validate_types=True) + + +def test_validar_dataframe_pandera_warning_warn_only_returns_report(monkeypatch): + df = pd.DataFrame( + { + "repo_name": ["org/repo1", "org/repo2"], + "stars": [100, 90], + "contributors": [10, 8], + "language": ["Python", "Go"], + } + ) + + monkeypatch.setattr( + validador, + "run_pandera_quality_checks", + lambda _df, _name: [ + { + "dataset": "github_correlacion", + "severity": "warning", + "rule": "mock_warning_rule", + "message": "mock warning", + } + ], + ) + + report = validar_dataframe( + df, + "github_correlacion", + enable_pandera=True, + pandera_warn_only=True, + return_quality_report=True, + ) + + assert report["critical"] == 0 + assert report["warning"] == 1 + assert report["info"] == 0 + + +def test_validar_dataframe_pandera_critical_strict_raises(monkeypatch): + df = pd.DataFrame( + { + "repo_name": ["org/repo1", "org/repo2"], + "stars": [100, 90], + "contributors": [10, 8], + "language": ["Python", "Go"], + } + ) + + monkeypatch.setattr( + validador, + "run_pandera_quality_checks", + lambda _df, _name: [ + { + "dataset": "github_correlacion", + "severity": "critical", + "rule": "mock_critical_rule", + "message": "mock critical", + } + ], + ) + + with pytest.raises(ETLValidationError): + validar_dataframe( + df, + "github_correlacion", + enable_pandera=True, + pandera_warn_only=False, + ) diff --git a/tests/test_validate_csv_contract.py b/tests/test_validate_csv_contract.py index ebf9344..c219cb0 100644 --- a/tests/test_validate_csv_contract.py +++ b/tests/test_validate_csv_contract.py @@ -46,3 +46,89 @@ def test_validate_contract_warn_only_missing_file(tmp_path, monkeypatch): ok, messages = validate_csv_contract.validate_contract(strict=False) assert ok is True assert any("[WARN] test_csv" in m for m in messages) + + +def test_validate_contract_pandera_warning_does_not_block(tmp_path, monkeypatch): + csv_path = tmp_path / "a.csv" + pd.DataFrame({"col1": [1], "col2": ["x"]}).to_csv(csv_path, index=False) + + monkeypatch.setattr( + validate_csv_contract, + "CSV_SCHEMA_CONTRACT", + { + "test_csv": { + "required_columns": ["col1", "col2"], + "critical_columns": ["col1"], + "column_types": {"col1": "integer", "col2": "string"}, + } + }, + ) + monkeypatch.setattr(validate_csv_contract, "ARCHIVOS_SALIDA", {"test_csv": Path(csv_path)}) + + def fake_validar_dataframe(**_kwargs): + return { + "critical": 0, + "warning": 1, + "info": 0, + "issues": [ + { + "dataset": "test_csv", + "severity": "warning", + "rule": "mock_warning_rule", + "message": "mock warning", + } + ], + } + + monkeypatch.setattr(validate_csv_contract, "validar_dataframe", fake_validar_dataframe) + + ok, messages = validate_csv_contract.validate_contract( + strict=True, + enable_pandera=True, + pandera_warn_only=True, + ) + assert ok is True + assert any("quality warnings=1" in m for m in messages) + + +def test_validate_contract_pandera_critical_blocks_in_strict_mode(tmp_path, monkeypatch): + csv_path = tmp_path / "a.csv" + pd.DataFrame({"col1": [1], "col2": ["x"]}).to_csv(csv_path, index=False) + + monkeypatch.setattr( + validate_csv_contract, + "CSV_SCHEMA_CONTRACT", + { + "test_csv": { + "required_columns": ["col1", "col2"], + "critical_columns": ["col1"], + "column_types": {"col1": "integer", "col2": "string"}, + } + }, + ) + monkeypatch.setattr(validate_csv_contract, "ARCHIVOS_SALIDA", {"test_csv": Path(csv_path)}) + + def fake_validar_dataframe(**_kwargs): + return { + "critical": 1, + "warning": 0, + "info": 0, + "issues": [ + { + "dataset": "test_csv", + "severity": "critical", + "rule": "mock_critical_rule", + "message": "mock critical", + } + ], + } + + monkeypatch.setattr(validate_csv_contract, "validar_dataframe", fake_validar_dataframe) + + ok, messages = validate_csv_contract.validate_contract( + strict=True, + enable_pandera=True, + pandera_warn_only=False, + ) + assert ok is False + assert any("quality gate failed" in m for m in messages) diff --git a/tests/test_workflow_etl_contract.py b/tests/test_workflow_etl_contract.py new file mode 100644 index 0000000..035aeef --- /dev/null +++ b/tests/test_workflow_etl_contract.py @@ -0,0 +1,62 @@ +import re +from pathlib import Path + + +WORKFLOW_PATH = Path(__file__).resolve().parent.parent / ".github" / "workflows" / "etl_semanal.yml" + + +def _load_workflow_text(): + return WORKFLOW_PATH.read_text(encoding="utf-8") + + +def test_workflow_has_parallel_source_jobs_and_aggregate_dependency_graph(): + content = _load_workflow_text() + + for job_name in ("job_github", "job_stackoverflow", "job_reddit", "job_aggregate", "job_publish"): + assert f"{job_name}:" in content + + aggregate_needs_pattern = ( + r"job_aggregate:\s*(?:.|\n)*?needs:\s*" + r"\n\s*-\s*job_github" + r"\n\s*-\s*job_stackoverflow" + r"\n\s*-\s*job_reddit" + ) + assert re.search(aggregate_needs_pattern, content, flags=re.MULTILINE) + + +def test_workflow_artifact_handoff_contract_is_defined(): + content = _load_workflow_text() + + assert "name: github-data" in content + assert "name: stackoverflow-data" in content + assert "name: reddit-data" in content + assert "name: aggregate-data" in content + + assert "Download GitHub artifacts" in content + assert "Download StackOverflow artifacts" in content + assert "Download Reddit artifacts" in content + assert "Download aggregate artifacts" in content + assert "if-no-files-found: error" in content + assert "Missing required artifact file" in content + assert "Optional artifact missing (degraded mode may continue)" in content + + +def test_workflow_publish_gate_and_bridge_asset_paths(): + content = _load_workflow_text() + + assert "if: ${{ needs.job_aggregate.result == 'success' }}" in content + assert "frontend/assets/data/*.json" in content + assert "artifact_payload/frontend/assets/data/*.json" in content + assert "frontend/assets/data/github_lenguajes.csv" in content + assert "frontend/assets/data/so_volumen_preguntas.csv" in content + assert "frontend/assets/data/reddit_temas_emergentes.csv" in content + + +def test_workflow_enables_dual_write_and_bridge_flags(): + content = _load_workflow_text() + + assert 'DATA_WRITE_LEGACY_CSV: "1"' in content + assert 'DATA_WRITE_LATEST_CSV: "1"' in content + assert 'DATA_WRITE_HISTORY_CSV: "1"' in content + assert 'EXPORT_HISTORY_BRIDGE_JSON: "1"' in content + assert 'TREND_SCORE_ENGINE: "duckdb"' in content